kernel-reiser4.patch

   1 diff -urN linux-2.6.35.orig/Documentation/Changes linux-2.6.35/Documentation/Changes
   2 --- linux-2.6.35.orig/Documentation/Changes     2010-08-02 00:11:14.000000000 +0200
   3 +++ linux-2.6.35/Documentation/Changes  2010-08-04 15:44:57.000000000 +0200
   4 @@ -36,6 +36,7 @@
   5  o  e2fsprogs              1.41.4                  # e2fsck -V
   6  o  jfsutils               1.1.3                   # fsck.jfs -V
   7  o  reiserfsprogs          3.6.3                   # reiserfsck -V 2>&1|grep reiserfsprogs
   8 +o  reiser4progs           1.0.0                   # fsck.reiser4 -V
   9  o  xfsprogs               2.6.0                   # xfs_db -V
  10  o  squashfs-tools         4.0                     # mksquashfs -version
  11  o  btrfs-progs            0.18                    # btrfsck
  12 @@ -157,6 +158,13 @@
  13  versions of mkreiserfs, resize_reiserfs, debugreiserfs and
  14  reiserfsck. These utils work on both i386 and alpha platforms.
  15
  16 +Reiser4progs
  17 +------------
  18 +
  19 +The reiser4progs package contains utilities for the reiser4 file system.
  20 +Detailed instructions are provided in the README file located at:
  21 +<ftp://ftp.namesys.com/pub/reiser4progs/README>.
  22 +
  23  Xfsprogs
  24  --------
  25
  26 @@ -345,6 +353,10 @@
  27  -------------
  28  o  <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
  29
  30 +Reiser4progs
  31 +------------
  32 +o  <ftp://ftp.namesys.com/pub/reiser4progs/>
  33 +
  34  Xfsprogs
  35  --------
  36  o  <ftp://oss.sgi.com/projects/xfs/download/>
  37 diff -urN linux-2.6.35.orig/Documentation/filesystems/reiser4.txt linux-2.6.35/Documentation/filesystems/reiser4.txt
  38 --- linux-2.6.35.orig/Documentation/filesystems/reiser4.txt     1970-01-01 01:00:00.000000000 +0100
  39 +++ linux-2.6.35/Documentation/filesystems/reiser4.txt  2010-08-04 15:44:57.000000000 +0200
  40 @@ -0,0 +1,75 @@
  41 +Reiser4 filesystem
  42 +==================
  43 +Reiser4 is a file system based on dancing tree algorithms, and is
  44 +described at http://www.namesys.com
  45 +
  46 +
  47 +References
  48 +==========
  49 +web page               http://namesys.com/v4/v4.html
  50 +source code            ftp://ftp.namesys.com/pub/reiser4-for-2.6/
  51 +userland tools         ftp://ftp.namesys.com/pub/reiser4progs/
  52 +install page           http://www.namesys.com/install_v4.html
  53 +
  54 +Compile options
  55 +===============
  56 +Enable reiser4 debug mode
  57 +       This checks everything imaginable while reiser4
  58 +       runs
  59 +
  60 +Mount options
  61 +=============
  62 +tmgr.atom_max_size=N
  63 +       Atoms containing more than N blocks will be forced to commit.
  64 +       N is decimal.
  65 +       Default is nr_free_pagecache_pages() / 2 at mount time.
  66 +
  67 +tmgr.atom_max_age=N
  68 +       Atoms older than N seconds will be forced to commit. N is decimal.
  69 +       Default is 600.
  70 +
  71 +tmgr.atom_max_flushers=N
  72 +       Limit of concurrent flushers for one atom. 0 means no limit.
  73 +       Default is 0.
  74 +
  75 +tree.cbk_cache.nr_slots=N
  76 +       Number of slots in the cbk cache.
  77 +
  78 +flush.relocate_threshold=N
  79 +       If flush finds more than N adjacent dirty leaf-level blocks it
  80 +       will force them to be relocated.
  81 +       Default is 64.
  82 +
  83 +flush.relocate_distance=N
  84 +       If flush finds can find a block allocation closer than at most
  85 +       N from the preceder it will relocate to that position.
  86 +       Default is 64.
  87 +
  88 +flush.scan_maxnodes=N
  89 +       The maximum number of nodes to scan left on a level during
  90 +       flush.
  91 +       Default is 10000.
  92 +
  93 +optimal_io_size=N
  94 +       Preferred IO size. This value is used to set st_blksize of
  95 +       struct stat.
  96 +       Default is 65536.
  97 +
  98 +bsdgroups
  99 +       Turn on BSD-style gid assignment.
 100 +
 101 +32bittimes
 102 +       By default file in reiser4 have 64 bit timestamps. Files
 103 +       created when filesystem is mounted with 32bittimes mount
 104 +       option will get 32 bit timestamps.
 105 +
 106 +mtflush
 107 +       Turn off concurrent flushing.
 108 +
 109 +nopseudo
 110 +       Disable pseudo files support. See
 111 +       http://namesys.com/v4/pseudo.html for more about pseudo files.
 112 +
 113 +dont_load_bitmap
 114 +       Don't load all bitmap blocks at mount time, it is useful for
 115 +       machines with tiny RAM and large disks.
 116 diff -urN linux-2.6.35.orig/fs/fs-writeback.c linux-2.6.35/fs/fs-writeback.c
 117 --- linux-2.6.35.orig/fs/fs-writeback.c 2010-08-02 00:11:14.000000000 +0200
 118 +++ linux-2.6.35/fs/fs-writeback.c      2010-08-04 20:33:23.000000000 +0200
 119 @@ -461,8 +461,10 @@
 120   * Return 1, if the caller writeback routine should be
 121   * interrupted. Otherwise return 0.
 122   */
 123 -static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 124 -               struct writeback_control *wbc, bool only_this_sb)
 125 +int generic_writeback_sb_inodes(struct super_block *sb,
 126 +                               struct bdi_writeback *wb,
 127 +                               struct writeback_control *wbc,
 128 +                               bool only_this_sb)
 129  {
 130         while (!list_empty(&wb->b_io)) {
 131                 long pages_skipped;
 132 @@ -544,7 +546,10 @@
 133                         requeue_io(inode);
 134                         continue;
 135                 }
 136 -               ret = writeback_sb_inodes(sb, wb, wbc, false);
 137 +               if (sb->s_op->writeback_inodes)
 138 +                       ret = sb->s_op->writeback_inodes(sb, wb, wbc, false);
 139 +               else
 140 +                       ret = generic_writeback_sb_inodes(sb, wb, wbc, false);
 141                 drop_super(sb);
 142
 143                 if (ret)
 144 @@ -553,6 +558,7 @@
 145         spin_unlock(&inode_lock);
 146         /* Leave any unwritten inodes on b_io */
 147  }
 148 +EXPORT_SYMBOL(writeback_inodes_wb);
 149
 150  static void __writeback_inodes_sb(struct super_block *sb,
 151                 struct bdi_writeback *wb, struct writeback_control *wbc)
 152 @@ -563,7 +569,7 @@
 153         spin_lock(&inode_lock);
 154         if (!wbc->for_kupdate || list_empty(&wb->b_io))
 155                 queue_io(wb, wbc->older_than_this);
 156 -       writeback_sb_inodes(sb, wb, wbc, true);
 157 +       generic_writeback_sb_inodes(sb, wb, wbc, true);
 158         spin_unlock(&inode_lock);
 159  }
 160
 161 @@ -680,6 +686,32 @@
 162
 163         return wrote;
 164  }
 165 +EXPORT_SYMBOL(generic_writeback_sb_inodes);
 166 +
 167 +/*
 168 + * This function is for file systems which have their
 169 + * own means of periodical write-out of old data.
 170 + * NOTE: inode_lock should be hold.
 171 + *
 172 + * Skip a portion of b_io inodes which belong to @sb
 173 + * and go sequentially in reverse order.
 174 + */
 175 +void writeback_skip_sb_inodes(struct super_block *sb,
 176 +                             struct bdi_writeback *wb)
 177 +{
 178 +       while (1) {
 179 +               struct inode *inode;
 180 +
 181 +               if (list_empty(&wb->b_io))
 182 +                       break;
 183 +               inode = list_entry(wb->b_io.prev, struct inode, i_list);
 184 +               if (sb != inode->i_sb)
 185 +                       break;
 186 +               redirty_tail(inode);
 187 +       }
 188 +}
 189 +EXPORT_SYMBOL(writeback_skip_sb_inodes);
 190 +
 191
 192  /*
 193   * Return the next wb_writeback_work struct that hasn't been processed yet.
 194 @@ -1159,3 +1191,12 @@
 195         return ret;
 196  }
 197  EXPORT_SYMBOL(sync_inode);
 198 +/*
 199 + * Local variables:
 200 + * c-indentation-style: "K&R"
 201 + * mode-name: "LC"
 202 + * c-basic-offset: 8
 203 + * tab-width: 8
 204 + * fill-column: 79
 205 + * End:
 206 + */
 207 diff -urN linux-2.6.35.orig/fs/inode.c linux-2.6.35/fs/inode.c
 208 --- linux-2.6.35.orig/fs/inode.c        2010-08-02 00:11:14.000000000 +0200
 209 +++ linux-2.6.35/fs/inode.c     2010-08-04 15:44:57.000000000 +0200
 210 @@ -84,6 +84,7 @@
 211   * the i_state of an inode while it is in use..
 212   */
 213  DEFINE_SPINLOCK(inode_lock);
 214 +EXPORT_SYMBOL_GPL(inode_lock);
 215
 216  /*
 217   * iprune_sem provides exclusion between the kswapd or try_to_free_pages
 218 diff -urN linux-2.6.35.orig/fs/Kconfig linux-2.6.35/fs/Kconfig
 219 --- linux-2.6.35.orig/fs/Kconfig        2010-08-02 00:11:14.000000000 +0200
 220 +++ linux-2.6.35/fs/Kconfig     2010-08-04 15:44:57.000000000 +0200
 221 @@ -27,6 +27,7 @@
 222         default y if EXT4_FS=y && EXT4_FS_XATTR
 223         default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
 224
 225 +source "fs/reiser4/Kconfig"
 226  source "fs/reiserfs/Kconfig"
 227  source "fs/jfs/Kconfig"
 228
 229 diff -urN linux-2.6.35.orig/fs/Makefile linux-2.6.35/fs/Makefile
 230 --- linux-2.6.35.orig/fs/Makefile       2010-08-02 00:11:14.000000000 +0200
 231 +++ linux-2.6.35/fs/Makefile    2010-08-04 15:44:57.000000000 +0200
 232 @@ -65,6 +65,7 @@
 233  # Do not add any filesystems before this line
 234  obj-$(CONFIG_FSCACHE)          += fscache/
 235  obj-$(CONFIG_REISERFS_FS)      += reiserfs/
 236 +obj-$(CONFIG_REISER4_FS)       += reiser4/
 237  obj-$(CONFIG_EXT3_FS)          += ext3/ # Before ext2 so root fs can be ext3
 238  obj-$(CONFIG_EXT2_FS)          += ext2/
 239  # We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
 240 diff -urN linux-2.6.35.orig/fs/reiser4/as_ops.c linux-2.6.35/fs/reiser4/as_ops.c
 241 --- linux-2.6.35.orig/fs/reiser4/as_ops.c       1970-01-01 01:00:00.000000000 +0100
 242 +++ linux-2.6.35/fs/reiser4/as_ops.c    2010-08-04 15:44:57.000000000 +0200
 243 @@ -0,0 +1,337 @@
 244 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
 245 +
 246 +/* Interface to VFS. Reiser4 address_space_operations are defined here. */
 247 +
 248 +#include "forward.h"
 249 +#include "debug.h"
 250 +#include "dformat.h"
 251 +#include "coord.h"
 252 +#include "plugin/item/item.h"
 253 +#include "plugin/file/file.h"
 254 +#include "plugin/security/perm.h"
 255 +#include "plugin/disk_format/disk_format.h"
 256 +#include "plugin/plugin.h"
 257 +#include "plugin/plugin_set.h"
 258 +#include "plugin/object.h"
 259 +#include "txnmgr.h"
 260 +#include "jnode.h"
 261 +#include "znode.h"
 262 +#include "block_alloc.h"
 263 +#include "tree.h"
 264 +#include "vfs_ops.h"
 265 +#include "inode.h"
 266 +#include "page_cache.h"
 267 +#include "ktxnmgrd.h"
 268 +#include "super.h"
 269 +#include "reiser4.h"
 270 +#include "entd.h"
 271 +
 272 +#include <linux/profile.h>
 273 +#include <linux/types.h>
 274 +#include <linux/mount.h>
 275 +#include <linux/vfs.h>
 276 +#include <linux/mm.h>
 277 +#include <linux/buffer_head.h>
 278 +#include <linux/dcache.h>
 279 +#include <linux/list.h>
 280 +#include <linux/pagemap.h>
 281 +#include <linux/slab.h>
 282 +#include <linux/seq_file.h>
 283 +#include <linux/init.h>
 284 +#include <linux/module.h>
 285 +#include <linux/writeback.h>
 286 +#include <linux/backing-dev.h>
 287 +#include <linux/quotaops.h>
 288 +#include <linux/security.h>
 289 +
 290 +/* address space operations */
 291 +
 292 +/**
 293 + * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
 294 + * @page: page to be dirtied
 295 + *
 296 + * Operation of struct address_space_operations. This implementation is used by
 297 + * unix and cryptcompress file plugins.
 298 + *
 299 + * This is called when reiser4 page gets dirtied outside of reiser4, for
 300 + * example, when dirty bit is moved from pte to physical page.
 301 + *
 302 + * Tags page in the mapping's page tree with special tag so that it is possible
 303 + * to do all the reiser4 specific work wrt dirty pages (jnode creation,
 304 + * capturing by an atom) later because it can not be done in the contexts where
 305 + * set_page_dirty is called.
 306 + */
 307 +int reiser4_set_page_dirty(struct page *page)
 308 +{
 309 +       /* this page can be unformatted only */
 310 +       assert("vs-1734", (page->mapping &&
 311 +                          page->mapping->host &&
 312 +                          reiser4_get_super_fake(page->mapping->host->i_sb) !=
 313 +                          page->mapping->host &&
 314 +                          reiser4_get_cc_fake(page->mapping->host->i_sb) !=
 315 +                          page->mapping->host &&
 316 +                          reiser4_get_bitmap_fake(page->mapping->host->i_sb) !=
 317 +                          page->mapping->host));
 318 +       return __set_page_dirty_nobuffers(page);
 319 +}
 320 +
 321 +/* ->invalidatepage method for reiser4 */
 322 +
 323 +/*
 324 + * this is called for each truncated page from
 325 + * truncate_inode_pages()->truncate_{complete,partial}_page().
 326 + *
 327 + * At the moment of call, page is under lock, and outstanding io (if any) has
 328 + * completed.
 329 + */
 330 +
 331 +/**
 332 + * reiser4_invalidatepage
 333 + * @page: page to invalidate
 334 + * @offset: starting offset for partial invalidation
 335 + *
 336 + */
 337 +void reiser4_invalidatepage(struct page *page, unsigned long offset)
 338 +{
 339 +       int ret = 0;
 340 +       reiser4_context *ctx;
 341 +       struct inode *inode;
 342 +       jnode *node;
 343 +
 344 +       /*
 345 +        * This is called to truncate file's page.
 346 +        *
 347 +        * Originally, reiser4 implemented truncate in a standard way
 348 +        * (vmtruncate() calls ->invalidatepage() on all truncated pages
 349 +        * first, then file system ->truncate() call-back is invoked).
 350 +        *
 351 +        * This lead to the problem when ->invalidatepage() was called on a
 352 +        * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
 353 +        * process. That is, truncate was bypassing transactions. To avoid
 354 +        * this, try_capture_page_to_invalidate() call was added here.
 355 +        *
 356 +        * After many troubles with vmtruncate() based truncate (including
 357 +        * races with flush, tail conversion, etc.) it was re-written in the
 358 +        * top-to-bottom style: items are killed in reiser4_cut_tree_object()
 359 +        * and pages belonging to extent are invalidated in kill_hook_extent().
 360 +        * So probably now additional call to capture is not needed here.
 361 +        */
 362 +
 363 +       assert("nikita-3137", PageLocked(page));
 364 +       assert("nikita-3138", !PageWriteback(page));
 365 +       inode = page->mapping->host;
 366 +
 367 +       /*
 368 +        * ->invalidatepage() should only be called for the unformatted
 369 +        * jnodes. Destruction of all other types of jnodes is performed
 370 +        * separately. But, during some corner cases (like handling errors
 371 +        * during mount) it is simpler to let ->invalidatepage to be called on
 372 +        * them. Check for this, and do nothing.
 373 +        */
 374 +       if (reiser4_get_super_fake(inode->i_sb) == inode)
 375 +               return;
 376 +       if (reiser4_get_cc_fake(inode->i_sb) == inode)
 377 +               return;
 378 +       if (reiser4_get_bitmap_fake(inode->i_sb) == inode)
 379 +               return;
 380 +       assert("vs-1426", PagePrivate(page));
 381 +       assert("vs-1427",
 382 +              page->mapping == jnode_get_mapping(jnode_by_page(page)));
 383 +       assert("", jprivate(page) != NULL);
 384 +       assert("", ergo(inode_file_plugin(inode) !=
 385 +                       file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID),
 386 +                       offset == 0));
 387 +
 388 +       ctx = reiser4_init_context(inode->i_sb);
 389 +       if (IS_ERR(ctx))
 390 +               return;
 391 +
 392 +       node = jprivate(page);
 393 +       spin_lock_jnode(node);
 394 +       if (!(node->state & ((1 << JNODE_DIRTY) | (1 << JNODE_FLUSH_QUEUED) |
 395 +                         (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
 396 +               /* there is not need to capture */
 397 +               jref(node);
 398 +               JF_SET(node, JNODE_HEARD_BANSHEE);
 399 +               page_clear_jnode(page, node);
 400 +               reiser4_uncapture_jnode(node);
 401 +               unhash_unformatted_jnode(node);
 402 +               jput(node);
 403 +               reiser4_exit_context(ctx);
 404 +               return;
 405 +       }
 406 +       spin_unlock_jnode(node);
 407 +
 408 +       /* capture page being truncated. */
 409 +       ret = try_capture_page_to_invalidate(page);
 410 +       if (ret != 0)
 411 +               warning("nikita-3141", "Cannot capture: %i", ret);
 412 +
 413 +       if (offset == 0) {
 414 +               /* remove jnode from transaction and detach it from page. */
 415 +               jref(node);
 416 +               JF_SET(node, JNODE_HEARD_BANSHEE);
 417 +               /* page cannot be detached from jnode concurrently, because it
 418 +                * is locked */
 419 +               reiser4_uncapture_page(page);
 420 +
 421 +               /* this detaches page from jnode, so that jdelete will not try
 422 +                * to lock page which is already locked */
 423 +               spin_lock_jnode(node);
 424 +               page_clear_jnode(page, node);
 425 +               spin_unlock_jnode(node);
 426 +               unhash_unformatted_jnode(node);
 427 +
 428 +               jput(node);
 429 +       }
 430 +
 431 +       reiser4_exit_context(ctx);
 432 +}
 433 +
 434 +/* help function called from reiser4_releasepage(). It returns true if jnode
 435 + * can be detached from its page and page released. */
 436 +int jnode_is_releasable(jnode * node/* node to check */)
 437 +{
 438 +       assert("nikita-2781", node != NULL);
 439 +       assert_spin_locked(&(node->guard));
 440 +       assert_spin_locked(&(node->load));
 441 +
 442 +       /* is some thread is currently using jnode page, later cannot be
 443 +        * detached */
 444 +       if (atomic_read(&node->d_count) != 0)
 445 +               return 0;
 446 +
 447 +       assert("vs-1214", !jnode_is_loaded(node));
 448 +
 449 +       /*
 450 +        * can only release page if real block number is assigned to it. Simple
 451 +        * check for ->atom wouldn't do, because it is possible for node to be
 452 +        * clean, not it atom yet, and still having fake block number. For
 453 +        * example, node just created in jinit_new().
 454 +        */
 455 +       if (reiser4_blocknr_is_fake(jnode_get_block(node)))
 456 +               return 0;
 457 +
 458 +       /*
 459 +        * pages prepared for write can not be released anyway, so avoid
 460 +        * detaching jnode from the page
 461 +        */
 462 +       if (JF_ISSET(node, JNODE_WRITE_PREPARED))
 463 +               return 0;
 464 +
 465 +       /*
 466 +        * dirty jnode cannot be released. It can however be submitted to disk
 467 +        * as part of early flushing, but only after getting flush-prepped.
 468 +        */
 469 +       if (JF_ISSET(node, JNODE_DIRTY))
 470 +               return 0;
 471 +
 472 +       /* overwrite set is only written by log writer. */
 473 +       if (JF_ISSET(node, JNODE_OVRWR))
 474 +               return 0;
 475 +
 476 +       /* jnode is already under writeback */
 477 +       if (JF_ISSET(node, JNODE_WRITEBACK))
 478 +               return 0;
 479 +
 480 +       /* don't flush bitmaps or journal records */
 481 +       if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
 482 +               return 0;
 483 +
 484 +       return 1;
 485 +}
 486 +
 487 +/*
 488 + * ->releasepage method for reiser4
 489 + *
 490 + * This is called by VM scanner when it comes across clean page.  What we have
 491 + * to do here is to check whether page can really be released (freed that is)
 492 + * and if so, detach jnode from it and remove page from the page cache.
 493 + *
 494 + * Check for releasability is done by releasable() function.
 495 + */
 496 +int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
 497 +{
 498 +       jnode *node;
 499 +
 500 +       assert("nikita-2257", PagePrivate(page));
 501 +       assert("nikita-2259", PageLocked(page));
 502 +       assert("nikita-2892", !PageWriteback(page));
 503 +       assert("nikita-3019", reiser4_schedulable());
 504 +
 505 +       /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
 506 +          is not clear what to do in this case. A lot of deadlocks seems be
 507 +          possible. */
 508 +
 509 +       node = jnode_by_page(page);
 510 +       assert("nikita-2258", node != NULL);
 511 +       assert("reiser4-4", page->mapping != NULL);
 512 +       assert("reiser4-5", page->mapping->host != NULL);
 513 +
 514 +       if (PageDirty(page))
 515 +               return 0;
 516 +
 517 +       /* extra page reference is used by reiser4 to protect
 518 +        * jnode<->page link from this ->releasepage(). */
 519 +       if (page_count(page) > 3)
 520 +               return 0;
 521 +
 522 +       /* releasable() needs jnode lock, because it looks at the jnode fields
 523 +        * and we need jload_lock here to avoid races with jload(). */
 524 +       spin_lock_jnode(node);
 525 +       spin_lock(&(node->load));
 526 +       if (jnode_is_releasable(node)) {
 527 +               struct address_space *mapping;
 528 +
 529 +               mapping = page->mapping;
 530 +               jref(node);
 531 +               /* there is no need to synchronize against
 532 +                * jnode_extent_write() here, because pages seen by
 533 +                * jnode_extent_write() are !releasable(). */
 534 +               page_clear_jnode(page, node);
 535 +               spin_unlock(&(node->load));
 536 +               spin_unlock_jnode(node);
 537 +
 538 +               /* we are under memory pressure so release jnode also. */
 539 +               jput(node);
 540 +
 541 +               return 1;
 542 +       } else {
 543 +               spin_unlock(&(node->load));
 544 +               spin_unlock_jnode(node);
 545 +               assert("nikita-3020", reiser4_schedulable());
 546 +               return 0;
 547 +       }
 548 +}
 549 +
 550 +int reiser4_readpage(struct file *file, struct page *page)
 551 +{
 552 +       assert("edward-1533", PageLocked(page));
 553 +       assert("edward-1534", !PageUptodate(page));
 554 +       assert("edward-1535", page->mapping && page->mapping->host);
 555 +
 556 +       return inode_file_plugin(page->mapping->host)->readpage(file, page);
 557 +}
 558 +
 559 +int reiser4_readpages(struct file *file, struct address_space *mapping,
 560 +                     struct list_head *pages, unsigned nr_pages)
 561 +{
 562 +       return inode_file_plugin(mapping->host)->readpages(file, mapping,
 563 +                                                          pages, nr_pages);
 564 +}
 565 +
 566 +int reiser4_writepages(struct address_space *mapping,
 567 +                      struct writeback_control *wbc)
 568 +{
 569 +       return inode_file_plugin(mapping->host)->writepages(mapping, wbc);
 570 +}
 571 +
 572 +/* Make Linus happy.
 573 +   Local variables:
 574 +   c-indentation-style: "K&R"
 575 +   mode-name: "LC"
 576 +   c-basic-offset: 8
 577 +   tab-width: 8
 578 +   fill-column: 120
 579 +   End:
 580 +*/
 581 diff -urN linux-2.6.35.orig/fs/reiser4/block_alloc.c linux-2.6.35/fs/reiser4/block_alloc.c
 582 --- linux-2.6.35.orig/fs/reiser4/block_alloc.c  1970-01-01 01:00:00.000000000 +0100
 583 +++ linux-2.6.35/fs/reiser4/block_alloc.c       2010-08-04 15:44:57.000000000 +0200
 584 @@ -0,0 +1,1142 @@
 585 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
 586 +reiser4/README */
 587 +
 588 +#include "debug.h"
 589 +#include "dformat.h"
 590 +#include "plugin/plugin.h"
 591 +#include "txnmgr.h"
 592 +#include "znode.h"
 593 +#include "block_alloc.h"
 594 +#include "tree.h"
 595 +#include "super.h"
 596 +
 597 +#include <linux/types.h>       /* for __u??  */
 598 +#include <linux/fs.h>          /* for struct super_block  */
 599 +#include <linux/spinlock.h>
 600 +
 601 +/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
 602 +
 603 +/* We need to be able to reserve enough disk space to ensure that an atomic
 604 +   operation will have enough disk space to flush (see flush.c and
 605 +   http://namesys.com/v4/v4.html) and commit it once it is started.
 606 +
 607 +   In our design a call for reserving disk space may fail but not an actual
 608 +   block allocation.
 609 +
 610 +   All free blocks, already allocated blocks, and all kinds of reserved blocks
 611 +   are counted in different per-fs block counters.
 612 +
 613 +   A reiser4 super block's set of block counters currently is:
 614 +
 615 +   free -- free blocks,
 616 +   used -- already allocated blocks,
 617 +
 618 +   grabbed -- initially reserved for performing an fs operation, those blocks
 619 +        are taken from free blocks, then grabbed disk space leaks from grabbed
 620 +        blocks counter to other counters like "fake allocated", "flush
 621 +        reserved", "used", the rest of not used grabbed space is returned to
 622 +        free space at the end of fs operation;
 623 +
 624 +   fake allocated -- counts all nodes without real disk block numbers assigned,
 625 +                   we have separate accounting for formatted and unformatted
 626 +                   nodes (for easier debugging);
 627 +
 628 +   flush reserved -- disk space needed for flushing and committing an atom.
 629 +                   Each dirty already allocated block could be written as a
 630 +                   part of atom's overwrite set or as a part of atom's
 631 +                   relocate set.  In both case one additional block is needed,
 632 +                   it is used as a wandered block if we do overwrite or as a
 633 +                   new location for a relocated block.
 634 +
 635 +   In addition, blocks in some states are counted on per-thread and per-atom
 636 +   basis.  A reiser4 context has a counter of blocks grabbed by this transaction
 637 +   and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
 638 +   of each reiser4 context.  Each reiser4 atom has a counter of "flush reserved"
 639 +   blocks, which are reserved for flush processing and atom commit. */
 640 +
 641 +/* AN EXAMPLE: suppose we insert new item to the reiser4 tree.  We estimate
 642 +   number of blocks to grab for most expensive case of balancing when the leaf
 643 +   node we insert new item to gets split and new leaf node is allocated.
 644 +
 645 +   So, we need to grab blocks for
 646 +
 647 +   1) one block for possible dirtying the node we insert an item to. That block
 648 +      would be used for node relocation at flush time or for allocating of a
 649 +      wandered one, it depends what will be a result (what set, relocate or
 650 +      overwrite the node gets assigned to) of the node processing by the flush
 651 +      algorithm.
 652 +
 653 +   2) one block for either allocating a new node, or dirtying of right or left
 654 +      clean neighbor, only one case may happen.
 655 +
 656 +   VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying
 657 +   of left neighbor, right neighbor, current node, and creation of new node.
 658 +   Have I forgotten something?  email me.
 659 +
 660 +   These grabbed blocks are counted in both reiser4 context "grabbed blocks"
 661 +   counter and in the fs-wide one (both ctx->grabbed_blocks and
 662 +   sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
 663 +   decremented by 2.
 664 +
 665 +   Suppose both two blocks were spent for dirtying of an already allocated clean
 666 +   node (one block went from "grabbed" to "flush reserved") and for new block
 667 +   allocating (one block went from "grabbed" to "fake allocated formatted").
 668 +
 669 +   Inserting of a child pointer to the parent node caused parent node to be
 670 +   split, the balancing code takes care about this grabbing necessary space
 671 +   immediately by calling reiser4_grab with BA_RESERVED flag set which means
 672 +   "can use the 5% reserved disk space".
 673 +
 674 +   At this moment insertion completes and grabbed blocks (if they were not used)
 675 +   should be returned to the free space counter.
 676 +
 677 +   However the atom life-cycle is not completed.  The atom had one "flush
 678 +   reserved" block added by our insertion and the new fake allocated node is
 679 +   counted as a "fake allocated formatted" one.  The atom has to be fully
 680 +   processed by flush before commit.  Suppose that the flush moved the first,
 681 +   already allocated node to the atom's overwrite list, the new fake allocated
 682 +   node, obviously, went into the atom relocate set.  The reiser4 flush
 683 +   allocates the new node using one unit from "fake allocated formatted"
 684 +   counter, the log writer uses one from "flush reserved" for wandered block
 685 +   allocation.
 686 +
 687 +   And, it is not the end.  When the wandered block is deallocated after the
 688 +   atom gets fully played (see wander.c for term description), the disk space
 689 +   occupied for it is returned to free blocks. */
 690 +
 691 +/* BLOCK NUMBERS */
 692 +
 693 +/* Any reiser4 node has a block number assigned to it.  We use these numbers for
 694 +   indexing in hash tables, so if a block has not yet been assigned a location
 695 +   on disk we need to give it a temporary fake block number.
 696 +
 697 +   Current implementation of reiser4 uses 64-bit integers for block numbers. We
 698 +   use highest bit in 64-bit block number to distinguish fake and real block
 699 +   numbers. So, only 63 bits may be used to addressing of real device
 700 +   blocks. That "fake" block numbers space is divided into subspaces of fake
 701 +   block numbers for data blocks and for shadow (working) bitmap blocks.
 702 +
 703 +   Fake block numbers for data blocks are generated by a cyclic counter, which
 704 +   gets incremented after each real block allocation. We assume that it is
 705 +   impossible to overload this counter during one transaction life. */
 706 +
 707 +/* Initialize a blocknr hint. */
 708 +void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint)
 709 +{
 710 +       memset(hint, 0, sizeof(reiser4_blocknr_hint));
 711 +}
 712 +
 713 +/* Release any resources of a blocknr hint. */
 714 +void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
 715 +{
 716 +/* No resources should be freed in current blocknr_hint implementation. */
 717 +}
 718 +
 719 +/* see above for explanation of fake block number.  */
 720 +/* Audited by: green(2002.06.11) */
 721 +int reiser4_blocknr_is_fake(const reiser4_block_nr * da)
 722 +{
 723 +       /* The reason for not simply returning result of '&' operation is that
 724 +          while return value is (possibly 32bit) int,  the reiser4_block_nr is
 725 +          at least 64 bits long, and high bit (which is the only possible
 726 +          non zero bit after the masking) would be stripped off */
 727 +       return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
 728 +}
 729 +
 730 +/* Static functions for <reiser4 super block>/<reiser4 context> block counters
 731 +   arithmetic. Mostly, they are isolated to not to code same assertions in
 732 +   several places. */
 733 +static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
 734 +{
 735 +       BUG_ON(ctx->grabbed_blocks < count);
 736 +       assert("zam-527", ctx->grabbed_blocks >= count);
 737 +       ctx->grabbed_blocks -= count;
 738 +}
 739 +
 740 +static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
 741 +{
 742 +       ctx->grabbed_blocks += count;
 743 +}
 744 +
 745 +static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
 746 +{
 747 +       assert("zam-525", sbinfo->blocks_grabbed >= count);
 748 +       sbinfo->blocks_grabbed -= count;
 749 +}
 750 +
 751 +/* Decrease the counter of block reserved for flush in super block. */
 752 +static void
 753 +sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
 754 +{
 755 +       assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
 756 +       sbinfo->blocks_flush_reserved -= count;
 757 +}
 758 +
 759 +static void
 760 +sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
 761 +                          reiser4_ba_flags_t flags)
 762 +{
 763 +       if (flags & BA_FORMATTED) {
 764 +               assert("zam-806", sbinfo->blocks_fake_allocated >= count);
 765 +               sbinfo->blocks_fake_allocated -= count;
 766 +       } else {
 767 +               assert("zam-528",
 768 +                      sbinfo->blocks_fake_allocated_unformatted >= count);
 769 +               sbinfo->blocks_fake_allocated_unformatted -= count;
 770 +       }
 771 +}
 772 +
 773 +static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
 774 +{
 775 +       assert("zam-530",
 776 +              sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
 777 +       sbinfo->blocks_used -= count;
 778 +}
 779 +
 780 +static void
 781 +sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
 782 +{
 783 +       assert("edward-501", sbinfo->blocks_clustered >= count);
 784 +       sbinfo->blocks_clustered -= count;
 785 +}
 786 +
 787 +/* Increase the counter of block reserved for flush in atom. */
 788 +static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
 789 +{
 790 +       assert("zam-772", atom != NULL);
 791 +       assert_spin_locked(&(atom->alock));
 792 +       atom->flush_reserved += count;
 793 +}
 794 +
 795 +/* Decrease the counter of block reserved for flush in atom. */
 796 +static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
 797 +{
 798 +       assert("zam-774", atom != NULL);
 799 +       assert_spin_locked(&(atom->alock));
 800 +       assert("nikita-2790", atom->flush_reserved >= count);
 801 +       atom->flush_reserved -= count;
 802 +}
 803 +
 804 +/* super block has 6 counters: free, used, grabbed, fake allocated
 805 +   (formatted and unformatted) and flush reserved. Their sum must be
 806 +   number of blocks on a device. This function checks this */
 807 +int reiser4_check_block_counters(const struct super_block *super)
 808 +{
 809 +       __u64 sum;
 810 +
 811 +       sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
 812 +           reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
 813 +           reiser4_fake_allocated_unformatted(super) + reiser4_flush_reserved(super) +
 814 +           reiser4_clustered_blocks(super);
 815 +       if (reiser4_block_count(super) != sum) {
 816 +               printk("super block counters: "
 817 +                      "used %llu, free %llu, "
 818 +                      "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
 819 +                      "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
 820 +                      (unsigned long long)reiser4_data_blocks(super),
 821 +                      (unsigned long long)reiser4_free_blocks(super),
 822 +                      (unsigned long long)reiser4_grabbed_blocks(super),
 823 +                      (unsigned long long)reiser4_fake_allocated(super),
 824 +                      (unsigned long long)
 825 +                      reiser4_fake_allocated_unformatted(super),
 826 +                      (unsigned long long)reiser4_flush_reserved(super),
 827 +                      (unsigned long long)reiser4_clustered_blocks(super),
 828 +                      (unsigned long long)sum,
 829 +                      (unsigned long long)reiser4_block_count(super));
 830 +               return 0;
 831 +       }
 832 +       return 1;
 833 +}
 834 +
 835 +/* Adjust "working" free blocks counter for number of blocks we are going to
 836 +   allocate.  Record number of grabbed blocks in fs-wide and per-thread
 837 +   counters.  This function should be called before bitmap scanning or
 838 +   allocating fake block numbers
 839 +
 840 +   @super           -- pointer to reiser4 super block;
 841 +   @count           -- number of blocks we reserve;
 842 +
 843 +   @return          -- 0 if success,  -ENOSPC, if all
 844 +                       free blocks are preserved or already allocated.
 845 +*/
 846 +
 847 +static int
 848 +reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
 849 +{
 850 +       __u64 free_blocks;
 851 +       int ret = 0, use_reserved = flags & BA_RESERVED;
 852 +       reiser4_super_info_data *sbinfo;
 853 +
 854 +       assert("vs-1276", ctx == get_current_context());
 855 +
 856 +       /* Do not grab anything on ro-mounted fs. */
 857 +       if (rofs_super(ctx->super)) {
 858 +               ctx->grab_enabled = 0;
 859 +               return 0;
 860 +       }
 861 +
 862 +       sbinfo = get_super_private(ctx->super);
 863 +
 864 +       spin_lock_reiser4_super(sbinfo);
 865 +
 866 +       free_blocks = sbinfo->blocks_free;
 867 +
 868 +       if ((use_reserved && free_blocks < count) ||
 869 +           (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
 870 +               ret = RETERR(-ENOSPC);
 871 +               goto unlock_and_ret;
 872 +       }
 873 +
 874 +       add_to_ctx_grabbed(ctx, count);
 875 +
 876 +       sbinfo->blocks_grabbed += count;
 877 +       sbinfo->blocks_free -= count;
 878 +
 879 +#if REISER4_DEBUG
 880 +       if (ctx->grabbed_initially == 0)
 881 +               ctx->grabbed_initially = count;
 882 +#endif
 883 +
 884 +       assert("nikita-2986", reiser4_check_block_counters(ctx->super));
 885 +
 886 +       /* disable grab space in current context */
 887 +       ctx->grab_enabled = 0;
 888 +
 889 +unlock_and_ret:
 890 +       spin_unlock_reiser4_super(sbinfo);
 891 +
 892 +       return ret;
 893 +}
 894 +
 895 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
 896 +{
 897 +       int ret;
 898 +       reiser4_context *ctx;
 899 +
 900 +       assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
 901 +                                  lock_stack_isclean(get_current_lock_stack
 902 +                                                     ())));
 903 +       ctx = get_current_context();
 904 +       if (!(flags & BA_FORCE) && !is_grab_enabled(ctx))
 905 +               return 0;
 906 +
 907 +       ret = reiser4_grab(ctx, count, flags);
 908 +       if (ret == -ENOSPC) {
 909 +
 910 +               /* Trying to commit the all transactions if BA_CAN_COMMIT flag
 911 +                  present */
 912 +               if (flags & BA_CAN_COMMIT) {
 913 +                       txnmgr_force_commit_all(ctx->super, 0);
 914 +                       ctx->grab_enabled = 1;
 915 +                       ret = reiser4_grab(ctx, count, flags);
 916 +               }
 917 +       }
 918 +       /*
 919 +        * allocation from reserved pool cannot fail. This is severe error.
 920 +        */
 921 +       assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
 922 +       return ret;
 923 +}
 924 +
 925 +/*
 926 + * SPACE RESERVED FOR UNLINK/TRUNCATE
 927 + *
 928 + * Unlink and truncate require space in transaction (to update stat data, at
 929 + * least). But we don't want rm(1) to fail with "No space on device" error.
 930 + *
 931 + * Solution is to reserve 5% of disk space for truncates and
 932 + * unlinks. Specifically, normal space grabbing requests don't grab space from
 933 + * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
 934 + * drain it. Per super block delete mutex is used to allow only one
 935 + * thread at a time to grab from reserved area.
 936 + *
 937 + * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
 938 + * flag.
 939 + *
 940 + */
 941 +
 942 +int reiser4_grab_reserved(struct super_block *super,
 943 +                         __u64 count, reiser4_ba_flags_t flags)
 944 +{
 945 +       reiser4_super_info_data *sbinfo = get_super_private(super);
 946 +
 947 +       assert("nikita-3175", flags & BA_CAN_COMMIT);
 948 +
 949 +       /* Check the delete mutex already taken by us, we assume that
 950 +        * reading of machine word is atomic. */
 951 +       if (sbinfo->delete_mutex_owner == current) {
 952 +               if (reiser4_grab_space
 953 +                   (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
 954 +                       warning("zam-1003",
 955 +                               "nested call of grab_reserved fails count=(%llu)",
 956 +                               (unsigned long long)count);
 957 +                       reiser4_release_reserved(super);
 958 +                       return RETERR(-ENOSPC);
 959 +               }
 960 +               return 0;
 961 +       }
 962 +
 963 +       if (reiser4_grab_space(count, flags)) {
 964 +               mutex_lock(&sbinfo->delete_mutex);
 965 +               assert("nikita-2929", sbinfo->delete_mutex_owner == NULL);
 966 +               sbinfo->delete_mutex_owner = current;
 967 +
 968 +               if (reiser4_grab_space(count, flags | BA_RESERVED)) {
 969 +                       warning("zam-833",
 970 +                               "reserved space is not enough (%llu)",
 971 +                               (unsigned long long)count);
 972 +                       reiser4_release_reserved(super);
 973 +                       return RETERR(-ENOSPC);
 974 +               }
 975 +       }
 976 +       return 0;
 977 +}
 978 +
 979 +void reiser4_release_reserved(struct super_block *super)
 980 +{
 981 +       reiser4_super_info_data *info;
 982 +
 983 +       info = get_super_private(super);
 984 +       if (info->delete_mutex_owner == current) {
 985 +               info->delete_mutex_owner = NULL;
 986 +               mutex_unlock(&info->delete_mutex);
 987 +       }
 988 +}
 989 +
 990 +static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
 991 +{
 992 +       reiser4_context *ctx;
 993 +       reiser4_super_info_data *sbinfo;
 994 +
 995 +       ctx = get_current_context();
 996 +       sub_from_ctx_grabbed(ctx, count);
 997 +
 998 +       sbinfo = get_super_private(ctx->super);
 999 +       spin_lock_reiser4_super(sbinfo);
1000 +
1001 +       sub_from_sb_grabbed(sbinfo, count);
1002 +       /* return sbinfo locked */
1003 +       return sbinfo;
1004 +}
1005 +
1006 +/* is called after @count fake block numbers are allocated and pointer to
1007 +   those blocks are inserted into tree. */
1008 +static void grabbed2fake_allocated_formatted(void)
1009 +{
1010 +       reiser4_super_info_data *sbinfo;
1011 +
1012 +       sbinfo = grabbed2fake_allocated_head(1);
1013 +       sbinfo->blocks_fake_allocated++;
1014 +
1015 +       assert("vs-922", reiser4_check_block_counters(reiser4_get_current_sb()));
1016 +
1017 +       spin_unlock_reiser4_super(sbinfo);
1018 +}
1019 +
1020 +/**
1021 + * grabbed2fake_allocated_unformatted
1022 + * @count:
1023 + *
1024 + */
1025 +static void grabbed2fake_allocated_unformatted(int count)
1026 +{
1027 +       reiser4_super_info_data *sbinfo;
1028 +
1029 +       sbinfo = grabbed2fake_allocated_head(count);
1030 +       sbinfo->blocks_fake_allocated_unformatted += count;
1031 +
1032 +       assert("vs-9221", reiser4_check_block_counters(reiser4_get_current_sb()));
1033 +
1034 +       spin_unlock_reiser4_super(sbinfo);
1035 +}
1036 +
1037 +void grabbed2cluster_reserved(int count)
1038 +{
1039 +       reiser4_context *ctx;
1040 +       reiser4_super_info_data *sbinfo;
1041 +
1042 +       ctx = get_current_context();
1043 +       sub_from_ctx_grabbed(ctx, count);
1044 +
1045 +       sbinfo = get_super_private(ctx->super);
1046 +       spin_lock_reiser4_super(sbinfo);
1047 +
1048 +       sub_from_sb_grabbed(sbinfo, count);
1049 +       sbinfo->blocks_clustered += count;
1050 +
1051 +       assert("edward-504", reiser4_check_block_counters(ctx->super));
1052 +
1053 +       spin_unlock_reiser4_super(sbinfo);
1054 +}
1055 +
1056 +void cluster_reserved2grabbed(int count)
1057 +{
1058 +       reiser4_context *ctx;
1059 +       reiser4_super_info_data *sbinfo;
1060 +
1061 +       ctx = get_current_context();
1062 +
1063 +       sbinfo = get_super_private(ctx->super);
1064 +       spin_lock_reiser4_super(sbinfo);
1065 +
1066 +       sub_from_cluster_reserved(sbinfo, count);
1067 +       sbinfo->blocks_grabbed += count;
1068 +
1069 +       assert("edward-505", reiser4_check_block_counters(ctx->super));
1070 +
1071 +       spin_unlock_reiser4_super(sbinfo);
1072 +       add_to_ctx_grabbed(ctx, count);
1073 +}
1074 +
1075 +void cluster_reserved2free(int count)
1076 +{
1077 +       reiser4_context *ctx;
1078 +       reiser4_super_info_data *sbinfo;
1079 +
1080 +       ctx = get_current_context();
1081 +       sbinfo = get_super_private(ctx->super);
1082 +
1083 +       cluster_reserved2grabbed(count);
1084 +       grabbed2free(ctx, sbinfo, count);
1085 +}
1086 +
1087 +static DEFINE_SPINLOCK(fake_lock);
1088 +static reiser4_block_nr fake_gen = 0;
1089 +
1090 +/**
1091 + * assign_fake_blocknr
1092 + * @blocknr:
1093 + * @count:
1094 + *
1095 + * Obtain a fake block number for new node which will be used to refer to
1096 + * this newly allocated node until real allocation is done.
1097 + */
1098 +static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
1099 +{
1100 +       spin_lock(&fake_lock);
1101 +       *blocknr = fake_gen;
1102 +       fake_gen += count;
1103 +       spin_unlock(&fake_lock);
1104 +
1105 +       BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
1106 +       /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
1107 +       *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
1108 +       assert("zam-394", zlook(current_tree, blocknr) == NULL);
1109 +}
1110 +
1111 +int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
1112 +{
1113 +       assign_fake_blocknr(blocknr, 1);
1114 +       grabbed2fake_allocated_formatted();
1115 +       return 0;
1116 +}
1117 +
1118 +/**
1119 + * fake_blocknrs_unformatted
1120 + * @count: number of fake numbers to get
1121 + *
1122 + * Allocates @count fake block numbers which will be assigned to jnodes
1123 + */
1124 +reiser4_block_nr fake_blocknr_unformatted(int count)
1125 +{
1126 +       reiser4_block_nr blocknr;
1127 +
1128 +       assign_fake_blocknr(&blocknr, count);
1129 +       grabbed2fake_allocated_unformatted(count);
1130 +
1131 +       return blocknr;
1132 +}
1133 +
1134 +/* adjust sb block counters, if real (on-disk) block allocation immediately
1135 +   follows grabbing of free disk space. */
1136 +static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1137 +                        __u64 count)
1138 +{
1139 +       sub_from_ctx_grabbed(ctx, count);
1140 +
1141 +       spin_lock_reiser4_super(sbinfo);
1142 +
1143 +       sub_from_sb_grabbed(sbinfo, count);
1144 +       sbinfo->blocks_used += count;
1145 +
1146 +       assert("nikita-2679", reiser4_check_block_counters(ctx->super));
1147 +
1148 +       spin_unlock_reiser4_super(sbinfo);
1149 +}
1150 +
1151 +/* adjust sb block counters when @count unallocated blocks get mapped to disk */
1152 +static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
1153 +                               reiser4_ba_flags_t flags)
1154 +{
1155 +       spin_lock_reiser4_super(sbinfo);
1156 +
1157 +       sub_from_sb_fake_allocated(sbinfo, count, flags);
1158 +       sbinfo->blocks_used += count;
1159 +
1160 +       assert("nikita-2680",
1161 +              reiser4_check_block_counters(reiser4_get_current_sb()));
1162 +
1163 +       spin_unlock_reiser4_super(sbinfo);
1164 +}
1165 +
1166 +static void flush_reserved2used(txn_atom * atom, __u64 count)
1167 +{
1168 +       reiser4_super_info_data *sbinfo;
1169 +
1170 +       assert("zam-787", atom != NULL);
1171 +       assert_spin_locked(&(atom->alock));
1172 +
1173 +       sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1174 +
1175 +       sbinfo = get_current_super_private();
1176 +       spin_lock_reiser4_super(sbinfo);
1177 +
1178 +       sub_from_sb_flush_reserved(sbinfo, count);
1179 +       sbinfo->blocks_used += count;
1180 +
1181 +       assert("zam-789",
1182 +              reiser4_check_block_counters(reiser4_get_current_sb()));
1183 +
1184 +       spin_unlock_reiser4_super(sbinfo);
1185 +}
1186 +
1187 +/* update the per fs  blocknr hint default value. */
1188 +void
1189 +update_blocknr_hint_default(const struct super_block *s,
1190 +                           const reiser4_block_nr * block)
1191 +{
1192 +       reiser4_super_info_data *sbinfo = get_super_private(s);
1193 +
1194 +       assert("nikita-3342", !reiser4_blocknr_is_fake(block));
1195 +
1196 +       spin_lock_reiser4_super(sbinfo);
1197 +       if (*block < sbinfo->block_count) {
1198 +               sbinfo->blocknr_hint_default = *block;
1199 +       } else {
1200 +               warning("zam-676",
1201 +                       "block number %llu is too large to be used in a blocknr hint\n",
1202 +                       (unsigned long long)*block);
1203 +               dump_stack();
1204 +               DEBUGON(1);
1205 +       }
1206 +       spin_unlock_reiser4_super(sbinfo);
1207 +}
1208 +
1209 +/* get current value of the default blocknr hint. */
1210 +void get_blocknr_hint_default(reiser4_block_nr * result)
1211 +{
1212 +       reiser4_super_info_data *sbinfo = get_current_super_private();
1213 +
1214 +       spin_lock_reiser4_super(sbinfo);
1215 +       *result = sbinfo->blocknr_hint_default;
1216 +       assert("zam-677", *result < sbinfo->block_count);
1217 +       spin_unlock_reiser4_super(sbinfo);
1218 +}
1219 +
1220 +/* Allocate "real" disk blocks by calling a proper space allocation plugin
1221 + * method. Blocks are allocated in one contiguous disk region. The plugin
1222 + * independent part accounts blocks by subtracting allocated amount from grabbed
1223 + * or fake block counter and add the same amount to the counter of allocated
1224 + * blocks.
1225 + *
1226 + * @hint -- a reiser4 blocknr hint object which contains further block
1227 + *          allocation hints and parameters (search start, a stage of block
1228 + *          which will be mapped to disk, etc.),
1229 + * @blk  -- an out parameter for the beginning of the allocated region,
1230 + * @len  -- in/out parameter, it should contain the maximum number of allocated
1231 + *          blocks, after block allocation completes, it contains the length of
1232 + *          allocated disk region.
1233 + * @flags -- see reiser4_ba_flags_t description.
1234 + *
1235 + * @return -- 0 if success, error code otherwise.
1236 + */
1237 +int
1238 +reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
1239 +                    reiser4_block_nr * len, reiser4_ba_flags_t flags)
1240 +{
1241 +       __u64 needed = *len;
1242 +       reiser4_context *ctx;
1243 +       reiser4_super_info_data *sbinfo;
1244 +       int ret;
1245 +
1246 +       assert("zam-986", hint != NULL);
1247 +
1248 +       ctx = get_current_context();
1249 +       sbinfo = get_super_private(ctx->super);
1250 +
1251 +       /* For write-optimized data we use default search start value, which is
1252 +        * close to last write location. */
1253 +       if (flags & BA_USE_DEFAULT_SEARCH_START)
1254 +               get_blocknr_hint_default(&hint->blk);
1255 +
1256 +       /* VITALY: allocator should grab this for internal/tx-lists/similar
1257 +          only. */
1258 +/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)?*/
1259 +       if (hint->block_stage == BLOCK_NOT_COUNTED) {
1260 +               ret = reiser4_grab_space_force(*len, flags);
1261 +               if (ret != 0)
1262 +                       return ret;
1263 +       }
1264 +
1265 +       ret =
1266 +           sa_alloc_blocks(reiser4_get_space_allocator(ctx->super),
1267 +                           hint, (int)needed, blk, len);
1268 +
1269 +       if (!ret) {
1270 +               assert("zam-680", *blk < reiser4_block_count(ctx->super));
1271 +               assert("zam-681",
1272 +                      *blk + *len <= reiser4_block_count(ctx->super));
1273 +
1274 +               if (flags & BA_PERMANENT) {
1275 +                       /* we assume that current atom exists at this moment */
1276 +                       txn_atom *atom = get_current_atom_locked();
1277 +                       atom->nr_blocks_allocated += *len;
1278 +                       spin_unlock_atom(atom);
1279 +               }
1280 +
1281 +               switch (hint->block_stage) {
1282 +               case BLOCK_NOT_COUNTED:
1283 +               case BLOCK_GRABBED:
1284 +                       grabbed2used(ctx, sbinfo, *len);
1285 +                       break;
1286 +               case BLOCK_UNALLOCATED:
1287 +                       fake_allocated2used(sbinfo, *len, flags);
1288 +                       break;
1289 +               case BLOCK_FLUSH_RESERVED:
1290 +                       {
1291 +                               txn_atom *atom = get_current_atom_locked();
1292 +                               flush_reserved2used(atom, *len);
1293 +                               spin_unlock_atom(atom);
1294 +                       }
1295 +                       break;
1296 +               default:
1297 +                       impossible("zam-531", "wrong block stage");
1298 +               }
1299 +       } else {
1300 +               assert("zam-821",
1301 +                      ergo(hint->max_dist == 0
1302 +                           && !hint->backward, ret != -ENOSPC));
1303 +               if (hint->block_stage == BLOCK_NOT_COUNTED)
1304 +                       grabbed2free(ctx, sbinfo, needed);
1305 +       }
1306 +
1307 +       return ret;
1308 +}
1309 +
1310 +/* used -> fake_allocated -> grabbed -> free */
1311 +
1312 +/* adjust sb block counters when @count unallocated blocks get unmapped from
1313 +   disk */
1314 +static void
1315 +used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1316 +                   int formatted)
1317 +{
1318 +       spin_lock_reiser4_super(sbinfo);
1319 +
1320 +       if (formatted)
1321 +               sbinfo->blocks_fake_allocated += count;
1322 +       else
1323 +               sbinfo->blocks_fake_allocated_unformatted += count;
1324 +
1325 +       sub_from_sb_used(sbinfo, count);
1326 +
1327 +       assert("nikita-2681",
1328 +              reiser4_check_block_counters(reiser4_get_current_sb()));
1329 +
1330 +       spin_unlock_reiser4_super(sbinfo);
1331 +}
1332 +
1333 +static void
1334 +used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
1335 +                   __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
1336 +{
1337 +       assert("nikita-2791", atom != NULL);
1338 +       assert_spin_locked(&(atom->alock));
1339 +
1340 +       add_to_atom_flush_reserved_nolock(atom, (__u32) count);
1341 +
1342 +       spin_lock_reiser4_super(sbinfo);
1343 +
1344 +       sbinfo->blocks_flush_reserved += count;
1345 +       /*add_to_sb_flush_reserved(sbinfo, count); */
1346 +       sub_from_sb_used(sbinfo, count);
1347 +
1348 +       assert("nikita-2681",
1349 +              reiser4_check_block_counters(reiser4_get_current_sb()));
1350 +
1351 +       spin_unlock_reiser4_super(sbinfo);
1352 +}
1353 +
1354 +/* disk space, virtually used by fake block numbers is counted as "grabbed"
1355 +   again. */
1356 +static void
1357 +fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1358 +                      __u64 count, reiser4_ba_flags_t flags)
1359 +{
1360 +       add_to_ctx_grabbed(ctx, count);
1361 +
1362 +       spin_lock_reiser4_super(sbinfo);
1363 +
1364 +       assert("nikita-2682", reiser4_check_block_counters(ctx->super));
1365 +
1366 +       sbinfo->blocks_grabbed += count;
1367 +       sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
1368 +
1369 +       assert("nikita-2683", reiser4_check_block_counters(ctx->super));
1370 +
1371 +       spin_unlock_reiser4_super(sbinfo);
1372 +}
1373 +
1374 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
1375 +{
1376 +       reiser4_context *ctx;
1377 +       reiser4_super_info_data *sbinfo;
1378 +
1379 +       ctx = get_current_context();
1380 +       sbinfo = get_super_private(ctx->super);
1381 +
1382 +       fake_allocated2grabbed(ctx, sbinfo, count, flags);
1383 +       grabbed2free(ctx, sbinfo, count);
1384 +}
1385 +
1386 +void grabbed2free_mark(__u64 mark)
1387 +{
1388 +       reiser4_context *ctx;
1389 +       reiser4_super_info_data *sbinfo;
1390 +
1391 +       ctx = get_current_context();
1392 +       sbinfo = get_super_private(ctx->super);
1393 +
1394 +       assert("nikita-3007", (__s64) mark >= 0);
1395 +       assert("nikita-3006", ctx->grabbed_blocks >= mark);
1396 +       grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
1397 +}
1398 +
1399 +/**
1400 + * grabbed2free - adjust grabbed and free block counters
1401 + * @ctx: context to update grabbed block counter of
1402 + * @sbinfo: super block to update grabbed and free block counters of
1403 + * @count: number of blocks to adjust counters by
1404 + *
1405 + * Decreases context's and per filesystem's counters of grabbed
1406 + * blocks. Increases per filesystem's counter of free blocks.
1407 + */
1408 +void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1409 +                 __u64 count)
1410 +{
1411 +       sub_from_ctx_grabbed(ctx, count);
1412 +
1413 +       spin_lock_reiser4_super(sbinfo);
1414 +
1415 +       sub_from_sb_grabbed(sbinfo, count);
1416 +       sbinfo->blocks_free += count;
1417 +       assert("nikita-2684", reiser4_check_block_counters(ctx->super));
1418 +
1419 +       spin_unlock_reiser4_super(sbinfo);
1420 +}
1421 +
1422 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
1423 +{
1424 +       reiser4_context *ctx;
1425 +       reiser4_super_info_data *sbinfo;
1426 +
1427 +       assert("vs-1095", atom);
1428 +
1429 +       ctx = get_current_context();
1430 +       sbinfo = get_super_private(ctx->super);
1431 +
1432 +       sub_from_ctx_grabbed(ctx, count);
1433 +
1434 +       add_to_atom_flush_reserved_nolock(atom, count);
1435 +
1436 +       spin_lock_reiser4_super(sbinfo);
1437 +
1438 +       sbinfo->blocks_flush_reserved += count;
1439 +       sub_from_sb_grabbed(sbinfo, count);
1440 +
1441 +       assert("vpf-292", reiser4_check_block_counters(ctx->super));
1442 +
1443 +       spin_unlock_reiser4_super(sbinfo);
1444 +}
1445 +
1446 +void grabbed2flush_reserved(__u64 count)
1447 +{
1448 +       txn_atom *atom = get_current_atom_locked();
1449 +
1450 +       grabbed2flush_reserved_nolock(atom, count);
1451 +
1452 +       spin_unlock_atom(atom);
1453 +}
1454 +
1455 +void flush_reserved2grabbed(txn_atom * atom, __u64 count)
1456 +{
1457 +       reiser4_context *ctx;
1458 +       reiser4_super_info_data *sbinfo;
1459 +
1460 +       assert("nikita-2788", atom != NULL);
1461 +       assert_spin_locked(&(atom->alock));
1462 +
1463 +       ctx = get_current_context();
1464 +       sbinfo = get_super_private(ctx->super);
1465 +
1466 +       add_to_ctx_grabbed(ctx, count);
1467 +
1468 +       sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1469 +
1470 +       spin_lock_reiser4_super(sbinfo);
1471 +
1472 +       sbinfo->blocks_grabbed += count;
1473 +       sub_from_sb_flush_reserved(sbinfo, count);
1474 +
1475 +       assert("vpf-292", reiser4_check_block_counters(ctx->super));
1476 +
1477 +       spin_unlock_reiser4_super(sbinfo);
1478 +}
1479 +
1480 +/**
1481 + * all_grabbed2free - releases all blocks grabbed in context
1482 + *
1483 + * Decreases context's and super block's grabbed block counters by number of
1484 + * blocks grabbed by current context and increases super block's free block
1485 + * counter correspondingly.
1486 + */
1487 +void all_grabbed2free(void)
1488 +{
1489 +       reiser4_context *ctx = get_current_context();
1490 +
1491 +       grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
1492 +}
1493 +
1494 +/* adjust sb block counters if real (on-disk) blocks do not become unallocated
1495 +   after freeing, @count blocks become "grabbed". */
1496 +static void
1497 +used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1498 +            __u64 count)
1499 +{
1500 +       add_to_ctx_grabbed(ctx, count);
1501 +
1502 +       spin_lock_reiser4_super(sbinfo);
1503 +
1504 +       sbinfo->blocks_grabbed += count;
1505 +       sub_from_sb_used(sbinfo, count);
1506 +
1507 +       assert("nikita-2685", reiser4_check_block_counters(ctx->super));
1508 +
1509 +       spin_unlock_reiser4_super(sbinfo);
1510 +}
1511 +
1512 +/* this used to be done through used2grabbed and grabbed2free*/
1513 +static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
1514 +{
1515 +       spin_lock_reiser4_super(sbinfo);
1516 +
1517 +       sbinfo->blocks_free += count;
1518 +       sub_from_sb_used(sbinfo, count);
1519 +
1520 +       assert("nikita-2685",
1521 +              reiser4_check_block_counters(reiser4_get_current_sb()));
1522 +
1523 +       spin_unlock_reiser4_super(sbinfo);
1524 +}
1525 +
1526 +#if REISER4_DEBUG
1527 +
1528 +/* check "allocated" state of given block range */
1529 +static void
1530 +reiser4_check_blocks(const reiser4_block_nr * start,
1531 +                    const reiser4_block_nr * len, int desired)
1532 +{
1533 +       sa_check_blocks(start, len, desired);
1534 +}
1535 +
1536 +/* check "allocated" state of given block */
1537 +void reiser4_check_block(const reiser4_block_nr * block, int desired)
1538 +{
1539 +       const reiser4_block_nr one = 1;
1540 +
1541 +       reiser4_check_blocks(block, &one, desired);
1542 +}
1543 +
1544 +#endif
1545 +
1546 +/* Blocks deallocation function may do an actual deallocation through space
1547 +   plugin allocation or store deleted block numbers in atom's delete_set data
1548 +   structure depend on @defer parameter. */
1549 +
1550 +/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks
1551 +   which will be deleted from WORKING bitmap. They might be just unmapped from
1552 +   disk, or freed but disk space is still grabbed by current thread, or these
1553 +   blocks must not be counted in any reiser4 sb block counters,
1554 +   see block_stage_t comment */
1555 +
1556 +/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
1557 +   distinguish blocks allocated for unformatted and formatted nodes */
1558 +
1559 +int
1560 +reiser4_dealloc_blocks(const reiser4_block_nr * start,
1561 +                      const reiser4_block_nr * len,
1562 +                      block_stage_t target_stage, reiser4_ba_flags_t flags)
1563 +{
1564 +       txn_atom *atom = NULL;
1565 +       int ret;
1566 +       reiser4_context *ctx;
1567 +       reiser4_super_info_data *sbinfo;
1568 +
1569 +       ctx = get_current_context();
1570 +       sbinfo = get_super_private(ctx->super);
1571 +
1572 +       if (REISER4_DEBUG) {
1573 +               assert("zam-431", *len != 0);
1574 +               assert("zam-432", *start != 0);
1575 +               assert("zam-558", !reiser4_blocknr_is_fake(start));
1576 +
1577 +               spin_lock_reiser4_super(sbinfo);
1578 +               assert("zam-562", *start < sbinfo->block_count);
1579 +               spin_unlock_reiser4_super(sbinfo);
1580 +       }
1581 +
1582 +       if (flags & BA_DEFER) {
1583 +               blocknr_set_entry *bsep = NULL;
1584 +
1585 +               /* storing deleted block numbers in a blocknr set
1586 +                  datastructure for further actual deletion */
1587 +               do {
1588 +                       atom = get_current_atom_locked();
1589 +                       assert("zam-430", atom != NULL);
1590 +
1591 +                       ret =
1592 +                           blocknr_set_add_extent(atom, &atom->delete_set,
1593 +                                                  &bsep, start, len);
1594 +
1595 +                       if (ret == -ENOMEM)
1596 +                               return ret;
1597 +
1598 +                       /* This loop might spin at most two times */
1599 +               } while (ret == -E_REPEAT);
1600 +
1601 +               assert("zam-477", ret == 0);
1602 +               assert("zam-433", atom != NULL);
1603 +
1604 +               spin_unlock_atom(atom);
1605 +
1606 +       } else {
1607 +               assert("zam-425", get_current_super_private() != NULL);
1608 +               sa_dealloc_blocks(reiser4_get_space_allocator(ctx->super),
1609 +                                 *start, *len);
1610 +
1611 +               if (flags & BA_PERMANENT) {
1612 +                       /* These blocks were counted as allocated, we have to
1613 +                        * revert it back if allocation is discarded. */
1614 +                       txn_atom *atom = get_current_atom_locked();
1615 +                       atom->nr_blocks_allocated -= *len;
1616 +                       spin_unlock_atom(atom);
1617 +               }
1618 +
1619 +               switch (target_stage) {
1620 +               case BLOCK_NOT_COUNTED:
1621 +                       assert("vs-960", flags & BA_FORMATTED);
1622 +                       /* VITALY: This is what was grabbed for
1623 +                          internal/tx-lists/similar only */
1624 +                       used2free(sbinfo, *len);
1625 +                       break;
1626 +
1627 +               case BLOCK_GRABBED:
1628 +                       used2grabbed(ctx, sbinfo, *len);
1629 +                       break;
1630 +
1631 +               case BLOCK_UNALLOCATED:
1632 +                       used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
1633 +                       break;
1634 +
1635 +               case BLOCK_FLUSH_RESERVED:{
1636 +                               txn_atom *atom;
1637 +
1638 +                               atom = get_current_atom_locked();
1639 +                               used2flush_reserved(sbinfo, atom, *len,
1640 +                                                   flags & BA_FORMATTED);
1641 +                               spin_unlock_atom(atom);
1642 +                               break;
1643 +                       }
1644 +               default:
1645 +                       impossible("zam-532", "wrong block stage");
1646 +               }
1647 +       }
1648 +
1649 +       return 0;
1650 +}
1651 +
1652 +/* wrappers for block allocator plugin methods */
1653 +int reiser4_pre_commit_hook(void)
1654 +{
1655 +       assert("zam-502", get_current_super_private() != NULL);
1656 +       sa_pre_commit_hook();
1657 +       return 0;
1658 +}
1659 +
1660 +/* an actor which applies delete set to block allocator data */
1661 +static int
1662 +apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
1663 +          const reiser4_block_nr * b, void *data UNUSED_ARG)
1664 +{
1665 +       reiser4_context *ctx;
1666 +       reiser4_super_info_data *sbinfo;
1667 +
1668 +       __u64 len = 1;
1669 +
1670 +       ctx = get_current_context();
1671 +       sbinfo = get_super_private(ctx->super);
1672 +
1673 +       assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
1674 +       assert("zam-552", sbinfo != NULL);
1675 +
1676 +       if (b != NULL)
1677 +               len = *b;
1678 +
1679 +       if (REISER4_DEBUG) {
1680 +               spin_lock_reiser4_super(sbinfo);
1681 +
1682 +               assert("zam-554", *a < reiser4_block_count(ctx->super));
1683 +               assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
1684 +
1685 +               spin_unlock_reiser4_super(sbinfo);
1686 +       }
1687 +
1688 +       sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
1689 +       /* adjust sb block counters */
1690 +       used2free(sbinfo, len);
1691 +       return 0;
1692 +}
1693 +
1694 +void reiser4_post_commit_hook(void)
1695 +{
1696 +       txn_atom *atom;
1697 +
1698 +       atom = get_current_atom_locked();
1699 +       assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
1700 +       spin_unlock_atom(atom);
1701 +
1702 +       /* do the block deallocation which was deferred
1703 +          until commit is done */
1704 +       blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
1705 +
1706 +       assert("zam-504", get_current_super_private() != NULL);
1707 +       sa_post_commit_hook();
1708 +}
1709 +
1710 +void reiser4_post_write_back_hook(void)
1711 +{
1712 +       assert("zam-504", get_current_super_private() != NULL);
1713 +
1714 +       sa_post_commit_hook();
1715 +}
1716 +
1717 +/*
1718 +   Local variables:
1719 +   c-indentation-style: "K&R"
1720 +   mode-name: "LC"
1721 +   c-basic-offset: 8
1722 +   tab-width: 8
1723 +   fill-column: 120
1724 +   scroll-step: 1
1725 +   End:
1726 +*/
1727 diff -urN linux-2.6.35.orig/fs/reiser4/block_alloc.h linux-2.6.35/fs/reiser4/block_alloc.h
1728 --- linux-2.6.35.orig/fs/reiser4/block_alloc.h  1970-01-01 01:00:00.000000000 +0100
1729 +++ linux-2.6.35/fs/reiser4/block_alloc.h       2010-08-04 15:44:57.000000000 +0200
1730 @@ -0,0 +1,177 @@
1731 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1732 +
1733 +#if !defined(__FS_REISER4_BLOCK_ALLOC_H__)
1734 +#define __FS_REISER4_BLOCK_ALLOC_H__
1735 +
1736 +#include "dformat.h"
1737 +#include "forward.h"
1738 +
1739 +#include <linux/types.h>       /* for __u??  */
1740 +#include <linux/fs.h>
1741 +
1742 +/* Mask when is applied to given block number shows is that block number is a
1743 +   fake one */
1744 +#define REISER4_FAKE_BLOCKNR_BIT_MASK   0x8000000000000000ULL
1745 +/* Mask which isolates a type of object this fake block number was assigned
1746 +   to */
1747 +#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
1748 +
1749 +/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
1750 +   against these two values to understand is the object unallocated or bitmap
1751 +   shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
1752 +#define REISER4_UNALLOCATED_STATUS_VALUE    0xC000000000000000ULL
1753 +#define REISER4_BITMAP_BLOCKS_STATUS_VALUE  0x8000000000000000ULL
1754 +
1755 +/* specification how block allocation was counted in sb block counters */
1756 +typedef enum {
1757 +       BLOCK_NOT_COUNTED = 0,  /* reiser4 has no info about this block yet */
1758 +       BLOCK_GRABBED = 1,      /* free space grabbed for further allocation
1759 +                                  of this block */
1760 +       BLOCK_FLUSH_RESERVED = 2,       /* block is reserved for flush needs. */
1761 +       BLOCK_UNALLOCATED = 3,  /* block is used for existing in-memory object
1762 +                                  ( unallocated formatted or unformatted
1763 +                                  node) */
1764 +       BLOCK_ALLOCATED = 4     /* block is mapped to disk, real on-disk block
1765 +                                  number assigned */
1766 +} block_stage_t;
1767 +
1768 +/* a hint for block allocator */
1769 +struct reiser4_blocknr_hint {
1770 +       /* FIXME: I think we want to add a longterm lock on the bitmap block
1771 +          here. This is to prevent jnode_flush() calls from interleaving
1772 +          allocations on the same bitmap, once a hint is established. */
1773 +
1774 +       /* search start hint */
1775 +       reiser4_block_nr blk;
1776 +       /* if not zero, it is a region size we search for free blocks in */
1777 +       reiser4_block_nr max_dist;
1778 +       /* level for allocation, may be useful have branch-level and higher
1779 +          write-optimized. */
1780 +       tree_level level;
1781 +       /* block allocator assumes that blocks, which will be mapped to disk,
1782 +          are in this specified block_stage */
1783 +       block_stage_t block_stage;
1784 +       /* If direction = 1 allocate blocks in backward direction from the end
1785 +        * of disk to the beginning of disk.  */
1786 +       unsigned int backward:1;
1787 +
1788 +};
1789 +
1790 +/* These flags control block allocation/deallocation behavior */
1791 +enum reiser4_ba_flags {
1792 +       /* do allocatations from reserved (5%) area */
1793 +       BA_RESERVED = (1 << 0),
1794 +
1795 +       /* block allocator can do commit trying to recover free space */
1796 +       BA_CAN_COMMIT = (1 << 1),
1797 +
1798 +       /* if operation will be applied to formatted block */
1799 +       BA_FORMATTED = (1 << 2),
1800 +
1801 +       /* defer actual block freeing until transaction commit */
1802 +       BA_DEFER = (1 << 3),
1803 +
1804 +       /* allocate blocks for permanent fs objects (formatted or unformatted),
1805 +           not wandered of log blocks */
1806 +       BA_PERMANENT = (1 << 4),
1807 +
1808 +       /* grab space even it was disabled */
1809 +       BA_FORCE = (1 << 5),
1810 +
1811 +       /* use default start value for free blocks search. */
1812 +       BA_USE_DEFAULT_SEARCH_START = (1 << 6)
1813 +};
1814 +
1815 +typedef enum reiser4_ba_flags reiser4_ba_flags_t;
1816 +
1817 +extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint);
1818 +extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint);
1819 +extern void update_blocknr_hint_default(const struct super_block *,
1820 +                                       const reiser4_block_nr *);
1821 +extern void get_blocknr_hint_default(reiser4_block_nr *);
1822 +
1823 +extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
1824 +
1825 +int assign_fake_blocknr_formatted(reiser4_block_nr *);
1826 +reiser4_block_nr fake_blocknr_unformatted(int);
1827 +
1828 +/* free -> grabbed -> fake_allocated -> used */
1829 +
1830 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
1831 +void all_grabbed2free(void);
1832 +void grabbed2free(reiser4_context * , reiser4_super_info_data * , __u64 count);
1833 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
1834 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
1835 +void grabbed2flush_reserved(__u64 count);
1836 +int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
1837 +                        reiser4_block_nr * start,
1838 +                        reiser4_block_nr * len, reiser4_ba_flags_t flags);
1839 +int reiser4_dealloc_blocks(const reiser4_block_nr *,
1840 +                          const reiser4_block_nr *,
1841 +                          block_stage_t, reiser4_ba_flags_t flags);
1842 +
1843 +static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
1844 +                                     reiser4_block_nr * start,
1845 +                                     reiser4_ba_flags_t flags)
1846 +{
1847 +       reiser4_block_nr one = 1;
1848 +       return reiser4_alloc_blocks(hint, start, &one, flags);
1849 +}
1850 +
1851 +static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
1852 +                                       block_stage_t stage,
1853 +                                       reiser4_ba_flags_t flags)
1854 +{
1855 +       const reiser4_block_nr one = 1;
1856 +       return reiser4_dealloc_blocks(block, &one, stage, flags);
1857 +}
1858 +
1859 +#define reiser4_grab_space_force(count, flags)         \
1860 +       reiser4_grab_space(count, flags | BA_FORCE)
1861 +
1862 +extern void grabbed2free_mark(__u64 mark);
1863 +extern int reiser4_grab_reserved(struct super_block *,
1864 +                                __u64, reiser4_ba_flags_t);
1865 +extern void reiser4_release_reserved(struct super_block *super);
1866 +
1867 +/* grabbed -> fake_allocated */
1868 +
1869 +/* fake_allocated -> used */
1870 +
1871 +/* used -> fake_allocated -> grabbed -> free */
1872 +
1873 +extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
1874 +
1875 +extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da);
1876 +
1877 +extern void grabbed2cluster_reserved(int count);
1878 +extern void cluster_reserved2grabbed(int count);
1879 +extern void cluster_reserved2free(int count);
1880 +
1881 +extern int reiser4_check_block_counters(const struct super_block *);
1882 +
1883 +#if REISER4_DEBUG
1884 +
1885 +extern void reiser4_check_block(const reiser4_block_nr *, int);
1886 +
1887 +#else
1888 +
1889 +#  define reiser4_check_block(beg, val)        noop
1890 +
1891 +#endif
1892 +
1893 +extern int reiser4_pre_commit_hook(void);
1894 +extern void reiser4_post_commit_hook(void);
1895 +extern void reiser4_post_write_back_hook(void);
1896 +
1897 +#endif                         /* __FS_REISER4_BLOCK_ALLOC_H__ */
1898 +
1899 +/* Make Linus happy.
1900 +   Local variables:
1901 +   c-indentation-style: "K&R"
1902 +   mode-name: "LC"
1903 +   c-basic-offset: 8
1904 +   tab-width: 8
1905 +   fill-column: 120
1906 +   End:
1907 +*/
1908 diff -urN linux-2.6.35.orig/fs/reiser4/blocknrset.c linux-2.6.35/fs/reiser4/blocknrset.c
1909 --- linux-2.6.35.orig/fs/reiser4/blocknrset.c   1970-01-01 01:00:00.000000000 +0100
1910 +++ linux-2.6.35/fs/reiser4/blocknrset.c        2010-08-04 15:44:57.000000000 +0200
1911 @@ -0,0 +1,371 @@
1912 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
1913 +reiser4/README */
1914 +
1915 +/* This file contains code for various block number sets used by the atom to
1916 +   track the deleted set and wandered block mappings. */
1917 +
1918 +#include "debug.h"
1919 +#include "dformat.h"
1920 +#include "txnmgr.h"
1921 +#include "context.h"
1922 +
1923 +#include <linux/slab.h>
1924 +
1925 +/* The proposed data structure for storing unordered block number sets is a
1926 +   list of elements, each of which contains an array of block number or/and
1927 +   array of block number pairs. That element called blocknr_set_entry is used
1928 +   to store block numbers from the beginning and for extents from the end of
1929 +   the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
1930 +   count numbers of blocks and extents.
1931 +
1932 +   +------------------- blocknr_set_entry->data ------------------+
1933 +   |block1|block2| ... <free space> ... |pair3|pair2|pair1|
1934 +   +------------------------------------------------------------+
1935 +
1936 +   When current blocknr_set_entry is full, allocate a new one. */
1937 +
1938 +/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
1939 + * set (single blocks and block extents), in that case blocknr pair represent an
1940 + * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
1941 + * there represent a (real block) -> (wandered block) mapping. */
1942 +
1943 +/* Protection: blocknr sets belong to reiser4 atom, and
1944 + * their modifications are performed with the atom lock held */
1945 +
1946 +/* The total size of a blocknr_set_entry. */
1947 +#define BLOCKNR_SET_ENTRY_SIZE 128
1948 +
1949 +/* The number of blocks that can fit the blocknr data area. */
1950 +#define BLOCKNR_SET_ENTRIES_NUMBER             \
1951 +       ((BLOCKNR_SET_ENTRY_SIZE -              \
1952 +       2 * sizeof(unsigned) -                  \
1953 +       sizeof(struct list_head)) /             \
1954 +       sizeof(reiser4_block_nr))
1955 +
1956 +/* An entry of the blocknr_set */
1957 +struct blocknr_set_entry {
1958 +       unsigned nr_singles;
1959 +       unsigned nr_pairs;
1960 +       struct list_head link;
1961 +       reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
1962 +};
1963 +
1964 +/* A pair of blocks as recorded in the blocknr_set_entry data. */
1965 +struct blocknr_pair {
1966 +       reiser4_block_nr a;
1967 +       reiser4_block_nr b;
1968 +};
1969 +
1970 +/* Return the number of blocknr slots available in a blocknr_set_entry. */
1971 +/* Audited by: green(2002.06.11) */
1972 +static unsigned bse_avail(blocknr_set_entry * bse)
1973 +{
1974 +       unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
1975 +
1976 +       assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
1977 +       cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
1978 +
1979 +       return BLOCKNR_SET_ENTRIES_NUMBER - used;
1980 +}
1981 +
1982 +/* Initialize a blocknr_set_entry. */
1983 +static void bse_init(blocknr_set_entry *bse)
1984 +{
1985 +       bse->nr_singles = 0;
1986 +       bse->nr_pairs = 0;
1987 +       INIT_LIST_HEAD(&bse->link);
1988 +}
1989 +
1990 +/* Allocate and initialize a blocknr_set_entry. */
1991 +/* Audited by: green(2002.06.11) */
1992 +static blocknr_set_entry *bse_alloc(void)
1993 +{
1994 +       blocknr_set_entry *e;
1995 +
1996 +       if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
1997 +                                          reiser4_ctx_gfp_mask_get())) == NULL)
1998 +               return NULL;
1999 +
2000 +       bse_init(e);
2001 +
2002 +       return e;
2003 +}
2004 +
2005 +/* Free a blocknr_set_entry. */
2006 +/* Audited by: green(2002.06.11) */
2007 +static void bse_free(blocknr_set_entry * bse)
2008 +{
2009 +       kfree(bse);
2010 +}
2011 +
2012 +/* Add a block number to a blocknr_set_entry */
2013 +/* Audited by: green(2002.06.11) */
2014 +static void
2015 +bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
2016 +{
2017 +       assert("jmacd-5099", bse_avail(bse) >= 1);
2018 +
2019 +       bse->entries[bse->nr_singles++] = *block;
2020 +}
2021 +
2022 +/* Get a pair of block numbers */
2023 +/* Audited by: green(2002.06.11) */
2024 +static inline struct blocknr_pair *bse_get_pair(blocknr_set_entry * bse,
2025 +                                               unsigned pno)
2026 +{
2027 +       assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
2028 +
2029 +       return (struct blocknr_pair *) (bse->entries +
2030 +                                       BLOCKNR_SET_ENTRIES_NUMBER -
2031 +                                       2 * (pno + 1));
2032 +}
2033 +
2034 +/* Add a pair of block numbers to a blocknr_set_entry */
2035 +/* Audited by: green(2002.06.11) */
2036 +static void
2037 +bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
2038 +            const reiser4_block_nr * b)
2039 +{
2040 +       struct blocknr_pair *pair;
2041 +
2042 +       assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
2043 +
2044 +       pair = bse_get_pair(bse, bse->nr_pairs++);
2045 +
2046 +       pair->a = *a;
2047 +       pair->b = *b;
2048 +}
2049 +
2050 +/* Add either a block or pair of blocks to the block number set.  The first
2051 +   blocknr (@a) must be non-NULL.  If @b is NULL a single blocknr is added, if
2052 +   @b is non-NULL a pair is added.  The block number set belongs to atom, and
2053 +   the call is made with the atom lock held.  There may not be enough space in
2054 +   the current blocknr_set_entry.  If new_bsep points to a non-NULL
2055 +   blocknr_set_entry then it will be added to the blocknr_set and new_bsep
2056 +   will be set to NULL.  If new_bsep contains NULL then the atom lock will be
2057 +   released and a new bse will be allocated in new_bsep.  E_REPEAT will be
2058 +   returned with the atom unlocked for the operation to be tried again.  If
2059 +   the operation succeeds, 0 is returned.  If new_bsep is non-NULL and not
2060 +   used during the call, it will be freed automatically. */
2061 +static int blocknr_set_add(txn_atom *atom, struct list_head *bset,
2062 +                          blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
2063 +                          const reiser4_block_nr *b)
2064 +{
2065 +       blocknr_set_entry *bse;
2066 +       unsigned entries_needed;
2067 +
2068 +       assert("jmacd-5101", a != NULL);
2069 +
2070 +       entries_needed = (b == NULL) ? 1 : 2;
2071 +       if (list_empty(bset) ||
2072 +           bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) {
2073 +               /* See if a bse was previously allocated. */
2074 +               if (*new_bsep == NULL) {
2075 +                       spin_unlock_atom(atom);
2076 +                       *new_bsep = bse_alloc();
2077 +                       return (*new_bsep != NULL) ? -E_REPEAT :
2078 +                               RETERR(-ENOMEM);
2079 +               }
2080 +
2081 +               /* Put it on the head of the list. */
2082 +               list_add(&((*new_bsep)->link), bset);
2083 +
2084 +               *new_bsep = NULL;
2085 +       }
2086 +
2087 +       /* Add the single or pair. */
2088 +       bse = list_entry(bset->next, blocknr_set_entry, link);
2089 +       if (b == NULL) {
2090 +               bse_put_single(bse, a);
2091 +       } else {
2092 +               bse_put_pair(bse, a, b);
2093 +       }
2094 +
2095 +       /* If new_bsep is non-NULL then there was an allocation race, free this
2096 +          copy. */
2097 +       if (*new_bsep != NULL) {
2098 +               bse_free(*new_bsep);
2099 +               *new_bsep = NULL;
2100 +       }
2101 +
2102 +       return 0;
2103 +}
2104 +
2105 +/* Add an extent to the block set.  If the length is 1, it is treated as a
2106 +   single block (e.g., reiser4_set_add_block). */
2107 +/* Audited by: green(2002.06.11) */
2108 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2109 +   kmalloc might schedule. The only exception is atom spinlock, which is
2110 +   properly freed. */
2111 +int
2112 +blocknr_set_add_extent(txn_atom * atom,
2113 +                      struct list_head *bset,
2114 +                      blocknr_set_entry ** new_bsep,
2115 +                      const reiser4_block_nr * start,
2116 +                      const reiser4_block_nr * len)
2117 +{
2118 +       assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
2119 +       return blocknr_set_add(atom, bset, new_bsep, start,
2120 +                              *len == 1 ? NULL : len);
2121 +}
2122 +
2123 +/* Add a block pair to the block set. It adds exactly a pair, which is checked
2124 + * by an assertion that both arguments are not null.*/
2125 +/* Audited by: green(2002.06.11) */
2126 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2127 +   kmalloc might schedule. The only exception is atom spinlock, which is
2128 +   properly freed. */
2129 +int
2130 +blocknr_set_add_pair(txn_atom * atom,
2131 +                    struct list_head *bset,
2132 +                    blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
2133 +                    const reiser4_block_nr * b)
2134 +{
2135 +       assert("jmacd-5103", a != NULL && b != NULL);
2136 +       return blocknr_set_add(atom, bset, new_bsep, a, b);
2137 +}
2138 +
2139 +/* Initialize a blocknr_set. */
2140 +void blocknr_set_init(struct list_head *bset)
2141 +{
2142 +       INIT_LIST_HEAD(bset);
2143 +}
2144 +
2145 +/* Release the entries of a blocknr_set. */
2146 +void blocknr_set_destroy(struct list_head *bset)
2147 +{
2148 +       blocknr_set_entry *bse;
2149 +
2150 +       while (!list_empty(bset)) {
2151 +               bse = list_entry(bset->next, blocknr_set_entry, link);
2152 +               list_del_init(&bse->link);
2153 +               bse_free(bse);
2154 +       }
2155 +}
2156 +
2157 +/* Merge blocknr_set entries out of @from into @into. */
2158 +/* Audited by: green(2002.06.11) */
2159 +/* Auditor comments: This merge does not know if merged sets contain
2160 +   blocks pairs (As for wandered sets) or extents, so it cannot really merge
2161 +   overlapping ranges if there is some. So I believe it may lead to
2162 +   some blocks being presented several times in one blocknr_set. To help
2163 +   debugging such problems it might help to check for duplicate entries on
2164 +   actual processing of this set. Testing this kind of stuff right here is
2165 +   also complicated by the fact that these sets are not sorted and going
2166 +   through whole set on each element addition is going to be CPU-heavy task */
2167 +void blocknr_set_merge(struct list_head *from, struct list_head *into)
2168 +{
2169 +       blocknr_set_entry *bse_into = NULL;
2170 +
2171 +       /* If @from is empty, no work to perform. */
2172 +       if (list_empty(from))
2173 +               return;
2174 +       /* If @into is not empty, try merging partial-entries. */
2175 +       if (!list_empty(into)) {
2176 +
2177 +               /* Neither set is empty, pop the front to members and try to
2178 +                  combine them. */
2179 +               blocknr_set_entry *bse_from;
2180 +               unsigned into_avail;
2181 +
2182 +               bse_into = list_entry(into->next, blocknr_set_entry, link);
2183 +               list_del_init(&bse_into->link);
2184 +               bse_from = list_entry(from->next, blocknr_set_entry, link);
2185 +               list_del_init(&bse_from->link);
2186 +
2187 +               /* Combine singles. */
2188 +               for (into_avail = bse_avail(bse_into);
2189 +                    into_avail != 0 && bse_from->nr_singles != 0;
2190 +                    into_avail -= 1) {
2191 +                       bse_put_single(bse_into,
2192 +                                      &bse_from->entries[--bse_from->
2193 +                                                         nr_singles]);
2194 +               }
2195 +
2196 +               /* Combine pairs. */
2197 +               for (; into_avail > 1 && bse_from->nr_pairs != 0;
2198 +                    into_avail -= 2) {
2199 +                       struct blocknr_pair *pair =
2200 +                               bse_get_pair(bse_from, --bse_from->nr_pairs);
2201 +                       bse_put_pair(bse_into, &pair->a, &pair->b);
2202 +               }
2203 +
2204 +               /* If bse_from is empty, delete it now. */
2205 +               if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
2206 +                       bse_free(bse_from);
2207 +               } else {
2208 +                       /* Otherwise, bse_into is full or nearly full (e.g.,
2209 +                          it could have one slot avail and bse_from has one
2210 +                          pair left).  Push it back onto the list.  bse_from
2211 +                          becomes bse_into, which will be the new partial. */
2212 +                       list_add(&bse_into->link, into);
2213 +                       bse_into = bse_from;
2214 +               }
2215 +       }
2216 +
2217 +       /* Splice lists together. */
2218 +       list_splice_init(from, into->prev);
2219 +
2220 +       /* Add the partial entry back to the head of the list. */
2221 +       if (bse_into != NULL)
2222 +               list_add(&bse_into->link, into);
2223 +}
2224 +
2225 +/* Iterate over all blocknr set elements. */
2226 +int blocknr_set_iterator(txn_atom *atom, struct list_head *bset,
2227 +                        blocknr_set_actor_f actor, void *data, int delete)
2228 +{
2229 +
2230 +       blocknr_set_entry *entry;
2231 +
2232 +       assert("zam-429", atom != NULL);
2233 +       assert("zam-430", atom_is_protected(atom));
2234 +       assert("zam-431", bset != 0);
2235 +       assert("zam-432", actor != NULL);
2236 +
2237 +       entry = list_entry(bset->next, blocknr_set_entry, link);
2238 +       while (bset != &entry->link) {
2239 +               blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
2240 +               unsigned int i;
2241 +               int ret;
2242 +
2243 +               for (i = 0; i < entry->nr_singles; i++) {
2244 +                       ret = actor(atom, &entry->entries[i], NULL, data);
2245 +
2246 +                       /* We can't break a loop if delete flag is set. */
2247 +                       if (ret != 0 && !delete)
2248 +                               return ret;
2249 +               }
2250 +
2251 +               for (i = 0; i < entry->nr_pairs; i++) {
2252 +                       struct blocknr_pair *ab;
2253 +
2254 +                       ab = bse_get_pair(entry, i);
2255 +
2256 +                       ret = actor(atom, &ab->a, &ab->b, data);
2257 +
2258 +                       if (ret != 0 && !delete)
2259 +                               return ret;
2260 +               }
2261 +
2262 +               if (delete) {
2263 +                       list_del(&entry->link);
2264 +                       bse_free(entry);
2265 +               }
2266 +
2267 +               entry = tmp;
2268 +       }
2269 +
2270 +       return 0;
2271 +}
2272 +
2273 +/*
2274 + * Local variables:
2275 + * c-indentation-style: "K&R"
2276 + * mode-name: "LC"
2277 + * c-basic-offset: 8
2278 + * tab-width: 8
2279 + * fill-column: 79
2280 + * scroll-step: 1
2281 + * End:
2282 + */
2283 diff -urN linux-2.6.35.orig/fs/reiser4/carry.c linux-2.6.35/fs/reiser4/carry.c
2284 --- linux-2.6.35.orig/fs/reiser4/carry.c        1970-01-01 01:00:00.000000000 +0100
2285 +++ linux-2.6.35/fs/reiser4/carry.c     2010-08-04 15:44:57.000000000 +0200
2286 @@ -0,0 +1,1398 @@
2287 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
2288 +   reiser4/README */
2289 +/* Functions to "carry" tree modification(s) upward. */
2290 +/* Tree is modified one level at a time. As we modify a level we accumulate a
2291 +   set of changes that need to be propagated to the next level.  We manage
2292 +   node locking such that any searches that collide with carrying are
2293 +   restarted, from the root if necessary.
2294 +
2295 +   Insertion of a new item may result in items being moved among nodes and
2296 +   this requires the delimiting key to be updated at the least common parent
2297 +   of the nodes modified to preserve search tree invariants. Also, insertion
2298 +   may require allocation of a new node. A pointer to the new node has to be
2299 +   inserted into some node on the parent level, etc.
2300 +
2301 +   Tree carrying is meant to be analogous to arithmetic carrying.
2302 +
2303 +   A carry operation is always associated with some node (&carry_node).
2304 +
2305 +   Carry process starts with some initial set of operations to be performed
2306 +   and an initial set of already locked nodes.  Operations are performed one
2307 +   by one. Performing each single operation has following possible effects:
2308 +
2309 +    - content of carry node associated with operation is modified
2310 +    - new carry nodes are locked and involved into carry process on this level
2311 +    - new carry operations are posted to the next level
2312 +
2313 +   After all carry operations on this level are done, process is repeated for
2314 +   the accumulated sequence on carry operations for the next level. This
2315 +   starts by trying to lock (in left to right order) all carry nodes
2316 +   associated with carry operations on the parent level. After this, we decide
2317 +   whether more nodes are required on the left of already locked set. If so,
2318 +   all locks taken on the parent level are released, new carry nodes are
2319 +   added, and locking process repeats.
2320 +
2321 +   It may happen that balancing process fails owing to unrecoverable error on
2322 +   some of upper levels of a tree (possible causes are io error, failure to
2323 +   allocate new node, etc.). In this case we should unmount the filesystem,
2324 +   rebooting if it is the root, and possibly advise the use of fsck.
2325 +
2326 +   USAGE:
2327 +
2328 +    int some_tree_operation( znode *node, ... )
2329 +    {
2330 +       // Allocate on a stack pool of carry objects: operations and nodes.
2331 +       // Most carry processes will only take objects from here, without
2332 +       // dynamic allocation.
2333 +
2334 +I feel uneasy about this pool.  It adds to code complexity, I understand why it
2335 +exists, but.... -Hans
2336 +
2337 +       carry_pool  pool;
2338 +       carry_level lowest_level;
2339 +       carry_op   *op;
2340 +
2341 +       init_carry_pool( &pool );
2342 +       init_carry_level( &lowest_level, &pool );
2343 +
2344 +       // operation may be one of:
2345 +       //   COP_INSERT    --- insert new item into node
2346 +       //   COP_CUT       --- remove part of or whole node
2347 +       //   COP_PASTE     --- increase size of item
2348 +       //   COP_DELETE    --- delete pointer from parent node
2349 +       //   COP_UPDATE    --- update delimiting key in least
2350 +       //                     common ancestor of two
2351 +
2352 +       op = reiser4_post_carry( &lowest_level, operation, node, 0 );
2353 +       if( IS_ERR( op ) || ( op == NULL ) ) {
2354 +               handle error
2355 +       } else {
2356 +       // fill in remaining fields in @op, according to carry.h:carry_op
2357 +               result = carry(&lowest_level, NULL);
2358 +       }
2359 +       done_carry_pool(&pool);
2360 +    }
2361 +
2362 +   When you are implementing node plugin method that participates in carry
2363 +   (shifting, insertion, deletion, etc.), do the following:
2364 +
2365 +   int foo_node_method(znode * node, ..., carry_level * todo)
2366 +   {
2367 +       carry_op   *op;
2368 +
2369 +       ....
2370 +
2371 +       // note, that last argument to reiser4_post_carry() is non-null
2372 +       // here, because @op is to be applied to the parent of @node, rather
2373 +       // than to the @node itself as in the previous case.
2374 +
2375 +       op = node_post_carry(todo, operation, node, 1);
2376 +       // fill in remaining fields in @op, according to carry.h:carry_op
2377 +
2378 +       ....
2379 +
2380 +   }
2381 +
2382 +   BATCHING:
2383 +
2384 +   One of the main advantages of level-by-level balancing implemented here is
2385 +   ability to batch updates on a parent level and to peform them more
2386 +   efficiently as a result.
2387 +
2388 +   Description To Be Done (TBD).
2389 +
2390 +   DIFFICULTIES AND SUBTLE POINTS:
2391 +
2392 +   1. complex plumbing is required, because:
2393 +
2394 +       a. effective allocation through pools is needed
2395 +
2396 +       b. target of operation is not exactly known when operation is
2397 +       posted. This is worked around through bitfields in &carry_node and
2398 +       logic in lock_carry_node()
2399 +
2400 +       c. of interaction with locking code: node should be added into sibling
2401 +       list when pointer to it is inserted into its parent, which is some time
2402 +       after node was created. Between these moments, node is somewhat in
2403 +       suspended state and is only registered in the carry lists
2404 +
2405 +    2. whole balancing logic is implemented here, in particular, insertion
2406 +    logic is coded in make_space().
2407 +
2408 +    3. special cases like insertion (reiser4_add_tree_root()) or deletion
2409 +    (reiser4_kill_tree_root()) of tree root and morphing of paste into insert
2410 +    (insert_paste()) have to be handled.
2411 +
2412 +    4. there is non-trivial interdependency between allocation of new nodes
2413 +    and almost everything else. This is mainly due to the (1.c) above. I shall
2414 +    write about this later.
2415 +
2416 +*/
2417 +
2418 +#include "forward.h"
2419 +#include "debug.h"
2420 +#include "key.h"
2421 +#include "coord.h"
2422 +#include "plugin/item/item.h"
2423 +#include "plugin/item/extent.h"
2424 +#include "plugin/node/node.h"
2425 +#include "jnode.h"
2426 +#include "znode.h"
2427 +#include "tree_mod.h"
2428 +#include "tree_walk.h"
2429 +#include "block_alloc.h"
2430 +#include "pool.h"
2431 +#include "tree.h"
2432 +#include "carry.h"
2433 +#include "carry_ops.h"
2434 +#include "super.h"
2435 +#include "reiser4.h"
2436 +
2437 +#include <linux/types.h>
2438 +
2439 +/* level locking/unlocking */
2440 +static int lock_carry_level(carry_level * level);
2441 +static void unlock_carry_level(carry_level * level, int failure);
2442 +static void done_carry_level(carry_level * level);
2443 +static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
2444 +
2445 +int lock_carry_node(carry_level * level, carry_node * node);
2446 +int lock_carry_node_tail(carry_node * node);
2447 +
2448 +/* carry processing proper */
2449 +static int carry_on_level(carry_level * doing, carry_level * todo);
2450 +
2451 +static carry_op *add_op(carry_level * level, pool_ordering order,
2452 +                       carry_op * reference);
2453 +
2454 +/* handlers for carry operations. */
2455 +
2456 +static void fatal_carry_error(carry_level * doing, int ecode);
2457 +static int add_new_root(carry_level * level, carry_node * node, znode * fake);
2458 +
2459 +static void print_level(const char *prefix, carry_level * level);
2460 +
2461 +#if REISER4_DEBUG
2462 +typedef enum {
2463 +       CARRY_TODO,
2464 +       CARRY_DOING
2465 +} carry_queue_state;
2466 +static int carry_level_invariant(carry_level * level, carry_queue_state state);
2467 +#endif
2468 +
2469 +/* main entry point for tree balancing.
2470 +
2471 +   Tree carry performs operations from @doing and while doing so accumulates
2472 +   information about operations to be performed on the next level ("carried"
2473 +   to the parent level). Carried operations are performed, causing possibly
2474 +   more operations to be carried upward etc. carry() takes care about
2475 +   locking and pinning znodes while operating on them.
2476 +
2477 +   For usage, see comment at the top of fs/reiser4/carry.c
2478 +
2479 +*/
2480 +int reiser4_carry(carry_level * doing /* set of carry operations to be
2481 +                                      * performed */ ,
2482 +                 carry_level * done  /* set of nodes, already performed
2483 +                                      *  at the previous level.
2484 +                                      * NULL in most cases */)
2485 +{
2486 +       int result = 0;
2487 +       /* queue of new requests */
2488 +       carry_level *todo;
2489 +       ON_DEBUG(STORE_COUNTERS);
2490 +
2491 +       assert("nikita-888", doing != NULL);
2492 +       BUG_ON(done != NULL);
2493 +
2494 +       todo = doing + 1;
2495 +       init_carry_level(todo, doing->pool);
2496 +
2497 +       /* queue of requests preformed on the previous level */
2498 +       done = todo + 1;
2499 +       init_carry_level(done, doing->pool);
2500 +
2501 +       /* iterate until there is nothing more to do */
2502 +       while (result == 0 && doing->ops_num > 0) {
2503 +               carry_level *tmp;
2504 +
2505 +               /* at this point @done is locked. */
2506 +               /* repeat lock/do/unlock while
2507 +
2508 +                  (1) lock_carry_level() fails due to deadlock avoidance, or
2509 +
2510 +                  (2) carry_on_level() decides that more nodes have to
2511 +                  be involved.
2512 +
2513 +                  (3) some unexpected error occurred while balancing on the
2514 +                  upper levels. In this case all changes are rolled back.
2515 +
2516 +                */
2517 +               while (1) {
2518 +                       result = lock_carry_level(doing);
2519 +                       if (result == 0) {
2520 +                               /* perform operations from @doing and
2521 +                                  accumulate new requests in @todo */
2522 +                               result = carry_on_level(doing, todo);
2523 +                               if (result == 0)
2524 +                                       break;
2525 +                               else if (result != -E_REPEAT ||
2526 +                                        !doing->restartable) {
2527 +                                       warning("nikita-1043",
2528 +                                               "Fatal error during carry: %i",
2529 +                                               result);
2530 +                                       print_level("done", done);
2531 +                                       print_level("doing", doing);
2532 +                                       print_level("todo", todo);
2533 +                                       /* do some rough stuff like aborting
2534 +                                          all pending transcrashes and thus
2535 +                                          pushing tree back to the consistent
2536 +                                          state. Alternatvely, just panic.
2537 +                                        */
2538 +                                       fatal_carry_error(doing, result);
2539 +                                       return result;
2540 +                               }
2541 +                       } else if (result != -E_REPEAT) {
2542 +                               fatal_carry_error(doing, result);
2543 +                               return result;
2544 +                       }
2545 +                       unlock_carry_level(doing, 1);
2546 +               }
2547 +               /* at this point @done can be safely unlocked */
2548 +               done_carry_level(done);
2549 +
2550 +               /* cyclically shift queues */
2551 +               tmp = done;
2552 +               done = doing;
2553 +               doing = todo;
2554 +               todo = tmp;
2555 +               init_carry_level(todo, doing->pool);
2556 +
2557 +               /* give other threads chance to run */
2558 +               reiser4_preempt_point();
2559 +       }
2560 +       done_carry_level(done);
2561 +
2562 +       /* all counters, but x_refs should remain the same. x_refs can change
2563 +          owing to transaction manager */
2564 +       ON_DEBUG(CHECK_COUNTERS);
2565 +       return result;
2566 +}
2567 +
2568 +/* perform carry operations on given level.
2569 +
2570 +   Optimizations proposed by pooh:
2571 +
2572 +   (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
2573 +   required;
2574 +
2575 +   (2) unlock node if there are no more operations to be performed upon it and
2576 +   node didn't add any operation to @todo. This can be implemented by
2577 +   attaching to each node two counters: counter of operaions working on this
2578 +   node and counter and operations carried upward from this node.
2579 +
2580 +*/
2581 +static int carry_on_level(carry_level * doing  /* queue of carry operations to
2582 +                                                * do on this level */ ,
2583 +                         carry_level * todo    /* queue where new carry
2584 +                                                * operations to be performed on
2585 +                                                * the * parent level are
2586 +                                                * accumulated during @doing
2587 +                                                * processing. */ )
2588 +{
2589 +       int result;
2590 +       int (*f) (carry_op *, carry_level *, carry_level *);
2591 +       carry_op *op;
2592 +       carry_op *tmp_op;
2593 +
2594 +       assert("nikita-1034", doing != NULL);
2595 +       assert("nikita-1035", todo != NULL);
2596 +
2597 +       /* @doing->nodes are locked. */
2598 +
2599 +       /* This function can be split into two phases: analysis and modification
2600 +
2601 +          Analysis calculates precisely what items should be moved between
2602 +          nodes. This information is gathered in some structures attached to
2603 +          each carry_node in a @doing queue. Analysis also determines whether
2604 +          new nodes are to be allocated etc.
2605 +
2606 +          After analysis is completed, actual modification is performed. Here
2607 +          we can take advantage of "batch modification": if there are several
2608 +          operations acting on the same node, modifications can be performed
2609 +          more efficiently when batched together.
2610 +
2611 +          Above is an optimization left for the future.
2612 +        */
2613 +       /* Important, but delayed optimization: it's possible to batch
2614 +          operations together and perform them more efficiently as a
2615 +          result. For example, deletion of several neighboring items from a
2616 +          node can be converted to a single ->cut() operation.
2617 +
2618 +          Before processing queue, it should be scanned and "mergeable"
2619 +          operations merged.
2620 +        */
2621 +       result = 0;
2622 +       for_all_ops(doing, op, tmp_op) {
2623 +               carry_opcode opcode;
2624 +
2625 +               assert("nikita-1041", op != NULL);
2626 +               opcode = op->op;
2627 +               assert("nikita-1042", op->op < COP_LAST_OP);
2628 +               f = op_dispatch_table[op->op].handler;
2629 +               result = f(op, doing, todo);
2630 +               /* locking can fail with -E_REPEAT. Any different error is fatal
2631 +                  and will be handled by fatal_carry_error() sledgehammer.
2632 +                */
2633 +               if (result != 0)
2634 +                       break;
2635 +       }
2636 +       if (result == 0) {
2637 +               carry_plugin_info info;
2638 +               carry_node *scan;
2639 +               carry_node *tmp_scan;
2640 +
2641 +               info.doing = doing;
2642 +               info.todo = todo;
2643 +
2644 +               assert("nikita-3002",
2645 +                      carry_level_invariant(doing, CARRY_DOING));
2646 +               for_all_nodes(doing, scan, tmp_scan) {
2647 +                       znode *node;
2648 +
2649 +                       node = reiser4_carry_real(scan);
2650 +                       assert("nikita-2547", node != NULL);
2651 +                       if (node_is_empty(node)) {
2652 +                               result =
2653 +                                   node_plugin_by_node(node)->
2654 +                                   prepare_removal(node, &info);
2655 +                               if (result != 0)
2656 +                                       break;
2657 +                       }
2658 +               }
2659 +       }
2660 +       return result;
2661 +}
2662 +
2663 +/* post carry operation
2664 +
2665 +   This is main function used by external carry clients: node layout plugins
2666 +   and tree operations to create new carry operation to be performed on some
2667 +   level.
2668 +
2669 +   New operation will be included in the @level queue. To actually perform it,
2670 +   call carry( level, ... ). This function takes write lock on @node. Carry
2671 +   manages all its locks by itself, don't worry about this.
2672 +
2673 +   This function adds operation and node at the end of the queue. It is up to
2674 +   caller to guarantee proper ordering of node queue.
2675 +
2676 +*/
2677 +carry_op * reiser4_post_carry(carry_level * level /* queue where new operation
2678 +                                                  * is to be posted at */ ,
2679 +                             carry_opcode op /* opcode of operation */ ,
2680 +                             znode * node      /* node on which this operation
2681 +                                                * will operate */ ,
2682 +                             int apply_to_parent_p /* whether operation will
2683 +                                                    * operate directly on @node
2684 +                                                    * or on it parent. */)
2685 +{
2686 +       carry_op *result;
2687 +       carry_node *child;
2688 +
2689 +       assert("nikita-1046", level != NULL);
2690 +       assert("nikita-1788", znode_is_write_locked(node));
2691 +
2692 +       result = add_op(level, POOLO_LAST, NULL);
2693 +       if (IS_ERR(result))
2694 +               return result;
2695 +       child = reiser4_add_carry(level, POOLO_LAST, NULL);
2696 +       if (IS_ERR(child)) {
2697 +               reiser4_pool_free(&level->pool->op_pool, &result->header);
2698 +               return (carry_op *) child;
2699 +       }
2700 +       result->node = child;
2701 +       result->op = op;
2702 +       child->parent = apply_to_parent_p;
2703 +       if (ZF_ISSET(node, JNODE_ORPHAN))
2704 +               child->left_before = 1;
2705 +       child->node = node;
2706 +       return result;
2707 +}
2708 +
2709 +/* initialize carry queue */
2710 +void init_carry_level(carry_level * level /* level to initialize */ ,
2711 +                     carry_pool * pool /* pool @level will allocate objects
2712 +                                        * from */ )
2713 +{
2714 +       assert("nikita-1045", level != NULL);
2715 +       assert("nikita-967", pool != NULL);
2716 +
2717 +       memset(level, 0, sizeof *level);
2718 +       level->pool = pool;
2719 +
2720 +       INIT_LIST_HEAD(&level->nodes);
2721 +       INIT_LIST_HEAD(&level->ops);
2722 +}
2723 +
2724 +/* allocate carry pool and initialize pools within queue */
2725 +carry_pool *init_carry_pool(int size)
2726 +{
2727 +       carry_pool *pool;
2728 +
2729 +       assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
2730 +       pool = kmalloc(size, reiser4_ctx_gfp_mask_get());
2731 +       if (pool == NULL)
2732 +               return ERR_PTR(RETERR(-ENOMEM));
2733 +
2734 +       reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
2735 +                         (char *)pool->op);
2736 +       reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
2737 +                         NODES_LOCKED_POOL_SIZE, (char *)pool->node);
2738 +       return pool;
2739 +}
2740 +
2741 +/* finish with queue pools */
2742 +void done_carry_pool(carry_pool * pool/* pool to destroy */)
2743 +{
2744 +       reiser4_done_pool(&pool->op_pool);
2745 +       reiser4_done_pool(&pool->node_pool);
2746 +       kfree(pool);
2747 +}
2748 +
2749 +/* add new carry node to the @level.
2750 +
2751 +   Returns pointer to the new carry node allocated from pool.  It's up to
2752 +   callers to maintain proper order in the @level. Assumption is that if carry
2753 +   nodes on one level are already sorted and modifications are peroformed from
2754 +   left to right, carry nodes added on the parent level will be ordered
2755 +   automatically. To control ordering use @order and @reference parameters.
2756 +
2757 +*/
2758 +carry_node *reiser4_add_carry_skip(carry_level * level /* &carry_level to add
2759 +                                                        * node to */ ,
2760 +                                  pool_ordering order  /* where to insert:
2761 +                                                        * at the beginning of
2762 +                                                        * @level,
2763 +                                                        * before @reference,
2764 +                                                        * after @reference,
2765 +                                                        * at the end of @level
2766 +                                                        */ ,
2767 +                                  carry_node * reference/* reference node for
2768 +                                                         * insertion */)
2769 +{
2770 +       ON_DEBUG(carry_node * orig_ref = reference);
2771 +
2772 +       if (order == POOLO_BEFORE) {
2773 +               reference = find_left_carry(reference, level);
2774 +               if (reference == NULL)
2775 +                       reference = list_entry(level->nodes.next, carry_node,
2776 +                                              header.level_linkage);
2777 +               else
2778 +                       reference = list_entry(reference->header.level_linkage.next,
2779 +                                              carry_node, header.level_linkage);
2780 +       } else if (order == POOLO_AFTER) {
2781 +               reference = find_right_carry(reference, level);
2782 +               if (reference == NULL)
2783 +                       reference = list_entry(level->nodes.prev, carry_node,
2784 +                                              header.level_linkage);
2785 +               else
2786 +                       reference = list_entry(reference->header.level_linkage.prev,
2787 +                                              carry_node, header.level_linkage);
2788 +       }
2789 +       assert("nikita-2209",
2790 +              ergo(orig_ref != NULL,
2791 +                   reiser4_carry_real(reference) ==
2792 +                   reiser4_carry_real(orig_ref)));
2793 +       return reiser4_add_carry(level, order, reference);
2794 +}
2795 +
2796 +carry_node *reiser4_add_carry(carry_level * level,   /* carry_level to add
2797 +                                                       node to */
2798 +                             pool_ordering order,   /* where to insert:
2799 +                                                     * at the beginning of
2800 +                                                     * @level;
2801 +                                                     * before @reference;
2802 +                                                     * after @reference;
2803 +                                                     * at the end of @level
2804 +                                                     */
2805 +                             carry_node * reference /* reference node for
2806 +                                                     * insertion */)
2807 +{
2808 +       carry_node *result;
2809 +
2810 +       result =
2811 +           (carry_node *) reiser4_add_obj(&level->pool->node_pool,
2812 +                                          &level->nodes,
2813 +                                          order, &reference->header);
2814 +       if (!IS_ERR(result) && (result != NULL))
2815 +               ++level->nodes_num;
2816 +       return result;
2817 +}
2818 +
2819 +/**
2820 + * add new carry operation to the @level.
2821 + *
2822 + * Returns pointer to the new carry operations allocated from pool. It's up to
2823 + * callers to maintain proper order in the @level. To control ordering use
2824 + * @order and @reference parameters.
2825 + */
2826 +static carry_op *add_op(carry_level * level, /* &carry_level to add node to */
2827 +                       pool_ordering order, /* where to insert:
2828 +                                             * at the beginning of @level;
2829 +                                             * before @reference;
2830 +                                             * after @reference;
2831 +                                             * at the end of @level */
2832 +                       carry_op * reference /* reference node for insertion */)
2833 +{
2834 +       carry_op *result;
2835 +
2836 +       result =
2837 +           (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops,
2838 +                                        order, &reference->header);
2839 +       if (!IS_ERR(result) && (result != NULL))
2840 +               ++level->ops_num;
2841 +       return result;
2842 +}
2843 +
2844 +/**
2845 + * Return node on the right of which @node was created.
2846 + *
2847 + * Each node is created on the right of some existing node (or it is new root,
2848 + * which is special case not handled here).
2849 + *
2850 + * @node is new node created on some level, but not yet inserted into its
2851 + * parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
2852 + */
2853 +static carry_node *find_begetting_brother(carry_node * node,/* node to start
2854 +                                                               search from */
2855 +                                         carry_level * kin UNUSED_ARG
2856 +                                                           /* level to scan */)
2857 +{
2858 +       carry_node *scan;
2859 +
2860 +       assert("nikita-1614", node != NULL);
2861 +       assert("nikita-1615", kin != NULL);
2862 +       assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
2863 +       assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL,
2864 +                                  ZF_ISSET(reiser4_carry_real(node),
2865 +                                           JNODE_ORPHAN)));
2866 +       for (scan = node;;
2867 +            scan = list_entry(scan->header.level_linkage.prev, carry_node,
2868 +                              header.level_linkage)) {
2869 +               assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
2870 +               if ((scan->node != node->node) &&
2871 +                   !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
2872 +                       assert("nikita-1618", reiser4_carry_real(scan) != NULL);
2873 +                       break;
2874 +               }
2875 +       }
2876 +       return scan;
2877 +}
2878 +
2879 +static cmp_t
2880 +carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
2881 +{
2882 +       assert("nikita-2199", n1 != NULL);
2883 +       assert("nikita-2200", n2 != NULL);
2884 +
2885 +       if (n1 == n2)
2886 +               return EQUAL_TO;
2887 +       while (1) {
2888 +               n1 = carry_node_next(n1);
2889 +               if (carry_node_end(level, n1))
2890 +                       return GREATER_THAN;
2891 +               if (n1 == n2)
2892 +                       return LESS_THAN;
2893 +       }
2894 +       impossible("nikita-2201", "End of level reached");
2895 +}
2896 +
2897 +carry_node *find_carry_node(carry_level * level, const znode * node)
2898 +{
2899 +       carry_node *scan;
2900 +       carry_node *tmp_scan;
2901 +
2902 +       assert("nikita-2202", level != NULL);
2903 +       assert("nikita-2203", node != NULL);
2904 +
2905 +       for_all_nodes(level, scan, tmp_scan) {
2906 +               if (reiser4_carry_real(scan) == node)
2907 +                       return scan;
2908 +       }
2909 +       return NULL;
2910 +}
2911 +
2912 +znode *reiser4_carry_real(const carry_node * node)
2913 +{
2914 +       assert("nikita-3061", node != NULL);
2915 +
2916 +       return node->lock_handle.node;
2917 +}
2918 +
2919 +carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
2920 +                             const znode * node)
2921 +{
2922 +       carry_node *base;
2923 +       carry_node *scan;
2924 +       carry_node *tmp_scan;
2925 +       carry_node *proj;
2926 +
2927 +       base = find_carry_node(doing, node);
2928 +       assert("nikita-2204", base != NULL);
2929 +
2930 +       for_all_nodes(todo, scan, tmp_scan) {
2931 +               proj = find_carry_node(doing, scan->node);
2932 +               assert("nikita-2205", proj != NULL);
2933 +               if (carry_node_cmp(doing, proj, base) != LESS_THAN)
2934 +                       break;
2935 +       }
2936 +       return scan;
2937 +}
2938 +
2939 +static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
2940 +                                    znode * node)
2941 +{
2942 +       carry_node *reference;
2943 +
2944 +       assert("nikita-2994", doing != NULL);
2945 +       assert("nikita-2995", todo != NULL);
2946 +       assert("nikita-2996", node != NULL);
2947 +
2948 +       reference = insert_carry_node(doing, todo, node);
2949 +       assert("nikita-2997", reference != NULL);
2950 +
2951 +       return reiser4_add_carry(todo, POOLO_BEFORE, reference);
2952 +}
2953 +
2954 +/* like reiser4_post_carry(), but designed to be called from node plugin
2955 +   methods. This function is different from reiser4_post_carry() in that it
2956 +   finds proper place to insert node in the queue. */
2957 +carry_op *node_post_carry(carry_plugin_info * info     /* carry parameters
2958 +                                                        * passed down to node
2959 +                                                        * plugin */ ,
2960 +                         carry_opcode op /* opcode of operation */ ,
2961 +                         znode * node  /* node on which this
2962 +                                        * operation will operate */ ,
2963 +                         int apply_to_parent_p /* whether operation will
2964 +                                                * operate directly on @node
2965 +                                                * or on it parent. */ )
2966 +{
2967 +       carry_op *result;
2968 +       carry_node *child;
2969 +
2970 +       assert("nikita-2207", info != NULL);
2971 +       assert("nikita-2208", info->todo != NULL);
2972 +
2973 +       if (info->doing == NULL)
2974 +               return reiser4_post_carry(info->todo, op, node,
2975 +                                         apply_to_parent_p);
2976 +
2977 +       result = add_op(info->todo, POOLO_LAST, NULL);
2978 +       if (IS_ERR(result))
2979 +               return result;
2980 +       child = add_carry_atplace(info->doing, info->todo, node);
2981 +       if (IS_ERR(child)) {
2982 +               reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
2983 +               return (carry_op *) child;
2984 +       }
2985 +       result->node = child;
2986 +       result->op = op;
2987 +       child->parent = apply_to_parent_p;
2988 +       if (ZF_ISSET(node, JNODE_ORPHAN))
2989 +               child->left_before = 1;
2990 +       child->node = node;
2991 +       return result;
2992 +}
2993 +
2994 +/* lock all carry nodes in @level */
2995 +static int lock_carry_level(carry_level * level/* level to lock */)
2996 +{
2997 +       int result;
2998 +       carry_node *node;
2999 +       carry_node *tmp_node;
3000 +
3001 +       assert("nikita-881", level != NULL);
3002 +       assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
3003 +
3004 +       /* lock nodes from left to right */
3005 +       result = 0;
3006 +       for_all_nodes(level, node, tmp_node) {
3007 +               result = lock_carry_node(level, node);
3008 +               if (result != 0)
3009 +                       break;
3010 +       }
3011 +       return result;
3012 +}
3013 +
3014 +/* Synchronize delimiting keys between @node and its left neighbor.
3015 +
3016 +   To reduce contention on dk key and simplify carry code, we synchronize
3017 +   delimiting keys only when carry ultimately leaves tree level (carrying
3018 +   changes upward) and unlocks nodes at this level.
3019 +
3020 +   This function first finds left neighbor of @node and then updates left
3021 +   neighbor's right delimiting key to conincide with least key in @node.
3022 +
3023 +*/
3024 +
3025 +ON_DEBUG(extern atomic_t delim_key_version;
3026 +    )
3027 +
3028 +static void sync_dkeys(znode * spot/* node to update */)
3029 +{
3030 +       reiser4_key pivot;
3031 +       reiser4_tree *tree;
3032 +
3033 +       assert("nikita-1610", spot != NULL);
3034 +       assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
3035 +
3036 +       tree = znode_get_tree(spot);
3037 +       read_lock_tree(tree);
3038 +       write_lock_dk(tree);
3039 +
3040 +       assert("nikita-2192", znode_is_loaded(spot));
3041 +
3042 +       /* sync left delimiting key of @spot with key in its leftmost item */
3043 +       if (node_is_empty(spot))
3044 +               pivot = *znode_get_rd_key(spot);
3045 +       else
3046 +               leftmost_key_in_node(spot, &pivot);
3047 +
3048 +       znode_set_ld_key(spot, &pivot);
3049 +
3050 +       /* there can be sequence of empty nodes pending removal on the left of
3051 +          @spot. Scan them and update their left and right delimiting keys to
3052 +          match left delimiting key of @spot. Also, update right delimiting
3053 +          key of first non-empty left neighbor.
3054 +        */
3055 +       while (1) {
3056 +               if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
3057 +                       break;
3058 +
3059 +               spot = spot->left;
3060 +               if (spot == NULL)
3061 +                       break;
3062 +
3063 +               znode_set_rd_key(spot, &pivot);
3064 +               /* don't sink into the domain of another balancing */
3065 +               if (!znode_is_write_locked(spot))
3066 +                       break;
3067 +               if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
3068 +                       znode_set_ld_key(spot, &pivot);
3069 +               else
3070 +                       break;
3071 +       }
3072 +
3073 +       write_unlock_dk(tree);
3074 +       read_unlock_tree(tree);
3075 +}
3076 +
3077 +/* unlock all carry nodes in @level */
3078 +static void unlock_carry_level(carry_level * level /* level to unlock */ ,
3079 +                              int failure      /* true if unlocking owing to
3080 +                                                * failure */ )
3081 +{
3082 +       carry_node *node;
3083 +       carry_node *tmp_node;
3084 +
3085 +       assert("nikita-889", level != NULL);
3086 +
3087 +       if (!failure) {
3088 +               znode *spot;
3089 +
3090 +               spot = NULL;
3091 +               /* update delimiting keys */
3092 +               for_all_nodes(level, node, tmp_node) {
3093 +                       if (reiser4_carry_real(node) != spot) {
3094 +                               spot = reiser4_carry_real(node);
3095 +                               sync_dkeys(spot);
3096 +                       }
3097 +               }
3098 +       }
3099 +
3100 +       /* nodes can be unlocked in arbitrary order.  In preemptible
3101 +          environment it's better to unlock in reverse order of locking,
3102 +          though.
3103 +        */
3104 +       for_all_nodes_back(level, node, tmp_node) {
3105 +               /* all allocated nodes should be already linked to their
3106 +                  parents at this moment. */
3107 +               assert("nikita-1631",
3108 +                      ergo(!failure, !ZF_ISSET(reiser4_carry_real(node),
3109 +                                               JNODE_ORPHAN)));
3110 +               ON_DEBUG(check_dkeys(reiser4_carry_real(node)));
3111 +               unlock_carry_node(level, node, failure);
3112 +       }
3113 +       level->new_root = NULL;
3114 +}
3115 +
3116 +/* finish with @level
3117 +
3118 +   Unlock nodes and release all allocated resources */
3119 +static void done_carry_level(carry_level * level/* level to finish */)
3120 +{
3121 +       carry_node *node;
3122 +       carry_node *tmp_node;
3123 +       carry_op *op;
3124 +       carry_op *tmp_op;
3125 +
3126 +       assert("nikita-1076", level != NULL);
3127 +
3128 +       unlock_carry_level(level, 0);
3129 +       for_all_nodes(level, node, tmp_node) {
3130 +               assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
3131 +               assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
3132 +               reiser4_pool_free(&level->pool->node_pool, &node->header);
3133 +       }
3134 +       for_all_ops(level, op, tmp_op)
3135 +           reiser4_pool_free(&level->pool->op_pool, &op->header);
3136 +}
3137 +
3138 +/* helper function to complete locking of carry node
3139 +
3140 +   Finish locking of carry node. There are several ways in which new carry
3141 +   node can be added into carry level and locked. Normal is through
3142 +   lock_carry_node(), but also from find_{left|right}_neighbor(). This
3143 +   function factors out common final part of all locking scenarios. It
3144 +   supposes that @node -> lock_handle is lock handle for lock just taken and
3145 +   fills ->real_node from this lock handle.
3146 +
3147 +*/
3148 +int lock_carry_node_tail(carry_node * node/* node to complete locking of */)
3149 +{
3150 +       assert("nikita-1052", node != NULL);
3151 +       assert("nikita-1187", reiser4_carry_real(node) != NULL);
3152 +       assert("nikita-1188", !node->unlock);
3153 +
3154 +       node->unlock = 1;
3155 +       /* Load node content into memory and install node plugin by
3156 +          looking at the node header.
3157 +
3158 +          Most of the time this call is cheap because the node is
3159 +          already in memory.
3160 +
3161 +          Corresponding zrelse() is in unlock_carry_node()
3162 +        */
3163 +       return zload(reiser4_carry_real(node));
3164 +}
3165 +
3166 +/* lock carry node
3167 +
3168 +   "Resolve" node to real znode, lock it and mark as locked.
3169 +   This requires recursive locking of znodes.
3170 +
3171 +   When operation is posted to the parent level, node it will be applied to is
3172 +   not yet known. For example, when shifting data between two nodes,
3173 +   delimiting has to be updated in parent or parents of nodes involved. But
3174 +   their parents is not yet locked and, moreover said nodes can be reparented
3175 +   by concurrent balancing.
3176 +
3177 +   To work around this, carry operation is applied to special "carry node"
3178 +   rather than to the znode itself. Carry node consists of some "base" or
3179 +   "reference" znode and flags indicating how to get to the target of carry
3180 +   operation (->real_node field of carry_node) from base.
3181 +
3182 +*/
3183 +int lock_carry_node(carry_level * level /* level @node is in */ ,
3184 +                   carry_node * node/* node to lock */)
3185 +{
3186 +       int result;
3187 +       znode *reference_point;
3188 +       lock_handle lh;
3189 +       lock_handle tmp_lh;
3190 +       reiser4_tree *tree;
3191 +
3192 +       assert("nikita-887", level != NULL);
3193 +       assert("nikita-882", node != NULL);
3194 +
3195 +       result = 0;
3196 +       reference_point = node->node;
3197 +       init_lh(&lh);
3198 +       init_lh(&tmp_lh);
3199 +       if (node->left_before) {
3200 +               /* handling of new nodes, allocated on the previous level:
3201 +
3202 +                  some carry ops were propably posted from the new node, but
3203 +                  this node neither has parent pointer set, nor is
3204 +                  connected. This will be done in ->create_hook() for
3205 +                  internal item.
3206 +
3207 +                  No then less, parent of new node has to be locked. To do
3208 +                  this, first go to the "left" in the carry order. This
3209 +                  depends on the decision to always allocate new node on the
3210 +                  right of existing one.
3211 +
3212 +                  Loop handles case when multiple nodes, all orphans, were
3213 +                  inserted.
3214 +
3215 +                  Strictly speaking, taking tree lock is not necessary here,
3216 +                  because all nodes scanned by loop in
3217 +                  find_begetting_brother() are write-locked by this thread,
3218 +                  and thus, their sibling linkage cannot change.
3219 +
3220 +                */
3221 +               tree = znode_get_tree(reference_point);
3222 +               read_lock_tree(tree);
3223 +               reference_point = find_begetting_brother(node, level)->node;
3224 +               read_unlock_tree(tree);
3225 +               assert("nikita-1186", reference_point != NULL);
3226 +       }
3227 +       if (node->parent && (result == 0)) {
3228 +               result =
3229 +                   reiser4_get_parent(&tmp_lh, reference_point,
3230 +                                      ZNODE_WRITE_LOCK);
3231 +               if (result != 0) {
3232 +                       ;       /* nothing */
3233 +               } else if (znode_get_level(tmp_lh.node) == 0) {
3234 +                       assert("nikita-1347", znode_above_root(tmp_lh.node));
3235 +                       result = add_new_root(level, node, tmp_lh.node);
3236 +                       if (result == 0) {
3237 +                               reference_point = level->new_root;
3238 +                               move_lh(&lh, &node->lock_handle);
3239 +                       }
3240 +               } else if ((level->new_root != NULL)
3241 +                          && (level->new_root !=
3242 +                              znode_parent_nolock(reference_point))) {
3243 +                       /* parent of node exists, but this level aready
3244 +                          created different new root, so */
3245 +                       warning("nikita-1109",
3246 +                               /* it should be "radicis", but tradition is
3247 +                                  tradition.  do banshees read latin? */
3248 +                               "hodie natus est radici frater");
3249 +                       result = -EIO;
3250 +               } else {
3251 +                       move_lh(&lh, &tmp_lh);
3252 +                       reference_point = lh.node;
3253 +               }
3254 +       }
3255 +       if (node->left && (result == 0)) {
3256 +               assert("nikita-1183", node->parent);
3257 +               assert("nikita-883", reference_point != NULL);
3258 +               result =
3259 +                   reiser4_get_left_neighbor(&tmp_lh, reference_point,
3260 +                                             ZNODE_WRITE_LOCK,
3261 +                                             GN_CAN_USE_UPPER_LEVELS);
3262 +               if (result == 0) {
3263 +                       done_lh(&lh);
3264 +                       move_lh(&lh, &tmp_lh);
3265 +                       reference_point = lh.node;
3266 +               }
3267 +       }
3268 +       if (!node->parent && !node->left && !node->left_before) {
3269 +               result =
3270 +                   longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
3271 +                                       ZNODE_LOCK_HIPRI);
3272 +       }
3273 +       if (result == 0) {
3274 +               move_lh(&node->lock_handle, &lh);
3275 +               result = lock_carry_node_tail(node);
3276 +       }
3277 +       done_lh(&tmp_lh);
3278 +       done_lh(&lh);
3279 +       return result;
3280 +}
3281 +
3282 +/* release a lock on &carry_node.
3283 +
3284 +   Release if necessary lock on @node. This opearion is pair of
3285 +   lock_carry_node() and is idempotent: you can call it more than once on the
3286 +   same node.
3287 +
3288 +*/
3289 +static void
3290 +unlock_carry_node(carry_level * level,
3291 +                 carry_node * node /* node to be released */ ,
3292 +                 int failure   /* 0 if node is unlocked due
3293 +                                * to some error */ )
3294 +{
3295 +       znode *real_node;
3296 +
3297 +       assert("nikita-884", node != NULL);
3298 +
3299 +       real_node = reiser4_carry_real(node);
3300 +       /* pair to zload() in lock_carry_node_tail() */
3301 +       zrelse(real_node);
3302 +       if (node->unlock && (real_node != NULL)) {
3303 +               assert("nikita-899", real_node == node->lock_handle.node);
3304 +               longterm_unlock_znode(&node->lock_handle);
3305 +       }
3306 +       if (failure) {
3307 +               if (node->deallocate && (real_node != NULL)) {
3308 +                       /* free node in bitmap
3309 +
3310 +                          Prepare node for removal. Last zput() will finish
3311 +                          with it.
3312 +                        */
3313 +                       ZF_SET(real_node, JNODE_HEARD_BANSHEE);
3314 +               }
3315 +               if (node->free) {
3316 +                       assert("nikita-2177",
3317 +                              list_empty_careful(&node->lock_handle.locks_link));
3318 +                       assert("nikita-2112",
3319 +                              list_empty_careful(&node->lock_handle.owners_link));
3320 +                       reiser4_pool_free(&level->pool->node_pool,
3321 +                                         &node->header);
3322 +               }
3323 +       }
3324 +}
3325 +
3326 +/* fatal_carry_error() - all-catching error handling function
3327 +
3328 +   It is possible that carry faces unrecoverable error, like unability to
3329 +   insert pointer at the internal level. Our simple solution is just panic in
3330 +   this situation. More sophisticated things like attempt to remount
3331 +   file-system as read-only can be implemented without much difficlties.
3332 +
3333 +   It is believed, that:
3334 +
3335 +   1. in stead of panicking, all current transactions can be aborted rolling
3336 +   system back to the consistent state.
3337 +
3338 +Umm, if you simply panic without doing anything more at all, then all current
3339 +transactions are aborted and the system is rolled back to a consistent state,
3340 +by virtue of the design of the transactional mechanism. Well, wait, let's be
3341 +precise.  If an internal node is corrupted on disk due to hardware failure,
3342 +then there may be no consistent state that can be rolled back to, so instead
3343 +we should say that it will rollback the transactions, which barring other
3344 +factors means rolling back to a consistent state.
3345 +
3346 +# Nikita: there is a subtle difference between panic and aborting
3347 +# transactions: machine doesn't reboot. Processes aren't killed. Processes
3348 +# don't using reiser4 (not that we care about such processes), or using other
3349 +# reiser4 mounts (about them we do care) will simply continue to run. With
3350 +# some luck, even application using aborted file system can survive: it will
3351 +# get some error, like EBADF, from each file descriptor on failed file system,
3352 +# but applications that do care about tolerance will cope with this (squid
3353 +# will).
3354 +
3355 +It would be a nice feature though to support rollback without rebooting
3356 +followed by remount, but this can wait for later versions.
3357 +
3358 +   2. once isolated transactions will be implemented it will be possible to
3359 +   roll back offending transaction.
3360 +
3361 +2. is additional code complexity of inconsistent value (it implies that a
3362 +broken tree should be kept in operation), so we must think about it more
3363 +before deciding if it should be done.  -Hans
3364 +
3365 +*/
3366 +static void fatal_carry_error(carry_level * doing UNUSED_ARG   /* carry level
3367 +                                                                * where
3368 +                                                                * unrecoverable
3369 +                                                                * error
3370 +                                                                * occurred */ ,
3371 +                             int ecode/* error code */)
3372 +{
3373 +       assert("nikita-1230", doing != NULL);
3374 +       assert("nikita-1231", ecode < 0);
3375 +
3376 +       reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
3377 +}
3378 +
3379 +/**
3380 + * Add new root to the tree
3381 + *
3382 + * This function itself only manages changes in carry structures and delegates
3383 + * all hard work (allocation of znode for new root, changes of parent and
3384 + * sibling pointers) to the reiser4_add_tree_root().
3385 + *
3386 + * Locking: old tree root is locked by carry at this point. Fake znode is also
3387 + * locked.
3388 + */
3389 +static int add_new_root(carry_level * level,/* carry level in context of which
3390 +                                            * operation is performed */
3391 +                       carry_node * node,  /* carry node for existing root */
3392 +                       znode * fake        /* "fake" znode already locked by
3393 +                                            * us */)
3394 +{
3395 +       int result;
3396 +
3397 +       assert("nikita-1104", level != NULL);
3398 +       assert("nikita-1105", node != NULL);
3399 +
3400 +       assert("nikita-1403", znode_is_write_locked(node->node));
3401 +       assert("nikita-1404", znode_is_write_locked(fake));
3402 +
3403 +       /* trying to create new root. */
3404 +       /* @node is root and it's already locked by us. This
3405 +          means that nobody else can be trying to add/remove
3406 +          tree root right now.
3407 +        */
3408 +       if (level->new_root == NULL)
3409 +               level->new_root = reiser4_add_tree_root(node->node, fake);
3410 +       if (!IS_ERR(level->new_root)) {
3411 +               assert("nikita-1210", znode_is_root(level->new_root));
3412 +               node->deallocate = 1;
3413 +               result =
3414 +                   longterm_lock_znode(&node->lock_handle, level->new_root,
3415 +                                       ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
3416 +               if (result == 0)
3417 +                       zput(level->new_root);
3418 +       } else {
3419 +               result = PTR_ERR(level->new_root);
3420 +               level->new_root = NULL;
3421 +       }
3422 +       return result;
3423 +}
3424 +
3425 +/* allocate new znode and add the operation that inserts the
3426 +   pointer to it into the parent node into the todo level
3427 +
3428 +   Allocate new znode, add it into carry queue and post into @todo queue
3429 +   request to add pointer to new node into its parent.
3430 +
3431 +   This is carry related routing that calls reiser4_new_node() to allocate new
3432 +   node.
3433 +*/
3434 +carry_node *add_new_znode(znode * brother      /* existing left neighbor of new
3435 +                                                * node */ ,
3436 +                         carry_node * ref      /* carry node after which new
3437 +                                                * carry node is to be inserted
3438 +                                                * into queue. This affects
3439 +                                                * locking. */ ,
3440 +                         carry_level * doing   /* carry queue where new node is
3441 +                                                * to be added */ ,
3442 +                         carry_level * todo    /* carry queue where COP_INSERT
3443 +                                                * operation to add pointer to
3444 +                                                * new node will ne added */ )
3445 +{
3446 +       carry_node *fresh;
3447 +       znode *new_znode;
3448 +       carry_op *add_pointer;
3449 +       carry_plugin_info info;
3450 +
3451 +       assert("nikita-1048", brother != NULL);
3452 +       assert("nikita-1049", todo != NULL);
3453 +
3454 +       /* There is a lot of possible variations here: to what parent
3455 +          new node will be attached and where. For simplicity, always
3456 +          do the following:
3457 +
3458 +          (1) new node and @brother will have the same parent.
3459 +
3460 +          (2) new node is added on the right of @brother
3461 +
3462 +        */
3463 +
3464 +       fresh = reiser4_add_carry_skip(doing,
3465 +                                      ref ? POOLO_AFTER : POOLO_LAST, ref);
3466 +       if (IS_ERR(fresh))
3467 +               return fresh;
3468 +
3469 +       fresh->deallocate = 1;
3470 +       fresh->free = 1;
3471 +
3472 +       new_znode = reiser4_new_node(brother, znode_get_level(brother));
3473 +       if (IS_ERR(new_znode))
3474 +               /* @fresh will be deallocated automatically by error
3475 +                  handling code in the caller. */
3476 +               return (carry_node *) new_znode;
3477 +
3478 +       /* new_znode returned znode with x_count 1. Caller has to decrease
3479 +          it. make_space() does. */
3480 +
3481 +       ZF_SET(new_znode, JNODE_ORPHAN);
3482 +       fresh->node = new_znode;
3483 +
3484 +       while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) {
3485 +               ref = carry_node_prev(ref);
3486 +               assert("nikita-1606", !carry_node_end(doing, ref));
3487 +       }
3488 +
3489 +       info.todo = todo;
3490 +       info.doing = doing;
3491 +       add_pointer = node_post_carry(&info, COP_INSERT,
3492 +                                     reiser4_carry_real(ref), 1);
3493 +       if (IS_ERR(add_pointer)) {
3494 +               /* no need to deallocate @new_znode here: it will be
3495 +                  deallocated during carry error handling. */
3496 +               return (carry_node *) add_pointer;
3497 +       }
3498 +
3499 +       add_pointer->u.insert.type = COPT_CHILD;
3500 +       add_pointer->u.insert.child = fresh;
3501 +       add_pointer->u.insert.brother = brother;
3502 +       /* initially new node spawns empty key range */
3503 +       write_lock_dk(znode_get_tree(brother));
3504 +       znode_set_ld_key(new_znode,
3505 +                        znode_set_rd_key(new_znode,
3506 +                                         znode_get_rd_key(brother)));
3507 +       write_unlock_dk(znode_get_tree(brother));
3508 +       return fresh;
3509 +}
3510 +
3511 +/* DEBUGGING FUNCTIONS.
3512 +
3513 +   Probably we also should leave them on even when
3514 +   debugging is turned off to print dumps at errors.
3515 +*/
3516 +#if REISER4_DEBUG
3517 +static int carry_level_invariant(carry_level * level, carry_queue_state state)
3518 +{
3519 +       carry_node *node;
3520 +       carry_node *tmp_node;
3521 +
3522 +       if (level == NULL)
3523 +               return 0;
3524 +
3525 +       if (level->track_type != 0 &&
3526 +           level->track_type != CARRY_TRACK_NODE &&
3527 +           level->track_type != CARRY_TRACK_CHANGE)
3528 +               return 0;
3529 +
3530 +       /* check that nodes are in ascending order */
3531 +       for_all_nodes(level, node, tmp_node) {
3532 +               znode *left;
3533 +               znode *right;
3534 +
3535 +               reiser4_key lkey;
3536 +               reiser4_key rkey;
3537 +
3538 +               if (node != carry_node_front(level)) {
3539 +                       if (state == CARRY_TODO) {
3540 +                               right = node->node;
3541 +                               left = carry_node_prev(node)->node;
3542 +                       } else {
3543 +                               right = reiser4_carry_real(node);
3544 +                               left = reiser4_carry_real(carry_node_prev(node));
3545 +                       }
3546 +                       if (right == NULL || left == NULL)
3547 +                               continue;
3548 +                       if (node_is_empty(right) || node_is_empty(left))
3549 +                               continue;
3550 +                       if (!keyle(leftmost_key_in_node(left, &lkey),
3551 +                                  leftmost_key_in_node(right, &rkey))) {
3552 +                               warning("", "wrong key order");
3553 +                               return 0;
3554 +                       }
3555 +               }
3556 +       }
3557 +       return 1;
3558 +}
3559 +#endif
3560 +
3561 +/* get symbolic name for boolean */
3562 +static const char *tf(int boolean/* truth value */)
3563 +{
3564 +       return boolean ? "t" : "f";
3565 +}
3566 +
3567 +/* symbolic name for carry operation */
3568 +static const char *carry_op_name(carry_opcode op/* carry opcode */)
3569 +{
3570 +       switch (op) {
3571 +       case COP_INSERT:
3572 +               return "COP_INSERT";
3573 +       case COP_DELETE:
3574 +               return "COP_DELETE";
3575 +       case COP_CUT:
3576 +               return "COP_CUT";
3577 +       case COP_PASTE:
3578 +               return "COP_PASTE";
3579 +       case COP_UPDATE:
3580 +               return "COP_UPDATE";
3581 +       case COP_EXTENT:
3582 +               return "COP_EXTENT";
3583 +       case COP_INSERT_FLOW:
3584 +               return "COP_INSERT_FLOW";
3585 +       default:{
3586 +                       /* not mt safe, but who cares? */
3587 +                       static char buf[20];
3588 +
3589 +                       sprintf(buf, "unknown op: %x", op);
3590 +                       return buf;
3591 +               }
3592 +       }
3593 +}
3594 +
3595 +/* dump information about carry node */
3596 +static void print_carry(const char *prefix /* prefix to print */ ,
3597 +                       carry_node * node/* node to print */)
3598 +{
3599 +       if (node == NULL) {
3600 +               printk("%s: null\n", prefix);
3601 +               return;
3602 +       }
3603 +       printk
3604 +           ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
3605 +            prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
3606 +            tf(node->free), tf(node->deallocate));
3607 +}
3608 +
3609 +/* dump information about carry operation */
3610 +static void print_op(const char *prefix /* prefix to print */ ,
3611 +                    carry_op * op/* operation to print */)
3612 +{
3613 +       if (op == NULL) {
3614 +               printk("%s: null\n", prefix);
3615 +               return;
3616 +       }
3617 +       printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
3618 +       print_carry("\tnode", op->node);
3619 +       switch (op->op) {
3620 +       case COP_INSERT:
3621 +       case COP_PASTE:
3622 +               print_coord("\tcoord",
3623 +                           op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
3624 +               reiser4_print_key("\tkey",
3625 +                                 op->u.insert.d ? op->u.insert.d->key : NULL);
3626 +               print_carry("\tchild", op->u.insert.child);
3627 +               break;
3628 +       case COP_DELETE:
3629 +               print_carry("\tchild", op->u.delete.child);
3630 +               break;
3631 +       case COP_CUT:
3632 +               if (op->u.cut_or_kill.is_cut) {
3633 +                       print_coord("\tfrom",
3634 +                                   op->u.cut_or_kill.u.kill->params.from, 0);
3635 +                       print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
3636 +                                   0);
3637 +               } else {
3638 +                       print_coord("\tfrom",
3639 +                                   op->u.cut_or_kill.u.cut->params.from, 0);
3640 +                       print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
3641 +                                   0);
3642 +               }
3643 +               break;
3644 +       case COP_UPDATE:
3645 +               print_carry("\tleft", op->u.update.left);
3646 +               break;
3647 +       default:
3648 +               /* do nothing */
3649 +               break;
3650 +       }
3651 +}
3652 +
3653 +/* dump information about all nodes and operations in a @level */
3654 +static void print_level(const char *prefix /* prefix to print */ ,
3655 +                       carry_level * level/* level to print */)
3656 +{
3657 +       carry_node *node;
3658 +       carry_node *tmp_node;
3659 +       carry_op *op;
3660 +       carry_op *tmp_op;
3661 +
3662 +       if (level == NULL) {
3663 +               printk("%s: null\n", prefix);
3664 +               return;
3665 +       }
3666 +       printk("%s: %p, restartable: %s\n",
3667 +              prefix, level, tf(level->restartable));
3668 +
3669 +       for_all_nodes(level, node, tmp_node)
3670 +           print_carry("\tcarry node", node);
3671 +       for_all_ops(level, op, tmp_op)
3672 +           print_op("\tcarry op", op);
3673 +}
3674 +
3675 +/* Make Linus happy.
3676 +   Local variables:
3677 +   c-indentation-style: "K&R"
3678 +   mode-name: "LC"
3679 +   c-basic-offset: 8
3680 +   tab-width: 8
3681 +   fill-column: 120
3682 +   scroll-step: 1
3683 +   End:
3684 +*/
3685 diff -urN linux-2.6.35.orig/fs/reiser4/carry.h linux-2.6.35/fs/reiser4/carry.h
3686 --- linux-2.6.35.orig/fs/reiser4/carry.h        1970-01-01 01:00:00.000000000 +0100
3687 +++ linux-2.6.35/fs/reiser4/carry.h     2010-08-04 15:44:57.000000000 +0200
3688 @@ -0,0 +1,445 @@
3689 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
3690 +   reiser4/README */
3691 +
3692 +/* Functions and data types to "carry" tree modification(s) upward.
3693 +   See fs/reiser4/carry.c for details. */
3694 +
3695 +#if !defined(__FS_REISER4_CARRY_H__)
3696 +#define __FS_REISER4_CARRY_H__
3697 +
3698 +#include "forward.h"
3699 +#include "debug.h"
3700 +#include "pool.h"
3701 +#include "znode.h"
3702 +
3703 +#include <linux/types.h>
3704 +
3705 +/* &carry_node - "location" of carry node.
3706 +
3707 +   "location" of node that is involved or going to be involved into
3708 +   carry process. Node where operation will be carried to on the
3709 +   parent level cannot be recorded explicitly. Operation will be carried
3710 +   usually to the parent of some node (where changes are performed at
3711 +   the current level) or, to the left neighbor of its parent. But while
3712 +   modifications are performed at the current level, parent may
3713 +   change. So, we have to allow some indirection (or, positevly,
3714 +   flexibility) in locating carry nodes.
3715 +
3716 +*/
3717 +typedef struct carry_node {
3718 +       /* pool linkage */
3719 +       struct reiser4_pool_header header;
3720 +
3721 +       /* base node from which real_node is calculated. See
3722 +          fs/reiser4/carry.c:lock_carry_node(). */
3723 +       znode *node;
3724 +
3725 +       /* how to get ->real_node */
3726 +       /* to get ->real_node obtain parent of ->node */
3727 +       __u32 parent:1;
3728 +       /* to get ->real_node obtain left neighbor of parent of
3729 +          ->node */
3730 +       __u32 left:1;
3731 +       __u32 left_before:1;
3732 +
3733 +       /* locking */
3734 +
3735 +       /* this node was locked by carry process and should be
3736 +          unlocked when carry leaves a level */
3737 +       __u32 unlock:1;
3738 +
3739 +       /* disk block for this node was allocated by carry process and
3740 +          should be deallocated when carry leaves a level */
3741 +       __u32 deallocate:1;
3742 +       /* this carry node was allocated by carry process and should be
3743 +          freed when carry leaves a level */
3744 +       __u32 free:1;
3745 +
3746 +       /* type of lock we want to take on this node */
3747 +       lock_handle lock_handle;
3748 +} carry_node;
3749 +
3750 +/* &carry_opcode - elementary operations that can be carried upward
3751 +
3752 +   Operations that carry() can handle. This list is supposed to be
3753 +   expanded.
3754 +
3755 +   Each carry operation (cop) is handled by appropriate function defined
3756 +   in fs/reiser4/carry.c. For example COP_INSERT is handled by
3757 +   fs/reiser4/carry.c:carry_insert() etc. These functions in turn
3758 +   call plugins of nodes affected by operation to modify nodes' content
3759 +   and to gather operations to be performed on the next level.
3760 +
3761 +*/
3762 +typedef enum {
3763 +       /* insert new item into node. */
3764 +       COP_INSERT,
3765 +       /* delete pointer from parent node */
3766 +       COP_DELETE,
3767 +       /* remove part of or whole node. */
3768 +       COP_CUT,
3769 +       /* increase size of item. */
3770 +       COP_PASTE,
3771 +       /* insert extent (that is sequence of unformatted nodes). */
3772 +       COP_EXTENT,
3773 +       /* update delimiting key in least common ancestor of two
3774 +          nodes. This is performed when items are moved between two
3775 +          nodes.
3776 +        */
3777 +       COP_UPDATE,
3778 +       /* insert flow */
3779 +       COP_INSERT_FLOW,
3780 +       COP_LAST_OP,
3781 +} carry_opcode;
3782 +
3783 +#define CARRY_FLOW_NEW_NODES_LIMIT 20
3784 +
3785 +/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
3786 +   item is determined. */
3787 +typedef enum {
3788 +       /* target item is one containing pointer to the ->child node */
3789 +       COPT_CHILD,
3790 +       /* target item is given explicitly by @coord */
3791 +       COPT_ITEM_DATA,
3792 +       /* target item is given by key */
3793 +       COPT_KEY,
3794 +       /* see insert_paste_common() for more comments on this. */
3795 +       COPT_PASTE_RESTARTED,
3796 +} cop_insert_pos_type;
3797 +
3798 +/* flags to cut and delete */
3799 +typedef enum {
3800 +       /* don't kill node even if it became completely empty as results of
3801 +        * cut. This is needed for eottl handling. See carry_extent() for
3802 +        * details. */
3803 +       DELETE_RETAIN_EMPTY = (1 << 0)
3804 +} cop_delete_flag;
3805 +
3806 +/*
3807 + * carry() implements "lock handle tracking" feature.
3808 + *
3809 + * Callers supply carry with node where to perform initial operation and lock
3810 + * handle on this node. Trying to optimize node utilization carry may actually
3811 + * move insertion point to different node. Callers expect that lock handle
3812 + * will rebe transferred to the new node also.
3813 + *
3814 + */
3815 +typedef enum {
3816 +       /* transfer lock handle along with insertion point */
3817 +       CARRY_TRACK_CHANGE = 1,
3818 +       /* acquire new lock handle to the node where insertion point is. This
3819 +        * is used when carry() client doesn't initially possess lock handle
3820 +        * on the insertion point node, for example, by extent insertion
3821 +        * code. See carry_extent(). */
3822 +       CARRY_TRACK_NODE = 2
3823 +} carry_track_type;
3824 +
3825 +/* data supplied to COP_{INSERT|PASTE} by callers */
3826 +typedef struct carry_insert_data {
3827 +       /* position where new item is to be inserted */
3828 +       coord_t *coord;
3829 +       /* new item description */
3830 +       reiser4_item_data * data;
3831 +       /* key of new item */
3832 +       const reiser4_key * key;
3833 +} carry_insert_data;
3834 +
3835 +/* cut and kill are similar, so carry_cut_data and carry_kill_data share the
3836 +   below structure of parameters */
3837 +struct cut_kill_params {
3838 +       /* coord where cut starts (inclusive) */
3839 +       coord_t *from;
3840 +       /* coord where cut stops (inclusive, this item/unit will also be
3841 +        * cut) */
3842 +       coord_t *to;
3843 +       /* starting key. This is necessary when item and unit pos don't
3844 +        * uniquely identify what portion or tree to remove. For example, this
3845 +        * indicates what portion of extent unit will be affected. */
3846 +       const reiser4_key * from_key;
3847 +       /* exclusive stop key */
3848 +       const reiser4_key * to_key;
3849 +       /* if this is not NULL, smallest actually removed key is stored
3850 +        * here. */
3851 +       reiser4_key *smallest_removed;
3852 +       /* kill_node_content()  is called for file truncate */
3853 +       int truncate;
3854 +};
3855 +
3856 +struct carry_cut_data {
3857 +       struct cut_kill_params params;
3858 +};
3859 +
3860 +struct carry_kill_data {
3861 +       struct cut_kill_params params;
3862 +       /* parameter to be passed to the ->kill_hook() method of item
3863 +        * plugin */
3864 +       /*void *iplug_params; *//* FIXME: unused currently */
3865 +       /* if not NULL---inode whose items are being removed. This is needed
3866 +        * for ->kill_hook() of extent item to update VM structures when
3867 +        * removing pages. */
3868 +       struct inode *inode;
3869 +       /* sibling list maintenance is complicated by existence of eottl. When
3870 +        * eottl whose left and right neighbors are formatted leaves is
3871 +        * removed, one has to connect said leaves in the sibling list. This
3872 +        * cannot be done when extent removal is just started as locking rules
3873 +        * require sibling list update to happen atomically with removal of
3874 +        * extent item. Therefore: 1. pointers to left and right neighbors
3875 +        * have to be passed down to the ->kill_hook() of extent item, and
3876 +        * 2. said neighbors have to be locked. */
3877 +       lock_handle *left;
3878 +       lock_handle *right;
3879 +       /* flags modifying behavior of kill. Currently, it may have
3880 +          DELETE_RETAIN_EMPTY set. */
3881 +       unsigned flags;
3882 +       char *buf;
3883 +};
3884 +
3885 +/* &carry_tree_op - operation to "carry" upward.
3886 +
3887 +   Description of an operation we want to "carry" to the upper level of
3888 +   a tree: e.g, when we insert something and there is not enough space
3889 +   we allocate a new node and "carry" the operation of inserting a
3890 +   pointer to the new node to the upper level, on removal of empty node,
3891 +   we carry up operation of removing appropriate entry from parent.
3892 +
3893 +   There are two types of carry ops: when adding or deleting node we
3894 +   node at the parent level where appropriate modification has to be
3895 +   performed is known in advance. When shifting items between nodes
3896 +   (split, merge), delimiting key should be changed in the least common
3897 +   parent of the nodes involved that is not known in advance.
3898 +
3899 +   For the operations of the first type we store in &carry_op pointer to
3900 +   the &carry_node at the parent level. For the operation of the second
3901 +   type we store &carry_node or parents of the left and right nodes
3902 +   modified and keep track of them upward until they coincide.
3903 +
3904 +*/
3905 +typedef struct carry_op {
3906 +       /* pool linkage */
3907 +       struct reiser4_pool_header header;
3908 +       carry_opcode op;
3909 +       /* node on which operation is to be performed:
3910 +
3911 +          for insert, paste: node where new item is to be inserted
3912 +
3913 +          for delete: node where pointer is to be deleted
3914 +
3915 +          for cut: node to cut from
3916 +
3917 +          for update: node where delimiting key is to be modified
3918 +
3919 +          for modify: parent of modified node
3920 +
3921 +        */
3922 +       carry_node *node;
3923 +       union {
3924 +               struct {
3925 +                       /* (sub-)type of insertion/paste. Taken from
3926 +                          cop_insert_pos_type. */
3927 +                       __u8 type;
3928 +                       /* various operation flags. Taken from
3929 +                          cop_insert_flag. */
3930 +                       __u8 flags;
3931 +                       carry_insert_data *d;
3932 +                       carry_node *child;
3933 +                       znode *brother;
3934 +               } insert, paste, extent;
3935 +
3936 +               struct {
3937 +                       int is_cut;
3938 +                       union {
3939 +                               carry_kill_data *kill;
3940 +                               carry_cut_data *cut;
3941 +                       } u;
3942 +               } cut_or_kill;
3943 +
3944 +               struct {
3945 +                       carry_node *left;
3946 +               } update;
3947 +               struct {
3948 +                       /* changed child */
3949 +                       carry_node *child;
3950 +                       /* bitmask of changes. See &cop_modify_flag */
3951 +                       __u32 flag;
3952 +               } modify;
3953 +               struct {
3954 +                       /* flags to deletion operation. Are taken from
3955 +                          cop_delete_flag */
3956 +                       __u32 flags;
3957 +                       /* child to delete from parent. If this is
3958 +                          NULL, delete op->node.  */
3959 +                       carry_node *child;
3960 +               } delete;
3961 +               struct {
3962 +                       /* various operation flags. Taken from
3963 +                          cop_insert_flag. */
3964 +                       __u32 flags;
3965 +                       flow_t *flow;
3966 +                       coord_t *insert_point;
3967 +                       reiser4_item_data *data;
3968 +                       /* flow insertion is limited by number of new blocks
3969 +                          added in that operation which do not get any data
3970 +                          but part of flow. This limit is set by macro
3971 +                          CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
3972 +                          of nodes added already during one carry_flow */
3973 +                       int new_nodes;
3974 +               } insert_flow;
3975 +       } u;
3976 +} carry_op;
3977 +
3978 +/* &carry_op_pool - preallocated pool of carry operations, and nodes */
3979 +typedef struct carry_pool {
3980 +       carry_op op[CARRIES_POOL_SIZE];
3981 +       struct reiser4_pool op_pool;
3982 +       carry_node node[NODES_LOCKED_POOL_SIZE];
3983 +       struct reiser4_pool node_pool;
3984 +} carry_pool;
3985 +
3986 +/* &carry_tree_level - carry process on given level
3987 +
3988 +   Description of balancing process on the given level.
3989 +
3990 +   No need for locking here, as carry_tree_level is essentially per
3991 +   thread thing (for now).
3992 +
3993 +*/
3994 +struct carry_level {
3995 +       /* this level may be restarted */
3996 +       __u32 restartable:1;
3997 +       /* list of carry nodes on this level, ordered by key order */
3998 +       struct list_head nodes;
3999 +       struct list_head ops;
4000 +       /* pool where new objects are allocated from */
4001 +       carry_pool *pool;
4002 +       int ops_num;
4003 +       int nodes_num;
4004 +       /* new root created on this level, if any */
4005 +       znode *new_root;
4006 +       /* This is set by caller (insert_by_key(), rreiser4_esize_item(), etc.)
4007 +          when they want ->tracked to automagically wander to the node where
4008 +          insertion point moved after insert or paste.
4009 +        */
4010 +       carry_track_type track_type;
4011 +       /* lock handle supplied by user that we are tracking. See
4012 +          above. */
4013 +       lock_handle *tracked;
4014 +};
4015 +
4016 +/* information carry passes to plugin methods that may add new operations to
4017 +   the @todo queue  */
4018 +struct carry_plugin_info {
4019 +       carry_level *doing;
4020 +       carry_level *todo;
4021 +};
4022 +
4023 +int reiser4_carry(carry_level * doing, carry_level * done);
4024 +
4025 +carry_node *reiser4_add_carry(carry_level * level, pool_ordering order,
4026 +                             carry_node * reference);
4027 +carry_node *reiser4_add_carry_skip(carry_level * level, pool_ordering order,
4028 +                                  carry_node * reference);
4029 +
4030 +extern carry_node *insert_carry_node(carry_level * doing,
4031 +                                    carry_level * todo, const znode * node);
4032 +
4033 +extern carry_pool *init_carry_pool(int);
4034 +extern void done_carry_pool(carry_pool * pool);
4035 +
4036 +extern void init_carry_level(carry_level * level, carry_pool * pool);
4037 +
4038 +extern carry_op *reiser4_post_carry(carry_level * level, carry_opcode op,
4039 +                                   znode * node, int apply_to_parent);
4040 +extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
4041 +                                znode * node, int apply_to_parent_p);
4042 +
4043 +carry_node *add_new_znode(znode * brother, carry_node * reference,
4044 +                         carry_level * doing, carry_level * todo);
4045 +
4046 +carry_node *find_carry_node(carry_level * level, const znode * node);
4047 +
4048 +extern znode *reiser4_carry_real(const carry_node * node);
4049 +
4050 +/* helper macros to iterate over carry queues */
4051 +
4052 +#define carry_node_next(node)                                          \
4053 +       list_entry((node)->header.level_linkage.next, carry_node,       \
4054 +                  header.level_linkage)
4055 +
4056 +#define carry_node_prev(node)                                          \
4057 +       list_entry((node)->header.level_linkage.prev, carry_node,       \
4058 +                  header.level_linkage)
4059 +
4060 +#define carry_node_front(level)                                                \
4061 +       list_entry((level)->nodes.next, carry_node, header.level_linkage)
4062 +
4063 +#define carry_node_back(level)                                         \
4064 +       list_entry((level)->nodes.prev, carry_node, header.level_linkage)
4065 +
4066 +#define carry_node_end(level, node)                            \
4067 +       (&(level)->nodes == &(node)->header.level_linkage)
4068 +
4069 +/* macro to iterate over all operations in a @level */
4070 +#define for_all_ops(level /* carry level (of type carry_level *) */,          \
4071 +                   op    /* pointer to carry operation, modified by loop (of  \
4072 +                          * type carry_op *) */,                              \
4073 +                   tmp   /* pointer to carry operation (of type carry_op *),  \
4074 +                          * used to make iterator stable in the face of       \
4075 +                          * deletions from the level */ )                     \
4076 +for (op = list_entry(level->ops.next, carry_op, header.level_linkage),        \
4077 +     tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage);  \
4078 +     &op->header.level_linkage != &level->ops;                                \
4079 +     op = tmp,                                                                \
4080 +     tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
4081 +
4082 +#if 0
4083 +for (op = (carry_op *) pool_level_list_front(&level->ops),            \
4084 +     tmp = (carry_op *) pool_level_list_next(&op->header) ;           \
4085 +     !pool_level_list_end(&level->ops, &op->header) ;                 \
4086 +     op = tmp, tmp = (carry_op *) pool_level_list_next(&op->header))
4087 +#endif
4088 +
4089 +/* macro to iterate over all nodes in a @level */                             \
4090 +#define for_all_nodes(level /* carry level (of type carry_level *) */,        \
4091 +                     node  /* pointer to carry node, modified by loop (of     \
4092 +                             * type carry_node *) */,                         \
4093 +                     tmp   /* pointer to carry node (of type carry_node *),   \
4094 +                             * used to make iterator stable in the face of *  \
4095 +                             * deletions from the level */ )                  \
4096 +for (node = list_entry(level->nodes.next, carry_node, header.level_linkage),   \
4097 +     tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage); \
4098 +     &node->header.level_linkage != &level->nodes;                            \
4099 +     node = tmp,                                                              \
4100 +     tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
4101 +
4102 +#if 0
4103 +for (node = carry_node_front(level),                                   \
4104 +     tmp = carry_node_next(node) ; !carry_node_end(level, node) ;      \
4105 +     node = tmp, tmp = carry_node_next(node))
4106 +#endif
4107 +
4108 +/* macro to iterate over all nodes in a @level in reverse order
4109 +
4110 +   This is used, because nodes are unlocked in reversed order of locking */
4111 +#define for_all_nodes_back(level /* carry level (of type carry_level *) */,    \
4112 +                          node  /* pointer to carry node, modified by loop    \
4113 +                                  * (of type carry_node *) */,                \
4114 +                          tmp   /* pointer to carry node (of type carry_node  \
4115 +                                  * *), used to make iterator stable in the   \
4116 +                                  * face of deletions from the level */ )     \
4117 +for (node = carry_node_back(level),            \
4118 +     tmp = carry_node_prev(node) ; !carry_node_end(level, node) ;      \
4119 +     node = tmp, tmp = carry_node_prev(node))
4120 +
4121 +/* __FS_REISER4_CARRY_H__ */
4122 +#endif
4123 +
4124 +/* Make Linus happy.
4125 +   Local variables:
4126 +   c-indentation-style: "K&R"
4127 +   mode-name: "LC"
4128 +   c-basic-offset: 8
4129 +   tab-width: 8
4130 +   fill-column: 120
4131 +   scroll-step: 1
4132 +   End:
4133 +*/
4134 diff -urN linux-2.6.35.orig/fs/reiser4/carry_ops.c linux-2.6.35/fs/reiser4/carry_ops.c
4135 --- linux-2.6.35.orig/fs/reiser4/carry_ops.c    1970-01-01 01:00:00.000000000 +0100
4136 +++ linux-2.6.35/fs/reiser4/carry_ops.c 2010-08-04 15:44:57.000000000 +0200
4137 @@ -0,0 +1,2132 @@
4138 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
4139 +   reiser4/README */
4140 +
4141 +/* implementation of carry operations */
4142 +
4143 +#include "forward.h"
4144 +#include "debug.h"
4145 +#include "key.h"
4146 +#include "coord.h"
4147 +#include "plugin/item/item.h"
4148 +#include "plugin/node/node.h"
4149 +#include "jnode.h"
4150 +#include "znode.h"
4151 +#include "block_alloc.h"
4152 +#include "tree_walk.h"
4153 +#include "pool.h"
4154 +#include "tree_mod.h"
4155 +#include "carry.h"
4156 +#include "carry_ops.h"
4157 +#include "tree.h"
4158 +#include "super.h"
4159 +#include "reiser4.h"
4160 +
4161 +#include <linux/types.h>
4162 +#include <linux/err.h>
4163 +
4164 +static int carry_shift_data(sideof side, coord_t *insert_coord, znode * node,
4165 +                           carry_level * doing, carry_level * todo,
4166 +                           unsigned int including_insert_coord_p);
4167 +
4168 +extern int lock_carry_node(carry_level * level, carry_node * node);
4169 +extern int lock_carry_node_tail(carry_node * node);
4170 +
4171 +/* find left neighbor of a carry node
4172 +
4173 +   Look for left neighbor of @node and add it to the @doing queue. See
4174 +   comments in the body.
4175 +
4176 +*/
4177 +static carry_node *find_left_neighbor(carry_op * op    /* node to find left
4178 +                                                        * neighbor of */ ,
4179 +                                     carry_level * doing/* level to scan */)
4180 +{
4181 +       int result;
4182 +       carry_node *node;
4183 +       carry_node *left;
4184 +       int flags;
4185 +       reiser4_tree *tree;
4186 +
4187 +       node = op->node;
4188 +
4189 +       tree = current_tree;
4190 +       read_lock_tree(tree);
4191 +       /* first, check whether left neighbor is already in a @doing queue */
4192 +       if (reiser4_carry_real(node)->left != NULL) {
4193 +               /* NOTE: there is locking subtlety here. Look into
4194 +                * find_right_neighbor() for more info */
4195 +               if (find_carry_node(doing,
4196 +                                   reiser4_carry_real(node)->left) != NULL) {
4197 +                       read_unlock_tree(tree);
4198 +                       left = node;
4199 +                       do {
4200 +                               left = list_entry(left->header.level_linkage.prev,
4201 +                                                 carry_node, header.level_linkage);
4202 +                               assert("nikita-3408", !carry_node_end(doing,
4203 +                                                                     left));
4204 +                       } while (reiser4_carry_real(left) ==
4205 +                                reiser4_carry_real(node));
4206 +                       return left;
4207 +               }
4208 +       }
4209 +       read_unlock_tree(tree);
4210 +
4211 +       left = reiser4_add_carry_skip(doing, POOLO_BEFORE, node);
4212 +       if (IS_ERR(left))
4213 +               return left;
4214 +
4215 +       left->node = node->node;
4216 +       left->free = 1;
4217 +
4218 +       flags = GN_TRY_LOCK;
4219 +       if (!(op->u.insert.flags & COPI_LOAD_LEFT))
4220 +               flags |= GN_NO_ALLOC;
4221 +
4222 +       /* then, feeling lucky, peek left neighbor in the cache. */
4223 +       result = reiser4_get_left_neighbor(&left->lock_handle,
4224 +                                          reiser4_carry_real(node),
4225 +                                          ZNODE_WRITE_LOCK, flags);
4226 +       if (result == 0) {
4227 +               /* ok, node found and locked. */
4228 +               result = lock_carry_node_tail(left);
4229 +               if (result != 0)
4230 +                       left = ERR_PTR(result);
4231 +       } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
4232 +               /* node is leftmost node in a tree, or neighbor wasn't in
4233 +                  cache, or there is an extent on the left. */
4234 +               reiser4_pool_free(&doing->pool->node_pool, &left->header);
4235 +               left = NULL;
4236 +       } else if (doing->restartable) {
4237 +               /* if left neighbor is locked, and level is restartable, add
4238 +                  new node to @doing and restart. */
4239 +               assert("nikita-913", node->parent != 0);
4240 +               assert("nikita-914", node->node != NULL);
4241 +               left->left = 1;
4242 +               left->free = 0;
4243 +               left = ERR_PTR(-E_REPEAT);
4244 +       } else {
4245 +               /* left neighbor is locked, level cannot be restarted. Just
4246 +                  ignore left neighbor. */
4247 +               reiser4_pool_free(&doing->pool->node_pool, &left->header);
4248 +               left = NULL;
4249 +       }
4250 +       return left;
4251 +}
4252 +
4253 +/* find right neighbor of a carry node
4254 +
4255 +   Look for right neighbor of @node and add it to the @doing queue. See
4256 +   comments in the body.
4257 +
4258 +*/
4259 +static carry_node *find_right_neighbor(carry_op * op   /* node to find right
4260 +                                                        * neighbor of */ ,
4261 +                                      carry_level * doing/* level to scan */)
4262 +{
4263 +       int result;
4264 +       carry_node *node;
4265 +       carry_node *right;
4266 +       lock_handle lh;
4267 +       int flags;
4268 +       reiser4_tree *tree;
4269 +
4270 +       init_lh(&lh);
4271 +
4272 +       node = op->node;
4273 +
4274 +       tree = current_tree;
4275 +       read_lock_tree(tree);
4276 +       /* first, check whether right neighbor is already in a @doing queue */
4277 +       if (reiser4_carry_real(node)->right != NULL) {
4278 +               /*
4279 +                * Tree lock is taken here anyway, because, even if _outcome_
4280 +                * of (find_carry_node() != NULL) doesn't depends on
4281 +                * concurrent updates to ->right, find_carry_node() cannot
4282 +                * work with second argument NULL. Hence, following comment is
4283 +                * of historic importance only.
4284 +                *
4285 +                * Subtle:
4286 +                *
4287 +                * Q: why don't we need tree lock here, looking for the right
4288 +                * neighbor?
4289 +                *
4290 +                * A: even if value of node->real_node->right were changed
4291 +                * during find_carry_node() execution, outcome of execution
4292 +                * wouldn't change, because (in short) other thread cannot add
4293 +                * elements to the @doing, and if node->real_node->right
4294 +                * already was in @doing, value of node->real_node->right
4295 +                * couldn't change, because node cannot be inserted between
4296 +                * locked neighbors.
4297 +                */
4298 +               if (find_carry_node(doing,
4299 +                                   reiser4_carry_real(node)->right) != NULL) {
4300 +                       read_unlock_tree(tree);
4301 +                       /*
4302 +                        * What we are doing here (this is also applicable to
4303 +                        * the find_left_neighbor()).
4304 +                        *
4305 +                        * tree_walk.c code requires that insertion of a
4306 +                        * pointer to a child, modification of parent pointer
4307 +                        * in the child, and insertion of the child into
4308 +                        * sibling list are atomic (see
4309 +                        * plugin/item/internal.c:create_hook_internal()).
4310 +                        *
4311 +                        * carry allocates new node long before pointer to it
4312 +                        * is inserted into parent and, actually, long before
4313 +                        * parent is even known. Such allocated-but-orphaned
4314 +                        * nodes are only trackable through carry level lists.
4315 +                        *
4316 +                        * Situation that is handled here is following: @node
4317 +                        * has valid ->right pointer, but there is
4318 +                        * allocated-but-orphaned node in the carry queue that
4319 +                        * is logically between @node and @node->right. Here
4320 +                        * we are searching for it. Critical point is that
4321 +                        * this is only possible if @node->right is also in
4322 +                        * the carry queue (this is checked above), because
4323 +                        * this is the only way new orphaned node could be
4324 +                        * inserted between them (before inserting new node,
4325 +                        * make_space() first tries to shift to the right, so,
4326 +                        * right neighbor will be locked and queued).
4327 +                        *
4328 +                        */
4329 +                       right = node;
4330 +                       do {
4331 +                               right = list_entry(right->header.level_linkage.next,
4332 +                                                  carry_node, header.level_linkage);
4333 +                               assert("nikita-3408", !carry_node_end(doing,
4334 +                                                                     right));
4335 +                       } while (reiser4_carry_real(right) ==
4336 +                                reiser4_carry_real(node));
4337 +                       return right;
4338 +               }
4339 +       }
4340 +       read_unlock_tree(tree);
4341 +
4342 +       flags = GN_CAN_USE_UPPER_LEVELS;
4343 +       if (!(op->u.insert.flags & COPI_LOAD_RIGHT))
4344 +               flags = GN_NO_ALLOC;
4345 +
4346 +       /* then, try to lock right neighbor */
4347 +       init_lh(&lh);
4348 +       result = reiser4_get_right_neighbor(&lh,
4349 +                                           reiser4_carry_real(node),
4350 +                                           ZNODE_WRITE_LOCK, flags);
4351 +       if (result == 0) {
4352 +               /* ok, node found and locked. */
4353 +               right = reiser4_add_carry_skip(doing, POOLO_AFTER, node);
4354 +               if (!IS_ERR(right)) {
4355 +                       right->node = lh.node;
4356 +                       move_lh(&right->lock_handle, &lh);
4357 +                       right->free = 1;
4358 +                       result = lock_carry_node_tail(right);
4359 +                       if (result != 0)
4360 +                               right = ERR_PTR(result);
4361 +               }
4362 +       } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
4363 +               /* node is rightmost node in a tree, or neighbor wasn't in
4364 +                  cache, or there is an extent on the right. */
4365 +               right = NULL;
4366 +       } else
4367 +               right = ERR_PTR(result);
4368 +       done_lh(&lh);
4369 +       return right;
4370 +}
4371 +
4372 +/* how much free space in a @node is needed for @op
4373 +
4374 +   How much space in @node is required for completion of @op, where @op is
4375 +   insert or paste operation.
4376 +*/
4377 +static unsigned int space_needed_for_op(znode * node   /* znode data are
4378 +                                                        * inserted or
4379 +                                                        * pasted in */ ,
4380 +                                       carry_op * op   /* carry
4381 +                                                          operation */ )
4382 +{
4383 +       assert("nikita-919", op != NULL);
4384 +
4385 +       switch (op->op) {
4386 +       default:
4387 +               impossible("nikita-1701", "Wrong opcode");
4388 +       case COP_INSERT:
4389 +               return space_needed(node, NULL, op->u.insert.d->data, 1);
4390 +       case COP_PASTE:
4391 +               return space_needed(node, op->u.insert.d->coord,
4392 +                                   op->u.insert.d->data, 0);
4393 +       }
4394 +}
4395 +
4396 +/* how much space in @node is required to insert or paste @data at
4397 +   @coord. */
4398 +unsigned int space_needed(const znode * node   /* node data are inserted or
4399 +                                                * pasted in */ ,
4400 +                         const coord_t *coord  /* coord where data are
4401 +                                                * inserted or pasted
4402 +                                                * at */ ,
4403 +                         const reiser4_item_data * data /* data to insert or
4404 +                                                         * paste */ ,
4405 +                         int insertion/* non-0 is inserting, 0---paste */)
4406 +{
4407 +       int result;
4408 +       item_plugin *iplug;
4409 +
4410 +       assert("nikita-917", node != NULL);
4411 +       assert("nikita-918", node_plugin_by_node(node) != NULL);
4412 +       assert("vs-230", !insertion || (coord == NULL));
4413 +
4414 +       result = 0;
4415 +       iplug = data->iplug;
4416 +       if (iplug->b.estimate != NULL) {
4417 +               /* ask item plugin how much space is needed to insert this
4418 +                  item */
4419 +               result += iplug->b.estimate(insertion ? NULL : coord, data);
4420 +       } else {
4421 +               /* reasonable default */
4422 +               result += data->length;
4423 +       }
4424 +       if (insertion) {
4425 +               node_plugin *nplug;
4426 +
4427 +               nplug = node->nplug;
4428 +               /* and add node overhead */
4429 +               if (nplug->item_overhead != NULL)
4430 +                       result += nplug->item_overhead(node, NULL);
4431 +       }
4432 +       return result;
4433 +}
4434 +
4435 +/* find &coord in parent where pointer to new child is to be stored. */
4436 +static int find_new_child_coord(carry_op * op  /* COP_INSERT carry operation to
4437 +                                                * insert pointer to new
4438 +                                                * child */ )
4439 +{
4440 +       int result;
4441 +       znode *node;
4442 +       znode *child;
4443 +
4444 +       assert("nikita-941", op != NULL);
4445 +       assert("nikita-942", op->op == COP_INSERT);
4446 +
4447 +       node = reiser4_carry_real(op->node);
4448 +       assert("nikita-943", node != NULL);
4449 +       assert("nikita-944", node_plugin_by_node(node) != NULL);
4450 +
4451 +       child = reiser4_carry_real(op->u.insert.child);
4452 +       result =
4453 +           find_new_child_ptr(node, child, op->u.insert.brother,
4454 +                              op->u.insert.d->coord);
4455 +
4456 +       build_child_ptr_data(child, op->u.insert.d->data);
4457 +       return result;
4458 +}
4459 +
4460 +/* additional amount of free space in @node required to complete @op */
4461 +static int free_space_shortage(znode * node /* node to check */ ,
4462 +                              carry_op * op/* operation being performed */)
4463 +{
4464 +       assert("nikita-1061", node != NULL);
4465 +       assert("nikita-1062", op != NULL);
4466 +
4467 +       switch (op->op) {
4468 +       default:
4469 +               impossible("nikita-1702", "Wrong opcode");
4470 +       case COP_INSERT:
4471 +       case COP_PASTE:
4472 +               return space_needed_for_op(node, op) - znode_free_space(node);
4473 +       case COP_EXTENT:
4474 +               /* when inserting extent shift data around until insertion
4475 +                  point is utmost in the node. */
4476 +               if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
4477 +                       return +1;
4478 +               else
4479 +                       return -1;
4480 +       }
4481 +}
4482 +
4483 +/* helper function: update node pointer in operation after insertion
4484 +   point was probably shifted into @target. */
4485 +static znode *sync_op(carry_op * op, carry_node * target)
4486 +{
4487 +       znode *insertion_node;
4488 +
4489 +       /* reget node from coord: shift might move insertion coord to
4490 +          the neighbor */
4491 +       insertion_node = op->u.insert.d->coord->node;
4492 +       /* if insertion point was actually moved into new node,
4493 +          update carry node pointer in operation. */
4494 +       if (insertion_node != reiser4_carry_real(op->node)) {
4495 +               op->node = target;
4496 +               assert("nikita-2540",
4497 +                      reiser4_carry_real(target) == insertion_node);
4498 +       }
4499 +       assert("nikita-2541",
4500 +              reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4501 +       return insertion_node;
4502 +}
4503 +
4504 +/*
4505 + * complete make_space() call: update tracked lock handle if necessary. See
4506 + * comments for fs/reiser4/carry.h:carry_track_type
4507 + */
4508 +static int
4509 +make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
4510 +{
4511 +       int result;
4512 +       carry_track_type tracking;
4513 +       znode *node;
4514 +
4515 +       tracking = doing->track_type;
4516 +       node = op->u.insert.d->coord->node;
4517 +
4518 +       if (tracking == CARRY_TRACK_NODE ||
4519 +           (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
4520 +               /* inserting or pasting into node different from
4521 +                  original. Update lock handle supplied by caller. */
4522 +               assert("nikita-1417", doing->tracked != NULL);
4523 +               done_lh(doing->tracked);
4524 +               init_lh(doing->tracked);
4525 +               result = longterm_lock_znode(doing->tracked, node,
4526 +                                            ZNODE_WRITE_LOCK,
4527 +                                            ZNODE_LOCK_HIPRI);
4528 +       } else
4529 +               result = 0;
4530 +       return result;
4531 +}
4532 +
4533 +/* This is insertion policy function. It shifts data to the left and right
4534 +   neighbors of insertion coord and allocates new nodes until there is enough
4535 +   free space to complete @op.
4536 +
4537 +   See comments in the body.
4538 +
4539 +   Assumes that the node format favors insertions at the right end of the node
4540 +   as node40 does.
4541 +
4542 +   See carry_flow() on detail about flow insertion
4543 +*/
4544 +static int make_space(carry_op * op /* carry operation, insert or paste */ ,
4545 +                     carry_level * doing /* current carry queue */ ,
4546 +                     carry_level * todo/* carry queue on the parent level */)
4547 +{
4548 +       znode *node;
4549 +       int result;
4550 +       int not_enough_space;
4551 +       int blk_alloc;
4552 +       znode *orig_node;
4553 +       __u32 flags;
4554 +
4555 +       coord_t *coord;
4556 +
4557 +       assert("nikita-890", op != NULL);
4558 +       assert("nikita-891", todo != NULL);
4559 +       assert("nikita-892",
4560 +              op->op == COP_INSERT ||
4561 +              op->op == COP_PASTE || op->op == COP_EXTENT);
4562 +       assert("nikita-1607",
4563 +              reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4564 +
4565 +       flags = op->u.insert.flags;
4566 +
4567 +       /* NOTE check that new node can only be allocated after checking left
4568 +        * and right neighbors. This is necessary for proper work of
4569 +        * find_{left,right}_neighbor(). */
4570 +       assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
4571 +                                  flags & COPI_DONT_SHIFT_LEFT));
4572 +       assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
4573 +                                  flags & COPI_DONT_SHIFT_RIGHT));
4574 +
4575 +       coord = op->u.insert.d->coord;
4576 +       orig_node = node = coord->node;
4577 +
4578 +       assert("nikita-908", node != NULL);
4579 +       assert("nikita-909", node_plugin_by_node(node) != NULL);
4580 +
4581 +       result = 0;
4582 +       /* If there is not enough space in a node, try to shift something to
4583 +          the left neighbor. This is a bit tricky, as locking to the left is
4584 +          low priority. This is handled by restart logic in carry().
4585 +        */
4586 +       not_enough_space = free_space_shortage(node, op);
4587 +       if (not_enough_space <= 0)
4588 +               /* it is possible that carry was called when there actually
4589 +                  was enough space in the node. For example, when inserting
4590 +                  leftmost item so that delimiting keys have to be updated.
4591 +                */
4592 +               return make_space_tail(op, doing, orig_node);
4593 +       if (!(flags & COPI_DONT_SHIFT_LEFT)) {
4594 +               carry_node *left;
4595 +               /* make note in statistics of an attempt to move
4596 +                  something into the left neighbor */
4597 +               left = find_left_neighbor(op, doing);
4598 +               if (unlikely(IS_ERR(left))) {
4599 +                       if (PTR_ERR(left) == -E_REPEAT)
4600 +                               return -E_REPEAT;
4601 +                       else {
4602 +                               /* some error other than restart request
4603 +                                  occurred. This shouldn't happen. Issue a
4604 +                                  warning and continue as if left neighbor
4605 +                                  weren't existing.
4606 +                                */
4607 +                               warning("nikita-924",
4608 +                                       "Error accessing left neighbor: %li",
4609 +                                       PTR_ERR(left));
4610 +                       }
4611 +               } else if (left != NULL) {
4612 +
4613 +                       /* shift everything possible on the left of and
4614 +                          including insertion coord into the left neighbor */
4615 +                       result = carry_shift_data(LEFT_SIDE, coord,
4616 +                                                 reiser4_carry_real(left),
4617 +                                                 doing, todo,
4618 +                                                 flags & COPI_GO_LEFT);
4619 +
4620 +                       /* reget node from coord: shift_left() might move
4621 +                          insertion coord to the left neighbor */
4622 +                       node = sync_op(op, left);
4623 +
4624 +                       not_enough_space = free_space_shortage(node, op);
4625 +                       /* There is not enough free space in @node, but
4626 +                          may be, there is enough free space in
4627 +                          @left. Various balancing decisions are valid here.
4628 +                          The same for the shifiting to the right.
4629 +                        */
4630 +               }
4631 +       }
4632 +       /* If there still is not enough space, shift to the right */
4633 +       if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
4634 +               carry_node *right;
4635 +
4636 +               right = find_right_neighbor(op, doing);
4637 +               if (IS_ERR(right)) {
4638 +                       warning("nikita-1065",
4639 +                               "Error accessing right neighbor: %li",
4640 +                               PTR_ERR(right));
4641 +               } else if (right != NULL) {
4642 +                       /* node containing insertion point, and its right
4643 +                          neighbor node are write locked by now.
4644 +
4645 +                          shift everything possible on the right of but
4646 +                          excluding insertion coord into the right neighbor
4647 +                        */
4648 +                       result = carry_shift_data(RIGHT_SIDE, coord,
4649 +                                                 reiser4_carry_real(right),
4650 +                                                 doing, todo,
4651 +                                                 flags & COPI_GO_RIGHT);
4652 +                       /* reget node from coord: shift_right() might move
4653 +                          insertion coord to the right neighbor */
4654 +                       node = sync_op(op, right);
4655 +                       not_enough_space = free_space_shortage(node, op);
4656 +               }
4657 +       }
4658 +       /* If there is still not enough space, allocate new node(s).
4659 +
4660 +          We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
4661 +          the carry operation flags (currently this is needed during flush
4662 +          only).
4663 +        */
4664 +       for (blk_alloc = 0;
4665 +            not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
4666 +            !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
4667 +               carry_node *fresh;      /* new node we are allocating */
4668 +               coord_t coord_shadow;   /* remembered insertion point before
4669 +                                        * shifting data into new node */
4670 +               carry_node *node_shadow;        /* remembered insertion node
4671 +                                                * before shifting */
4672 +               unsigned int gointo;    /* whether insertion point should move
4673 +                                        * into newly allocated node */
4674 +
4675 +               /* allocate new node on the right of @node. Znode and disk
4676 +                  fake block number for new node are allocated.
4677 +
4678 +                  add_new_znode() posts carry operation COP_INSERT with
4679 +                  COPT_CHILD option to the parent level to add
4680 +                  pointer to newly created node to its parent.
4681 +
4682 +                  Subtle point: if several new nodes are required to complete
4683 +                  insertion operation at this level, they will be inserted
4684 +                  into their parents in the order of creation, which means
4685 +                  that @node will be valid "cookie" at the time of insertion.
4686 +
4687 +                */
4688 +               fresh = add_new_znode(node, op->node, doing, todo);
4689 +               if (IS_ERR(fresh))
4690 +                       return PTR_ERR(fresh);
4691 +
4692 +               /* Try to shift into new node. */
4693 +               result = lock_carry_node(doing, fresh);
4694 +               zput(reiser4_carry_real(fresh));
4695 +               if (result != 0) {
4696 +                       warning("nikita-947",
4697 +                               "Cannot lock new node: %i", result);
4698 +                       return result;
4699 +               }
4700 +
4701 +               /* both nodes are write locked by now.
4702 +
4703 +                  shift everything possible on the right of and
4704 +                  including insertion coord into the right neighbor.
4705 +                */
4706 +               coord_dup(&coord_shadow, op->u.insert.d->coord);
4707 +               node_shadow = op->node;
4708 +               /* move insertion point into newly created node if:
4709 +
4710 +                  . insertion point is rightmost in the source node, or
4711 +                  . this is not the first node we are allocating in a row.
4712 +                */
4713 +               gointo =
4714 +                   (blk_alloc > 0) ||
4715 +                   coord_is_after_rightmost(op->u.insert.d->coord);
4716 +
4717 +               if (gointo &&
4718 +                   op->op == COP_PASTE &&
4719 +                   coord_is_existing_item(op->u.insert.d->coord) &&
4720 +                   is_solid_item((item_plugin_by_coord(op->u.insert.d->coord)))) {
4721 +                       /* paste into solid (atomic) item, which can contain
4722 +                          only one unit, so we need to shift it right, where
4723 +                          insertion point supposed to be */
4724 +
4725 +                       assert("edward-1444", op->u.insert.d->data->iplug ==
4726 +                              item_plugin_by_id(STATIC_STAT_DATA_ID));
4727 +                       assert("edward-1445",
4728 +                              op->u.insert.d->data->length >
4729 +                              node_plugin_by_node(coord->node)->free_space
4730 +                              (coord->node));
4731 +
4732 +                       op->u.insert.d->coord->between = BEFORE_UNIT;
4733 +               }
4734 +
4735 +               result = carry_shift_data(RIGHT_SIDE, coord,
4736 +                                         reiser4_carry_real(fresh),
4737 +                                         doing, todo, gointo);
4738 +               /* if insertion point was actually moved into new node,
4739 +                  update carry node pointer in operation. */
4740 +               node = sync_op(op, fresh);
4741 +               not_enough_space = free_space_shortage(node, op);
4742 +               if ((not_enough_space > 0) && (node != coord_shadow.node)) {
4743 +                       /* there is not enough free in new node. Shift
4744 +                          insertion point back to the @shadow_node so that
4745 +                          next new node would be inserted between
4746 +                          @shadow_node and @fresh.
4747 +                        */
4748 +                       coord_normalize(&coord_shadow);
4749 +                       coord_dup(coord, &coord_shadow);
4750 +                       node = coord->node;
4751 +                       op->node = node_shadow;
4752 +                       if (1 || (flags & COPI_STEP_BACK)) {
4753 +                               /* still not enough space?! Maybe there is
4754 +                                  enough space in the source node (i.e., node
4755 +                                  data are moved from) now.
4756 +                                */
4757 +                               not_enough_space =
4758 +                                   free_space_shortage(node, op);
4759 +                       }
4760 +               }
4761 +       }
4762 +       if (not_enough_space > 0) {
4763 +               if (!(flags & COPI_DONT_ALLOCATE))
4764 +                       warning("nikita-948", "Cannot insert new item");
4765 +               result = -E_NODE_FULL;
4766 +       }
4767 +       assert("nikita-1622", ergo(result == 0,
4768 +                                 reiser4_carry_real(op->node) == coord->node));
4769 +       assert("nikita-2616", coord == op->u.insert.d->coord);
4770 +       if (result == 0)
4771 +               result = make_space_tail(op, doing, orig_node);
4772 +       return result;
4773 +}
4774 +
4775 +/* insert_paste_common() - common part of insert and paste operations
4776 +
4777 +   This function performs common part of COP_INSERT and COP_PASTE.
4778 +
4779 +   There are two ways in which insertion/paste can be requested:
4780 +
4781 +    . by directly supplying reiser4_item_data. In this case, op ->
4782 +    u.insert.type is set to COPT_ITEM_DATA.
4783 +
4784 +    . by supplying child pointer to which is to inserted into parent. In this
4785 +    case op -> u.insert.type == COPT_CHILD.
4786 +
4787 +    . by supplying key of new item/unit. This is currently only used during
4788 +    extent insertion
4789 +
4790 +   This is required, because when new node is allocated we don't know at what
4791 +   position pointer to it is to be stored in the parent. Actually, we don't
4792 +   even know what its parent will be, because parent can be re-balanced
4793 +   concurrently and new node re-parented, and because parent can be full and
4794 +   pointer to the new node will go into some other node.
4795 +
4796 +   insert_paste_common() resolves pointer to child node into position in the
4797 +   parent by calling find_new_child_coord(), that fills
4798 +   reiser4_item_data. After this, insertion/paste proceeds uniformly.
4799 +
4800 +   Another complication is with finding free space during pasting. It may
4801 +   happen that while shifting items to the neighbors and newly allocated
4802 +   nodes, insertion coord can no longer be in the item we wanted to paste
4803 +   into. At this point, paste becomes (morphs) into insert. Moreover free
4804 +   space analysis has to be repeated, because amount of space required for
4805 +   insertion is different from that of paste (item header overhead, etc).
4806 +
4807 +   This function "unifies" different insertion modes (by resolving child
4808 +   pointer or key into insertion coord), and then calls make_space() to free
4809 +   enough space in the node by shifting data to the left and right and by
4810 +   allocating new nodes if necessary. Carry operation knows amount of space
4811 +   required for its completion. After enough free space is obtained, caller of
4812 +   this function (carry_{insert,paste,etc.}) performs actual insertion/paste
4813 +   by calling item plugin method.
4814 +
4815 +*/
4816 +static int insert_paste_common(carry_op * op   /* carry operation being
4817 +                                                * performed */ ,
4818 +                              carry_level * doing /* current carry level */ ,
4819 +                              carry_level * todo /* next carry level */ ,
4820 +                              carry_insert_data * cdata        /* pointer to
4821 +                                                                * cdata */ ,
4822 +                              coord_t *coord /* insertion/paste coord */ ,
4823 +                              reiser4_item_data * data /* data to be
4824 +                                                        * inserted/pasted */ )
4825 +{
4826 +       assert("nikita-981", op != NULL);
4827 +       assert("nikita-980", todo != NULL);
4828 +       assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE)
4829 +              || (op->op == COP_EXTENT));
4830 +
4831 +       if (op->u.insert.type == COPT_PASTE_RESTARTED) {
4832 +               /* nothing to do. Fall through to make_space(). */
4833 +               ;
4834 +       } else if (op->u.insert.type == COPT_KEY) {
4835 +               node_search_result intra_node;
4836 +               znode *node;
4837 +               /* Problem with doing batching at the lowest level, is that
4838 +                  operations here are given by coords where modification is
4839 +                  to be performed, and one modification can invalidate coords
4840 +                  of all following operations.
4841 +
4842 +                  So, we are implementing yet another type for operation that
4843 +                  will use (the only) "locator" stable across shifting of
4844 +                  data between nodes, etc.: key (COPT_KEY).
4845 +
4846 +                  This clause resolves key to the coord in the node.
4847 +
4848 +                  But node can change also. Probably some pieces have to be
4849 +                  added to the lock_carry_node(), to lock node by its key.
4850 +
4851 +                */
4852 +               /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
4853 +                  if you need something else. */
4854 +               op->u.insert.d->coord = coord;
4855 +               node = reiser4_carry_real(op->node);
4856 +               intra_node = node_plugin_by_node(node)->lookup
4857 +                   (node, op->u.insert.d->key, FIND_EXACT,
4858 +                    op->u.insert.d->coord);
4859 +               if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
4860 +                       warning("nikita-1715", "Intra node lookup failure: %i",
4861 +                               intra_node);
4862 +                       return intra_node;
4863 +               }
4864 +       } else if (op->u.insert.type == COPT_CHILD) {
4865 +               /* if we are asked to insert pointer to the child into
4866 +                  internal node, first convert pointer to the child into
4867 +                  coord within parent node.
4868 +                */
4869 +               znode *child;
4870 +               int result;
4871 +
4872 +               op->u.insert.d = cdata;
4873 +               op->u.insert.d->coord = coord;
4874 +               op->u.insert.d->data = data;
4875 +               op->u.insert.d->coord->node = reiser4_carry_real(op->node);
4876 +               result = find_new_child_coord(op);
4877 +               child = reiser4_carry_real(op->u.insert.child);
4878 +               if (result != NS_NOT_FOUND) {
4879 +                       warning("nikita-993",
4880 +                               "Cannot find a place for child pointer: %i",
4881 +                               result);
4882 +                       return result;
4883 +               }
4884 +               /* This only happens when we did multiple insertions at
4885 +                  the previous level, trying to insert single item and
4886 +                  it so happened, that insertion of pointers to all new
4887 +                  nodes before this one already caused parent node to
4888 +                  split (may be several times).
4889 +
4890 +                  I am going to come up with better solution.
4891 +
4892 +                  You are not expected to understand this.
4893 +                  -- v6root/usr/sys/ken/slp.c
4894 +
4895 +                  Basically, what happens here is the following: carry came
4896 +                  to the parent level and is about to insert internal item
4897 +                  pointing to the child node that it just inserted in the
4898 +                  level below. Position where internal item is to be inserted
4899 +                  was found by find_new_child_coord() above, but node of the
4900 +                  current carry operation (that is, parent node of child
4901 +                  inserted on the previous level), was determined earlier in
4902 +                  the lock_carry_level/lock_carry_node. It could so happen
4903 +                  that other carry operations already performed on the parent
4904 +                  level already split parent node, so that insertion point
4905 +                  moved into another node. Handle this by creating new carry
4906 +                  node for insertion point if necessary.
4907 +                */
4908 +               if (reiser4_carry_real(op->node) !=
4909 +                   op->u.insert.d->coord->node) {
4910 +                       pool_ordering direction;
4911 +                       znode *z1;
4912 +                       znode *z2;
4913 +                       reiser4_key k1;
4914 +                       reiser4_key k2;
4915 +
4916 +                       /*
4917 +                        * determine in what direction insertion point
4918 +                        * moved. Do this by comparing delimiting keys.
4919 +                        */
4920 +                       z1 = op->u.insert.d->coord->node;
4921 +                       z2 = reiser4_carry_real(op->node);
4922 +                       if (keyle(leftmost_key_in_node(z1, &k1),
4923 +                                 leftmost_key_in_node(z2, &k2)))
4924 +                               /* insertion point moved to the left */
4925 +                               direction = POOLO_BEFORE;
4926 +                       else
4927 +                               /* insertion point moved to the right */
4928 +                               direction = POOLO_AFTER;
4929 +
4930 +                       op->node = reiser4_add_carry_skip(doing,
4931 +                                                         direction, op->node);
4932 +                       if (IS_ERR(op->node))
4933 +                               return PTR_ERR(op->node);
4934 +                       op->node->node = op->u.insert.d->coord->node;
4935 +                       op->node->free = 1;
4936 +                       result = lock_carry_node(doing, op->node);
4937 +                       if (result != 0)
4938 +                               return result;
4939 +               }
4940 +
4941 +               /*
4942 +                * set up key of an item being inserted: we are inserting
4943 +                * internal item and its key is (by the very definition of
4944 +                * search tree) is leftmost key in the child node.
4945 +                */
4946 +               write_lock_dk(znode_get_tree(child));
4947 +               op->u.insert.d->key = leftmost_key_in_node(child,
4948 +                                                          znode_get_ld_key(child));
4949 +               write_unlock_dk(znode_get_tree(child));
4950 +               op->u.insert.d->data->arg = op->u.insert.brother;
4951 +       } else {
4952 +               assert("vs-243", op->u.insert.d->coord != NULL);
4953 +               op->u.insert.d->coord->node = reiser4_carry_real(op->node);
4954 +       }
4955 +
4956 +       /* find free space. */
4957 +       return make_space(op, doing, todo);
4958 +}
4959 +
4960 +/* handle carry COP_INSERT operation.
4961 +
4962 +   Insert new item into node. New item can be given in one of two ways:
4963 +
4964 +   - by passing &tree_coord and &reiser4_item_data as part of @op. This is
4965 +   only applicable at the leaf/twig level.
4966 +
4967 +   - by passing a child node pointer to which is to be inserted by this
4968 +   operation.
4969 +
4970 +*/
4971 +static int carry_insert(carry_op * op /* operation to perform */ ,
4972 +                       carry_level * doing     /* queue of operations @op
4973 +                                                * is part of */ ,
4974 +                       carry_level * todo      /* queue where new operations
4975 +                                                * are accumulated */ )
4976 +{
4977 +       znode *node;
4978 +       carry_insert_data cdata;
4979 +       coord_t coord;
4980 +       reiser4_item_data data;
4981 +       carry_plugin_info info;
4982 +       int result;
4983 +
4984 +       assert("nikita-1036", op != NULL);
4985 +       assert("nikita-1037", todo != NULL);
4986 +       assert("nikita-1038", op->op == COP_INSERT);
4987 +
4988 +       coord_init_zero(&coord);
4989 +
4990 +       /* perform common functionality of insert and paste. */
4991 +       result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
4992 +       if (result != 0)
4993 +               return result;
4994 +
4995 +       node = op->u.insert.d->coord->node;
4996 +       assert("nikita-1039", node != NULL);
4997 +       assert("nikita-1040", node_plugin_by_node(node) != NULL);
4998 +
4999 +       assert("nikita-949",
5000 +              space_needed_for_op(node, op) <= znode_free_space(node));
5001 +
5002 +       /* ask node layout to create new item. */
5003 +       info.doing = doing;
5004 +       info.todo = todo;
5005 +       result = node_plugin_by_node(node)->create_item
5006 +           (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data,
5007 +            &info);
5008 +       doing->restartable = 0;
5009 +       znode_make_dirty(node);
5010 +
5011 +       return result;
5012 +}
5013 +
5014 +/*
5015 + * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
5016 + * supplied with a "flow" (that is, a stream of data) and inserts it into tree
5017 + * by slicing into multiple items.
5018 + */
5019 +
5020 +#define flow_insert_point(op) ((op)->u.insert_flow.insert_point)
5021 +#define flow_insert_flow(op) ((op)->u.insert_flow.flow)
5022 +#define flow_insert_data(op) ((op)->u.insert_flow.data)
5023 +
5024 +static size_t item_data_overhead(carry_op * op)
5025 +{
5026 +       if (flow_insert_data(op)->iplug->b.estimate == NULL)
5027 +               return 0;
5028 +       return (flow_insert_data(op)->iplug->b.
5029 +               estimate(NULL /* estimate insertion */ , flow_insert_data(op)) -
5030 +               flow_insert_data(op)->length);
5031 +}
5032 +
5033 +/* FIXME-VS: this is called several times during one make_flow_for_insertion
5034 +   and it will always return the same result. Some optimization could be made
5035 +   by calculating this value once at the beginning and passing it around. That
5036 +   would reduce some flexibility in future changes
5037 +*/
5038 +static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
5039 +static size_t flow_insertion_overhead(carry_op * op)
5040 +{
5041 +       znode *node;
5042 +       size_t insertion_overhead;
5043 +
5044 +       node = flow_insert_point(op)->node;
5045 +       insertion_overhead = 0;
5046 +       if (node->nplug->item_overhead &&
5047 +           !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key,
5048 +                      flow_insert_data(op)))
5049 +               insertion_overhead =
5050 +                   node->nplug->item_overhead(node, NULL) +
5051 +                       item_data_overhead(op);
5052 +       return insertion_overhead;
5053 +}
5054 +
5055 +/* how many bytes of flow does fit to the node */
5056 +static int what_can_fit_into_node(carry_op * op)
5057 +{
5058 +       size_t free, overhead;
5059 +
5060 +       overhead = flow_insertion_overhead(op);
5061 +       free = znode_free_space(flow_insert_point(op)->node);
5062 +       if (free <= overhead)
5063 +               return 0;
5064 +       free -= overhead;
5065 +       /* FIXME: flow->length is loff_t only to not get overflowed in case of
5066 +          expandign truncate */
5067 +       if (free < op->u.insert_flow.flow->length)
5068 +               return free;
5069 +       return (int)op->u.insert_flow.flow->length;
5070 +}
5071 +
5072 +/* in make_space_for_flow_insertion we need to check either whether whole flow
5073 +   fits into a node or whether minimal fraction of flow fits into a node */
5074 +static int enough_space_for_whole_flow(carry_op * op)
5075 +{
5076 +       return (unsigned)what_can_fit_into_node(op) ==
5077 +           op->u.insert_flow.flow->length;
5078 +}
5079 +
5080 +#define MIN_FLOW_FRACTION 1
5081 +static int enough_space_for_min_flow_fraction(carry_op * op)
5082 +{
5083 +       assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
5084 +
5085 +       return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
5086 +}
5087 +
5088 +/* this returns 0 if left neighbor was obtained successfully and everything
5089 +   upto insertion point including it were shifted and left neighbor still has
5090 +   some free space to put minimal fraction of flow into it */
5091 +static int
5092 +make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
5093 +{
5094 +       carry_node *left;
5095 +       znode *orig;
5096 +
5097 +       left = find_left_neighbor(op, doing);
5098 +       if (unlikely(IS_ERR(left))) {
5099 +               warning("vs-899",
5100 +                       "make_space_by_shift_left: "
5101 +                       "error accessing left neighbor: %li", PTR_ERR(left));
5102 +               return 1;
5103 +       }
5104 +       if (left == NULL)
5105 +               /* left neighbor either does not exist or is unformatted
5106 +                  node */
5107 +               return 1;
5108 +
5109 +       orig = flow_insert_point(op)->node;
5110 +       /* try to shift content of node @orig from its head upto insert point
5111 +          including insertion point into the left neighbor */
5112 +       carry_shift_data(LEFT_SIDE, flow_insert_point(op),
5113 +                        reiser4_carry_real(left), doing, todo,
5114 +                        1/* including insert point */);
5115 +       if (reiser4_carry_real(left) != flow_insert_point(op)->node) {
5116 +               /* insertion point did not move */
5117 +               return 1;
5118 +       }
5119 +
5120 +       /* insertion point is set after last item in the node */
5121 +       assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
5122 +
5123 +       if (!enough_space_for_min_flow_fraction(op)) {
5124 +               /* insertion point node does not have enough free space to put
5125 +                  even minimal portion of flow into it, therefore, move
5126 +                  insertion point back to orig node (before first item) */
5127 +               coord_init_before_first_item(flow_insert_point(op), orig);
5128 +               return 1;
5129 +       }
5130 +
5131 +       /* part of flow is to be written to the end of node */
5132 +       op->node = left;
5133 +       return 0;
5134 +}
5135 +
5136 +/* this returns 0 if right neighbor was obtained successfully and everything to
5137 +   the right of insertion point was shifted to it and node got enough free
5138 +   space to put minimal fraction of flow into it */
5139 +static int
5140 +make_space_by_shift_right(carry_op * op, carry_level * doing,
5141 +                         carry_level * todo)
5142 +{
5143 +       carry_node *right;
5144 +
5145 +       right = find_right_neighbor(op, doing);
5146 +       if (unlikely(IS_ERR(right))) {
5147 +               warning("nikita-1065", "shift_right_excluding_insert_point: "
5148 +                       "error accessing right neighbor: %li", PTR_ERR(right));
5149 +               return 1;
5150 +       }
5151 +       if (right) {
5152 +               /* shift everything possible on the right of but excluding
5153 +                  insertion coord into the right neighbor */
5154 +               carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5155 +                                reiser4_carry_real(right), doing, todo,
5156 +                                0/* not including insert point */);
5157 +       } else {
5158 +               /* right neighbor either does not exist or is unformatted
5159 +                  node */
5160 +               ;
5161 +       }
5162 +       if (coord_is_after_rightmost(flow_insert_point(op))) {
5163 +               if (enough_space_for_min_flow_fraction(op)) {
5164 +                       /* part of flow is to be written to the end of node */
5165 +                       return 0;
5166 +               }
5167 +       }
5168 +
5169 +       /* new node is to be added if insert point node did not get enough
5170 +          space for whole flow */
5171 +       return 1;
5172 +}
5173 +
5174 +/* this returns 0 when insert coord is set at the node end and fraction of flow
5175 +   fits into that node */
5176 +static int
5177 +make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
5178 +{
5179 +       int result;
5180 +       znode *node;
5181 +       carry_node *new;
5182 +
5183 +       node = flow_insert_point(op)->node;
5184 +
5185 +       if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5186 +               return RETERR(-E_NODE_FULL);
5187 +       /* add new node after insert point node */
5188 +       new = add_new_znode(node, op->node, doing, todo);
5189 +       if (unlikely(IS_ERR(new)))
5190 +               return PTR_ERR(new);
5191 +       result = lock_carry_node(doing, new);
5192 +       zput(reiser4_carry_real(new));
5193 +       if (unlikely(result))
5194 +               return result;
5195 +       op->u.insert_flow.new_nodes++;
5196 +       if (!coord_is_after_rightmost(flow_insert_point(op))) {
5197 +               carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5198 +                                reiser4_carry_real(new), doing, todo,
5199 +                                0/* not including insert point */);
5200 +               assert("vs-901",
5201 +                      coord_is_after_rightmost(flow_insert_point(op)));
5202 +
5203 +               if (enough_space_for_min_flow_fraction(op))
5204 +                       return 0;
5205 +               if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5206 +                       return RETERR(-E_NODE_FULL);
5207 +
5208 +               /* add one more new node */
5209 +               new = add_new_znode(node, op->node, doing, todo);
5210 +               if (unlikely(IS_ERR(new)))
5211 +                       return PTR_ERR(new);
5212 +               result = lock_carry_node(doing, new);
5213 +               zput(reiser4_carry_real(new));
5214 +               if (unlikely(result))
5215 +                       return result;
5216 +               op->u.insert_flow.new_nodes++;
5217 +       }
5218 +
5219 +       /* move insertion point to new node */
5220 +       coord_init_before_first_item(flow_insert_point(op),
5221 +                                    reiser4_carry_real(new));
5222 +       op->node = new;
5223 +       return 0;
5224 +}
5225 +
5226 +static int
5227 +make_space_for_flow_insertion(carry_op * op, carry_level * doing,
5228 +                             carry_level * todo)
5229 +{
5230 +       __u32 flags = op->u.insert_flow.flags;
5231 +
5232 +       if (enough_space_for_whole_flow(op)) {
5233 +               /* whole flow fits into insert point node */
5234 +               return 0;
5235 +       }
5236 +
5237 +       if (!(flags & COPI_DONT_SHIFT_LEFT)
5238 +           && (make_space_by_shift_left(op, doing, todo) == 0)) {
5239 +               /* insert point is shifted to left neighbor of original insert
5240 +                  point node and is set after last unit in that node. It has
5241 +                  enough space to fit at least minimal fraction of flow. */
5242 +               return 0;
5243 +       }
5244 +
5245 +       if (enough_space_for_whole_flow(op)) {
5246 +               /* whole flow fits into insert point node */
5247 +               return 0;
5248 +       }
5249 +
5250 +       if (!(flags & COPI_DONT_SHIFT_RIGHT)
5251 +           && (make_space_by_shift_right(op, doing, todo) == 0)) {
5252 +               /* insert point is still set to the same node, but there is
5253 +                  nothing to the right of insert point. */
5254 +               return 0;
5255 +       }
5256 +
5257 +       if (enough_space_for_whole_flow(op)) {
5258 +               /* whole flow fits into insert point node */
5259 +               return 0;
5260 +       }
5261 +
5262 +       return make_space_by_new_nodes(op, doing, todo);
5263 +}
5264 +
5265 +/* implements COP_INSERT_FLOW operation */
5266 +static int
5267 +carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
5268 +{
5269 +       int result;
5270 +       flow_t *f;
5271 +       coord_t *insert_point;
5272 +       node_plugin *nplug;
5273 +       carry_plugin_info info;
5274 +       znode *orig_node;
5275 +       lock_handle *orig_lh;
5276 +
5277 +       f = op->u.insert_flow.flow;
5278 +       result = 0;
5279 +
5280 +       /* carry system needs this to work */
5281 +       info.doing = doing;
5282 +       info.todo = todo;
5283 +
5284 +       orig_node = flow_insert_point(op)->node;
5285 +       orig_lh = doing->tracked;
5286 +
5287 +       while (f->length) {
5288 +               result = make_space_for_flow_insertion(op, doing, todo);
5289 +               if (result)
5290 +                       break;
5291 +
5292 +               insert_point = flow_insert_point(op);
5293 +               nplug = node_plugin_by_node(insert_point->node);
5294 +
5295 +               /* compose item data for insertion/pasting */
5296 +               flow_insert_data(op)->data = f->data;
5297 +               flow_insert_data(op)->length = what_can_fit_into_node(op);
5298 +
5299 +               if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
5300 +                       /* insert point is set to item of file we are writing to
5301 +                          and we have to append to it */
5302 +                       assert("vs-903", insert_point->between == AFTER_UNIT);
5303 +                       nplug->change_item_size(insert_point,
5304 +                                               flow_insert_data(op)->length);
5305 +                       flow_insert_data(op)->iplug->b.paste(insert_point,
5306 +                                                            flow_insert_data
5307 +                                                            (op), &info);
5308 +               } else {
5309 +                       /* new item must be inserted */
5310 +                       pos_in_node_t new_pos;
5311 +                       flow_insert_data(op)->length += item_data_overhead(op);
5312 +
5313 +                       /* FIXME-VS: this is because node40_create_item changes
5314 +                          insert_point for obscure reasons */
5315 +                       switch (insert_point->between) {
5316 +                       case AFTER_ITEM:
5317 +                               new_pos = insert_point->item_pos + 1;
5318 +                               break;
5319 +                       case EMPTY_NODE:
5320 +                               new_pos = 0;
5321 +                               break;
5322 +                       case BEFORE_ITEM:
5323 +                               assert("vs-905", insert_point->item_pos == 0);
5324 +                               new_pos = 0;
5325 +                               break;
5326 +                       default:
5327 +                               impossible("vs-906",
5328 +                                          "carry_insert_flow: invalid coord");
5329 +                               new_pos = 0;
5330 +                               break;
5331 +                       }
5332 +
5333 +                       nplug->create_item(insert_point, &f->key,
5334 +                                          flow_insert_data(op), &info);
5335 +                       coord_set_item_pos(insert_point, new_pos);
5336 +               }
5337 +               coord_init_after_item_end(insert_point);
5338 +               doing->restartable = 0;
5339 +               znode_make_dirty(insert_point->node);
5340 +
5341 +               move_flow_forward(f, (unsigned)flow_insert_data(op)->length);
5342 +       }
5343 +
5344 +       if (orig_node != flow_insert_point(op)->node) {
5345 +               /* move lock to new insert point */
5346 +               done_lh(orig_lh);
5347 +               init_lh(orig_lh);
5348 +               result =
5349 +                   longterm_lock_znode(orig_lh, flow_insert_point(op)->node,
5350 +                                       ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
5351 +       }
5352 +
5353 +       return result;
5354 +}
5355 +
5356 +/* implements COP_DELETE operation
5357 +
5358 +   Remove pointer to @op -> u.delete.child from it's parent.
5359 +
5360 +   This function also handles killing of a tree root is last pointer from it
5361 +   was removed. This is complicated by our handling of "twig" level: root on
5362 +   twig level is never killed.
5363 +
5364 +*/
5365 +static int carry_delete(carry_op * op /* operation to be performed */ ,
5366 +                       carry_level * doing UNUSED_ARG  /* current carry
5367 +                                                        * level */ ,
5368 +                       carry_level * todo/* next carry level */)
5369 +{
5370 +       int result;
5371 +       coord_t coord;
5372 +       coord_t coord2;
5373 +       znode *parent;
5374 +       znode *child;
5375 +       carry_plugin_info info;
5376 +       reiser4_tree *tree;
5377 +
5378 +       /*
5379 +        * This operation is called to delete internal item pointing to the
5380 +        * child node that was removed by carry from the tree on the previous
5381 +        * tree level.
5382 +        */
5383 +
5384 +       assert("nikita-893", op != NULL);
5385 +       assert("nikita-894", todo != NULL);
5386 +       assert("nikita-895", op->op == COP_DELETE);
5387 +
5388 +       coord_init_zero(&coord);
5389 +       coord_init_zero(&coord2);
5390 +
5391 +       parent = reiser4_carry_real(op->node);
5392 +       child = op->u.delete.child ?
5393 +               reiser4_carry_real(op->u.delete.child) : op->node->node;
5394 +       tree = znode_get_tree(child);
5395 +       read_lock_tree(tree);
5396 +
5397 +       /*
5398 +        * @parent was determined when carry entered parent level
5399 +        * (lock_carry_level/lock_carry_node). Since then, actual parent of
5400 +        * @child node could change due to other carry operations performed on
5401 +        * the parent level. Check for this.
5402 +        */
5403 +
5404 +       if (znode_parent(child) != parent) {
5405 +               /* NOTE-NIKITA add stat counter for this. */
5406 +               parent = znode_parent(child);
5407 +               assert("nikita-2581", find_carry_node(doing, parent));
5408 +       }
5409 +       read_unlock_tree(tree);
5410 +
5411 +       assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
5412 +
5413 +       /* Twig level horrors: tree should be of height at least 2. So, last
5414 +          pointer from the root at twig level is preserved even if child is
5415 +          empty. This is ugly, but so it was architectured.
5416 +        */
5417 +
5418 +       if (znode_is_root(parent) &&
5419 +           znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
5420 +           node_num_items(parent) == 1) {
5421 +               /* Delimiting key manipulations. */
5422 +               write_lock_dk(tree);
5423 +               znode_set_ld_key(child, znode_set_ld_key(parent, reiser4_min_key()));
5424 +               znode_set_rd_key(child, znode_set_rd_key(parent, reiser4_max_key()));
5425 +               ZF_SET(child, JNODE_DKSET);
5426 +               write_unlock_dk(tree);
5427 +
5428 +               /* @child escaped imminent death! */
5429 +               ZF_CLR(child, JNODE_HEARD_BANSHEE);
5430 +               return 0;
5431 +       }
5432 +
5433 +       /* convert child pointer to the coord_t */
5434 +       result = find_child_ptr(parent, child, &coord);
5435 +       if (result != NS_FOUND) {
5436 +               warning("nikita-994", "Cannot find child pointer: %i", result);
5437 +               print_coord_content("coord", &coord);
5438 +               return result;
5439 +       }
5440 +
5441 +       coord_dup(&coord2, &coord);
5442 +       info.doing = doing;
5443 +       info.todo = todo;
5444 +       {
5445 +               /*
5446 +                * Actually kill internal item: prepare structure with
5447 +                * arguments for ->cut_and_kill() method...
5448 +                */
5449 +
5450 +               struct carry_kill_data kdata;
5451 +               kdata.params.from = &coord;
5452 +               kdata.params.to = &coord2;
5453 +               kdata.params.from_key = NULL;
5454 +               kdata.params.to_key = NULL;
5455 +               kdata.params.smallest_removed = NULL;
5456 +               kdata.params.truncate = 1;
5457 +               kdata.flags = op->u.delete.flags;
5458 +               kdata.inode = NULL;
5459 +               kdata.left = NULL;
5460 +               kdata.right = NULL;
5461 +               kdata.buf = NULL;
5462 +               /* ... and call it. */
5463 +               result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
5464 +                                                                  &info);
5465 +       }
5466 +       doing->restartable = 0;
5467 +
5468 +       /* check whether root should be killed violently */
5469 +       if (znode_is_root(parent) &&
5470 +           /* don't kill roots at and lower than twig level */
5471 +           znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
5472 +           node_num_items(parent) == 1)
5473 +               result = reiser4_kill_tree_root(coord.node);
5474 +
5475 +       return result < 0 ? : 0;
5476 +}
5477 +
5478 +/* implements COP_CUT opration
5479 +
5480 +   Cuts part or whole content of node.
5481 +
5482 +*/
5483 +static int carry_cut(carry_op * op /* operation to be performed */ ,
5484 +                    carry_level * doing /* current carry level */ ,
5485 +                    carry_level * todo/* next carry level */)
5486 +{
5487 +       int result;
5488 +       carry_plugin_info info;
5489 +       node_plugin *nplug;
5490 +
5491 +       assert("nikita-896", op != NULL);
5492 +       assert("nikita-897", todo != NULL);
5493 +       assert("nikita-898", op->op == COP_CUT);
5494 +
5495 +       info.doing = doing;
5496 +       info.todo = todo;
5497 +
5498 +       nplug = node_plugin_by_node(reiser4_carry_real(op->node));
5499 +       if (op->u.cut_or_kill.is_cut)
5500 +               result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
5501 +       else
5502 +               result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
5503 +
5504 +       doing->restartable = 0;
5505 +       return result < 0 ? : 0;
5506 +}
5507 +
5508 +/* helper function for carry_paste(): returns true if @op can be continued as
5509 +   paste  */
5510 +static int
5511 +can_paste(coord_t *icoord, const reiser4_key * key,
5512 +         const reiser4_item_data * data)
5513 +{
5514 +       coord_t circa;
5515 +       item_plugin *new_iplug;
5516 +       item_plugin *old_iplug;
5517 +       int result = 0;         /* to keep gcc shut */
5518 +
5519 +       assert("", icoord->between != AT_UNIT);
5520 +
5521 +       /* obviously, one cannot paste when node is empty---there is nothing
5522 +          to paste into. */
5523 +       if (node_is_empty(icoord->node))
5524 +               return 0;
5525 +       /* if insertion point is at the middle of the item, then paste */
5526 +       if (!coord_is_between_items(icoord))
5527 +               return 1;
5528 +       coord_dup(&circa, icoord);
5529 +       circa.between = AT_UNIT;
5530 +
5531 +       old_iplug = item_plugin_by_coord(&circa);
5532 +       new_iplug = data->iplug;
5533 +
5534 +       /* check whether we can paste to the item @icoord is "at" when we
5535 +          ignore ->between field */
5536 +       if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data))
5537 +               result = 1;
5538 +       else if (icoord->between == BEFORE_UNIT
5539 +                  || icoord->between == BEFORE_ITEM) {
5540 +               /* otherwise, try to glue to the item at the left, if any */
5541 +               coord_dup(&circa, icoord);
5542 +               if (coord_set_to_left(&circa)) {
5543 +                       result = 0;
5544 +                       coord_init_before_item(icoord);
5545 +               } else {
5546 +                       old_iplug = item_plugin_by_coord(&circa);
5547 +                       result = (old_iplug == new_iplug)
5548 +                           && item_can_contain_key(icoord, key, data);
5549 +                       if (result) {
5550 +                               coord_dup(icoord, &circa);
5551 +                               icoord->between = AFTER_UNIT;
5552 +                       }
5553 +               }
5554 +       } else if (icoord->between == AFTER_UNIT
5555 +                  || icoord->between == AFTER_ITEM) {
5556 +               coord_dup(&circa, icoord);
5557 +               /* otherwise, try to glue to the item at the right, if any */
5558 +               if (coord_set_to_right(&circa)) {
5559 +                       result = 0;
5560 +                       coord_init_after_item(icoord);
5561 +               } else {
5562 +                       int (*cck) (const coord_t *, const reiser4_key *,
5563 +                                   const reiser4_item_data *);
5564 +
5565 +                       old_iplug = item_plugin_by_coord(&circa);
5566 +
5567 +                       cck = old_iplug->b.can_contain_key;
5568 +                       if (cck == NULL)
5569 +                               /* item doesn't define ->can_contain_key
5570 +                                  method? So it is not expandable. */
5571 +                               result = 0;
5572 +                       else {
5573 +                               result = (old_iplug == new_iplug)
5574 +                                   && cck(&circa /*icoord */ , key, data);
5575 +                               if (result) {
5576 +                                       coord_dup(icoord, &circa);
5577 +                                       icoord->between = BEFORE_UNIT;
5578 +                               }
5579 +                       }
5580 +               }
5581 +       } else
5582 +               impossible("nikita-2513", "Nothing works");
5583 +       if (result) {
5584 +               if (icoord->between == BEFORE_ITEM) {
5585 +                       assert("vs-912", icoord->unit_pos == 0);
5586 +                       icoord->between = BEFORE_UNIT;
5587 +               } else if (icoord->between == AFTER_ITEM) {
5588 +                       coord_init_after_item_end(icoord);
5589 +               }
5590 +       }
5591 +       return result;
5592 +}
5593 +
5594 +/* implements COP_PASTE operation
5595 +
5596 +   Paste data into existing item. This is complicated by the fact that after
5597 +   we shifted something to the left or right neighbors trying to free some
5598 +   space, item we were supposed to paste into can be in different node than
5599 +   insertion coord. If so, we are no longer doing paste, but insert. See
5600 +   comments in insert_paste_common().
5601 +
5602 +*/
5603 +static int carry_paste(carry_op * op /* operation to be performed */ ,
5604 +                      carry_level * doing UNUSED_ARG   /* current carry
5605 +                                                        * level */ ,
5606 +                      carry_level * todo/* next carry level */)
5607 +{
5608 +       znode *node;
5609 +       carry_insert_data cdata;
5610 +       coord_t dcoord;
5611 +       reiser4_item_data data;
5612 +       int result;
5613 +       int real_size;
5614 +       item_plugin *iplug;
5615 +       carry_plugin_info info;
5616 +       coord_t *coord;
5617 +
5618 +       assert("nikita-982", op != NULL);
5619 +       assert("nikita-983", todo != NULL);
5620 +       assert("nikita-984", op->op == COP_PASTE);
5621 +
5622 +       coord_init_zero(&dcoord);
5623 +
5624 +       result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
5625 +       if (result != 0)
5626 +               return result;
5627 +
5628 +       coord = op->u.insert.d->coord;
5629 +
5630 +       /* handle case when op -> u.insert.coord doesn't point to the item
5631 +          of required type. restart as insert. */
5632 +       if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
5633 +               op->op = COP_INSERT;
5634 +               op->u.insert.type = COPT_PASTE_RESTARTED;
5635 +               result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
5636 +
5637 +               return result;
5638 +       }
5639 +
5640 +       node = coord->node;
5641 +       iplug = item_plugin_by_coord(coord);
5642 +       assert("nikita-992", iplug != NULL);
5643 +
5644 +       assert("nikita-985", node != NULL);
5645 +       assert("nikita-986", node_plugin_by_node(node) != NULL);
5646 +
5647 +       assert("nikita-987",
5648 +              space_needed_for_op(node, op) <= znode_free_space(node));
5649 +
5650 +       assert("nikita-1286", coord_is_existing_item(coord));
5651 +
5652 +       /*
5653 +        * if item is expanded as a result of this operation, we should first
5654 +        * change item size, than call ->b.paste item method. If item is
5655 +        * shrunk, it should be done other way around: first call ->b.paste
5656 +        * method, then reduce item size.
5657 +        */
5658 +
5659 +       real_size = space_needed_for_op(node, op);
5660 +       if (real_size > 0)
5661 +               node->nplug->change_item_size(coord, real_size);
5662 +
5663 +       doing->restartable = 0;
5664 +       info.doing = doing;
5665 +       info.todo = todo;
5666 +
5667 +       result = iplug->b.paste(coord, op->u.insert.d->data, &info);
5668 +
5669 +       if (real_size < 0)
5670 +               node->nplug->change_item_size(coord, real_size);
5671 +
5672 +       /* if we pasted at the beginning of the item, update item's key. */
5673 +       if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
5674 +               node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
5675 +
5676 +       znode_make_dirty(node);
5677 +       return result;
5678 +}
5679 +
5680 +/* handle carry COP_EXTENT operation. */
5681 +static int carry_extent(carry_op * op /* operation to perform */ ,
5682 +                       carry_level * doing     /* queue of operations @op
5683 +                                                * is part of */ ,
5684 +                       carry_level * todo      /* queue where new operations
5685 +                                                * are accumulated */ )
5686 +{
5687 +       znode *node;
5688 +       carry_insert_data cdata;
5689 +       coord_t coord;
5690 +       reiser4_item_data data;
5691 +       carry_op *delete_dummy;
5692 +       carry_op *insert_extent;
5693 +       int result;
5694 +       carry_plugin_info info;
5695 +
5696 +       assert("nikita-1751", op != NULL);
5697 +       assert("nikita-1752", todo != NULL);
5698 +       assert("nikita-1753", op->op == COP_EXTENT);
5699 +
5700 +       /* extent insertion overview:
5701 +
5702 +          extents live on the TWIG LEVEL, which is level one above the leaf
5703 +          one. This complicates extent insertion logic somewhat: it may
5704 +          happen (and going to happen all the time) that in logical key
5705 +          ordering extent has to be placed between items I1 and I2, located
5706 +          at the leaf level, but I1 and I2 are in the same formatted leaf
5707 +          node N1. To insert extent one has to
5708 +
5709 +          (1) reach node N1 and shift data between N1, its neighbors and
5710 +          possibly newly allocated nodes until I1 and I2 fall into different
5711 +          nodes. Since I1 and I2 are still neighboring items in logical key
5712 +          order, they will be necessary utmost items in their respective
5713 +          nodes.
5714 +
5715 +          (2) After this new extent item is inserted into node on the twig
5716 +          level.
5717 +
5718 +          Fortunately this process can reuse almost all code from standard
5719 +          insertion procedure (viz. make_space() and insert_paste_common()),
5720 +          due to the following observation: make_space() only shifts data up
5721 +          to and excluding or including insertion point. It never
5722 +          "over-moves" through insertion point. Thus, one can use
5723 +          make_space() to perform step (1). All required for this is just to
5724 +          instruct free_space_shortage() to keep make_space() shifting data
5725 +          until insertion point is at the node border.
5726 +
5727 +        */
5728 +
5729 +       /* perform common functionality of insert and paste. */
5730 +       result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5731 +       if (result != 0)
5732 +               return result;
5733 +
5734 +       node = op->u.extent.d->coord->node;
5735 +       assert("nikita-1754", node != NULL);
5736 +       assert("nikita-1755", node_plugin_by_node(node) != NULL);
5737 +       assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
5738 +
5739 +       /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
5740 +          extent fits between items. */
5741 +
5742 +       info.doing = doing;
5743 +       info.todo = todo;
5744 +
5745 +       /* there is another complication due to placement of extents on the
5746 +          twig level: extents are "rigid" in the sense that key-range
5747 +          occupied by extent cannot grow indefinitely to the right as it is
5748 +          for the formatted leaf nodes. Because of this when search finds two
5749 +          adjacent extents on the twig level, it has to "drill" to the leaf
5750 +          level, creating new node. Here we are removing this node.
5751 +        */
5752 +       if (node_is_empty(node)) {
5753 +               delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
5754 +               if (IS_ERR(delete_dummy))
5755 +                       return PTR_ERR(delete_dummy);
5756 +               delete_dummy->u.delete.child = NULL;
5757 +               delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
5758 +               ZF_SET(node, JNODE_HEARD_BANSHEE);
5759 +       }
5760 +
5761 +       /* proceed with inserting extent item into parent. We are definitely
5762 +          inserting rather than pasting if we get that far. */
5763 +       insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
5764 +       if (IS_ERR(insert_extent))
5765 +               /* @delete_dummy will be automatically destroyed on the level
5766 +                  exiting  */
5767 +               return PTR_ERR(insert_extent);
5768 +       /* NOTE-NIKITA insertion by key is simplest option here. Another
5769 +          possibility is to insert on the left or right of already existing
5770 +          item.
5771 +        */
5772 +       insert_extent->u.insert.type = COPT_KEY;
5773 +       insert_extent->u.insert.d = op->u.extent.d;
5774 +       assert("nikita-1719", op->u.extent.d->key != NULL);
5775 +       insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
5776 +       insert_extent->u.insert.flags =
5777 +           znode_get_tree(node)->carry.new_extent_flags;
5778 +
5779 +       /*
5780 +        * if carry was asked to track lock handle we should actually track
5781 +        * lock handle on the twig node rather than on the leaf where
5782 +        * operation was started from. Transfer tracked lock handle.
5783 +        */
5784 +       if (doing->track_type) {
5785 +               assert("nikita-3242", doing->tracked != NULL);
5786 +               assert("nikita-3244", todo->tracked == NULL);
5787 +               todo->tracked = doing->tracked;
5788 +               todo->track_type = CARRY_TRACK_NODE;
5789 +               doing->tracked = NULL;
5790 +               doing->track_type = 0;
5791 +       }
5792 +
5793 +       return 0;
5794 +}
5795 +
5796 +/* update key in @parent between pointers to @left and @right.
5797 +
5798 +   Find coords of @left and @right and update delimiting key between them.
5799 +   This is helper function called by carry_update(). Finds position of
5800 +   internal item involved. Updates item key. Updates delimiting keys of child
5801 +   nodes involved.
5802 +*/
5803 +static int update_delimiting_key(znode * parent        /* node key is updated
5804 +                                                * in */ ,
5805 +                                znode * left /* child of @parent */ ,
5806 +                                znode * right /* child of @parent */ ,
5807 +                                carry_level * doing    /* current carry
5808 +                                                        * level */ ,
5809 +                                carry_level * todo     /* parent carry
5810 +                                                        * level */ ,
5811 +                                const char **error_msg /* place to
5812 +                                                        * store error
5813 +                                                        * message */ )
5814 +{
5815 +       coord_t left_pos;
5816 +       coord_t right_pos;
5817 +       int result;
5818 +       reiser4_key ldkey;
5819 +       carry_plugin_info info;
5820 +
5821 +       assert("nikita-1177", right != NULL);
5822 +       /* find position of right left child in a parent */
5823 +       result = find_child_ptr(parent, right, &right_pos);
5824 +       if (result != NS_FOUND) {
5825 +               *error_msg = "Cannot find position of right child";
5826 +               return result;
5827 +       }
5828 +
5829 +       if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
5830 +               /* find position of the left child in a parent */
5831 +               result = find_child_ptr(parent, left, &left_pos);
5832 +               if (result != NS_FOUND) {
5833 +                       *error_msg = "Cannot find position of left child";
5834 +                       return result;
5835 +               }
5836 +               assert("nikita-1355", left_pos.node != NULL);
5837 +       } else
5838 +               left_pos.node = NULL;
5839 +
5840 +       /* check that they are separated by exactly one key and are basically
5841 +          sane */
5842 +       if (REISER4_DEBUG) {
5843 +               if ((left_pos.node != NULL)
5844 +                   && !coord_is_existing_unit(&left_pos)) {
5845 +                       *error_msg = "Left child is bastard";
5846 +                       return RETERR(-EIO);
5847 +               }
5848 +               if (!coord_is_existing_unit(&right_pos)) {
5849 +                       *error_msg = "Right child is bastard";
5850 +                       return RETERR(-EIO);
5851 +               }
5852 +               if (left_pos.node != NULL &&
5853 +                   !coord_are_neighbors(&left_pos, &right_pos)) {
5854 +                       *error_msg = "Children are not direct siblings";
5855 +                       return RETERR(-EIO);
5856 +               }
5857 +       }
5858 +       *error_msg = NULL;
5859 +
5860 +       info.doing = doing;
5861 +       info.todo = todo;
5862 +
5863 +       /*
5864 +        * If child node is not empty, new key of internal item is a key of
5865 +        * leftmost item in the child node. If the child is empty, take its
5866 +        * right delimiting key as a new key of the internal item. Precise key
5867 +        * in the latter case is not important per se, because the child (and
5868 +        * the internal item) are going to be killed shortly anyway, but we
5869 +        * have to preserve correct order of keys in the parent node.
5870 +        */
5871 +
5872 +       if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
5873 +               leftmost_key_in_node(right, &ldkey);
5874 +       else {
5875 +               read_lock_dk(znode_get_tree(parent));
5876 +               ldkey = *znode_get_rd_key(right);
5877 +               read_unlock_dk(znode_get_tree(parent));
5878 +       }
5879 +       node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
5880 +       doing->restartable = 0;
5881 +       znode_make_dirty(parent);
5882 +       return 0;
5883 +}
5884 +
5885 +/* implements COP_UPDATE opration
5886 +
5887 +   Update delimiting keys.
5888 +
5889 +*/
5890 +static int carry_update(carry_op * op /* operation to be performed */ ,
5891 +                       carry_level * doing /* current carry level */ ,
5892 +                       carry_level * todo/* next carry level */)
5893 +{
5894 +       int result;
5895 +       carry_node *missing UNUSED_ARG;
5896 +       znode *left;
5897 +       znode *right;
5898 +       carry_node *lchild;
5899 +       carry_node *rchild;
5900 +       const char *error_msg;
5901 +       reiser4_tree *tree;
5902 +
5903 +       /*
5904 +        * This operation is called to update key of internal item. This is
5905 +        * necessary when carry shifted of cut data on the child
5906 +        * level. Arguments of this operation are:
5907 +        *
5908 +        *     @right --- child node. Operation should update key of internal
5909 +        *     item pointing to @right.
5910 +        *
5911 +        *     @left --- left neighbor of @right. This parameter is optional.
5912 +        */
5913 +
5914 +       assert("nikita-902", op != NULL);
5915 +       assert("nikita-903", todo != NULL);
5916 +       assert("nikita-904", op->op == COP_UPDATE);
5917 +
5918 +       lchild = op->u.update.left;
5919 +       rchild = op->node;
5920 +
5921 +       if (lchild != NULL) {
5922 +               assert("nikita-1001", lchild->parent);
5923 +               assert("nikita-1003", !lchild->left);
5924 +               left = reiser4_carry_real(lchild);
5925 +       } else
5926 +               left = NULL;
5927 +
5928 +       tree = znode_get_tree(rchild->node);
5929 +       read_lock_tree(tree);
5930 +       right = znode_parent(rchild->node);
5931 +       read_unlock_tree(tree);
5932 +
5933 +       if (right != NULL) {
5934 +               result = update_delimiting_key(right,
5935 +                                              lchild ? lchild->node : NULL,
5936 +                                              rchild->node,
5937 +                                              doing, todo, &error_msg);
5938 +       } else {
5939 +               error_msg = "Cannot find node to update key in";
5940 +               result = RETERR(-EIO);
5941 +       }
5942 +       /* operation will be reposted to the next level by the
5943 +          ->update_item_key() method of node plugin, if necessary. */
5944 +
5945 +       if (result != 0) {
5946 +               warning("nikita-999", "Error updating delimiting key: %s (%i)",
5947 +                       error_msg ? : "", result);
5948 +       }
5949 +       return result;
5950 +}
5951 +
5952 +/* move items from @node during carry */
5953 +static int carry_shift_data(sideof side /* in what direction to move data */ ,
5954 +                           coord_t *insert_coord       /* coord where new item
5955 +                                                        * is to be inserted */,
5956 +                           znode * node /* node which data are moved from */ ,
5957 +                           carry_level * doing /* active carry queue */ ,
5958 +                           carry_level * todo  /* carry queue where new
5959 +                                                * operations are to be put
5960 +                                                * in */ ,
5961 +                           unsigned int including_insert_coord_p
5962 +                               /* true if @insertion_coord can be moved */ )
5963 +{
5964 +       int result;
5965 +       znode *source;
5966 +       carry_plugin_info info;
5967 +       node_plugin *nplug;
5968 +
5969 +       source = insert_coord->node;
5970 +
5971 +       info.doing = doing;
5972 +       info.todo = todo;
5973 +
5974 +       nplug = node_plugin_by_node(node);
5975 +       result = nplug->shift(insert_coord, node,
5976 +                             (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
5977 +                             (int)including_insert_coord_p, &info);
5978 +       /* the only error ->shift() method of node plugin can return is
5979 +          -ENOMEM due to carry node/operation allocation. */
5980 +       assert("nikita-915", result >= 0 || result == -ENOMEM);
5981 +       if (result > 0) {
5982 +               /*
5983 +                * if some number of bytes was actually shifted, mark nodes
5984 +                * dirty, and carry level as non-restartable.
5985 +                */
5986 +               doing->restartable = 0;
5987 +               znode_make_dirty(source);
5988 +               znode_make_dirty(node);
5989 +       }
5990 +
5991 +       assert("nikita-2077", coord_check(insert_coord));
5992 +       return 0;
5993 +}
5994 +
5995 +typedef carry_node *(*carry_iterator) (carry_node * node);
5996 +static carry_node *find_dir_carry(carry_node * node, carry_level * level,
5997 +                                 carry_iterator iterator);
5998 +
5999 +static carry_node *pool_level_list_prev(carry_node *node)
6000 +{
6001 +       return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage);
6002 +}
6003 +
6004 +/* look for the left neighbor of given carry node in a carry queue.
6005 +
6006 +   This is used by find_left_neighbor(), but I am not sure that this
6007 +   really gives any advantage. More statistics required.
6008 +
6009 +*/
6010 +carry_node *find_left_carry(carry_node * node  /* node to find left neighbor
6011 +                                                * of */ ,
6012 +                           carry_level * level/* level to scan */)
6013 +{
6014 +       return find_dir_carry(node, level,
6015 +                             (carry_iterator) pool_level_list_prev);
6016 +}
6017 +
6018 +static carry_node *pool_level_list_next(carry_node *node)
6019 +{
6020 +       return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);
6021 +}
6022 +
6023 +/* look for the right neighbor of given carry node in a
6024 +   carry queue.
6025 +
6026 +   This is used by find_right_neighbor(), but I am not sure that this
6027 +   really gives any advantage. More statistics required.
6028 +
6029 +*/
6030 +carry_node *find_right_carry(carry_node * node /* node to find right neighbor
6031 +                                                * of */ ,
6032 +                            carry_level * level/* level to scan */)
6033 +{
6034 +       return find_dir_carry(node, level,
6035 +                             (carry_iterator) pool_level_list_next);
6036 +}
6037 +
6038 +/* look for the left or right neighbor of given carry node in a carry
6039 +   queue.
6040 +
6041 +   Helper function used by find_{left|right}_carry().
6042 +*/
6043 +static carry_node *find_dir_carry(carry_node * node    /* node to start
6044 +                                                        * scanning from */ ,
6045 +                                 carry_level * level /* level to scan */ ,
6046 +                                 carry_iterator iterator       /* operation to
6047 +                                                                * move to the
6048 +                                                                * next node */)
6049 +{
6050 +       carry_node *neighbor;
6051 +
6052 +       assert("nikita-1059", node != NULL);
6053 +       assert("nikita-1060", level != NULL);
6054 +
6055 +       /* scan list of carry nodes on this list dir-ward, skipping all
6056 +          carry nodes referencing the same znode. */
6057 +       neighbor = node;
6058 +       while (1) {
6059 +               neighbor = iterator(neighbor);
6060 +               if (carry_node_end(level, neighbor))
6061 +                       /* list head is reached */
6062 +                       return NULL;
6063 +               if (reiser4_carry_real(neighbor) != reiser4_carry_real(node))
6064 +                       return neighbor;
6065 +       }
6066 +}
6067 +
6068 +/*
6069 + * Memory reservation estimation.
6070 + *
6071 + * Carry process proceeds through tree levels upwards. Carry assumes that it
6072 + * takes tree in consistent state (e.g., that search tree invariants hold),
6073 + * and leaves tree consistent after it finishes. This means that when some
6074 + * error occurs carry cannot simply return if there are pending carry
6075 + * operations. Generic solution for this problem is carry-undo either as
6076 + * transaction manager feature (requiring checkpoints and isolation), or
6077 + * through some carry specific mechanism.
6078 + *
6079 + * Our current approach is to panic if carry hits an error while tree is
6080 + * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
6081 + * this "memory reservation" mechanism was added.
6082 + *
6083 + * Memory reservation is implemented by perthread-pages.diff patch from
6084 + * core-patches. Its API is defined in <linux/gfp.h>
6085 + *
6086 + *     int  perthread_pages_reserve(int nrpages, gfp_t gfp);
6087 + *     void perthread_pages_release(int nrpages);
6088 + *     int  perthread_pages_count(void);
6089 + *
6090 + * carry estimates its worst case memory requirements at the entry, reserved
6091 + * enough memory, and released unused pages before returning.
6092 + *
6093 + * Code below estimates worst case memory requirements for a given carry
6094 + * queue. This is dome by summing worst case memory requirements for each
6095 + * operation in the queue.
6096 + *
6097 + */
6098 +
6099 +/*
6100 + * Memory memory requirements of many operations depends on the tree
6101 + * height. For example, item insertion requires new node to be inserted at
6102 + * each tree level in the worst case. What tree height should be used for
6103 + * estimation? Current tree height is wrong, because tree height can change
6104 + * between the time when estimation was done and the time when operation is
6105 + * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
6106 + * is also not desirable, because it would lead to the huge over-estimation
6107 + * all the time. Plausible solution is "capped tree height": if current tree
6108 + * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
6109 + * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
6110 + * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
6111 + * to be increased even more during short interval of time.
6112 + */
6113 +#define TREE_HEIGHT_CAP (5)
6114 +
6115 +/* return capped tree height for the @tree. See comment above. */
6116 +static int cap_tree_height(reiser4_tree * tree)
6117 +{
6118 +       return max_t(int, tree->height, TREE_HEIGHT_CAP);
6119 +}
6120 +
6121 +/* return capped tree height for the current tree. */
6122 +static int capped_height(void)
6123 +{
6124 +       return cap_tree_height(current_tree);
6125 +}
6126 +
6127 +/* return number of pages required to store given number of bytes */
6128 +static int bytes_to_pages(int bytes)
6129 +{
6130 +       return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
6131 +}
6132 +
6133 +/* how many pages are required to allocate znodes during item insertion. */
6134 +static int carry_estimate_znodes(void)
6135 +{
6136 +       /*
6137 +        * Note, that there we have some problem here: there is no way to
6138 +        * reserve pages specifically for the given slab. This means that
6139 +        * these pages can be hijacked for some other end.
6140 +        */
6141 +
6142 +       /* in the worst case we need 3 new znode on each tree level */
6143 +       return bytes_to_pages(capped_height() * sizeof(znode) * 3);
6144 +}
6145 +
6146 +/*
6147 + * how many pages are required to load bitmaps. One bitmap per level.
6148 + */
6149 +static int carry_estimate_bitmaps(void)
6150 +{
6151 +       if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
6152 +               int bytes;
6153 +
6154 +               bytes = capped_height() * (0 +  /* bnode should be added, but
6155 +                                                * it is private to bitmap.c,
6156 +                                                * skip for now. */
6157 +                                          2 * sizeof(jnode));
6158 +                                               /* working and commit jnodes */
6159 +               return bytes_to_pages(bytes) + 2;       /* and their contents */
6160 +       } else
6161 +               /* bitmaps were pre-loaded during mount */
6162 +               return 0;
6163 +}
6164 +
6165 +/* worst case item insertion memory requirements */
6166 +static int carry_estimate_insert(carry_op * op, carry_level * level)
6167 +{
6168 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +
6169 +                                                               /* new atom */
6170 +           capped_height() +   /* new block on each level */
6171 +           1 +         /* and possibly extra new block at the leaf level */
6172 +           3;                  /* loading of leaves into memory */
6173 +}
6174 +
6175 +/* worst case item deletion memory requirements */
6176 +static int carry_estimate_delete(carry_op * op, carry_level * level)
6177 +{
6178 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +
6179 +                                                               /* new atom */
6180 +           3;                  /* loading of leaves into memory */
6181 +}
6182 +
6183 +/* worst case tree cut memory requirements */
6184 +static int carry_estimate_cut(carry_op * op, carry_level * level)
6185 +{
6186 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +
6187 +                                                               /* new atom */
6188 +           3;                  /* loading of leaves into memory */
6189 +}
6190 +
6191 +/* worst case memory requirements of pasting into item */
6192 +static int carry_estimate_paste(carry_op * op, carry_level * level)
6193 +{
6194 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 +
6195 +                                                               /* new atom */
6196 +           capped_height() +   /* new block on each level */
6197 +           1 +         /* and possibly extra new block at the leaf level */
6198 +           3;                  /* loading of leaves into memory */
6199 +}
6200 +
6201 +/* worst case memory requirements of extent insertion */
6202 +static int carry_estimate_extent(carry_op * op, carry_level * level)
6203 +{
6204 +       return carry_estimate_insert(op, level) +       /* insert extent */
6205 +           carry_estimate_delete(op, level);   /* kill leaf */
6206 +}
6207 +
6208 +/* worst case memory requirements of key update */
6209 +static int carry_estimate_update(carry_op * op, carry_level * level)
6210 +{
6211 +       return 0;
6212 +}
6213 +
6214 +/* worst case memory requirements of flow insertion */
6215 +static int carry_estimate_insert_flow(carry_op * op, carry_level * level)
6216 +{
6217 +       int newnodes;
6218 +
6219 +       newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
6220 +                      CARRY_FLOW_NEW_NODES_LIMIT);
6221 +       /*
6222 +        * roughly estimate insert_flow as a sequence of insertions.
6223 +        */
6224 +       return newnodes * carry_estimate_insert(op, level);
6225 +}
6226 +
6227 +/* This is dispatch table for carry operations. It can be trivially
6228 +   abstracted into useful plugin: tunable balancing policy is a good
6229 +   thing. */
6230 +carry_op_handler op_dispatch_table[COP_LAST_OP] = {
6231 +       [COP_INSERT] = {
6232 +                       .handler = carry_insert,
6233 +                       .estimate = carry_estimate_insert}
6234 +       ,
6235 +       [COP_DELETE] = {
6236 +                       .handler = carry_delete,
6237 +                       .estimate = carry_estimate_delete}
6238 +       ,
6239 +       [COP_CUT] = {
6240 +                    .handler = carry_cut,
6241 +                    .estimate = carry_estimate_cut}
6242 +       ,
6243 +       [COP_PASTE] = {
6244 +                      .handler = carry_paste,
6245 +                      .estimate = carry_estimate_paste}
6246 +       ,
6247 +       [COP_EXTENT] = {
6248 +                       .handler = carry_extent,
6249 +                       .estimate = carry_estimate_extent}
6250 +       ,
6251 +       [COP_UPDATE] = {
6252 +                       .handler = carry_update,
6253 +                       .estimate = carry_estimate_update}
6254 +       ,
6255 +       [COP_INSERT_FLOW] = {
6256 +                            .handler = carry_insert_flow,
6257 +                            .estimate = carry_estimate_insert_flow}
6258 +};
6259 +
6260 +/* Make Linus happy.
6261 +   Local variables:
6262 +   c-indentation-style: "K&R"
6263 +   mode-name: "LC"
6264 +   c-basic-offset: 8
6265 +   tab-width: 8
6266 +   fill-column: 120
6267 +   scroll-step: 1
6268 +   End:
6269 +*/
6270 diff -urN linux-2.6.35.orig/fs/reiser4/carry_ops.h linux-2.6.35/fs/reiser4/carry_ops.h
6271 --- linux-2.6.35.orig/fs/reiser4/carry_ops.h    1970-01-01 01:00:00.000000000 +0100
6272 +++ linux-2.6.35/fs/reiser4/carry_ops.h 2010-08-04 15:44:57.000000000 +0200
6273 @@ -0,0 +1,43 @@
6274 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
6275 +   reiser4/README */
6276 +
6277 +/* implementation of carry operations. See carry_ops.c for details. */
6278 +
6279 +#if !defined(__CARRY_OPS_H__)
6280 +#define __CARRY_OPS_H__
6281 +
6282 +#include "forward.h"
6283 +#include "znode.h"
6284 +#include "carry.h"
6285 +
6286 +/* carry operation handlers */
6287 +typedef struct carry_op_handler {
6288 +       /* perform operation */
6289 +       int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
6290 +       /* estimate memory requirements for @op */
6291 +       int (*estimate) (carry_op * op, carry_level * level);
6292 +} carry_op_handler;
6293 +
6294 +/* This is dispatch table for carry operations. It can be trivially
6295 +   abstracted into useful plugin: tunable balancing policy is a good
6296 +   thing. */
6297 +extern carry_op_handler op_dispatch_table[COP_LAST_OP];
6298 +
6299 +unsigned int space_needed(const znode * node, const coord_t *coord,
6300 +                         const reiser4_item_data * data, int inserting);
6301 +extern carry_node *find_left_carry(carry_node * node, carry_level * level);
6302 +extern carry_node *find_right_carry(carry_node * node, carry_level * level);
6303 +
6304 +/* __CARRY_OPS_H__ */
6305 +#endif
6306 +
6307 +/* Make Linus happy.
6308 +   Local variables:
6309 +   c-indentation-style: "K&R"
6310 +   mode-name: "LC"
6311 +   c-basic-offset: 8
6312 +   tab-width: 8
6313 +   fill-column: 120
6314 +   scroll-step: 1
6315 +   End:
6316 +*/
6317 diff -urN linux-2.6.35.orig/fs/reiser4/context.c linux-2.6.35/fs/reiser4/context.c
6318 --- linux-2.6.35.orig/fs/reiser4/context.c      1970-01-01 01:00:00.000000000 +0100
6319 +++ linux-2.6.35/fs/reiser4/context.c   2010-08-04 15:44:57.000000000 +0200
6320 @@ -0,0 +1,289 @@
6321 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6322 +
6323 +/* Manipulation of reiser4_context */
6324 +
6325 +/*
6326 + * global context used during system call. Variable of this type is allocated
6327 + * on the stack at the beginning of the reiser4 part of the system call and
6328 + * pointer to it is stored in the current->fs_context. This allows us to avoid
6329 + * passing pointer to current transaction and current lockstack (both in
6330 + * one-to-one mapping with threads) all over the call chain.
6331 + *
6332 + * It's kind of like those global variables the prof used to tell you not to
6333 + * use in CS1, except thread specific.;-) Nikita, this was a good idea.
6334 + *
6335 + * In some situations it is desirable to have ability to enter reiser4_context
6336 + * more than once for the same thread (nested contexts). For example, there
6337 + * are some functions that can be called either directly from VFS/VM or from
6338 + * already active reiser4 context (->writepage, for example).
6339 + *
6340 + * In such situations "child" context acts like dummy: all activity is
6341 + * actually performed in the top level context, and get_current_context()
6342 + * always returns top level context.
6343 + * Of course, reiser4_init_context()/reiser4_done_context() have to be properly
6344 + * nested any way.
6345 + *
6346 + * Note that there is an important difference between reiser4 uses
6347 + * ->fs_context and the way other file systems use it. Other file systems
6348 + * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
6349 + * (this is why ->fs_context was initially called ->journal_info). This means,
6350 + * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
6351 + * to the file system, they assume that some transaction is already underway,
6352 + * and usually bail out, because starting nested transaction would most likely
6353 + * lead to the deadlock. This gives false positives with reiser4, because we
6354 + * set ->fs_context before starting transaction.
6355 + */
6356 +
6357 +#include "debug.h"
6358 +#include "super.h"
6359 +#include "context.h"
6360 +#include "vfs_ops.h"   /* for reiser4_throttle_write() */
6361 +
6362 +#include <linux/writeback.h> /* for current_is_pdflush() */
6363 +#include <linux/hardirq.h>
6364 +
6365 +static void _reiser4_init_context(reiser4_context * context,
6366 +                                 struct super_block *super)
6367 +{
6368 +       memset(context, 0, sizeof(*context));
6369 +
6370 +       context->super = super;
6371 +       context->magic = context_magic;
6372 +       context->outer = current->journal_info;
6373 +       current->journal_info = (void *)context;
6374 +       context->nr_children = 0;
6375 +       context->gfp_mask = GFP_KERNEL;
6376 +
6377 +       init_lock_stack(&context->stack);
6378 +
6379 +       reiser4_txn_begin(context);
6380 +
6381 +       /* initialize head of tap list */
6382 +       INIT_LIST_HEAD(&context->taps);
6383 +#if REISER4_DEBUG
6384 +       context->task = current;
6385 +#endif
6386 +       grab_space_enable();
6387 +}
6388 +
6389 +/* initialize context and bind it to the current thread
6390 +
6391 +   This function should be called at the beginning of reiser4 part of
6392 +   syscall.
6393 +*/
6394 +reiser4_context * reiser4_init_context(struct super_block *super)
6395 +{
6396 +       reiser4_context *context;
6397 +
6398 +       assert("nikita-2662", !in_interrupt() && !in_irq());
6399 +       assert("nikita-3357", super != NULL);
6400 +       assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6401 +
6402 +       context = get_current_context_check();
6403 +       if (context && context->super == super) {
6404 +               context = (reiser4_context *) current->journal_info;
6405 +               context->nr_children++;
6406 +               return context;
6407 +       }
6408 +
6409 +       context = kmalloc(sizeof(*context), GFP_KERNEL);
6410 +       if (context == NULL)
6411 +               return ERR_PTR(RETERR(-ENOMEM));
6412 +
6413 +       _reiser4_init_context(context, super);
6414 +       return context;
6415 +}
6416 +
6417 +/* this is used in scan_mgr which is called with spinlock held and in
6418 +   reiser4_fill_super magic */
6419 +void init_stack_context(reiser4_context *context, struct super_block *super)
6420 +{
6421 +       assert("nikita-2662", !in_interrupt() && !in_irq());
6422 +       assert("nikita-3357", super != NULL);
6423 +       assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6424 +       assert("vs-12", !is_in_reiser4_context());
6425 +
6426 +       _reiser4_init_context(context, super);
6427 +       context->on_stack = 1;
6428 +       return;
6429 +}
6430 +
6431 +/* cast lock stack embedded into reiser4 context up to its container */
6432 +reiser4_context *get_context_by_lock_stack(lock_stack * owner)
6433 +{
6434 +       return container_of(owner, reiser4_context, stack);
6435 +}
6436 +
6437 +/* true if there is already _any_ reiser4 context for the current thread */
6438 +int is_in_reiser4_context(void)
6439 +{
6440 +       reiser4_context *ctx;
6441 +
6442 +       ctx = current->journal_info;
6443 +       return ctx != NULL && ((unsigned long)ctx->magic) == context_magic;
6444 +}
6445 +
6446 +/*
6447 + * call balance dirty pages for the current context.
6448 + *
6449 + * File system is expected to call balance_dirty_pages_ratelimited() whenever
6450 + * it dirties a page. reiser4 does this for unformatted nodes (that is, during
6451 + * write---this covers vast majority of all dirty traffic), but we cannot do
6452 + * this immediately when formatted node is dirtied, because long term lock is
6453 + * usually held at that time. To work around this, dirtying of formatted node
6454 + * simply increases ->nr_marked_dirty counter in the current reiser4
6455 + * context. When we are about to leave this context,
6456 + * balance_dirty_pages_ratelimited() is called, if necessary.
6457 + *
6458 + * This introduces another problem: sometimes we do not want to run
6459 + * balance_dirty_pages_ratelimited() when leaving a context, for example
6460 + * because some important lock (like ->i_mutex on the parent directory) is
6461 + * held. To achieve this, ->nobalance flag can be set in the current context.
6462 + */
6463 +static void reiser4_throttle_write_at(reiser4_context *context)
6464 +{
6465 +       reiser4_super_info_data *sbinfo = get_super_private(context->super);
6466 +
6467 +       /*
6468 +        * call balance_dirty_pages_ratelimited() to process formatted nodes
6469 +        * dirtied during this system call. Do that only if we are not in mount
6470 +        * and there were nodes dirtied in this context and we are not in
6471 +        * writepage (to avoid deadlock) and not in pdflush
6472 +        */
6473 +       if (sbinfo != NULL && sbinfo->fake != NULL &&
6474 +           context->nr_marked_dirty != 0 &&
6475 +           !(current->flags & PF_MEMALLOC) &&
6476 +           !current_is_flush_bd_task())
6477 +               /* FIXME-EDWARD: throttle with nr_marked_dirty? */
6478 +               reiser4_throttle_write(sbinfo->fake, 1);
6479 +}
6480 +
6481 +/* release resources associated with context.
6482 +
6483 +   This function should be called at the end of "session" with reiser4,
6484 +   typically just before leaving reiser4 driver back to VFS.
6485 +
6486 +   This is good place to put some degugging consistency checks, like that
6487 +   thread released all locks and closed transcrash etc.
6488 +
6489 +*/
6490 +static void reiser4_done_context(reiser4_context * context)
6491 +                               /* context being released */
6492 +{
6493 +       assert("nikita-860", context != NULL);
6494 +       assert("nikita-859", context->magic == context_magic);
6495 +       assert("vs-646", (reiser4_context *) current->journal_info == context);
6496 +       assert("zam-686", !in_interrupt() && !in_irq());
6497 +
6498 +       /* only do anything when leaving top-level reiser4 context. All nested
6499 +        * contexts are just dummies. */
6500 +       if (context->nr_children == 0) {
6501 +               assert("jmacd-673", context->trans == NULL);
6502 +               assert("jmacd-1002", lock_stack_isclean(&context->stack));
6503 +               assert("nikita-1936", reiser4_no_counters_are_held());
6504 +               assert("nikita-2626", list_empty_careful(reiser4_taps_list()));
6505 +               assert("zam-1004", ergo(get_super_private(context->super),
6506 +                                       get_super_private(context->super)->delete_mutex_owner !=
6507 +                                       current));
6508 +
6509 +               /* release all grabbed but as yet unused blocks */
6510 +               if (context->grabbed_blocks != 0)
6511 +                       all_grabbed2free();
6512 +
6513 +               /*
6514 +                * synchronize against longterm_unlock_znode():
6515 +                * wake_up_requestor() wakes up requestors without holding
6516 +                * zlock (otherwise they will immediately bump into that lock
6517 +                * after wake up on another CPU). To work around (rare)
6518 +                * situation where requestor has been woken up asynchronously
6519 +                * and managed to run until completion (and destroy its
6520 +                * context and lock stack) before wake_up_requestor() called
6521 +                * wake_up() on it, wake_up_requestor() synchronize on lock
6522 +                * stack spin lock. It has actually been observed that spin
6523 +                * lock _was_ locked at this point, because
6524 +                * wake_up_requestor() took interrupt.
6525 +                */
6526 +               spin_lock_stack(&context->stack);
6527 +               spin_unlock_stack(&context->stack);
6528 +
6529 +               assert("zam-684", context->nr_children == 0);
6530 +               /* restore original ->fs_context value */
6531 +               current->journal_info = context->outer;
6532 +               if (context->on_stack == 0)
6533 +                       kfree(context);
6534 +       } else {
6535 +               context->nr_children--;
6536 +#if REISER4_DEBUG
6537 +               assert("zam-685", context->nr_children >= 0);
6538 +#endif
6539 +       }
6540 +}
6541 +
6542 +/*
6543 + * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
6544 + * transaction. Call done_context() to do context related book-keeping.
6545 + */
6546 +void reiser4_exit_context(reiser4_context * context)
6547 +{
6548 +       assert("nikita-3021", reiser4_schedulable());
6549 +
6550 +       if (context->nr_children == 0) {
6551 +               if (!context->nobalance)
6552 +                       reiser4_throttle_write_at(context);
6553 +
6554 +               /* if filesystem is mounted with -o sync or -o dirsync - commit
6555 +                  transaction.  FIXME: TXNH_DONT_COMMIT is used to avoid
6556 +                  commiting on exit_context when inode semaphore is held and
6557 +                  to have ktxnmgrd to do commit instead to get better
6558 +                  concurrent filesystem accesses. But, when one mounts with -o
6559 +                  sync, he cares more about reliability than about
6560 +                  performance. So, for now we have this simple mount -o sync
6561 +                  support. */
6562 +               if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) {
6563 +                       txn_atom *atom;
6564 +
6565 +                       atom = get_current_atom_locked_nocheck();
6566 +                       if (atom) {
6567 +                               atom->flags |= ATOM_FORCE_COMMIT;
6568 +                               context->trans->flags &= ~TXNH_DONT_COMMIT;
6569 +                               spin_unlock_atom(atom);
6570 +                       }
6571 +               }
6572 +               reiser4_txn_end(context);
6573 +       }
6574 +       reiser4_done_context(context);
6575 +}
6576 +
6577 +void reiser4_ctx_gfp_mask_set(void)
6578 +{
6579 +       reiser4_context *ctx;
6580 +
6581 +       ctx = get_current_context();
6582 +       if (ctx->entd == 0 &&
6583 +           list_empty(&ctx->stack.locks) &&
6584 +           ctx->trans->atom == NULL)
6585 +               ctx->gfp_mask = GFP_KERNEL;
6586 +       else
6587 +               ctx->gfp_mask = GFP_NOFS;
6588 +}
6589 +
6590 +void reiser4_ctx_gfp_mask_force(gfp_t mask)
6591 +{
6592 +       reiser4_context *ctx;
6593 +       ctx = get_current_context();
6594 +
6595 +       assert("edward-1454", ctx != NULL);
6596 +
6597 +       ctx->gfp_mask = mask;
6598 +}
6599 +
6600 +/*
6601 + * Local variables:
6602 + * c-indentation-style: "K&R"
6603 + * mode-name: "LC"
6604 + * c-basic-offset: 8
6605 + * tab-width: 8
6606 + * fill-column: 120
6607 + * scroll-step: 1
6608 + * End:
6609 + */
6610 diff -urN linux-2.6.35.orig/fs/reiser4/context.h linux-2.6.35/fs/reiser4/context.h
6611 --- linux-2.6.35.orig/fs/reiser4/context.h      1970-01-01 01:00:00.000000000 +0100
6612 +++ linux-2.6.35/fs/reiser4/context.h   2010-08-04 15:44:57.000000000 +0200
6613 @@ -0,0 +1,228 @@
6614 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
6615 + * reiser4/README */
6616 +
6617 +/* Reiser4 context. See context.c for details. */
6618 +
6619 +#if !defined( __REISER4_CONTEXT_H__ )
6620 +#define __REISER4_CONTEXT_H__
6621 +
6622 +#include "forward.h"
6623 +#include "debug.h"
6624 +#include "dformat.h"
6625 +#include "tap.h"
6626 +#include "lock.h"
6627 +
6628 +#include <linux/types.h>       /* for __u??  */
6629 +#include <linux/fs.h>          /* for struct super_block  */
6630 +#include <linux/spinlock.h>
6631 +#include <linux/sched.h>       /* for struct task_struct */
6632 +
6633 +/* reiser4 per-thread context */
6634 +struct reiser4_context {
6635 +       /* magic constant. For identification of reiser4 contexts. */
6636 +       __u32 magic;
6637 +
6638 +       /* current lock stack. See lock.[ch]. This is where list of all
6639 +          locks taken by current thread is kept. This is also used in
6640 +          deadlock detection. */
6641 +       lock_stack stack;
6642 +
6643 +       /* current transcrash. */
6644 +       txn_handle *trans;
6645 +       /* transaction handle embedded into reiser4_context. ->trans points
6646 +        * here by default. */
6647 +       txn_handle trans_in_ctx;
6648 +
6649 +       /* super block we are working with.  To get the current tree
6650 +          use &get_super_private (reiser4_get_current_sb ())->tree. */
6651 +       struct super_block *super;
6652 +
6653 +       /* parent fs activation */
6654 +       struct fs_activation *outer;
6655 +
6656 +       /* per-thread grabbed (for further allocation) blocks counter */
6657 +       reiser4_block_nr grabbed_blocks;
6658 +
6659 +       /* list of taps currently monitored. See tap.c */
6660 +       struct list_head taps;
6661 +
6662 +       /* grabbing space is enabled */
6663 +       unsigned int grab_enabled:1;
6664 +       /* should be set when we are write dirty nodes to disk in jnode_flush or
6665 +        * reiser4_write_logs() */
6666 +       unsigned int writeout_mode:1;
6667 +       /* true, if current thread is an ent thread */
6668 +       unsigned int entd:1;
6669 +       /* true, if balance_dirty_pages() should not be run when leaving this
6670 +        * context. This is used to avoid lengthly balance_dirty_pages()
6671 +        * operation when holding some important resource, like directory
6672 +        * ->i_mutex */
6673 +       unsigned int nobalance:1;
6674 +
6675 +       /* this bit is used on reiser4_done_context to decide whether context is
6676 +          kmalloc-ed and has to be kfree-ed */
6677 +       unsigned int on_stack:1;
6678 +
6679 +       /* count non-trivial jnode_set_dirty() calls */
6680 +       unsigned long nr_marked_dirty;
6681 +
6682 +       /* reiser4_writeback_inodes calls (via generic_writeback_sb_inodes)
6683 +        * reiser4_writepages for each of dirty inodes. Reiser4_writepages
6684 +        * captures pages. When number of pages captured in one
6685 +        * reiser4_sync_inodes reaches some threshold - some atoms get
6686 +        * flushed */
6687 +       int nr_captured;
6688 +       int nr_children;        /* number of child contexts */
6689 +#if REISER4_DEBUG
6690 +       /* debugging information about reiser4 locks held by the current
6691 +        * thread */
6692 +       reiser4_lock_cnt_info locks;
6693 +       struct task_struct *task;       /* so we can easily find owner of the stack */
6694 +
6695 +       /*
6696 +        * disk space grabbing debugging support
6697 +        */
6698 +       /* how many disk blocks were grabbed by the first call to
6699 +        * reiser4_grab_space() in this context */
6700 +       reiser4_block_nr grabbed_initially;
6701 +
6702 +       /* list of all threads doing flush currently */
6703 +       struct list_head flushers_link;
6704 +       /* information about last error encountered by reiser4 */
6705 +       err_site err;
6706 +#endif
6707 +       void *vp;
6708 +       gfp_t gfp_mask;
6709 +};
6710 +
6711 +extern reiser4_context *get_context_by_lock_stack(lock_stack *);
6712 +
6713 +/* Debugging helps. */
6714 +#if REISER4_DEBUG
6715 +extern void print_contexts(void);
6716 +#endif
6717 +
6718 +#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
6719 +#define current_blocksize reiser4_get_current_sb()->s_blocksize
6720 +#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
6721 +
6722 +extern reiser4_context *reiser4_init_context(struct super_block *);
6723 +extern void init_stack_context(reiser4_context *, struct super_block *);
6724 +extern void reiser4_exit_context(reiser4_context *);
6725 +
6726 +/* magic constant we store in reiser4_context allocated at the stack. Used to
6727 +   catch accesses to staled or uninitialized contexts. */
6728 +#define context_magic ((__u32) 0x4b1b5d0b)
6729 +
6730 +extern int is_in_reiser4_context(void);
6731 +
6732 +/*
6733 + * return reiser4_context for the thread @tsk
6734 + */
6735 +static inline reiser4_context *get_context(const struct task_struct *tsk)
6736 +{
6737 +       assert("vs-1682",
6738 +              ((reiser4_context *) tsk->journal_info)->magic == context_magic);
6739 +       return (reiser4_context *) tsk->journal_info;
6740 +}
6741 +
6742 +/*
6743 + * return reiser4 context of the current thread, or NULL if there is none.
6744 + */
6745 +static inline reiser4_context *get_current_context_check(void)
6746 +{
6747 +       if (is_in_reiser4_context())
6748 +               return get_context(current);
6749 +       else
6750 +               return NULL;
6751 +}
6752 +
6753 +static inline reiser4_context *get_current_context(void);      /* __attribute__((const)); */
6754 +
6755 +/* return context associated with current thread */
6756 +static inline reiser4_context *get_current_context(void)
6757 +{
6758 +       return get_context(current);
6759 +}
6760 +
6761 +static inline gfp_t reiser4_ctx_gfp_mask_get(void)
6762 +{
6763 +       reiser4_context *ctx;
6764 +
6765 +       ctx = get_current_context_check();
6766 +       return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask;
6767 +}
6768 +
6769 +void reiser4_ctx_gfp_mask_set(void);
6770 +void reiser4_ctx_gfp_mask_force (gfp_t mask);
6771 +
6772 +/*
6773 + * true if current thread is in the write-out mode. Thread enters write-out
6774 + * mode during jnode_flush and reiser4_write_logs().
6775 + */
6776 +static inline int is_writeout_mode(void)
6777 +{
6778 +       return get_current_context()->writeout_mode;
6779 +}
6780 +
6781 +/*
6782 + * enter write-out mode
6783 + */
6784 +static inline void writeout_mode_enable(void)
6785 +{
6786 +       assert("zam-941", !get_current_context()->writeout_mode);
6787 +       get_current_context()->writeout_mode = 1;
6788 +}
6789 +
6790 +/*
6791 + * leave write-out mode
6792 + */
6793 +static inline void writeout_mode_disable(void)
6794 +{
6795 +       assert("zam-942", get_current_context()->writeout_mode);
6796 +       get_current_context()->writeout_mode = 0;
6797 +}
6798 +
6799 +static inline void grab_space_enable(void)
6800 +{
6801 +       get_current_context()->grab_enabled = 1;
6802 +}
6803 +
6804 +static inline void grab_space_disable(void)
6805 +{
6806 +       get_current_context()->grab_enabled = 0;
6807 +}
6808 +
6809 +static inline void grab_space_set_enabled(int enabled)
6810 +{
6811 +       get_current_context()->grab_enabled = enabled;
6812 +}
6813 +
6814 +static inline int is_grab_enabled(reiser4_context * ctx)
6815 +{
6816 +       return ctx->grab_enabled;
6817 +}
6818 +
6819 +/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
6820 + * flush would be performed when it is closed. This is necessary when handle
6821 + * has to be closed under some coarse semaphore, like i_mutex of
6822 + * directory. Commit will be performed by ktxnmgrd. */
6823 +static inline void context_set_commit_async(reiser4_context * context)
6824 +{
6825 +       context->nobalance = 1;
6826 +       context->trans->flags |= TXNH_DONT_COMMIT;
6827 +}
6828 +
6829 +/* __REISER4_CONTEXT_H__ */
6830 +#endif
6831 +
6832 +/* Make Linus happy.
6833 +   Local variables:
6834 +   c-indentation-style: "K&R"
6835 +   mode-name: "LC"
6836 +   c-basic-offset: 8
6837 +   tab-width: 8
6838 +   fill-column: 120
6839 +   scroll-step: 1
6840 +   End:
6841 +*/
6842 diff -urN linux-2.6.35.orig/fs/reiser4/coord.c linux-2.6.35/fs/reiser4/coord.c
6843 --- linux-2.6.35.orig/fs/reiser4/coord.c        1970-01-01 01:00:00.000000000 +0100
6844 +++ linux-2.6.35/fs/reiser4/coord.c     2010-08-04 15:44:57.000000000 +0200
6845 @@ -0,0 +1,928 @@
6846 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
6847 +   reiser4/README */
6848 +
6849 +#include "forward.h"
6850 +#include "debug.h"
6851 +#include "dformat.h"
6852 +#include "tree.h"
6853 +#include "plugin/item/item.h"
6854 +#include "znode.h"
6855 +#include "coord.h"
6856 +
6857 +/* Internal constructor. */
6858 +static inline void
6859 +coord_init_values(coord_t *coord, const znode * node, pos_in_node_t item_pos,
6860 +                 pos_in_node_t unit_pos, between_enum between)
6861 +{
6862 +       coord->node = (znode *) node;
6863 +       coord_set_item_pos(coord, item_pos);
6864 +       coord->unit_pos = unit_pos;
6865 +       coord->between = between;
6866 +       ON_DEBUG(coord->plug_v = 0);
6867 +       ON_DEBUG(coord->body_v = 0);
6868 +
6869 +       /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord,
6870 + node, item_pos, unit_pos, coord_tween_tostring (between)); */
6871 +}
6872 +
6873 +/* after shifting of node content, coord previously set properly may become
6874 +   invalid, try to "normalize" it. */
6875 +void coord_normalize(coord_t *coord)
6876 +{
6877 +       znode *node;
6878 +
6879 +       node = coord->node;
6880 +       assert("vs-683", node);
6881 +
6882 +       coord_clear_iplug(coord);
6883 +
6884 +       if (node_is_empty(node)) {
6885 +               coord_init_first_unit(coord, node);
6886 +       } else if ((coord->between == AFTER_ITEM)
6887 +                  || (coord->between == AFTER_UNIT)) {
6888 +               return;
6889 +       } else if (coord->item_pos == coord_num_items(coord)
6890 +                  && coord->between == BEFORE_ITEM) {
6891 +               coord_dec_item_pos(coord);
6892 +               coord->between = AFTER_ITEM;
6893 +       } else if (coord->unit_pos == coord_num_units(coord)
6894 +                  && coord->between == BEFORE_UNIT) {
6895 +               coord->unit_pos--;
6896 +               coord->between = AFTER_UNIT;
6897 +       } else if (coord->item_pos == coord_num_items(coord)
6898 +                  && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
6899 +               coord_dec_item_pos(coord);
6900 +               coord->unit_pos = 0;
6901 +               coord->between = AFTER_ITEM;
6902 +       }
6903 +}
6904 +
6905 +/* Copy a coordinate. */
6906 +void coord_dup(coord_t *coord, const coord_t *old_coord)
6907 +{
6908 +       assert("jmacd-9800", coord_check(old_coord));
6909 +       coord_dup_nocheck(coord, old_coord);
6910 +}
6911 +
6912 +/* Copy a coordinate without check. Useful when old_coord->node is not
6913 +   loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
6914 +void coord_dup_nocheck(coord_t *coord, const coord_t *old_coord)
6915 +{
6916 +       coord->node = old_coord->node;
6917 +       coord_set_item_pos(coord, old_coord->item_pos);
6918 +       coord->unit_pos = old_coord->unit_pos;
6919 +       coord->between = old_coord->between;
6920 +       coord->iplugid = old_coord->iplugid;
6921 +       ON_DEBUG(coord->plug_v = old_coord->plug_v);
6922 +       ON_DEBUG(coord->body_v = old_coord->body_v);
6923 +}
6924 +
6925 +/* Initialize an invalid coordinate. */
6926 +void coord_init_invalid(coord_t *coord, const znode * node)
6927 +{
6928 +       coord_init_values(coord, node, 0, 0, INVALID_COORD);
6929 +}
6930 +
6931 +void coord_init_first_unit_nocheck(coord_t *coord, const znode * node)
6932 +{
6933 +       coord_init_values(coord, node, 0, 0, AT_UNIT);
6934 +}
6935 +
6936 +/* Initialize a coordinate to point at the first unit of the first item. If the
6937 +   node is empty, it is positioned at the EMPTY_NODE. */
6938 +void coord_init_first_unit(coord_t *coord, const znode * node)
6939 +{
6940 +       int is_empty = node_is_empty(node);
6941 +
6942 +       coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
6943 +
6944 +       assert("jmacd-9801", coord_check(coord));
6945 +}
6946 +
6947 +/* Initialize a coordinate to point at the last unit of the last item.  If the
6948 +   node is empty, it is positioned at the EMPTY_NODE. */
6949 +void coord_init_last_unit(coord_t *coord, const znode * node)
6950 +{
6951 +       int is_empty = node_is_empty(node);
6952 +
6953 +       coord_init_values(coord, node,
6954 +                         (is_empty ? 0 : node_num_items(node) - 1), 0,
6955 +                         (is_empty ? EMPTY_NODE : AT_UNIT));
6956 +       if (!is_empty)
6957 +               coord->unit_pos = coord_last_unit_pos(coord);
6958 +       assert("jmacd-9802", coord_check(coord));
6959 +}
6960 +
6961 +/* Initialize a coordinate to before the first item. If the node is empty, it is
6962 +   positioned at the EMPTY_NODE. */
6963 +void coord_init_before_first_item(coord_t *coord, const znode * node)
6964 +{
6965 +       int is_empty = node_is_empty(node);
6966 +
6967 +       coord_init_values(coord, node, 0, 0,
6968 +                         (is_empty ? EMPTY_NODE : BEFORE_UNIT));
6969 +
6970 +       assert("jmacd-9803", coord_check(coord));
6971 +}
6972 +
6973 +/* Initialize a coordinate to after the last item. If the node is empty, it is
6974 +   positioned at the EMPTY_NODE. */
6975 +void coord_init_after_last_item(coord_t *coord, const znode * node)
6976 +{
6977 +       int is_empty = node_is_empty(node);
6978 +
6979 +       coord_init_values(coord, node,
6980 +                         (is_empty ? 0 : node_num_items(node) - 1), 0,
6981 +                         (is_empty ? EMPTY_NODE : AFTER_ITEM));
6982 +
6983 +       assert("jmacd-9804", coord_check(coord));
6984 +}
6985 +
6986 +/* Initialize a coordinate to after last unit in the item. Coord must be set
6987 +   already to existing item */
6988 +void coord_init_after_item_end(coord_t *coord)
6989 +{
6990 +       coord->between = AFTER_UNIT;
6991 +       coord->unit_pos = coord_last_unit_pos(coord);
6992 +}
6993 +
6994 +/* Initialize a coordinate to before the item. Coord must be set already to
6995 +   existing item */
6996 +void coord_init_before_item(coord_t *coord)
6997 +{
6998 +       coord->unit_pos = 0;
6999 +       coord->between = BEFORE_ITEM;
7000 +}
7001 +
7002 +/* Initialize a coordinate to after the item. Coord must be set already to
7003 +   existing item */
7004 +void coord_init_after_item(coord_t *coord)
7005 +{
7006 +       coord->unit_pos = 0;
7007 +       coord->between = AFTER_ITEM;
7008 +}
7009 +
7010 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
7011 +   it was not clear how actually */
7012 +void coord_init_zero(coord_t *coord)
7013 +{
7014 +       memset(coord, 0, sizeof(*coord));
7015 +}
7016 +
7017 +/* Return the number of units at the present item.
7018 +   Asserts coord_is_existing_item(). */
7019 +unsigned coord_num_units(const coord_t *coord)
7020 +{
7021 +       assert("jmacd-9806", coord_is_existing_item(coord));
7022 +
7023 +       return item_plugin_by_coord(coord)->b.nr_units(coord);
7024 +}
7025 +
7026 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
7027 +/* Audited by: green(2002.06.15) */
7028 +int coord_is_invalid(const coord_t *coord)
7029 +{
7030 +       return coord->between == INVALID_COORD;
7031 +}
7032 +
7033 +/* Returns true if the coordinate is positioned at an existing item, not before
7034 +   or after an item.  It may be placed at, before, or after any unit within the
7035 +   item, whether existing or not. */
7036 +int coord_is_existing_item(const coord_t *coord)
7037 +{
7038 +       switch (coord->between) {
7039 +       case EMPTY_NODE:
7040 +       case BEFORE_ITEM:
7041 +       case AFTER_ITEM:
7042 +       case INVALID_COORD:
7043 +               return 0;
7044 +
7045 +       case BEFORE_UNIT:
7046 +       case AT_UNIT:
7047 +       case AFTER_UNIT:
7048 +               return coord->item_pos < coord_num_items(coord);
7049 +       }
7050 +
7051 +       impossible("jmacd-9900", "unreachable coord: %p", coord);
7052 +       return 0;
7053 +}
7054 +
7055 +/* Returns true if the coordinate is positioned at an existing unit, not before
7056 +   or after a unit. */
7057 +/* Audited by: green(2002.06.15) */
7058 +int coord_is_existing_unit(const coord_t *coord)
7059 +{
7060 +       switch (coord->between) {
7061 +       case EMPTY_NODE:
7062 +       case BEFORE_UNIT:
7063 +       case AFTER_UNIT:
7064 +       case BEFORE_ITEM:
7065 +       case AFTER_ITEM:
7066 +       case INVALID_COORD:
7067 +               return 0;
7068 +
7069 +       case AT_UNIT:
7070 +               return (coord->item_pos < coord_num_items(coord)
7071 +                       && coord->unit_pos < coord_num_units(coord));
7072 +       }
7073 +
7074 +       impossible("jmacd-9902", "unreachable");
7075 +       return 0;
7076 +}
7077 +
7078 +/* Returns true if the coordinate is positioned at the first unit of the first
7079 +   item. Not true for empty nodes nor coordinates positioned before the first
7080 +   item. */
7081 +/* Audited by: green(2002.06.15) */
7082 +int coord_is_leftmost_unit(const coord_t *coord)
7083 +{
7084 +       return (coord->between == AT_UNIT && coord->item_pos == 0
7085 +               && coord->unit_pos == 0);
7086 +}
7087 +
7088 +#if REISER4_DEBUG
7089 +/* For assertions only, checks for a valid coordinate. */
7090 +int coord_check(const coord_t *coord)
7091 +{
7092 +       if (coord->node == NULL)
7093 +               return 0;
7094 +       if (znode_above_root(coord->node))
7095 +               return 1;
7096 +
7097 +       switch (coord->between) {
7098 +       default:
7099 +       case INVALID_COORD:
7100 +               return 0;
7101 +       case EMPTY_NODE:
7102 +               if (!node_is_empty(coord->node))
7103 +                       return 0;
7104 +               return coord->item_pos == 0 && coord->unit_pos == 0;
7105 +
7106 +       case BEFORE_UNIT:
7107 +       case AFTER_UNIT:
7108 +               if (node_is_empty(coord->node) && (coord->item_pos == 0)
7109 +                   && (coord->unit_pos == 0))
7110 +                       return 1;
7111 +       case AT_UNIT:
7112 +               break;
7113 +       case AFTER_ITEM:
7114 +       case BEFORE_ITEM:
7115 +               /* before/after item should not set unit_pos. */
7116 +               if (coord->unit_pos != 0)
7117 +                       return 0;
7118 +               break;
7119 +       }
7120 +
7121 +       if (coord->item_pos >= node_num_items(coord->node))
7122 +               return 0;
7123 +
7124 +       /* FIXME-VS: we are going to check unit_pos. This makes no sense when
7125 +          between is set either AFTER_ITEM or BEFORE_ITEM */
7126 +       if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
7127 +               return 1;
7128 +
7129 +       if (coord_is_iplug_set(coord) &&
7130 +           coord->unit_pos >
7131 +           item_plugin_by_coord(coord)->b.nr_units(coord) - 1)
7132 +               return 0;
7133 +       return 1;
7134 +}
7135 +#endif
7136 +
7137 +/* Adjust coordinate boundaries based on the number of items prior to
7138 +   coord_next/prev. Returns 1 if the new position is does not exist. */
7139 +static int coord_adjust_items(coord_t *coord, unsigned items, int is_next)
7140 +{
7141 +       /* If the node is invalid, leave it. */
7142 +       if (coord->between == INVALID_COORD)
7143 +               return 1;
7144 +
7145 +       /* If the node is empty, set it appropriately. */
7146 +       if (items == 0) {
7147 +               coord->between = EMPTY_NODE;
7148 +               coord_set_item_pos(coord, 0);
7149 +               coord->unit_pos = 0;
7150 +               return 1;
7151 +       }
7152 +
7153 +       /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
7154 +       if (coord->between == EMPTY_NODE) {
7155 +               coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
7156 +               coord_set_item_pos(coord, 0);
7157 +               coord->unit_pos = 0;
7158 +               return 0;
7159 +       }
7160 +
7161 +       /* If the item_pos is out-of-range, set it appropriatly. */
7162 +       if (coord->item_pos >= items) {
7163 +               coord->between = AFTER_ITEM;
7164 +               coord_set_item_pos(coord, items - 1);
7165 +               coord->unit_pos = 0;
7166 +               /* If is_next, return 1 (can't go any further). */
7167 +               return is_next;
7168 +       }
7169 +
7170 +       return 0;
7171 +}
7172 +
7173 +/* Advances the coordinate by one unit to the right.  If empty, no change.  If
7174 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new
7175 +   position is an existing unit. */
7176 +int coord_next_unit(coord_t *coord)
7177 +{
7178 +       unsigned items = coord_num_items(coord);
7179 +
7180 +       if (coord_adjust_items(coord, items, 1) == 1)
7181 +               return 1;
7182 +
7183 +       switch (coord->between) {
7184 +       case BEFORE_UNIT:
7185 +               /* Now it is positioned at the same unit. */
7186 +               coord->between = AT_UNIT;
7187 +               return 0;
7188 +
7189 +       case AFTER_UNIT:
7190 +       case AT_UNIT:
7191 +               /* If it was at or after a unit and there are more units in this
7192 +                  item, advance to the next one. */
7193 +               if (coord->unit_pos < coord_last_unit_pos(coord)) {
7194 +                       coord->unit_pos += 1;
7195 +                       coord->between = AT_UNIT;
7196 +                       return 0;
7197 +               }
7198 +
7199 +               /* Otherwise, it is crossing an item boundary and treated as if
7200 +                  it was after the current item. */
7201 +               coord->between = AFTER_ITEM;
7202 +               coord->unit_pos = 0;
7203 +               /* FALLTHROUGH */
7204 +
7205 +       case AFTER_ITEM:
7206 +               /* Check for end-of-node. */
7207 +               if (coord->item_pos == items - 1)
7208 +                       return 1;
7209 +
7210 +               coord_inc_item_pos(coord);
7211 +               coord->unit_pos = 0;
7212 +               coord->between = AT_UNIT;
7213 +               return 0;
7214 +
7215 +       case BEFORE_ITEM:
7216 +               /* The adjust_items checks ensure that we are valid here. */
7217 +               coord->unit_pos = 0;
7218 +               coord->between = AT_UNIT;
7219 +               return 0;
7220 +
7221 +       case INVALID_COORD:
7222 +       case EMPTY_NODE:
7223 +               /* Handled in coord_adjust_items(). */
7224 +               break;
7225 +       }
7226 +
7227 +       impossible("jmacd-9902", "unreachable");
7228 +       return 0;
7229 +}
7230 +
7231 +/* Advances the coordinate by one item to the right.  If empty, no change.  If
7232 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new
7233 +   position is an existing item. */
7234 +int coord_next_item(coord_t *coord)
7235 +{
7236 +       unsigned items = coord_num_items(coord);
7237 +
7238 +       if (coord_adjust_items(coord, items, 1) == 1)
7239 +               return 1;
7240 +
7241 +       switch (coord->between) {
7242 +       case AFTER_UNIT:
7243 +       case AT_UNIT:
7244 +       case BEFORE_UNIT:
7245 +       case AFTER_ITEM:
7246 +               /* Check for end-of-node. */
7247 +               if (coord->item_pos == items - 1) {
7248 +                       coord->between = AFTER_ITEM;
7249 +                       coord->unit_pos = 0;
7250 +                       coord_clear_iplug(coord);
7251 +                       return 1;
7252 +               }
7253 +
7254 +               /* Anywhere in an item, go to the next one. */
7255 +               coord->between = AT_UNIT;
7256 +               coord_inc_item_pos(coord);
7257 +               coord->unit_pos = 0;
7258 +               return 0;
7259 +
7260 +       case BEFORE_ITEM:
7261 +               /* The out-of-range check ensures that we are valid here. */
7262 +               coord->unit_pos = 0;
7263 +               coord->between = AT_UNIT;
7264 +               return 0;
7265 +       case INVALID_COORD:
7266 +       case EMPTY_NODE:
7267 +               /* Handled in coord_adjust_items(). */
7268 +               break;
7269 +       }
7270 +
7271 +       impossible("jmacd-9903", "unreachable");
7272 +       return 0;
7273 +}
7274 +
7275 +/* Advances the coordinate by one unit to the left.  If empty, no change.  If
7276 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new
7277 +   position is an existing unit. */
7278 +int coord_prev_unit(coord_t *coord)
7279 +{
7280 +       unsigned items = coord_num_items(coord);
7281 +
7282 +       if (coord_adjust_items(coord, items, 0) == 1)
7283 +               return 1;
7284 +
7285 +       switch (coord->between) {
7286 +       case AT_UNIT:
7287 +       case BEFORE_UNIT:
7288 +               if (coord->unit_pos > 0) {
7289 +                       coord->unit_pos -= 1;
7290 +                       coord->between = AT_UNIT;
7291 +                       return 0;
7292 +               }
7293 +
7294 +               if (coord->item_pos == 0) {
7295 +                       coord->between = BEFORE_ITEM;
7296 +                       return 1;
7297 +               }
7298 +
7299 +               coord_dec_item_pos(coord);
7300 +               coord->unit_pos = coord_last_unit_pos(coord);
7301 +               coord->between = AT_UNIT;
7302 +               return 0;
7303 +
7304 +       case AFTER_UNIT:
7305 +               /* What if unit_pos is out-of-range? */
7306 +               assert("jmacd-5442",
7307 +                      coord->unit_pos <= coord_last_unit_pos(coord));
7308 +               coord->between = AT_UNIT;
7309 +               return 0;
7310 +
7311 +       case BEFORE_ITEM:
7312 +               if (coord->item_pos == 0)
7313 +                       return 1;
7314 +
7315 +               coord_dec_item_pos(coord);
7316 +               /* FALLTHROUGH */
7317 +
7318 +       case AFTER_ITEM:
7319 +               coord->between = AT_UNIT;
7320 +               coord->unit_pos = coord_last_unit_pos(coord);
7321 +               return 0;
7322 +
7323 +       case INVALID_COORD:
7324 +       case EMPTY_NODE:
7325 +               break;
7326 +       }
7327 +
7328 +       impossible("jmacd-9904", "unreachable");
7329 +       return 0;
7330 +}
7331 +
7332 +/* Advances the coordinate by one item to the left.  If empty, no change.  If
7333 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new
7334 +   position is an existing item. */
7335 +int coord_prev_item(coord_t *coord)
7336 +{
7337 +       unsigned items = coord_num_items(coord);
7338 +
7339 +       if (coord_adjust_items(coord, items, 0) == 1)
7340 +               return 1;
7341 +
7342 +       switch (coord->between) {
7343 +       case AT_UNIT:
7344 +       case AFTER_UNIT:
7345 +       case BEFORE_UNIT:
7346 +       case BEFORE_ITEM:
7347 +
7348 +               if (coord->item_pos == 0) {
7349 +                       coord->between = BEFORE_ITEM;
7350 +                       coord->unit_pos = 0;
7351 +                       return 1;
7352 +               }
7353 +
7354 +               coord_dec_item_pos(coord);
7355 +               coord->unit_pos = 0;
7356 +               coord->between = AT_UNIT;
7357 +               return 0;
7358 +
7359 +       case AFTER_ITEM:
7360 +               coord->between = AT_UNIT;
7361 +               coord->unit_pos = 0;
7362 +               return 0;
7363 +
7364 +       case INVALID_COORD:
7365 +       case EMPTY_NODE:
7366 +               break;
7367 +       }
7368 +
7369 +       impossible("jmacd-9905", "unreachable");
7370 +       return 0;
7371 +}
7372 +
7373 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on
7374 +   sideof argument. */
7375 +void coord_init_sideof_unit(coord_t *coord, const znode * node, sideof dir)
7376 +{
7377 +       assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7378 +       if (dir == LEFT_SIDE) {
7379 +               coord_init_first_unit(coord, node);
7380 +       } else {
7381 +               coord_init_last_unit(coord, node);
7382 +       }
7383 +}
7384 +
7385 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending
7386 +   on sideof argument. */
7387 +/* Audited by: green(2002.06.15) */
7388 +int coord_is_after_sideof_unit(coord_t *coord, sideof dir)
7389 +{
7390 +       assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7391 +       if (dir == LEFT_SIDE) {
7392 +               return coord_is_before_leftmost(coord);
7393 +       } else {
7394 +               return coord_is_after_rightmost(coord);
7395 +       }
7396 +}
7397 +
7398 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument.
7399 + */
7400 +/* Audited by: green(2002.06.15) */
7401 +int coord_sideof_unit(coord_t *coord, sideof dir)
7402 +{
7403 +       assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7404 +       if (dir == LEFT_SIDE) {
7405 +               return coord_prev_unit(coord);
7406 +       } else {
7407 +               return coord_next_unit(coord);
7408 +       }
7409 +}
7410 +
7411 +#if REISER4_DEBUG
7412 +int coords_equal(const coord_t *c1, const coord_t *c2)
7413 +{
7414 +       assert("nikita-2840", c1 != NULL);
7415 +       assert("nikita-2841", c2 != NULL);
7416 +
7417 +       return
7418 +           c1->node == c2->node &&
7419 +           c1->item_pos == c2->item_pos &&
7420 +           c1->unit_pos == c2->unit_pos && c1->between == c2->between;
7421 +}
7422 +#endif  /*  REISER4_DEBUG  */
7423 +
7424 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if
7425 +   coord_is_after_leftmost return NCOORD_ON_THE_LEFT, otherwise return
7426 +   NCOORD_INSIDE. */
7427 +/* Audited by: green(2002.06.15) */
7428 +coord_wrt_node coord_wrt(const coord_t *coord)
7429 +{
7430 +       if (coord_is_before_leftmost(coord))
7431 +               return COORD_ON_THE_LEFT;
7432 +
7433 +       if (coord_is_after_rightmost(coord))
7434 +               return COORD_ON_THE_RIGHT;
7435 +
7436 +       return COORD_INSIDE;
7437 +}
7438 +
7439 +/* Returns true if the coordinate is positioned after the last item or after the
7440 +   last unit of the last item or it is an empty node. */
7441 +/* Audited by: green(2002.06.15) */
7442 +int coord_is_after_rightmost(const coord_t *coord)
7443 +{
7444 +       assert("jmacd-7313", coord_check(coord));
7445 +
7446 +       switch (coord->between) {
7447 +       case INVALID_COORD:
7448 +       case AT_UNIT:
7449 +       case BEFORE_UNIT:
7450 +       case BEFORE_ITEM:
7451 +               return 0;
7452 +
7453 +       case EMPTY_NODE:
7454 +               return 1;
7455 +
7456 +       case AFTER_ITEM:
7457 +               return (coord->item_pos == node_num_items(coord->node) - 1);
7458 +
7459 +       case AFTER_UNIT:
7460 +               return ((coord->item_pos == node_num_items(coord->node) - 1) &&
7461 +                       coord->unit_pos == coord_last_unit_pos(coord));
7462 +       }
7463 +
7464 +       impossible("jmacd-9908", "unreachable");
7465 +       return 0;
7466 +}
7467 +
7468 +/* Returns true if the coordinate is positioned before the first item or it is
7469 +   an empty node. */
7470 +int coord_is_before_leftmost(const coord_t *coord)
7471 +{
7472 +       /* FIXME-VS: coord_check requires node to be loaded whereas it is not
7473 +          necessary to check if coord is set before leftmost
7474 +          assert ("jmacd-7313", coord_check (coord)); */
7475 +       switch (coord->between) {
7476 +       case INVALID_COORD:
7477 +       case AT_UNIT:
7478 +       case AFTER_ITEM:
7479 +       case AFTER_UNIT:
7480 +               return 0;
7481 +
7482 +       case EMPTY_NODE:
7483 +               return 1;
7484 +
7485 +       case BEFORE_ITEM:
7486 +       case BEFORE_UNIT:
7487 +               return (coord->item_pos == 0) && (coord->unit_pos == 0);
7488 +       }
7489 +
7490 +       impossible("jmacd-9908", "unreachable");
7491 +       return 0;
7492 +}
7493 +
7494 +/* Returns true if the coordinate is positioned after a item, before a item,
7495 +   after the last unit of an item, before the first unit of an item, or at an
7496 +   empty node. */
7497 +/* Audited by: green(2002.06.15) */
7498 +int coord_is_between_items(const coord_t *coord)
7499 +{
7500 +       assert("jmacd-7313", coord_check(coord));
7501 +
7502 +       switch (coord->between) {
7503 +       case INVALID_COORD:
7504 +       case AT_UNIT:
7505 +               return 0;
7506 +
7507 +       case AFTER_ITEM:
7508 +       case BEFORE_ITEM:
7509 +       case EMPTY_NODE:
7510 +               return 1;
7511 +
7512 +       case BEFORE_UNIT:
7513 +               return coord->unit_pos == 0;
7514 +
7515 +       case AFTER_UNIT:
7516 +               return coord->unit_pos == coord_last_unit_pos(coord);
7517 +       }
7518 +
7519 +       impossible("jmacd-9908", "unreachable");
7520 +       return 0;
7521 +}
7522 +
7523 +#if REISER4_DEBUG
7524 +/* Returns true if the coordinates are positioned at adjacent units, regardless
7525 +   of before-after or item boundaries. */
7526 +int coord_are_neighbors(coord_t *c1, coord_t *c2)
7527 +{
7528 +       coord_t *left;
7529 +       coord_t *right;
7530 +
7531 +       assert("nikita-1241", c1 != NULL);
7532 +       assert("nikita-1242", c2 != NULL);
7533 +       assert("nikita-1243", c1->node == c2->node);
7534 +       assert("nikita-1244", coord_is_existing_unit(c1));
7535 +       assert("nikita-1245", coord_is_existing_unit(c2));
7536 +
7537 +       left = right = NULL;
7538 +       switch (coord_compare(c1, c2)) {
7539 +       case COORD_CMP_ON_LEFT:
7540 +               left = c1;
7541 +               right = c2;
7542 +               break;
7543 +       case COORD_CMP_ON_RIGHT:
7544 +               left = c2;
7545 +               right = c1;
7546 +               break;
7547 +       case COORD_CMP_SAME:
7548 +               return 0;
7549 +       default:
7550 +               wrong_return_value("nikita-1246", "compare_coords()");
7551 +       }
7552 +       assert("vs-731", left && right);
7553 +       if (left->item_pos == right->item_pos) {
7554 +               return left->unit_pos + 1 == right->unit_pos;
7555 +       } else if (left->item_pos + 1 == right->item_pos) {
7556 +               return (left->unit_pos == coord_last_unit_pos(left))
7557 +                   && (right->unit_pos == 0);
7558 +       } else {
7559 +               return 0;
7560 +       }
7561 +}
7562 +#endif  /*  REISER4_DEBUG  */
7563 +
7564 +/* Assuming two coordinates are positioned in the same node, return
7565 +   COORD_CMP_ON_RIGHT, COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's
7566 +   position relative to c2.  */
7567 +/* Audited by: green(2002.06.15) */
7568 +coord_cmp coord_compare(coord_t *c1, coord_t *c2)
7569 +{
7570 +       assert("vs-209", c1->node == c2->node);
7571 +       assert("vs-194", coord_is_existing_unit(c1)
7572 +              && coord_is_existing_unit(c2));
7573 +
7574 +       if (c1->item_pos > c2->item_pos)
7575 +               return COORD_CMP_ON_RIGHT;
7576 +       if (c1->item_pos < c2->item_pos)
7577 +               return COORD_CMP_ON_LEFT;
7578 +       if (c1->unit_pos > c2->unit_pos)
7579 +               return COORD_CMP_ON_RIGHT;
7580 +       if (c1->unit_pos < c2->unit_pos)
7581 +               return COORD_CMP_ON_LEFT;
7582 +       return COORD_CMP_SAME;
7583 +}
7584 +
7585 +/* If the coordinate is between items, shifts it to the right.  Returns 0 on
7586 +   success and non-zero if there is no position to the right. */
7587 +int coord_set_to_right(coord_t *coord)
7588 +{
7589 +       unsigned items = coord_num_items(coord);
7590 +
7591 +       if (coord_adjust_items(coord, items, 1) == 1)
7592 +               return 1;
7593 +
7594 +       switch (coord->between) {
7595 +       case AT_UNIT:
7596 +               return 0;
7597 +
7598 +       case BEFORE_ITEM:
7599 +       case BEFORE_UNIT:
7600 +               coord->between = AT_UNIT;
7601 +               return 0;
7602 +
7603 +       case AFTER_UNIT:
7604 +               if (coord->unit_pos < coord_last_unit_pos(coord)) {
7605 +                       coord->unit_pos += 1;
7606 +                       coord->between = AT_UNIT;
7607 +                       return 0;
7608 +               } else {
7609 +
7610 +                       coord->unit_pos = 0;
7611 +
7612 +                       if (coord->item_pos == items - 1) {
7613 +                               coord->between = AFTER_ITEM;
7614 +                               return 1;
7615 +                       }
7616 +
7617 +                       coord_inc_item_pos(coord);
7618 +                       coord->between = AT_UNIT;
7619 +                       return 0;
7620 +               }
7621 +
7622 +       case AFTER_ITEM:
7623 +               if (coord->item_pos == items - 1)
7624 +                       return 1;
7625 +
7626 +               coord_inc_item_pos(coord);
7627 +               coord->unit_pos = 0;
7628 +               coord->between = AT_UNIT;
7629 +               return 0;
7630 +
7631 +       case EMPTY_NODE:
7632 +               return 1;
7633 +
7634 +       case INVALID_COORD:
7635 +               break;
7636 +       }
7637 +
7638 +       impossible("jmacd-9920", "unreachable");
7639 +       return 0;
7640 +}
7641 +
7642 +/* If the coordinate is between items, shifts it to the left.  Returns 0 on
7643 +   success and non-zero if there is no position to the left. */
7644 +int coord_set_to_left(coord_t *coord)
7645 +{
7646 +       unsigned items = coord_num_items(coord);
7647 +
7648 +       if (coord_adjust_items(coord, items, 0) == 1)
7649 +               return 1;
7650 +
7651 +       switch (coord->between) {
7652 +       case AT_UNIT:
7653 +               return 0;
7654 +
7655 +       case AFTER_UNIT:
7656 +               coord->between = AT_UNIT;
7657 +               return 0;
7658 +
7659 +       case AFTER_ITEM:
7660 +               coord->between = AT_UNIT;
7661 +               coord->unit_pos = coord_last_unit_pos(coord);
7662 +               return 0;
7663 +
7664 +       case BEFORE_UNIT:
7665 +               if (coord->unit_pos > 0) {
7666 +                       coord->unit_pos -= 1;
7667 +                       coord->between = AT_UNIT;
7668 +                       return 0;
7669 +               } else {
7670 +
7671 +                       if (coord->item_pos == 0) {
7672 +                               coord->between = BEFORE_ITEM;
7673 +                               return 1;
7674 +                       }
7675 +
7676 +                       coord->unit_pos = coord_last_unit_pos(coord);
7677 +                       coord_dec_item_pos(coord);
7678 +                       coord->between = AT_UNIT;
7679 +                       return 0;
7680 +               }
7681 +
7682 +       case BEFORE_ITEM:
7683 +               if (coord->item_pos == 0)
7684 +                       return 1;
7685 +
7686 +               coord_dec_item_pos(coord);
7687 +               coord->unit_pos = coord_last_unit_pos(coord);
7688 +               coord->between = AT_UNIT;
7689 +               return 0;
7690 +
7691 +       case EMPTY_NODE:
7692 +               return 1;
7693 +
7694 +       case INVALID_COORD:
7695 +               break;
7696 +       }
7697 +
7698 +       impossible("jmacd-9920", "unreachable");
7699 +       return 0;
7700 +}
7701 +
7702 +static const char *coord_tween_tostring(between_enum n)
7703 +{
7704 +       switch (n) {
7705 +       case BEFORE_UNIT:
7706 +               return "before unit";
7707 +       case BEFORE_ITEM:
7708 +               return "before item";
7709 +       case AT_UNIT:
7710 +               return "at unit";
7711 +       case AFTER_UNIT:
7712 +               return "after unit";
7713 +       case AFTER_ITEM:
7714 +               return "after item";
7715 +       case EMPTY_NODE:
7716 +               return "empty node";
7717 +       case INVALID_COORD:
7718 +               return "invalid";
7719 +       default:
7720 +       {
7721 +               static char buf[30];
7722 +
7723 +               sprintf(buf, "unknown: %i", n);
7724 +               return buf;
7725 +       }
7726 +       }
7727 +}
7728 +
7729 +void print_coord(const char *mes, const coord_t *coord, int node)
7730 +{
7731 +       if (coord == NULL) {
7732 +               printk("%s: null\n", mes);
7733 +               return;
7734 +       }
7735 +       printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
7736 +              mes, coord->item_pos, coord->unit_pos,
7737 +              coord_tween_tostring(coord->between), coord->iplugid);
7738 +}
7739 +
7740 +int
7741 +item_utmost_child_real_block(const coord_t *coord, sideof side,
7742 +                            reiser4_block_nr * blk)
7743 +{
7744 +       return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord,
7745 +                                                                     side,
7746 +                                                                     blk);
7747 +}
7748 +
7749 +int item_utmost_child(const coord_t *coord, sideof side, jnode ** child)
7750 +{
7751 +       return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
7752 +}
7753 +
7754 +/* @count bytes of flow @f got written, update correspondingly f->length,
7755 +   f->data and f->key */
7756 +void move_flow_forward(flow_t *f, unsigned count)
7757 +{
7758 +       if (f->data)
7759 +               f->data += count;
7760 +       f->length -= count;
7761 +       set_key_offset(&f->key, get_key_offset(&f->key) + count);
7762 +}
7763 +
7764 +/*
7765 +   Local variables:
7766 +   c-indentation-style: "K&R"
7767 +   mode-name: "LC"
7768 +   c-basic-offset: 8
7769 +   tab-width: 8
7770 +   fill-column: 120
7771 +   scroll-step: 1
7772 +   End:
7773 +*/
7774 diff -urN linux-2.6.35.orig/fs/reiser4/coord.h linux-2.6.35/fs/reiser4/coord.h
7775 --- linux-2.6.35.orig/fs/reiser4/coord.h        1970-01-01 01:00:00.000000000 +0100
7776 +++ linux-2.6.35/fs/reiser4/coord.h     2010-08-04 15:44:57.000000000 +0200
7777 @@ -0,0 +1,399 @@
7778 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
7779 +   reiser4/README */
7780 +
7781 +/* Coords */
7782 +
7783 +#if !defined(__REISER4_COORD_H__)
7784 +#define __REISER4_COORD_H__
7785 +
7786 +#include "forward.h"
7787 +#include "debug.h"
7788 +#include "dformat.h"
7789 +#include "key.h"
7790 +
7791 +/* insertions happen between coords in the tree, so we need some means
7792 +   of specifying the sense of betweenness. */
7793 +typedef enum {
7794 +       BEFORE_UNIT, /* Note: we/init_coord depends on this value being zero. */
7795 +       AT_UNIT,
7796 +       AFTER_UNIT,
7797 +       BEFORE_ITEM,
7798 +       AFTER_ITEM,
7799 +       INVALID_COORD,
7800 +       EMPTY_NODE,
7801 +} between_enum;
7802 +
7803 +/* location of coord w.r.t. its node */
7804 +typedef enum {
7805 +       COORD_ON_THE_LEFT = -1,
7806 +       COORD_ON_THE_RIGHT = +1,
7807 +       COORD_INSIDE = 0
7808 +} coord_wrt_node;
7809 +
7810 +typedef enum {
7811 +       COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
7812 +} coord_cmp;
7813 +
7814 +struct coord {
7815 +       /* node in a tree */
7816 +       /*  0 */ znode *node;
7817 +
7818 +       /* position of item within node */
7819 +       /*  4 */ pos_in_node_t item_pos;
7820 +       /* position of unit within item */
7821 +       /*  6 */ pos_in_node_t unit_pos;
7822 +       /* optimization: plugin of item is stored in coord_t. Until this was
7823 +          implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
7824 +          is invalidated (set to 0xff) on each modification of ->item_pos,
7825 +          and all such modifications are funneled through coord_*_item_pos()
7826 +          functions below.
7827 +        */
7828 +       /*  8 */ char iplugid;
7829 +       /* position of coord w.r.t. to neighboring items and/or units.
7830 +          Values are taken from &between_enum above.
7831 +        */
7832 +       /*  9 */ char between;
7833 +       /* padding. It will be added by the compiler anyway to conform to the
7834 +        * C language alignment requirements. We keep it here to be on the
7835 +        * safe side and to have a clear picture of the memory layout of this
7836 +        * structure. */
7837 +       /* 10 */ __u16 pad;
7838 +       /* 12 */ int offset;
7839 +#if REISER4_DEBUG
7840 +       unsigned long plug_v;
7841 +       unsigned long body_v;
7842 +#endif
7843 +};
7844 +
7845 +#define INVALID_PLUGID  ((char)((1 << 8) - 1))
7846 +#define INVALID_OFFSET -1
7847 +
7848 +static inline void coord_clear_iplug(coord_t *coord)
7849 +{
7850 +       assert("nikita-2835", coord != NULL);
7851 +       coord->iplugid = INVALID_PLUGID;
7852 +       coord->offset = INVALID_OFFSET;
7853 +}
7854 +
7855 +static inline int coord_is_iplug_set(const coord_t *coord)
7856 +{
7857 +       assert("nikita-2836", coord != NULL);
7858 +       return coord->iplugid != INVALID_PLUGID;
7859 +}
7860 +
7861 +static inline void coord_set_item_pos(coord_t *coord, pos_in_node_t pos)
7862 +{
7863 +       assert("nikita-2478", coord != NULL);
7864 +       coord->item_pos = pos;
7865 +       coord_clear_iplug(coord);
7866 +}
7867 +
7868 +static inline void coord_dec_item_pos(coord_t *coord)
7869 +{
7870 +       assert("nikita-2480", coord != NULL);
7871 +       --coord->item_pos;
7872 +       coord_clear_iplug(coord);
7873 +}
7874 +
7875 +static inline void coord_inc_item_pos(coord_t *coord)
7876 +{
7877 +       assert("nikita-2481", coord != NULL);
7878 +       ++coord->item_pos;
7879 +       coord_clear_iplug(coord);
7880 +}
7881 +
7882 +static inline void coord_add_item_pos(coord_t *coord, int delta)
7883 +{
7884 +       assert("nikita-2482", coord != NULL);
7885 +       coord->item_pos += delta;
7886 +       coord_clear_iplug(coord);
7887 +}
7888 +
7889 +static inline void coord_invalid_item_pos(coord_t *coord)
7890 +{
7891 +       assert("nikita-2832", coord != NULL);
7892 +       coord->item_pos = (unsigned short)~0;
7893 +       coord_clear_iplug(coord);
7894 +}
7895 +
7896 +/* Reverse a direction. */
7897 +static inline sideof sideof_reverse(sideof side)
7898 +{
7899 +       return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
7900 +}
7901 +
7902 +/* NOTE: There is a somewhat odd mixture of the following opposed terms:
7903 +
7904 +   "first" and "last"
7905 +   "next" and "prev"
7906 +   "before" and "after"
7907 +   "leftmost" and "rightmost"
7908 +
7909 +   But I think the chosen names are decent the way they are.
7910 +*/
7911 +
7912 +/* COORD INITIALIZERS */
7913 +
7914 +/* Initialize an invalid coordinate. */
7915 +extern void coord_init_invalid(coord_t *coord, const znode * node);
7916 +
7917 +extern void coord_init_first_unit_nocheck(coord_t *coord, const znode * node);
7918 +
7919 +/* Initialize a coordinate to point at the first unit of the first item. If the
7920 +   node is empty, it is positioned at the EMPTY_NODE. */
7921 +extern void coord_init_first_unit(coord_t *coord, const znode * node);
7922 +
7923 +/* Initialize a coordinate to point at the last unit of the last item. If the
7924 +   node is empty, it is positioned at the EMPTY_NODE. */
7925 +extern void coord_init_last_unit(coord_t *coord, const znode * node);
7926 +
7927 +/* Initialize a coordinate to before the first item. If the node is empty, it is
7928 +   positioned at the EMPTY_NODE. */
7929 +extern void coord_init_before_first_item(coord_t *coord, const znode * node);
7930 +
7931 +/* Initialize a coordinate to after the last item. If the node is empty, it is
7932 +   positioned at the EMPTY_NODE. */
7933 +extern void coord_init_after_last_item(coord_t *coord, const znode * node);
7934 +
7935 +/* Initialize a coordinate to after last unit in the item. Coord must be set
7936 +   already to existing item */
7937 +void coord_init_after_item_end(coord_t *coord);
7938 +
7939 +/* Initialize a coordinate to before the item. Coord must be set already to
7940 +   existing item */
7941 +void coord_init_before_item(coord_t *);
7942 +/* Initialize a coordinate to after the item. Coord must be set already to
7943 +   existing item */
7944 +void coord_init_after_item(coord_t *);
7945 +
7946 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on
7947 +   sideof argument. */
7948 +extern void coord_init_sideof_unit(coord_t *coord, const znode * node,
7949 +                                  sideof dir);
7950 +
7951 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
7952 +   it was not clear how actually
7953 +   FIXME-VS: added by vs (2002, june, 8) */
7954 +extern void coord_init_zero(coord_t *coord);
7955 +
7956 +/* COORD METHODS */
7957 +
7958 +/* after shifting of node content, coord previously set properly may become
7959 +   invalid, try to "normalize" it. */
7960 +void coord_normalize(coord_t *coord);
7961 +
7962 +/* Copy a coordinate. */
7963 +extern void coord_dup(coord_t *coord, const coord_t *old_coord);
7964 +
7965 +/* Copy a coordinate without check. */
7966 +void coord_dup_nocheck(coord_t *coord, const coord_t *old_coord);
7967 +
7968 +unsigned coord_num_units(const coord_t *coord);
7969 +
7970 +/* Return the last valid unit number at the present item (i.e.,
7971 +   coord_num_units() - 1). */
7972 +static inline unsigned coord_last_unit_pos(const coord_t *coord)
7973 +{
7974 +       return coord_num_units(coord) - 1;
7975 +}
7976 +
7977 +#if REISER4_DEBUG
7978 +/* For assertions only, checks for a valid coordinate. */
7979 +extern int coord_check(const coord_t *coord);
7980 +
7981 +extern unsigned long znode_times_locked(const znode * z);
7982 +
7983 +static inline void coord_update_v(coord_t *coord)
7984 +{
7985 +       coord->plug_v = coord->body_v = znode_times_locked(coord->node);
7986 +}
7987 +#endif
7988 +
7989 +extern int coords_equal(const coord_t *c1, const coord_t *c2);
7990 +
7991 +extern void print_coord(const char *mes, const coord_t *coord, int print_node);
7992 +
7993 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if
7994 +   coord_is_after_leftmost return NCOORD_ON_THE_LEFT, otherwise return
7995 +   NCOORD_INSIDE. */
7996 +extern coord_wrt_node coord_wrt(const coord_t *coord);
7997 +
7998 +/* Returns true if the coordinates are positioned at adjacent units, regardless
7999 +   of before-after or item boundaries. */
8000 +extern int coord_are_neighbors(coord_t *c1, coord_t *c2);
8001 +
8002 +/* Assuming two coordinates are positioned in the same node, return
8003 +   NCOORD_CMP_ON_RIGHT, NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's
8004 +   position relative to c2.  */
8005 +extern coord_cmp coord_compare(coord_t *c1, coord_t *c2);
8006 +
8007 +/* COORD PREDICATES */
8008 +
8009 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
8010 +extern int coord_is_invalid(const coord_t *coord);
8011 +
8012 +/* Returns true if the coordinate is positioned at an existing item, not before
8013 +   or after an item. It may be placed at, before, or after any unit within the
8014 +   item, whether existing or not. If this is true you can call methods of the
8015 +   item plugin.  */
8016 +extern int coord_is_existing_item(const coord_t *coord);
8017 +
8018 +/* Returns true if the coordinate is positioned after a item, before a item,
8019 +   after the last unit of an item, before the first unit of an item, or at an
8020 +   empty node. */
8021 +extern int coord_is_between_items(const coord_t *coord);
8022 +
8023 +/* Returns true if the coordinate is positioned at an existing unit, not before
8024 +   or after a unit. */
8025 +extern int coord_is_existing_unit(const coord_t *coord);
8026 +
8027 +/* Returns true if the coordinate is positioned at an empty node. */
8028 +extern int coord_is_empty(const coord_t *coord);
8029 +
8030 +/* Returns true if the coordinate is positioned at the first unit of the first
8031 +   item. Not true for empty nodes nor coordinates positioned before the first
8032 +   item. */
8033 +extern int coord_is_leftmost_unit(const coord_t *coord);
8034 +
8035 +/* Returns true if the coordinate is positioned after the last item or after the
8036 +   last unit of the last item or it is an empty node. */
8037 +extern int coord_is_after_rightmost(const coord_t *coord);
8038 +
8039 +/* Returns true if the coordinate is positioned before the first item or it is
8040 +    an empty node. */
8041 +extern int coord_is_before_leftmost(const coord_t *coord);
8042 +
8043 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending
8044 +   on sideof argument. */
8045 +extern int coord_is_after_sideof_unit(coord_t *coord, sideof dir);
8046 +
8047 +/* COORD MODIFIERS */
8048 +
8049 +/* Advances the coordinate by one unit to the right.  If empty, no change.  If
8050 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new
8051 +   position is an existing unit. */
8052 +extern int coord_next_unit(coord_t *coord);
8053 +
8054 +/* Advances the coordinate by one item to the right.  If empty, no change.  If
8055 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new
8056 +   position is an existing item. */
8057 +extern int coord_next_item(coord_t *coord);
8058 +
8059 +/* Advances the coordinate by one unit to the left.  If empty, no change.  If
8060 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new
8061 +   position is an existing unit. */
8062 +extern int coord_prev_unit(coord_t *coord);
8063 +
8064 +/* Advances the coordinate by one item to the left.  If empty, no change. If
8065 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM. Returns 0 if new
8066 +   position is an existing item. */
8067 +extern int coord_prev_item(coord_t *coord);
8068 +
8069 +/* If the coordinate is between items, shifts it to the right.  Returns 0 on
8070 +   success and non-zero if there is no position to the right. */
8071 +extern int coord_set_to_right(coord_t *coord);
8072 +
8073 +/* If the coordinate is between items, shifts it to the left.  Returns 0 on
8074 +   success and non-zero if there is no position to the left. */
8075 +extern int coord_set_to_left(coord_t *coord);
8076 +
8077 +/* If the coordinate is at an existing unit, set to after that unit.  Returns 0
8078 +   on success and non-zero if the unit did not exist. */
8079 +extern int coord_set_after_unit(coord_t *coord);
8080 +
8081 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof
8082 +   argument. */
8083 +extern int coord_sideof_unit(coord_t *coord, sideof dir);
8084 +
8085 +/* iterate over all units in @node */
8086 +#define for_all_units(coord, node)                                     \
8087 +       for (coord_init_before_first_item((coord), (node)) ;            \
8088 +            coord_next_unit(coord) == 0 ;)
8089 +
8090 +/* iterate over all items in @node */
8091 +#define for_all_items(coord, node)                                     \
8092 +       for (coord_init_before_first_item((coord), (node)) ;            \
8093 +            coord_next_item(coord) == 0 ;)
8094 +
8095 +/* COORD/ITEM METHODS */
8096 +
8097 +extern int item_utmost_child_real_block(const coord_t *coord, sideof side,
8098 +                                       reiser4_block_nr * blk);
8099 +extern int item_utmost_child(const coord_t *coord, sideof side,
8100 +                            jnode ** child);
8101 +
8102 +/* a flow is a sequence of bytes being written to or read from the tree.  The
8103 +   tree will slice the flow into items while storing it into nodes, but all of
8104 +   that is hidden from anything outside the tree.  */
8105 +
8106 +struct flow {
8107 +       reiser4_key key;        /* key of start of flow's sequence of bytes */
8108 +       loff_t length;          /* length of flow's sequence of bytes */
8109 +       char *data;             /* start of flow's sequence of bytes */
8110 +       int user;               /* if 1 data is user space, 0 - kernel space */
8111 +       rw_op op;               /* NIKITA-FIXME-HANS: comment is where?  */
8112 +};
8113 +
8114 +void move_flow_forward(flow_t *f, unsigned count);
8115 +
8116 +/* &reiser4_item_data - description of data to be inserted or pasted
8117 +
8118 +   Q: articulate the reasons for the difference between this and flow.
8119 +
8120 +   A: Becides flow we insert into tree other things: stat data, directory
8121 +   entry, etc.  To insert them into tree one has to provide this structure. If
8122 +   one is going to insert flow - he can use insert_flow, where this structure
8123 +   does not have to be created
8124 +*/
8125 +struct reiser4_item_data {
8126 +       /* actual data to be inserted. If NULL, ->create_item() will not
8127 +          do xmemcpy itself, leaving this up to the caller. This can
8128 +          save some amount of unnecessary memory copying, for example,
8129 +          during insertion of stat data.
8130 +
8131 +        */
8132 +       char *data;
8133 +       /* 1 if 'char * data' contains pointer to user space and 0 if it is
8134 +          kernel space */
8135 +       int user;
8136 +       /* amount of data we are going to insert or paste */
8137 +       int length;
8138 +       /* "Arg" is opaque data that is passed down to the
8139 +          ->create_item() method of node layout, which in turn
8140 +          hands it to the ->create_hook() of item being created. This
8141 +          arg is currently used by:
8142 +
8143 +          .  ->create_hook() of internal item
8144 +          (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
8145 +          . ->paste() method of directory item.
8146 +          . ->create_hook() of extent item
8147 +
8148 +          For internal item, this is left "brother" of new node being
8149 +          inserted and it is used to add new node into sibling list
8150 +          after parent to it was just inserted into parent.
8151 +
8152 +          While ->arg does look somewhat of unnecessary compication,
8153 +          it actually saves a lot of headache in many places, because
8154 +          all data necessary to insert or paste new data into tree are
8155 +          collected in one place, and this eliminates a lot of extra
8156 +          argument passing and storing everywhere.
8157 +
8158 +        */
8159 +       void *arg;
8160 +       /* plugin of item we are inserting */
8161 +       item_plugin *iplug;
8162 +};
8163 +
8164 +/* __REISER4_COORD_H__ */
8165 +#endif
8166 +
8167 +/* Make Linus happy.
8168 +   Local variables:
8169 +   c-indentation-style: "K&R"
8170 +   mode-name: "LC"
8171 +   c-basic-offset: 8
8172 +   tab-width: 8
8173 +   fill-column: 120
8174 +   scroll-step: 1
8175 +   End:
8176 +*/
8177 diff -urN linux-2.6.35.orig/fs/reiser4/debug.c linux-2.6.35/fs/reiser4/debug.c
8178 --- linux-2.6.35.orig/fs/reiser4/debug.c        1970-01-01 01:00:00.000000000 +0100
8179 +++ linux-2.6.35/fs/reiser4/debug.c     2010-08-04 15:44:57.000000000 +0200
8180 @@ -0,0 +1,308 @@
8181 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8182 + * reiser4/README */
8183 +
8184 +/* Debugging facilities. */
8185 +
8186 +/*
8187 + * This file contains generic debugging functions used by reiser4. Roughly
8188 + * following:
8189 + *
8190 + *     panicking: reiser4_do_panic(), reiser4_print_prefix().
8191 + *
8192 + *     locking:
8193 + *     reiser4_schedulable(), reiser4_lock_counters(), print_lock_counters(),
8194 + *     reiser4_no_counters_are_held(), reiser4_commit_check_locks()
8195 + *
8196 + *     error code monitoring (see comment before RETERR macro):
8197 + *     reiser4_return_err(), reiser4_report_err().
8198 + *
8199 + *     stack back-tracing: fill_backtrace()
8200 + *
8201 + *     miscellaneous: reiser4_preempt_point(), call_on_each_assert(),
8202 + *     reiser4_debugtrap().
8203 + *
8204 + */
8205 +
8206 +#include "reiser4.h"
8207 +#include "context.h"
8208 +#include "super.h"
8209 +#include "txnmgr.h"
8210 +#include "znode.h"
8211 +
8212 +#include <linux/sysfs.h>
8213 +#include <linux/slab.h>
8214 +#include <linux/types.h>
8215 +#include <linux/fs.h>
8216 +#include <linux/spinlock.h>
8217 +#include <linux/kallsyms.h>
8218 +#include <linux/vmalloc.h>
8219 +#include <linux/ctype.h>
8220 +#include <linux/sysctl.h>
8221 +#include <linux/hardirq.h>
8222 +
8223 +#if 0
8224 +#if REISER4_DEBUG
8225 +static void reiser4_report_err(void);
8226 +#else
8227 +#define reiser4_report_err() noop
8228 +#endif
8229 +#endif  /*  0  */
8230 +
8231 +/*
8232 + * global buffer where message given to reiser4_panic is formatted.
8233 + */
8234 +static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
8235 +
8236 +/*
8237 + * lock protecting consistency of panic_buf under concurrent panics
8238 + */
8239 +static DEFINE_SPINLOCK(panic_guard);
8240 +
8241 +/* Your best friend. Call it on each occasion.  This is called by
8242 +    fs/reiser4/debug.h:reiser4_panic(). */
8243 +void reiser4_do_panic(const char *format/* format string */ , ... /* rest */)
8244 +{
8245 +       static int in_panic = 0;
8246 +       va_list args;
8247 +
8248 +       /*
8249 +        * check for recursive panic.
8250 +        */
8251 +       if (in_panic == 0) {
8252 +               in_panic = 1;
8253 +
8254 +               spin_lock(&panic_guard);
8255 +               va_start(args, format);
8256 +               vsnprintf(panic_buf, sizeof(panic_buf), format, args);
8257 +               va_end(args);
8258 +               printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
8259 +               spin_unlock(&panic_guard);
8260 +
8261 +               /*
8262 +                * if kernel debugger is configured---drop in. Early dropping
8263 +                * into kgdb is not always convenient, because panic message
8264 +                * is not yet printed most of the times. But:
8265 +                *
8266 +                *     (1) message can be extracted from printk_buf[]
8267 +                *     (declared static inside of printk()), and
8268 +                *
8269 +                *     (2) sometimes serial/kgdb combo dies while printing
8270 +                *     long panic message, so it's more prudent to break into
8271 +                *     debugger earlier.
8272 +                *
8273 +                */
8274 +               DEBUGON(1);
8275 +       }
8276 +       /* to make gcc happy about noreturn attribute */
8277 +       panic("%s", panic_buf);
8278 +}
8279 +
8280 +#if 0
8281 +void
8282 +reiser4_print_prefix(const char *level, int reperr, const char *mid,
8283 +                    const char *function, const char *file, int lineno)
8284 +{
8285 +       const char *comm;
8286 +       int pid;
8287 +
8288 +       if (unlikely(in_interrupt() || in_irq())) {
8289 +               comm = "interrupt";
8290 +               pid = 0;
8291 +       } else {
8292 +               comm = current->comm;
8293 +               pid = current->pid;
8294 +       }
8295 +       printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
8296 +              level, comm, pid, function, file, lineno, mid);
8297 +       if (reperr)
8298 +               reiser4_report_err();
8299 +}
8300 +#endif  /*  0  */
8301 +
8302 +/* Preemption point: this should be called periodically during long running
8303 +   operations (carry, allocate, and squeeze are best examples) */
8304 +int reiser4_preempt_point(void)
8305 +{
8306 +       assert("nikita-3008", reiser4_schedulable());
8307 +       cond_resched();
8308 +       return signal_pending(current);
8309 +}
8310 +
8311 +#if REISER4_DEBUG
8312 +/* Debugging aid: return struct where information about locks taken by current
8313 +   thread is accumulated. This can be used to formulate lock ordering
8314 +   constraints and various assertions.
8315 +
8316 +*/
8317 +reiser4_lock_cnt_info *reiser4_lock_counters(void)
8318 +{
8319 +       reiser4_context *ctx = get_current_context();
8320 +       assert("jmacd-1123", ctx != NULL);
8321 +       return &ctx->locks;
8322 +}
8323 +
8324 +/*
8325 + * print human readable information about locks held by the reiser4 context.
8326 + */
8327 +static void print_lock_counters(const char *prefix,
8328 +                               const reiser4_lock_cnt_info * info)
8329 +{
8330 +       printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
8331 +              "jload: %i, "
8332 +              "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
8333 +              "ktxnmgrd: %i, fq: %i\n"
8334 +              "inode: %i, "
8335 +              "cbk_cache: %i (r:%i,w%i), "
8336 +              "eflush: %i, "
8337 +              "zlock: %i,\n"
8338 +              "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
8339 +              "d: %i, x: %i, t: %i\n", prefix,
8340 +              info->spin_locked_jnode,
8341 +              info->rw_locked_tree, info->read_locked_tree,
8342 +              info->write_locked_tree,
8343 +              info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
8344 +              info->spin_locked_jload,
8345 +              info->spin_locked_txnh,
8346 +              info->spin_locked_atom, info->spin_locked_stack,
8347 +              info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
8348 +              info->spin_locked_fq,
8349 +              info->spin_locked_inode,
8350 +              info->rw_locked_cbk_cache,
8351 +              info->read_locked_cbk_cache,
8352 +              info->write_locked_cbk_cache,
8353 +              info->spin_locked_super_eflush,
8354 +              info->spin_locked_zlock,
8355 +              info->spin_locked,
8356 +              info->long_term_locked_znode,
8357 +              info->inode_sem_r, info->inode_sem_w,
8358 +              info->d_refs, info->x_refs, info->t_refs);
8359 +}
8360 +
8361 +/* check that no spinlocks are held */
8362 +int reiser4_schedulable(void)
8363 +{
8364 +       if (get_current_context_check() != NULL) {
8365 +               if (!LOCK_CNT_NIL(spin_locked)) {
8366 +                       print_lock_counters("in atomic", reiser4_lock_counters());
8367 +                       return 0;
8368 +               }
8369 +       }
8370 +       might_sleep();
8371 +       return 1;
8372 +}
8373 +/*
8374 + * return true, iff no locks are held.
8375 + */
8376 +int reiser4_no_counters_are_held(void)
8377 +{
8378 +       reiser4_lock_cnt_info *counters;
8379 +
8380 +       counters = reiser4_lock_counters();
8381 +       return
8382 +           (counters->spin_locked_zlock == 0) &&
8383 +           (counters->spin_locked_jnode == 0) &&
8384 +           (counters->rw_locked_tree == 0) &&
8385 +           (counters->read_locked_tree == 0) &&
8386 +           (counters->write_locked_tree == 0) &&
8387 +           (counters->rw_locked_dk == 0) &&
8388 +           (counters->read_locked_dk == 0) &&
8389 +           (counters->write_locked_dk == 0) &&
8390 +           (counters->spin_locked_txnh == 0) &&
8391 +           (counters->spin_locked_atom == 0) &&
8392 +           (counters->spin_locked_stack == 0) &&
8393 +           (counters->spin_locked_txnmgr == 0) &&
8394 +           (counters->spin_locked_inode == 0) &&
8395 +           (counters->spin_locked == 0) &&
8396 +           (counters->long_term_locked_znode == 0) &&
8397 +           (counters->inode_sem_r == 0) &&
8398 +           (counters->inode_sem_w == 0) && (counters->d_refs == 0);
8399 +}
8400 +
8401 +/*
8402 + * return true, iff transaction commit can be done under locks held by the
8403 + * current thread.
8404 + */
8405 +int reiser4_commit_check_locks(void)
8406 +{
8407 +       reiser4_lock_cnt_info *counters;
8408 +       int inode_sem_r;
8409 +       int inode_sem_w;
8410 +       int result;
8411 +
8412 +       /*
8413 +        * inode's read/write semaphore is the only reiser4 lock that can be
8414 +        * held during commit.
8415 +        */
8416 +
8417 +       counters = reiser4_lock_counters();
8418 +       inode_sem_r = counters->inode_sem_r;
8419 +       inode_sem_w = counters->inode_sem_w;
8420 +
8421 +       counters->inode_sem_r = counters->inode_sem_w = 0;
8422 +       result = reiser4_no_counters_are_held();
8423 +       counters->inode_sem_r = inode_sem_r;
8424 +       counters->inode_sem_w = inode_sem_w;
8425 +       return result;
8426 +}
8427 +
8428 +/*
8429 + * fill "error site" in the current reiser4 context. See comment before RETERR
8430 + * macro for more details.
8431 + */
8432 +void reiser4_return_err(int code, const char *file, int line)
8433 +{
8434 +       if (code < 0 && is_in_reiser4_context()) {
8435 +               reiser4_context *ctx = get_current_context();
8436 +
8437 +               if (ctx != NULL) {
8438 +                       ctx->err.code = code;
8439 +                       ctx->err.file = file;
8440 +                       ctx->err.line = line;
8441 +               }
8442 +       }
8443 +}
8444 +
8445 +#if 0
8446 +/*
8447 + * report error information recorder by reiser4_return_err().
8448 + */
8449 +static void reiser4_report_err(void)
8450 +{
8451 +       reiser4_context *ctx = get_current_context_check();
8452 +
8453 +       if (ctx != NULL) {
8454 +               if (ctx->err.code != 0) {
8455 +                       printk("code: %i at %s:%i\n",
8456 +                              ctx->err.code, ctx->err.file, ctx->err.line);
8457 +               }
8458 +       }
8459 +}
8460 +#endif  /*  0  */
8461 +
8462 +#endif                         /* REISER4_DEBUG */
8463 +
8464 +#if KERNEL_DEBUGGER
8465 +
8466 +/*
8467 + * this functions just drops into kernel debugger. It is a convenient place to
8468 + * put breakpoint in.
8469 + */
8470 +void reiser4_debugtrap(void)
8471 +{
8472 +       /* do nothing. Put break point here. */
8473 +#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
8474 +       extern void kgdb_breakpoint(void);
8475 +       kgdb_breakpoint();
8476 +#endif
8477 +}
8478 +#endif
8479 +
8480 +/* Make Linus happy.
8481 +   Local variables:
8482 +   c-indentation-style: "K&R"
8483 +   mode-name: "LC"
8484 +   c-basic-offset: 8
8485 +   tab-width: 8
8486 +   fill-column: 120
8487 +   End:
8488 +*/
8489 diff -urN linux-2.6.35.orig/fs/reiser4/debug.h linux-2.6.35/fs/reiser4/debug.h
8490 --- linux-2.6.35.orig/fs/reiser4/debug.h        1970-01-01 01:00:00.000000000 +0100
8491 +++ linux-2.6.35/fs/reiser4/debug.h     2010-08-04 15:44:57.000000000 +0200
8492 @@ -0,0 +1,351 @@
8493 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8494 +   reiser4/README */
8495 +
8496 +/* Declarations of debug macros. */
8497 +
8498 +#if !defined(__FS_REISER4_DEBUG_H__)
8499 +#define __FS_REISER4_DEBUG_H__
8500 +
8501 +#include "forward.h"
8502 +#include "reiser4.h"
8503 +
8504 +/* generic function to produce formatted output, decorating it with
8505 +   whatever standard prefixes/postfixes we want. "Fun" is a function
8506 +   that will be actually called, can be printk, panic etc.
8507 +   This is for use by other debugging macros, not by users. */
8508 +#define DCALL(lev, fun, reperr, label, format, ...)                    \
8509 +({                                                                     \
8510 +       fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" ,   \
8511 +           current->comm, current->pid, __FUNCTION__,                  \
8512 +           __FILE__, __LINE__, label, ## __VA_ARGS__);                 \
8513 +})
8514 +
8515 +/*
8516 + * cause kernel to crash
8517 + */
8518 +#define reiser4_panic(mid, format, ...)                                \
8519 +       DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
8520 +
8521 +/* print message with indication of current process, file, line and
8522 +   function */
8523 +#define reiser4_log(label, format, ...)                                \
8524 +       DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
8525 +
8526 +/* Assertion checked during compilation.
8527 +    If "cond" is false (0) we get duplicate case label in switch.
8528 +    Use this to check something like famous
8529 +       cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
8530 +    in 3.x journal.c. If cassertion fails you get compiler error,
8531 +    so no "maintainer-id".
8532 +*/
8533 +#define cassert(cond) ({ switch (-1) { case (cond): case 0: break; } })
8534 +
8535 +#define noop   do {; } while (0)
8536 +
8537 +#if REISER4_DEBUG
8538 +/* version of info that only actually prints anything when _d_ebugging
8539 +    is on */
8540 +#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
8541 +/* macro to catch logical errors. Put it into `default' clause of
8542 +    switch() statement. */
8543 +#define impossible(label, format, ...)                         \
8544 +       reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
8545 +/* assert assures that @cond is true. If it is not, reiser4_panic() is
8546 +   called. Use this for checking logical consistency and _never_ call
8547 +   this to check correctness of external data: disk blocks and user-input . */
8548 +#define assert(label, cond)                                            \
8549 +({                                                                     \
8550 +       /* call_on_each_assert(); */                                    \
8551 +       if (cond) {                                                     \
8552 +               /* put negated check to avoid using !(cond) that would lose \
8553 +                * warnings for things like assert(a = b); */           \
8554 +               ;                                                       \
8555 +       } else {                                                        \
8556 +               DEBUGON(1);                                             \
8557 +               reiser4_panic(label, "assertion failed: %s", #cond);    \
8558 +       }                                                               \
8559 +})
8560 +
8561 +/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
8562 +#define check_me(label, expr)  assert(label, (expr))
8563 +
8564 +#define ON_DEBUG(exp) exp
8565 +
8566 +extern int reiser4_schedulable(void);
8567 +extern void call_on_each_assert(void);
8568 +
8569 +#else
8570 +
8571 +#define dinfo(format, args...) noop
8572 +#define impossible(label, format, args...) noop
8573 +#define assert(label, cond) noop
8574 +#define check_me(label, expr)  ((void) (expr))
8575 +#define ON_DEBUG(exp)
8576 +#define reiser4_schedulable() might_sleep()
8577 +
8578 +/* REISER4_DEBUG */
8579 +#endif
8580 +
8581 +#if REISER4_DEBUG
8582 +/* per-thread information about lock acquired by this thread. Used by lock
8583 + * ordering checking in spin_macros.h */
8584 +typedef struct reiser4_lock_cnt_info {
8585 +       int rw_locked_tree;
8586 +       int read_locked_tree;
8587 +       int write_locked_tree;
8588 +
8589 +       int rw_locked_dk;
8590 +       int read_locked_dk;
8591 +       int write_locked_dk;
8592 +
8593 +       int rw_locked_cbk_cache;
8594 +       int read_locked_cbk_cache;
8595 +       int write_locked_cbk_cache;
8596 +
8597 +       int spin_locked_zlock;
8598 +       int spin_locked_jnode;
8599 +       int spin_locked_jload;
8600 +       int spin_locked_txnh;
8601 +       int spin_locked_atom;
8602 +       int spin_locked_stack;
8603 +       int spin_locked_txnmgr;
8604 +       int spin_locked_ktxnmgrd;
8605 +       int spin_locked_fq;
8606 +       int spin_locked_inode;
8607 +       int spin_locked_super_eflush;
8608 +       int spin_locked;
8609 +       int long_term_locked_znode;
8610 +
8611 +       int inode_sem_r;
8612 +       int inode_sem_w;
8613 +
8614 +       int d_refs;
8615 +       int x_refs;
8616 +       int t_refs;
8617 +} reiser4_lock_cnt_info;
8618 +
8619 +extern struct reiser4_lock_cnt_info *reiser4_lock_counters(void);
8620 +#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
8621 +
8622 +/* increment lock-counter @counter, if present */
8623 +#define LOCK_CNT_INC(counter)                                  \
8624 +       IN_CONTEXT(++(reiser4_lock_counters()->counter), 0)
8625 +
8626 +/* decrement lock-counter @counter, if present */
8627 +#define LOCK_CNT_DEC(counter)                                  \
8628 +       IN_CONTEXT(--(reiser4_lock_counters()->counter), 0)
8629 +
8630 +/* check that lock-counter is zero. This is for use in assertions */
8631 +#define LOCK_CNT_NIL(counter)                                  \
8632 +       IN_CONTEXT(reiser4_lock_counters()->counter == 0, 1)
8633 +
8634 +/* check that lock-counter is greater than zero. This is for use in
8635 + * assertions */
8636 +#define LOCK_CNT_GTZ(counter)                                  \
8637 +       IN_CONTEXT(reiser4_lock_counters()->counter > 0, 1)
8638 +#define LOCK_CNT_LT(counter,n)                                 \
8639 +       IN_CONTEXT(reiser4_lock_counters()->counter < n, 1)
8640 +
8641 +#else                          /* REISER4_DEBUG */
8642 +
8643 +/* no-op versions on the above */
8644 +
8645 +typedef struct reiser4_lock_cnt_info {
8646 +} reiser4_lock_cnt_info;
8647 +
8648 +#define reiser4_lock_counters() ((reiser4_lock_cnt_info *)NULL)
8649 +#define LOCK_CNT_INC(counter) noop
8650 +#define LOCK_CNT_DEC(counter) noop
8651 +#define LOCK_CNT_NIL(counter) (1)
8652 +#define LOCK_CNT_GTZ(counter) (1)
8653 +#define LOCK_CNT_LT(counter, n) (1)
8654 +
8655 +#endif                         /* REISER4_DEBUG */
8656 +
8657 +#define assert_spin_not_locked(lock) BUG_ON(0)
8658 +#define assert_rw_write_locked(lock) BUG_ON(0)
8659 +#define assert_rw_read_locked(lock) BUG_ON(0)
8660 +#define assert_rw_locked(lock) BUG_ON(0)
8661 +#define assert_rw_not_write_locked(lock) BUG_ON(0)
8662 +#define assert_rw_not_read_locked(lock) BUG_ON(0)
8663 +#define assert_rw_not_locked(lock) BUG_ON(0)
8664 +
8665 +/* flags controlling debugging behavior. Are set through debug_flags=N mount
8666 +   option. */
8667 +typedef enum {
8668 +       /* print a lot of information during panic. When this is on all jnodes
8669 +        * are listed. This can be *very* large output. Usually you don't want
8670 +        * this. Especially over serial line. */
8671 +       REISER4_VERBOSE_PANIC = 0x00000001,
8672 +       /* print a lot of information during umount */
8673 +       REISER4_VERBOSE_UMOUNT = 0x00000002,
8674 +       /* print gathered statistics on umount */
8675 +       REISER4_STATS_ON_UMOUNT = 0x00000004,
8676 +       /* check node consistency */
8677 +       REISER4_CHECK_NODE = 0x00000008
8678 +} reiser4_debug_flags;
8679 +
8680 +extern int is_in_reiser4_context(void);
8681 +
8682 +/*
8683 + * evaluate expression @e only if with reiser4 context
8684 + */
8685 +#define ON_CONTEXT(e)  do {                    \
8686 +       if (is_in_reiser4_context()) {          \
8687 +               e;                              \
8688 +       } } while (0)
8689 +
8690 +/*
8691 + * evaluate expression @e only when within reiser4_context and debugging is
8692 + * on.
8693 + */
8694 +#define ON_DEBUG_CONTEXT(e) ON_DEBUG(ON_CONTEXT(e))
8695 +
8696 +/*
8697 + * complain about unexpected function result and crash. Used in "default"
8698 + * branches of switch statements and alike to assert that invalid results are
8699 + * not silently ignored.
8700 + */
8701 +#define wrong_return_value(label, function)                            \
8702 +       impossible(label, "wrong return value from " function)
8703 +
8704 +/* Issue different types of reiser4 messages to the console */
8705 +#define warning(label, format, ...)                                    \
8706 +       DCALL(KERN_WARNING,                                             \
8707 +              printk, 1, label, "WARNING: " format , ## __VA_ARGS__)
8708 +#define notice(label, format, ...)                                     \
8709 +       DCALL(KERN_NOTICE,                                              \
8710 +              printk, 1, label, "NOTICE: " format , ## __VA_ARGS__)
8711 +
8712 +/* mark not yet implemented functionality */
8713 +#define not_yet(label, format, ...)                            \
8714 +       reiser4_panic(label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__)
8715 +
8716 +extern void reiser4_do_panic(const char *format, ...)
8717 +    __attribute__ ((noreturn, format(printf, 1, 2)));
8718 +
8719 +extern int reiser4_preempt_point(void);
8720 +extern void reiser4_print_stats(void);
8721 +
8722 +#if REISER4_DEBUG
8723 +extern int reiser4_no_counters_are_held(void);
8724 +extern int reiser4_commit_check_locks(void);
8725 +#else
8726 +#define reiser4_no_counters_are_held() (1)
8727 +#define reiser4_commit_check_locks() (1)
8728 +#endif
8729 +
8730 +/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
8731 +#define IS_POW(i)                              \
8732 +({                                             \
8733 +       typeof(i) __i;                          \
8734 +                                               \
8735 +       __i = (i);                              \
8736 +       !(__i & (__i - 1));                     \
8737 +})
8738 +
8739 +#define KERNEL_DEBUGGER (1)
8740 +
8741 +#if KERNEL_DEBUGGER
8742 +
8743 +extern void reiser4_debugtrap(void);
8744 +
8745 +/*
8746 + * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
8747 + * kgdb is not compiled in, do nothing.
8748 + */
8749 +#define DEBUGON(cond)                                  \
8750 +({                                                     \
8751 +       if (unlikely(cond))                             \
8752 +               reiser4_debugtrap();                    \
8753 +})
8754 +#else
8755 +#define DEBUGON(cond) noop
8756 +#endif
8757 +
8758 +/*
8759 + * Error code tracing facility. (Idea is borrowed from XFS code.)
8760 + *
8761 + * Suppose some strange and/or unexpected code is returned from some function
8762 + * (for example, write(2) returns -EEXIST). It is possible to place a
8763 + * breakpoint in the reiser4_write(), but it is too late here. How to find out
8764 + * in what particular place -EEXIST was generated first?
8765 + *
8766 + * In reiser4 all places where actual error codes are produced (that is,
8767 + * statements of the form
8768 + *
8769 + *     return -EFOO;        // (1), or
8770 + *
8771 + *     result = -EFOO;      // (2)
8772 + *
8773 + * are replaced with
8774 + *
8775 + *     return RETERR(-EFOO);        // (1a), and
8776 + *
8777 + *     result = RETERR(-EFOO);      // (2a) respectively
8778 + *
8779 + * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
8780 + * printed in error and warning messages. Moreover, it's possible to put a
8781 + * conditional breakpoint in reiser4_return_err (low-level function called
8782 + * by RETERR() to do the actual work) to break into debugger immediately
8783 + * when particular error happens.
8784 + *
8785 + */
8786 +
8787 +#if REISER4_DEBUG
8788 +
8789 +/*
8790 + * data-type to store information about where error happened ("error site").
8791 + */
8792 +typedef struct err_site {
8793 +       int code;               /* error code */
8794 +       const char *file;       /* source file, filled by __FILE__ */
8795 +       int line;               /* source file line, filled by __LINE__ */
8796 +} err_site;
8797 +
8798 +extern void reiser4_return_err(int code, const char *file, int line);
8799 +
8800 +/*
8801 + * fill &get_current_context()->err_site with error information.
8802 + */
8803 +#define RETERR(code)                                   \
8804 +({                                                     \
8805 +       typeof(code) __code;                            \
8806 +                                                       \
8807 +       __code = (code);                                \
8808 +       reiser4_return_err(__code, __FILE__, __LINE__); \
8809 +       __code;                                         \
8810 +})
8811 +
8812 +#else
8813 +
8814 +/*
8815 + * no-op versions of the above
8816 + */
8817 +
8818 +typedef struct err_site {
8819 +} err_site;
8820 +#define RETERR(code) code
8821 +#endif
8822 +
8823 +#if REISER4_LARGE_KEY
8824 +/*
8825 + * conditionally compile arguments only if REISER4_LARGE_KEY is on.
8826 + */
8827 +#define ON_LARGE_KEY(...) __VA_ARGS__
8828 +#else
8829 +#define ON_LARGE_KEY(...)
8830 +#endif
8831 +
8832 +/* __FS_REISER4_DEBUG_H__ */
8833 +#endif
8834 +
8835 +/* Make Linus happy.
8836 +   Local variables:
8837 +   c-indentation-style: "K&R"
8838 +   mode-name: "LC"
8839 +   c-basic-offset: 8
8840 +   tab-width: 8
8841 +   fill-column: 120
8842 +   End:
8843 +*/
8844 diff -urN linux-2.6.35.orig/fs/reiser4/dformat.h linux-2.6.35/fs/reiser4/dformat.h
8845 --- linux-2.6.35.orig/fs/reiser4/dformat.h      1970-01-01 01:00:00.000000000 +0100
8846 +++ linux-2.6.35/fs/reiser4/dformat.h   2010-08-04 15:44:57.000000000 +0200
8847 @@ -0,0 +1,71 @@
8848 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8849 +   reiser4/README */
8850 +
8851 +/* Formats of on-disk data and conversion functions. */
8852 +
8853 +/* put all item formats in the files describing the particular items,
8854 +   our model is, everything you need to do to add an item to reiser4,
8855 +   (excepting the changes to the plugin that uses the item which go
8856 +   into the file defining that plugin), you put into one file. */
8857 +/* Data on disk are stored in little-endian format.
8858 +   To declare fields of on-disk structures, use d8, d16, d32 and d64.
8859 +   d??tocpu() and cputod??() to convert. */
8860 +
8861 +#if !defined(__FS_REISER4_DFORMAT_H__)
8862 +#define __FS_REISER4_DFORMAT_H__
8863 +
8864 +#include <asm/byteorder.h>
8865 +#include <asm/unaligned.h>
8866 +#include <linux/types.h>
8867 +
8868 +typedef __u8 d8;
8869 +typedef __le16 d16;
8870 +typedef __le32 d32;
8871 +typedef __le64 d64;
8872 +
8873 +#define PACKED __attribute__((packed))
8874 +
8875 +/* data-type for block number */
8876 +typedef __u64 reiser4_block_nr;
8877 +
8878 +/* data-type for block number on disk, disk format */
8879 +typedef __le64 reiser4_dblock_nr;
8880 +
8881 +/**
8882 + * disk_addr_eq - compare disk addresses
8883 + * @b1: pointer to block number ot compare
8884 + * @b2: pointer to block number ot compare
8885 + *
8886 + * Returns true if if disk addresses are the same
8887 + */
8888 +static inline int disk_addr_eq(const reiser4_block_nr * b1,
8889 +                              const reiser4_block_nr * b2)
8890 +{
8891 +       assert("nikita-1033", b1 != NULL);
8892 +       assert("nikita-1266", b2 != NULL);
8893 +
8894 +       return !memcmp(b1, b2, sizeof *b1);
8895 +}
8896 +
8897 +/* structure of master reiser4 super block */
8898 +typedef struct reiser4_master_sb {
8899 +       char magic[16];         /* "ReIsEr4" */
8900 +       __le16 disk_plugin_id;  /* id of disk layout plugin */
8901 +       __le16 blocksize;
8902 +       char uuid[16];          /* unique id */
8903 +       char label[16];         /* filesystem label */
8904 +       __le64 diskmap;         /* location of the diskmap. 0 if not present */
8905 +} reiser4_master_sb;
8906 +
8907 +/* __FS_REISER4_DFORMAT_H__ */
8908 +#endif
8909 +
8910 +/*
8911 + * Local variables:
8912 + * c-indentation-style: "K&R"
8913 + * mode-name: "LC"
8914 + * c-basic-offset: 8
8915 + * tab-width: 8
8916 + * fill-column: 79
8917 + * End:
8918 + */
8919 diff -urN linux-2.6.35.orig/fs/reiser4/dscale.c linux-2.6.35/fs/reiser4/dscale.c
8920 --- linux-2.6.35.orig/fs/reiser4/dscale.c       1970-01-01 01:00:00.000000000 +0100
8921 +++ linux-2.6.35/fs/reiser4/dscale.c    2010-08-04 15:44:57.000000000 +0200
8922 @@ -0,0 +1,192 @@
8923 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8924 + * reiser4/README */
8925 +
8926 +/* Scalable on-disk integers */
8927 +
8928 +/*
8929 + * Various on-disk structures contain integer-like structures. Stat-data
8930 + * contain [yes, "data" is plural, check the dictionary] file size, link
8931 + * count; extent unit contains extent width etc. To accommodate for general
8932 + * case enough space is reserved to keep largest possible value. 64 bits in
8933 + * all cases above. But in overwhelming majority of cases numbers actually
8934 + * stored in these fields will be comparatively small and reserving 8 bytes is
8935 + * a waste of precious disk bandwidth.
8936 + *
8937 + * Scalable integers are one way to solve this problem. dscale_write()
8938 + * function stores __u64 value in the given area consuming from 1 to 9 bytes,
8939 + * depending on the magnitude of the value supplied. dscale_read() reads value
8940 + * previously stored by dscale_write().
8941 + *
8942 + * dscale_write() produces format not completely unlike of UTF: two highest
8943 + * bits of the first byte are used to store "tag". One of 4 possible tag
8944 + * values is chosen depending on the number being encoded:
8945 + *
8946 + *           0 ... 0x3f               => 0           [table 1]
8947 + *        0x40 ... 0x3fff             => 1
8948 + *      0x4000 ... 0x3fffffff         => 2
8949 + *  0x40000000 ... 0xffffffffffffffff => 3
8950 + *
8951 + * (see dscale_range() function)
8952 + *
8953 + * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
8954 + * to be stored, so in this case there is no place in the first byte to store
8955 + * tag. For such values tag is stored in an extra 9th byte.
8956 + *
8957 + * As _highest_ bits are used for the test (which is natural) scaled integers
8958 + * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
8959 + * uses LITTLE-ENDIAN.
8960 + *
8961 + */
8962 +
8963 +#include "debug.h"
8964 +#include "dscale.h"
8965 +
8966 +/* return tag of scaled integer stored at @address */
8967 +static int gettag(const unsigned char *address)
8968 +{
8969 +       /* tag is stored in two highest bits */
8970 +       return (*address) >> 6;
8971 +}
8972 +
8973 +/* clear tag from value. Clear tag embedded into @value. */
8974 +static void cleartag(__u64 *value, int tag)
8975 +{
8976 +       /*
8977 +        * W-w-what ?!
8978 +        *
8979 +        * Actually, this is rather simple: @value passed here was read by
8980 +        * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
8981 +        * zeroes. Tag is still stored in the highest (arithmetically)
8982 +        * non-zero bits of @value, but relative position of tag within __u64
8983 +        * depends on @tag.
8984 +        *
8985 +        * For example if @tag is 0, it's stored 2 highest bits of lowest
8986 +        * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
8987 +        *
8988 +        * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
8989 +        * and it's offset if (2 * 8) - 2 == 14 bits.
8990 +        *
8991 +        * See table 1 above for details.
8992 +        *
8993 +        * All these cases are captured by the formula:
8994 +        */
8995 +       *value &= ~(3 << (((1 << tag) << 3) - 2));
8996 +       /*
8997 +        * That is, clear two (3 == 0t11) bits at the offset
8998 +        *
8999 +        *                  8 * (2 ^ tag) - 2,
9000 +        *
9001 +        * that is, two highest bits of (2 ^ tag)-th byte of @value.
9002 +        */
9003 +}
9004 +
9005 +/* return tag for @value. See table 1 above for details. */
9006 +static int dscale_range(__u64 value)
9007 +{
9008 +       if (value > 0x3fffffff)
9009 +               return 3;
9010 +       if (value > 0x3fff)
9011 +               return 2;
9012 +       if (value > 0x3f)
9013 +               return 1;
9014 +       return 0;
9015 +}
9016 +
9017 +/* restore value stored at @adderss by dscale_write() and return number of
9018 + * bytes consumed */
9019 +int dscale_read(unsigned char *address, __u64 *value)
9020 +{
9021 +       int tag;
9022 +
9023 +       /* read tag */
9024 +       tag = gettag(address);
9025 +       switch (tag) {
9026 +       case 3:
9027 +               /* In this case tag is stored in an extra byte, skip this byte
9028 +                * and decode value stored in the next 8 bytes.*/
9029 +               *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1)));
9030 +               /* worst case: 8 bytes for value itself plus one byte for
9031 +                * tag. */
9032 +               return 9;
9033 +       case 0:
9034 +               *value = get_unaligned(address);
9035 +               break;
9036 +       case 1:
9037 +               *value = __be16_to_cpu(get_unaligned((__be16 *)address));
9038 +               break;
9039 +       case 2:
9040 +               *value = __be32_to_cpu(get_unaligned((__be32 *)address));
9041 +               break;
9042 +       default:
9043 +               return RETERR(-EIO);
9044 +       }
9045 +       /* clear tag embedded into @value */
9046 +       cleartag(value, tag);
9047 +       /* number of bytes consumed is (2 ^ tag)---see table 1. */
9048 +       return 1 << tag;
9049 +}
9050 +
9051 +/* number of bytes consumed */
9052 +int dscale_bytes_to_read(unsigned char *address)
9053 +{
9054 +       int tag;
9055 +
9056 +       tag = gettag(address);
9057 +       switch (tag) {
9058 +       case 0:
9059 +       case 1:
9060 +       case 2:
9061 +               return 1 << tag;
9062 +       case 3:
9063 +               return 9;
9064 +       default:
9065 +               return RETERR(-EIO);
9066 +       }
9067 +}
9068 +
9069 +/* store @value at @address and return number of bytes consumed */
9070 +int dscale_write(unsigned char *address, __u64 value)
9071 +{
9072 +       int tag;
9073 +       int shift;
9074 +       __be64 v;
9075 +       unsigned char *valarr;
9076 +
9077 +       tag = dscale_range(value);
9078 +       v = __cpu_to_be64(value);
9079 +       valarr = (unsigned char *)&v;
9080 +       shift = (tag == 3) ? 1 : 0;
9081 +       memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag);
9082 +       *address |= (tag << 6);
9083 +       return shift + (1 << tag);
9084 +}
9085 +
9086 +/* number of bytes required to store @value */
9087 +int dscale_bytes_to_write(__u64 value)
9088 +{
9089 +       int bytes;
9090 +
9091 +       bytes = 1 << dscale_range(value);
9092 +       if (bytes == 8)
9093 +               ++bytes;
9094 +       return bytes;
9095 +}
9096 +
9097 +/* returns true if @value and @other require the same number of bytes to be
9098 + * stored. Used by detect when data structure (like stat-data) has to be
9099 + * expanded or contracted. */
9100 +int dscale_fit(__u64 value, __u64 other)
9101 +{
9102 +       return dscale_range(value) == dscale_range(other);
9103 +}
9104 +
9105 +/* Make Linus happy.
9106 +   Local variables:
9107 +   c-indentation-style: "K&R"
9108 +   mode-name: "LC"
9109 +   c-basic-offset: 8
9110 +   tab-width: 8
9111 +   fill-column: 120
9112 +   scroll-step: 1
9113 +   End:
9114 +*/
9115 diff -urN linux-2.6.35.orig/fs/reiser4/dscale.h linux-2.6.35/fs/reiser4/dscale.h
9116 --- linux-2.6.35.orig/fs/reiser4/dscale.h       1970-01-01 01:00:00.000000000 +0100
9117 +++ linux-2.6.35/fs/reiser4/dscale.h    2010-08-04 15:44:57.000000000 +0200
9118 @@ -0,0 +1,28 @@
9119 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9120 + * reiser4/README */
9121 +
9122 +/* Scalable on-disk integers. See dscale.h for details. */
9123 +
9124 +#if !defined(__FS_REISER4_DSCALE_H__)
9125 +#define __FS_REISER4_DSCALE_H__
9126 +
9127 +#include "dformat.h"
9128 +
9129 +extern int dscale_read(unsigned char *address, __u64 *value);
9130 +extern int dscale_write(unsigned char *address, __u64 value);
9131 +extern int dscale_bytes_to_read(unsigned char *address);
9132 +extern int dscale_bytes_to_write(__u64 value);
9133 +extern int dscale_fit(__u64 value, __u64 other);
9134 +
9135 +/* __FS_REISER4_DSCALE_H__ */
9136 +#endif
9137 +
9138 +/* Make Linus happy.
9139 +   Local variables:
9140 +   c-indentation-style: "K&R"
9141 +   mode-name: "LC"
9142 +   c-basic-offset: 8
9143 +   tab-width: 8
9144 +   fill-column: 120
9145 +   End:
9146 +*/
9147 diff -urN linux-2.6.35.orig/fs/reiser4/entd.c linux-2.6.35/fs/reiser4/entd.c
9148 --- linux-2.6.35.orig/fs/reiser4/entd.c 1970-01-01 01:00:00.000000000 +0100
9149 +++ linux-2.6.35/fs/reiser4/entd.c      2010-08-04 16:58:38.000000000 +0200
9150 @@ -0,0 +1,337 @@
9151 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
9152 + * reiser4/README */
9153 +
9154 +/* Ent daemon. */
9155 +
9156 +#include "debug.h"
9157 +#include "txnmgr.h"
9158 +#include "tree.h"
9159 +#include "entd.h"
9160 +#include "super.h"
9161 +#include "context.h"
9162 +#include "reiser4.h"
9163 +#include "vfs_ops.h"
9164 +#include "page_cache.h"
9165 +#include "inode.h"
9166 +
9167 +#include <linux/sched.h>       /* struct task_struct */
9168 +#include <linux/suspend.h>
9169 +#include <linux/kernel.h>
9170 +#include <linux/writeback.h>
9171 +#include <linux/time.h>                /* INITIAL_JIFFIES */
9172 +#include <linux/backing-dev.h> /* bdi_write_congested */
9173 +#include <linux/wait.h>
9174 +#include <linux/kthread.h>
9175 +#include <linux/freezer.h>
9176 +
9177 +#define DEF_PRIORITY 12
9178 +#define MAX_ENTD_ITERS 10
9179 +
9180 +static void entd_flush(struct super_block *, struct wbq *);
9181 +static int entd(void *arg);
9182 +
9183 +/*
9184 + * set ->comm field of end thread to make its state visible to the user level
9185 + */
9186 +#define entd_set_comm(state)                                   \
9187 +       snprintf(current->comm, sizeof(current->comm),  \
9188 +               "ent:%s%s", super->s_id, (state))
9189 +
9190 +/**
9191 + * reiser4_init_entd - initialize entd context and start kernel daemon
9192 + * @super: super block to start ent thread for
9193 + *
9194 + * Creates entd contexts, starts kernel thread and waits until it
9195 + * initializes.
9196 + */
9197 +int reiser4_init_entd(struct super_block *super)
9198 +{
9199 +       entd_context *ctx;
9200 +
9201 +       assert("nikita-3104", super != NULL);
9202 +
9203 +       ctx = get_entd_context(super);
9204 +
9205 +       memset(ctx, 0, sizeof *ctx);
9206 +       spin_lock_init(&ctx->guard);
9207 +       init_waitqueue_head(&ctx->wait);
9208 +#if REISER4_DEBUG
9209 +       INIT_LIST_HEAD(&ctx->flushers_list);
9210 +#endif
9211 +       /* lists of writepage requests */
9212 +       INIT_LIST_HEAD(&ctx->todo_list);
9213 +       INIT_LIST_HEAD(&ctx->done_list);
9214 +       /* start entd */
9215 +       ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id);
9216 +       if (IS_ERR(ctx->tsk))
9217 +               return PTR_ERR(ctx->tsk);
9218 +       return 0;
9219 +}
9220 +
9221 +static void put_wbq(struct wbq *rq)
9222 +{
9223 +       iput(rq->mapping->host);
9224 +       complete(&rq->completion);
9225 +}
9226 +
9227 +/* ent should be locked */
9228 +static struct wbq *__get_wbq(entd_context * ent)
9229 +{
9230 +       struct wbq *wbq;
9231 +
9232 +       if (list_empty(&ent->todo_list))
9233 +               return NULL;
9234 +
9235 +       ent->nr_todo_reqs--;
9236 +       wbq = list_entry(ent->todo_list.next, struct wbq, link);
9237 +       list_del_init(&wbq->link);
9238 +       return wbq;
9239 +}
9240 +
9241 +/* ent thread function */
9242 +static int entd(void *arg)
9243 +{
9244 +       struct super_block *super;
9245 +       entd_context *ent;
9246 +       int done = 0;
9247 +
9248 +       super = arg;
9249 +       /* do_fork() just copies task_struct into the new
9250 +          thread. ->fs_context shouldn't be copied of course. This shouldn't
9251 +          be a problem for the rest of the code though.
9252 +        */
9253 +       current->journal_info = NULL;
9254 +
9255 +       ent = get_entd_context(super);
9256 +
9257 +       while (!done) {
9258 +               try_to_freeze();
9259 +
9260 +               spin_lock(&ent->guard);
9261 +               while (ent->nr_todo_reqs != 0) {
9262 +                       struct wbq *rq;
9263 +
9264 +                       assert("", list_empty(&ent->done_list));
9265 +
9266 +                       /* take request from the queue head */
9267 +                       rq = __get_wbq(ent);
9268 +                       assert("", rq != NULL);
9269 +                       ent->cur_request = rq;
9270 +                       spin_unlock(&ent->guard);
9271 +
9272 +                       entd_set_comm("!");
9273 +                       entd_flush(super, rq);
9274 +
9275 +                       put_wbq(rq);
9276 +
9277 +                       /*
9278 +                        * wakeup all requestors and iput their inodes
9279 +                        */
9280 +                       spin_lock(&ent->guard);
9281 +                       while (!list_empty(&ent->done_list)) {
9282 +                               rq = list_entry(ent->done_list.next, struct wbq, link);
9283 +                               list_del_init(&rq->link);
9284 +                               ent->nr_done_reqs--;
9285 +                               spin_unlock(&ent->guard);
9286 +                               assert("", rq->written == 1);
9287 +                               put_wbq(rq);
9288 +                               spin_lock(&ent->guard);
9289 +                       }
9290 +               }
9291 +               spin_unlock(&ent->guard);
9292 +
9293 +               entd_set_comm(".");
9294 +
9295 +               {
9296 +                       DEFINE_WAIT(__wait);
9297 +
9298 +                       do {
9299 +                               prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE);
9300 +                               if (kthread_should_stop()) {
9301 +                                       done = 1;
9302 +                                       break;
9303 +                               }
9304 +                               if (ent->nr_todo_reqs != 0)
9305 +                                       break;
9306 +                               schedule();
9307 +                       } while (0);
9308 +                       finish_wait(&ent->wait, &__wait);
9309 +               }
9310 +       }
9311 +       BUG_ON(ent->nr_todo_reqs != 0);
9312 +       return 0;
9313 +}
9314 +
9315 +/**
9316 + * reiser4_done_entd - stop entd kernel thread
9317 + * @super: super block to stop ent thread for
9318 + *
9319 + * It is called on umount. Sends stop signal to entd and wait until it handles
9320 + * it.
9321 + */
9322 +void reiser4_done_entd(struct super_block *super)
9323 +{
9324 +       entd_context *ent;
9325 +
9326 +       assert("nikita-3103", super != NULL);
9327 +
9328 +       ent = get_entd_context(super);
9329 +       assert("zam-1055", ent->tsk != NULL);
9330 +       kthread_stop(ent->tsk);
9331 +}
9332 +
9333 +/* called at the beginning of jnode_flush to register flusher thread with ent
9334 + * daemon */
9335 +void reiser4_enter_flush(struct super_block *super)
9336 +{
9337 +       entd_context *ent;
9338 +
9339 +       assert("zam-1029", super != NULL);
9340 +       ent = get_entd_context(super);
9341 +
9342 +       assert("zam-1030", ent != NULL);
9343 +
9344 +       spin_lock(&ent->guard);
9345 +       ent->flushers++;
9346 +#if REISER4_DEBUG
9347 +       list_add(&get_current_context()->flushers_link, &ent->flushers_list);
9348 +#endif
9349 +       spin_unlock(&ent->guard);
9350 +}
9351 +
9352 +/* called at the end of jnode_flush */
9353 +void reiser4_leave_flush(struct super_block *super)
9354 +{
9355 +       entd_context *ent;
9356 +       int wake_up_ent;
9357 +
9358 +       assert("zam-1027", super != NULL);
9359 +       ent = get_entd_context(super);
9360 +
9361 +       assert("zam-1028", ent != NULL);
9362 +
9363 +       spin_lock(&ent->guard);
9364 +       ent->flushers--;
9365 +       wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0);
9366 +#if REISER4_DEBUG
9367 +       list_del_init(&get_current_context()->flushers_link);
9368 +#endif
9369 +       spin_unlock(&ent->guard);
9370 +       if (wake_up_ent)
9371 +               wake_up_process(ent->tsk);
9372 +}
9373 +
9374 +#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX
9375 +
9376 +static void entd_flush(struct super_block *super, struct wbq *rq)
9377 +{
9378 +       reiser4_context ctx;
9379 +       int tmp;
9380 +
9381 +       init_stack_context(&ctx, super);
9382 +       ctx.entd = 1;
9383 +       ctx.gfp_mask = GFP_NOFS;
9384 +
9385 +       rq->wbc->range_start = page_offset(rq->page);
9386 +       rq->wbc->range_end = rq->wbc->range_start +
9387 +               (ENTD_CAPTURE_APAGE_BURST << PAGE_CACHE_SHIFT);
9388 +       tmp = rq->wbc->nr_to_write;
9389 +
9390 +       rq->mapping->a_ops->writepages(rq->mapping, rq->wbc);
9391 +
9392 +       if (rq->wbc->nr_to_write > 0) {
9393 +               rq->wbc->range_start = 0;
9394 +               rq->wbc->range_end = LLONG_MAX;
9395 +               writeback_inodes_wb(&rq->mapping->backing_dev_info->wb,
9396 +                                   rq->wbc);
9397 +       }
9398 +       rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
9399 +
9400 +       reiser4_writeout(super, rq->wbc);
9401 +       context_set_commit_async(&ctx);
9402 +       reiser4_exit_context(&ctx);
9403 +}
9404 +
9405 +/**
9406 + * write_page_by_ent - ask entd thread to flush this page as part of slum
9407 + * @page: page to be written
9408 + * @wbc: writeback control passed to reiser4_writepage
9409 + *
9410 + * Creates a request, puts it on entd list of requests, wakeups entd if
9411 + * necessary, waits until entd completes with the request.
9412 + */
9413 +int write_page_by_ent(struct page *page, struct writeback_control *wbc)
9414 +{
9415 +       struct super_block *sb;
9416 +       struct inode *inode;
9417 +       entd_context *ent;
9418 +       struct wbq rq;
9419 +
9420 +       assert("", PageLocked(page));
9421 +       assert("", page->mapping != NULL);
9422 +
9423 +       sb = page->mapping->host->i_sb;
9424 +       ent = get_entd_context(sb);
9425 +       assert("", ent && ent->done == 0);
9426 +
9427 +       /*
9428 +        * we are going to unlock page and ask ent thread to write the
9429 +        * page. Re-dirty page before unlocking so that if ent thread fails to
9430 +        * write it - it will remain dirty
9431 +        */
9432 +       set_page_dirty_notag(page);
9433 +
9434 +       /*
9435 +        * pin inode in memory, unlock page, entd_flush will iput. We can not
9436 +        * iput here becasue we can not allow delete_inode to be called here
9437 +        */
9438 +       inode = igrab(page->mapping->host);
9439 +       unlock_page(page);
9440 +       if (inode == NULL)
9441 +               /* inode is getting freed */
9442 +               return 0;
9443 +
9444 +       /* init wbq */
9445 +       INIT_LIST_HEAD(&rq.link);
9446 +       rq.magic = WBQ_MAGIC;
9447 +       rq.wbc = wbc;
9448 +       rq.page = page;
9449 +       rq.mapping = inode->i_mapping;
9450 +       rq.node = NULL;
9451 +       rq.written = 0;
9452 +       init_completion(&rq.completion);
9453 +
9454 +       /* add request to entd's list of writepage requests */
9455 +       spin_lock(&ent->guard);
9456 +       ent->nr_todo_reqs++;
9457 +       list_add_tail(&rq.link, &ent->todo_list);
9458 +       if (ent->nr_todo_reqs == 1)
9459 +               wake_up_process(ent->tsk);
9460 +
9461 +       spin_unlock(&ent->guard);
9462 +
9463 +       /* wait until entd finishes */
9464 +       wait_for_completion(&rq.completion);
9465 +
9466 +       if (rq.written)
9467 +               /* Eventually ENTD has written the page to disk. */
9468 +               return 0;
9469 +       return 0;
9470 +}
9471 +
9472 +int wbq_available(void)
9473 +{
9474 +       struct super_block *sb = reiser4_get_current_sb();
9475 +       entd_context *ent = get_entd_context(sb);
9476 +       return ent->nr_todo_reqs;
9477 +}
9478 +
9479 +/*
9480 + * Local variables:
9481 + * c-indentation-style: "K&R"
9482 + * mode-name: "LC"
9483 + * c-basic-offset: 8
9484 + * tab-width: 8
9485 + * fill-column: 79
9486 + * End:
9487 + */
9488 diff -urN linux-2.6.35.orig/fs/reiser4/entd.h linux-2.6.35/fs/reiser4/entd.h
9489 --- linux-2.6.35.orig/fs/reiser4/entd.h 1970-01-01 01:00:00.000000000 +0100
9490 +++ linux-2.6.35/fs/reiser4/entd.h      2010-08-04 15:44:57.000000000 +0200
9491 @@ -0,0 +1,90 @@
9492 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
9493 +
9494 +/* Ent daemon. */
9495 +
9496 +#ifndef __ENTD_H__
9497 +#define __ENTD_H__
9498 +
9499 +#include "context.h"
9500 +
9501 +#include <linux/fs.h>
9502 +#include <linux/completion.h>
9503 +#include <linux/wait.h>
9504 +#include <linux/spinlock.h>
9505 +#include <linux/sched.h>       /* for struct task_struct */
9506 +
9507 +#define WBQ_MAGIC 0x7876dc76
9508 +
9509 +/* write-back request. */
9510 +struct wbq {
9511 +       int magic;
9512 +       struct list_head link; /* list head of this list is in entd context */
9513 +       struct writeback_control *wbc;
9514 +       struct page *page;
9515 +       struct address_space *mapping;
9516 +       struct completion completion;
9517 +       jnode *node; /* set if ent thread captured requested page */
9518 +       int written; /* set if ent thread wrote requested page */
9519 +};
9520 +
9521 +/* ent-thread context. This is used to synchronize starting/stopping ent
9522 + * threads. */
9523 +typedef struct entd_context {
9524 +        /* wait queue that ent thread waits on for more work. It's
9525 +         * signaled by write_page_by_ent(). */
9526 +       wait_queue_head_t wait;
9527 +       /* spinlock protecting other fields */
9528 +       spinlock_t guard;
9529 +       /* ent thread */
9530 +       struct task_struct *tsk;
9531 +       /* set to indicate that ent thread should leave. */
9532 +       int done;
9533 +       /* counter of active flushers */
9534 +       int flushers;
9535 +       /*
9536 +        * when reiser4_writepage asks entd to write a page - it adds struct
9537 +        * wbq to this list
9538 +        */
9539 +       struct list_head todo_list;
9540 +       /* number of elements on the above list */
9541 +       int nr_todo_reqs;
9542 +
9543 +       struct wbq *cur_request;
9544 +       /*
9545 +        * when entd writes a page it moves write-back request from todo_list
9546 +        * to done_list. This list is used at the end of entd iteration to
9547 +        * wakeup requestors and iput inodes.
9548 +        */
9549 +       struct list_head done_list;
9550 +       /* number of elements on the above list */
9551 +       int nr_done_reqs;
9552 +
9553 +#if REISER4_DEBUG
9554 +       /* list of all active flushers */
9555 +       struct list_head flushers_list;
9556 +#endif
9557 +} entd_context;
9558 +
9559 +extern int  reiser4_init_entd(struct super_block *);
9560 +extern void reiser4_done_entd(struct super_block *);
9561 +
9562 +extern void reiser4_enter_flush(struct super_block *);
9563 +extern void reiser4_leave_flush(struct super_block *);
9564 +
9565 +extern int write_page_by_ent(struct page *, struct writeback_control *);
9566 +extern int wbq_available(void);
9567 +extern void ent_writes_page(struct super_block *, struct page *);
9568 +
9569 +extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *);
9570 +/* __ENTD_H__ */
9571 +#endif
9572 +
9573 +/* Make Linus happy.
9574 +   Local variables:
9575 +   c-indentation-style: "K&R"
9576 +   mode-name: "LC"
9577 +   c-basic-offset: 8
9578 +   tab-width: 8
9579 +   fill-column: 120
9580 +   End:
9581 +*/
9582 diff -urN linux-2.6.35.orig/fs/reiser4/eottl.c linux-2.6.35/fs/reiser4/eottl.c
9583 --- linux-2.6.35.orig/fs/reiser4/eottl.c        1970-01-01 01:00:00.000000000 +0100
9584 +++ linux-2.6.35/fs/reiser4/eottl.c     2010-08-04 15:44:57.000000000 +0200
9585 @@ -0,0 +1,510 @@
9586 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9587 +   reiser4/README */
9588 +
9589 +#include "forward.h"
9590 +#include "debug.h"
9591 +#include "key.h"
9592 +#include "coord.h"
9593 +#include "plugin/item/item.h"
9594 +#include "plugin/node/node.h"
9595 +#include "znode.h"
9596 +#include "block_alloc.h"
9597 +#include "tree_walk.h"
9598 +#include "tree_mod.h"
9599 +#include "carry.h"
9600 +#include "tree.h"
9601 +#include "super.h"
9602 +
9603 +#include <linux/types.h>       /* for __u??  */
9604 +
9605 +/*
9606 + * Extents on the twig level (EOTTL) handling.
9607 + *
9608 + * EOTTL poses some problems to the tree traversal, that are better explained
9609 + * by example.
9610 + *
9611 + * Suppose we have block B1 on the twig level with the following items:
9612 + *
9613 + * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
9614 + * offset)
9615 + * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
9616 + * 2. internal item I2 with key (10:0:0:0)
9617 + *
9618 + * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
9619 + * then intra-node lookup is done. This lookup finished on the E1, because the
9620 + * key we are looking for is larger than the key of E1 and is smaller than key
9621 + * the of I2.
9622 + *
9623 + * Here search is stuck.
9624 + *
9625 + * After some thought it is clear what is wrong here: extents on the twig level
9626 + * break some basic property of the *search* tree (on the pretext, that they
9627 + * restore property of balanced tree).
9628 + *
9629 + * Said property is the following: if in the internal node of the search tree
9630 + * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
9631 + * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
9632 + * through the Pointer.
9633 + *
9634 + * This is not true, when Pointer is Extent-Pointer, simply because extent
9635 + * cannot expand indefinitely to the right to include any item with
9636 + *
9637 + *   Key1 <= Key <= Key2.
9638 + *
9639 + * For example, our E1 extent is only responsible for the data with keys
9640 + *
9641 + *   (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
9642 + *
9643 + * so, key range
9644 + *
9645 + *   ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
9646 + *
9647 + * is orphaned: there is no way to get there from the tree root.
9648 + *
9649 + * In other words, extent pointers are different than normal child pointers as
9650 + * far as search tree is concerned, and this creates such problems.
9651 + *
9652 + * Possible solution for this problem is to insert our item into node pointed
9653 + * to by I2. There are some problems through:
9654 + *
9655 + * (1) I2 can be in a different node.
9656 + * (2) E1 can be immediately followed by another extent E2.
9657 + *
9658 + * (1) is solved by calling reiser4_get_right_neighbor() and accounting
9659 + * for locks/coords as necessary.
9660 + *
9661 + * (2) is more complex. Solution here is to insert new empty leaf node and
9662 + * insert internal item between E1 and E2 pointing to said leaf node. This is
9663 + * further complicated by possibility that E2 is in a different node, etc.
9664 + *
9665 + * Problems:
9666 + *
9667 + * (1) if there was internal item I2 immediately on the right of an extent E1
9668 + * we and we decided to insert new item S1 into node N2 pointed to by I2, then
9669 + * key of S1 will be less than smallest key in the N2. Normally, search key
9670 + * checks that key we are looking for is in the range of keys covered by the
9671 + * node key is being looked in. To work around of this situation, while
9672 + * preserving useful consistency check new flag CBK_TRUST_DK was added to the
9673 + * cbk falgs bitmask. This flag is automatically set on entrance to the
9674 + * coord_by_key() and is only cleared when we are about to enter situation
9675 + * described above.
9676 + *
9677 + * (2) If extent E1 is immediately followed by another extent E2 and we are
9678 + * searching for the key that is between E1 and E2 we only have to insert new
9679 + * empty leaf node when coord_by_key was called for insertion, rather than just
9680 + * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
9681 + * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
9682 + * performed by insert_by_key() and friends.
9683 + *
9684 + * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
9685 + * case it requires modification of node content which is only possible under
9686 + * write lock. It may well happen that we only have read lock on the node where
9687 + * new internal pointer is to be inserted (common case: lookup of non-existent
9688 + * stat-data that fells between two extents). If only read lock is held, tree
9689 + * traversal is restarted with lock_level modified so that next time we hit
9690 + * this problem, write lock will be held. Once we have write lock, balancing
9691 + * will be performed.
9692 + */
9693 +
9694 +/**
9695 + * is_next_item_internal - check whether next item is internal
9696 + * @coord: coordinate of extent item in twig node
9697 + * @key: search key
9698 + * @lh: twig node lock handle
9699 + *
9700 + * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
9701 + * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
9702 + * to that node, @coord is set to its first unit. If next item is not internal
9703 + * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
9704 + * is returned if search restart has to be done.
9705 + */
9706 +static int
9707 +is_next_item_internal(coord_t *coord, const reiser4_key * key,
9708 +                     lock_handle * lh)
9709 +{
9710 +       coord_t next;
9711 +       lock_handle rn;
9712 +       int result;
9713 +
9714 +       coord_dup(&next, coord);
9715 +       if (coord_next_unit(&next) == 0) {
9716 +               /* next unit is in this node */
9717 +               if (item_is_internal(&next)) {
9718 +                       coord_dup(coord, &next);
9719 +                       return 1;
9720 +               }
9721 +               assert("vs-3", item_is_extent(&next));
9722 +               return 0;
9723 +       }
9724 +
9725 +       /*
9726 +        * next unit either does not exist or is in right neighbor. If it is in
9727 +        * right neighbor we have to check right delimiting key because
9728 +        * concurrent thread could get their first and insert item with a key
9729 +        * smaller than @key
9730 +        */
9731 +       read_lock_dk(current_tree);
9732 +       result = keycmp(key, znode_get_rd_key(coord->node));
9733 +       read_unlock_dk(current_tree);
9734 +       assert("vs-6", result != EQUAL_TO);
9735 +       if (result == GREATER_THAN)
9736 +               return 2;
9737 +
9738 +       /* lock right neighbor */
9739 +       init_lh(&rn);
9740 +       result = reiser4_get_right_neighbor(&rn, coord->node,
9741 +                                           znode_is_wlocked(coord->node) ?
9742 +                                           ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
9743 +                                           GN_CAN_USE_UPPER_LEVELS);
9744 +       if (result == -E_NO_NEIGHBOR) {
9745 +               /* we are on the rightmost edge of the tree */
9746 +               done_lh(&rn);
9747 +               return 0;
9748 +       }
9749 +
9750 +       if (result) {
9751 +               assert("vs-4", result < 0);
9752 +               done_lh(&rn);
9753 +               return result;
9754 +       }
9755 +
9756 +       /*
9757 +        * check whether concurrent thread managed to insert item with a key
9758 +        * smaller than @key
9759 +        */
9760 +       read_lock_dk(current_tree);
9761 +       result = keycmp(key, znode_get_ld_key(rn.node));
9762 +       read_unlock_dk(current_tree);
9763 +       assert("vs-6", result != EQUAL_TO);
9764 +       if (result == GREATER_THAN) {
9765 +               done_lh(&rn);
9766 +               return 2;
9767 +       }
9768 +
9769 +       result = zload(rn.node);
9770 +       if (result) {
9771 +               assert("vs-5", result < 0);
9772 +               done_lh(&rn);
9773 +               return result;
9774 +       }
9775 +
9776 +       coord_init_first_unit(&next, rn.node);
9777 +       if (item_is_internal(&next)) {
9778 +               /*
9779 +                * next unit is in right neighbor and it is an unit of internal
9780 +                * item. Unlock coord->node. Move @lh to right neighbor. @coord
9781 +                * is set to the first unit of right neighbor.
9782 +                */
9783 +               coord_dup(coord, &next);
9784 +               zrelse(rn.node);
9785 +               done_lh(lh);
9786 +               move_lh(lh, &rn);
9787 +               return 1;
9788 +       }
9789 +
9790 +       /*
9791 +        * next unit is unit of extent item. Return without chaning @lh and
9792 +        * @coord.
9793 +        */
9794 +       assert("vs-6", item_is_extent(&next));
9795 +       zrelse(rn.node);
9796 +       done_lh(&rn);
9797 +       return 0;
9798 +}
9799 +
9800 +/**
9801 + * rd_key - calculate key of an item next to the given one
9802 + * @coord: position in a node
9803 + * @key: storage for result key
9804 + *
9805 + * @coord is set between items or after the last item in a node. Calculate key
9806 + * of item to the right of @coord.
9807 + */
9808 +static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
9809 +{
9810 +       coord_t dup;
9811 +
9812 +       assert("nikita-2281", coord_is_between_items(coord));
9813 +       coord_dup(&dup, coord);
9814 +
9815 +       if (coord_set_to_right(&dup) == 0)
9816 +               /* next item is in this node. Return its key. */
9817 +               unit_key_by_coord(&dup, key);
9818 +       else {
9819 +               /*
9820 +                * next item either does not exist or is in right
9821 +                * neighbor. Return znode's right delimiting key.
9822 +                */
9823 +               read_lock_dk(current_tree);
9824 +               *key = *znode_get_rd_key(coord->node);
9825 +               read_unlock_dk(current_tree);
9826 +       }
9827 +       return key;
9828 +}
9829 +
9830 +/**
9831 + * add_empty_leaf - insert empty leaf between two extents
9832 + * @insert_coord: position in twig node between two extents
9833 + * @lh: twig node lock handle
9834 + * @key: left delimiting key of new node
9835 + * @rdkey: right delimiting key of new node
9836 + *
9837 + * Inserts empty leaf node between two extent items. It is necessary when we
9838 + * have to insert an item on leaf level between two extents (items on the twig
9839 + * level).
9840 + */
9841 +static int
9842 +add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
9843 +              const reiser4_key *key, const reiser4_key *rdkey)
9844 +{
9845 +       int result;
9846 +       carry_pool *pool;
9847 +       carry_level *todo;
9848 +       reiser4_item_data *item;
9849 +       carry_insert_data *cdata;
9850 +       carry_op *op;
9851 +       znode *node;
9852 +       reiser4_tree *tree;
9853 +
9854 +       assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
9855 +       tree = znode_get_tree(insert_coord->node);
9856 +       node = reiser4_new_node(insert_coord->node, LEAF_LEVEL);
9857 +       if (IS_ERR(node))
9858 +               return PTR_ERR(node);
9859 +
9860 +       /* setup delimiting keys for node being inserted */
9861 +       write_lock_dk(tree);
9862 +       znode_set_ld_key(node, key);
9863 +       znode_set_rd_key(node, rdkey);
9864 +       ON_DEBUG(node->creator = current);
9865 +       ON_DEBUG(node->first_key = *key);
9866 +       write_unlock_dk(tree);
9867 +
9868 +       ZF_SET(node, JNODE_ORPHAN);
9869 +
9870 +       /*
9871 +        * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
9872 +        * carry_insert_data
9873 +        */
9874 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
9875 +                              sizeof(*item) + sizeof(*cdata));
9876 +       if (IS_ERR(pool))
9877 +               return PTR_ERR(pool);
9878 +       todo = (carry_level *) (pool + 1);
9879 +       init_carry_level(todo, pool);
9880 +
9881 +       item = (reiser4_item_data *) (todo + 3);
9882 +       cdata = (carry_insert_data *) (item + 1);
9883 +
9884 +       op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0);
9885 +       if (!IS_ERR(op)) {
9886 +               cdata->coord = insert_coord;
9887 +               cdata->key = key;
9888 +               cdata->data = item;
9889 +               op->u.insert.d = cdata;
9890 +               op->u.insert.type = COPT_ITEM_DATA;
9891 +               build_child_ptr_data(node, item);
9892 +               item->arg = NULL;
9893 +               /* have @insert_coord to be set at inserted item after
9894 +                  insertion is done */
9895 +               todo->track_type = CARRY_TRACK_CHANGE;
9896 +               todo->tracked = lh;
9897 +
9898 +               result = reiser4_carry(todo, NULL);
9899 +               if (result == 0) {
9900 +                       /*
9901 +                        * pin node in memory. This is necessary for
9902 +                        * znode_make_dirty() below.
9903 +                        */
9904 +                       result = zload(node);
9905 +                       if (result == 0) {
9906 +                               lock_handle local_lh;
9907 +
9908 +                               /*
9909 +                                * if we inserted new child into tree we have
9910 +                                * to mark it dirty so that flush will be able
9911 +                                * to process it.
9912 +                                */
9913 +                               init_lh(&local_lh);
9914 +                               result = longterm_lock_znode(&local_lh, node,
9915 +                                                            ZNODE_WRITE_LOCK,
9916 +                                                            ZNODE_LOCK_LOPRI);
9917 +                               if (result == 0) {
9918 +                                       znode_make_dirty(node);
9919 +
9920 +                                       /*
9921 +                                        * when internal item pointing to @node
9922 +                                        * was inserted into twig node
9923 +                                        * create_hook_internal did not connect
9924 +                                        * it properly because its right
9925 +                                        * neighbor was not known. Do it
9926 +                                        * here
9927 +                                        */
9928 +                                       write_lock_tree(tree);
9929 +                                       assert("nikita-3312",
9930 +                                              znode_is_right_connected(node));
9931 +                                       assert("nikita-2984",
9932 +                                              node->right == NULL);
9933 +                                       ZF_CLR(node, JNODE_RIGHT_CONNECTED);
9934 +                                       write_unlock_tree(tree);
9935 +                                       result =
9936 +                                           connect_znode(insert_coord, node);
9937 +                                       ON_DEBUG(if (result == 0) check_dkeys(node););
9938 +
9939 +                                       done_lh(lh);
9940 +                                       move_lh(lh, &local_lh);
9941 +                                       assert("vs-1676", node_is_empty(node));
9942 +                                       coord_init_first_unit(insert_coord,
9943 +                                                             node);
9944 +                               } else {
9945 +                                       warning("nikita-3136",
9946 +                                               "Cannot lock child");
9947 +                               }
9948 +                               done_lh(&local_lh);
9949 +                               zrelse(node);
9950 +                       }
9951 +               }
9952 +       } else
9953 +               result = PTR_ERR(op);
9954 +       zput(node);
9955 +       done_carry_pool(pool);
9956 +       return result;
9957 +}
9958 +
9959 +/**
9960 + * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
9961 + * @h: search handle
9962 + * @outcome: flag saying whether search has to restart or is done
9963 + *
9964 + * Handles search on twig level. If this function completes search itself then
9965 + * it returns 1. If search has to go one level down then 0 is returned. If
9966 + * error happens then LOOKUP_DONE is returned via @outcome and error code is
9967 + * saved in @h->result.
9968 + */
9969 +int handle_eottl(cbk_handle *h, int *outcome)
9970 +{
9971 +       int result;
9972 +       reiser4_key key;
9973 +       coord_t *coord;
9974 +
9975 +       coord = h->coord;
9976 +
9977 +       if (h->level != TWIG_LEVEL ||
9978 +           (coord_is_existing_item(coord) && item_is_internal(coord))) {
9979 +               /* Continue to traverse tree downward. */
9980 +               return 0;
9981 +       }
9982 +
9983 +       /*
9984 +        * make sure that @h->coord is set to twig node and that it is either
9985 +        * set to extent item or after extent item
9986 +        */
9987 +       assert("vs-356", h->level == TWIG_LEVEL);
9988 +       assert("vs-357", ({
9989 +                         coord_t lcoord;
9990 +                         coord_dup(&lcoord, coord);
9991 +                         check_me("vs-733", coord_set_to_left(&lcoord) == 0);
9992 +                         item_is_extent(&lcoord);
9993 +                         }
9994 +              ));
9995 +
9996 +       if (*outcome == NS_FOUND) {
9997 +               /* we have found desired key on twig level in extent item */
9998 +               h->result = CBK_COORD_FOUND;
9999 +               *outcome = LOOKUP_DONE;
10000 +               return 1;
10001 +       }
10002 +
10003 +       if (!(h->flags & CBK_FOR_INSERT)) {
10004 +               /* tree traversal is not for insertion. Just return
10005 +                  CBK_COORD_NOTFOUND. */
10006 +               h->result = CBK_COORD_NOTFOUND;
10007 +               *outcome = LOOKUP_DONE;
10008 +               return 1;
10009 +       }
10010 +
10011 +       /* take a look at the item to the right of h -> coord */
10012 +       result = is_next_item_internal(coord, h->key, h->active_lh);
10013 +       if (unlikely(result < 0)) {
10014 +               h->error = "get_right_neighbor failed";
10015 +               h->result = result;
10016 +               *outcome = LOOKUP_DONE;
10017 +               return 1;
10018 +       }
10019 +       if (result == 0) {
10020 +               /*
10021 +                * item to the right is also an extent one. Allocate a new node
10022 +                * and insert pointer to it after item h -> coord.
10023 +                *
10024 +                * This is a result of extents being located at the twig
10025 +                * level. For explanation, see comment just above
10026 +                * is_next_item_internal().
10027 +                */
10028 +               znode *loaded;
10029 +
10030 +               if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
10031 +                       /*
10032 +                        * we got node read locked, restart coord_by_key to
10033 +                        * have write lock on twig level
10034 +                        */
10035 +                       h->lock_level = TWIG_LEVEL;
10036 +                       h->lock_mode = ZNODE_WRITE_LOCK;
10037 +                       *outcome = LOOKUP_REST;
10038 +                       return 1;
10039 +               }
10040 +
10041 +               loaded = coord->node;
10042 +               result =
10043 +                   add_empty_leaf(coord, h->active_lh, h->key,
10044 +                                  rd_key(coord, &key));
10045 +               if (result) {
10046 +                       h->error = "could not add empty leaf";
10047 +                       h->result = result;
10048 +                       *outcome = LOOKUP_DONE;
10049 +                       return 1;
10050 +               }
10051 +               /* added empty leaf is locked (h->active_lh), its parent node
10052 +                  is unlocked, h->coord is set as EMPTY */
10053 +               assert("vs-13", coord->between == EMPTY_NODE);
10054 +               assert("vs-14", znode_is_write_locked(coord->node));
10055 +               assert("vs-15",
10056 +                      WITH_DATA(coord->node, node_is_empty(coord->node)));
10057 +               assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
10058 +               assert("vs-17", coord->node == h->active_lh->node);
10059 +               *outcome = LOOKUP_DONE;
10060 +               h->result = CBK_COORD_NOTFOUND;
10061 +               return 1;
10062 +       } else if (result == 1) {
10063 +               /*
10064 +                * this is special case mentioned in the comment on
10065 +                * tree.h:cbk_flags. We have found internal item immediately on
10066 +                * the right of extent, and we are going to insert new item
10067 +                * there. Key of item we are going to insert is smaller than
10068 +                * leftmost key in the node pointed to by said internal item
10069 +                * (otherwise search wouldn't come to the extent in the first
10070 +                * place).
10071 +                *
10072 +                * This is a result of extents being located at the twig
10073 +                * level. For explanation, see comment just above
10074 +                * is_next_item_internal().
10075 +                */
10076 +               h->flags &= ~CBK_TRUST_DK;
10077 +       } else {
10078 +               assert("vs-8", result == 2);
10079 +               *outcome = LOOKUP_REST;
10080 +               return 1;
10081 +       }
10082 +       assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
10083 +       return 0;
10084 +}
10085 +
10086 +/*
10087 + * Local variables:
10088 + * c-indentation-style: "K&R"
10089 + * mode-name: "LC"
10090 + * c-basic-offset: 8
10091 + * tab-width: 8
10092 + * fill-column: 120
10093 + * scroll-step: 1
10094 + * End:
10095 + */
10096 diff -urN linux-2.6.35.orig/fs/reiser4/estimate.c linux-2.6.35/fs/reiser4/estimate.c
10097 --- linux-2.6.35.orig/fs/reiser4/estimate.c     1970-01-01 01:00:00.000000000 +0100
10098 +++ linux-2.6.35/fs/reiser4/estimate.c  2010-08-04 15:44:57.000000000 +0200
10099 @@ -0,0 +1,129 @@
10100 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
10101 +   reiser4/README */
10102 +
10103 +#include "debug.h"
10104 +#include "dformat.h"
10105 +#include "tree.h"
10106 +#include "carry.h"
10107 +#include "inode.h"
10108 +#include "plugin/cluster.h"
10109 +#include "plugin/item/ctail.h"
10110 +
10111 +/* This returns how many nodes might get dirty and added nodes if @children
10112 +   nodes are dirtied
10113 +
10114 +   Amount of internals which will get dirty or get allocated we estimate as 5%
10115 +   of the childs + 1 balancing. 1 balancing is 2 neighbours, 2 new blocks and
10116 +   the current block on the leaf level, 2 neighbour nodes + the current (or 1
10117 +   neighbour and 1 new and the current) on twig level, 2 neighbour nodes on
10118 +   upper levels and 1 for a new root. So 5 for leaf level, 3 for twig level,
10119 +   2 on upper + 1 for root.
10120 +
10121 +   Do not calculate the current node of the lowest level here - this is overhead
10122 +   only.
10123 +
10124 +   children is almost always 1 here. Exception is flow insertion
10125 +*/
10126 +static reiser4_block_nr
10127 +max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
10128 +{
10129 +       reiser4_block_nr ten_percent;
10130 +
10131 +       ten_percent = ((103 * childen) >> 10);
10132 +
10133 +       /* If we have too many balancings at the time, tree height can raise on
10134 +          more then 1. Assume that if tree_height is 5, it can raise on 1 only.
10135 +       */
10136 +       return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
10137 +}
10138 +
10139 +/* this returns maximal possible number of nodes which can be modified plus
10140 +   number of new nodes which can be required to perform insertion of one item
10141 +   into the tree */
10142 +/* it is only called when tree height changes, or gets initialized */
10143 +reiser4_block_nr calc_estimate_one_insert(tree_level height)
10144 +{
10145 +       return 1 + max_balance_overhead(1, height);
10146 +}
10147 +
10148 +reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree)
10149 +{
10150 +       return tree->estimate_one_insert;
10151 +}
10152 +
10153 +/* this returns maximal possible number of nodes which can be modified plus
10154 +   number of new nodes which can be required to perform insertion of one unit
10155 +   into an item in the tree */
10156 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree)
10157 +{
10158 +       /* estimate insert into item just like item insertion */
10159 +       return tree->estimate_one_insert;
10160 +}
10161 +
10162 +reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree)
10163 +{
10164 +       /* on item removal reiser4 does not try to pack nodes more complact, so,
10165 +          only one node may be dirtied on leaf level */
10166 +       return tree->estimate_one_insert;
10167 +}
10168 +
10169 +/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and
10170 +   dirty 3 existing nodes (insert point and both its neighbors).
10171 +   Max_balance_overhead should estimate number of blocks which may change/get
10172 +   added on internal levels */
10173 +reiser4_block_nr estimate_insert_flow(tree_level height)
10174 +{
10175 +       return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 +
10176 +                                                                    CARRY_FLOW_NEW_NODES_LIMIT,
10177 +                                                                    height);
10178 +}
10179 +
10180 +/* returnes max number of nodes can be occupied by disk cluster */
10181 +static reiser4_block_nr estimate_cluster(struct inode *inode, int unprepped)
10182 +{
10183 +       int per_cluster;
10184 +       per_cluster = (unprepped ? 1 : cluster_nrpages(inode));
10185 +       return 3 + per_cluster +
10186 +               max_balance_overhead(3 + per_cluster,
10187 +                                    REISER4_MAX_ZTREE_HEIGHT);
10188 +}
10189 +
10190 +/* how many nodes might get dirty and added
10191 +   during insertion of a disk cluster */
10192 +reiser4_block_nr estimate_insert_cluster(struct inode *inode)
10193 +{
10194 +       return estimate_cluster(inode, 1); /* 24 */
10195 +}
10196 +
10197 +/* how many nodes might get dirty and added
10198 +   during update of a (prepped or unprepped) disk cluster */
10199 +reiser4_block_nr estimate_update_cluster(struct inode *inode)
10200 +{
10201 +       return estimate_cluster(inode, 0); /* 44, for 64K-cluster */
10202 +}
10203 +
10204 +/* How many nodes occupied by a disk cluster might get dirty.
10205 +   Note that this estimation is not precise (i.e. disk cluster
10206 +   can occupy more nodes).
10207 +   Q: Why we don't use precise estimation?
10208 +   A: 1.Because precise estimation is fairly bad: 65536 nodes
10209 +       for 64K logical cluster, it means 256M of dead space on
10210 +       a partition
10211 +      2.It is a very rare case when disk cluster occupies more
10212 +       nodes then this estimation returns.
10213 +*/
10214 +reiser4_block_nr estimate_dirty_cluster(struct inode *inode)
10215 +{
10216 +       return cluster_nrpages(inode) + 4;
10217 +}
10218 +
10219 +/* Make Linus happy.
10220 +   Local variables:
10221 +   c-indentation-style: "K&R"
10222 +   mode-name: "LC"
10223 +   c-basic-offset: 8
10224 +   tab-width: 8
10225 +   fill-column: 120
10226 +   scroll-step: 1
10227 +   End:
10228 +*/
10229 diff -urN linux-2.6.35.orig/fs/reiser4/export_ops.c linux-2.6.35/fs/reiser4/export_ops.c
10230 --- linux-2.6.35.orig/fs/reiser4/export_ops.c   1970-01-01 01:00:00.000000000 +0100
10231 +++ linux-2.6.35/fs/reiser4/export_ops.c        2010-08-04 15:44:57.000000000 +0200
10232 @@ -0,0 +1,328 @@
10233 +/* Copyright 2005 by Hans Reiser, licensing governed by
10234 + * reiser4/README */
10235 +
10236 +#include "inode.h"
10237 +#include "plugin/plugin.h"
10238 +
10239 +/*
10240 + * Supported file-handle types
10241 + */
10242 +typedef enum {
10243 +       FH_WITH_PARENT = 0x10,  /* file handle with parent */
10244 +       FH_WITHOUT_PARENT = 0x11        /* file handle without parent */
10245 +} reiser4_fhtype;
10246 +
10247 +#define NFSERROR (255)
10248 +
10249 +/* initialize place-holder for object */
10250 +static void object_on_wire_init(reiser4_object_on_wire *o)
10251 +{
10252 +       o->plugin = NULL;
10253 +}
10254 +
10255 +/* finish with @o */
10256 +static void object_on_wire_done(reiser4_object_on_wire *o)
10257 +{
10258 +       if (o->plugin != NULL)
10259 +               o->plugin->wire.done(o);
10260 +}
10261 +
10262 +/*
10263 + * read serialized object identity from @addr and store information about
10264 + * object in @obj. This is dual to encode_inode().
10265 + */
10266 +static char *decode_inode(struct super_block *s, char *addr,
10267 +                         reiser4_object_on_wire * obj)
10268 +{
10269 +       file_plugin *fplug;
10270 +
10271 +       /* identifier of object plugin is stored in the first two bytes,
10272 +        * followed by... */
10273 +       fplug = file_plugin_by_disk_id(reiser4_get_tree(s), (d16 *) addr);
10274 +       if (fplug != NULL) {
10275 +               addr += sizeof(d16);
10276 +               obj->plugin = fplug;
10277 +               assert("nikita-3520", fplug->wire.read != NULL);
10278 +               /* plugin specific encoding of object identity. */
10279 +               addr = fplug->wire.read(addr, obj);
10280 +       } else
10281 +               addr = ERR_PTR(RETERR(-EINVAL));
10282 +       return addr;
10283 +}
10284 +
10285 +static struct dentry *reiser4_get_dentry(struct super_block *super,
10286 +                                        void *data);
10287 +/**
10288 + * reiser4_decode_fh: decode on-wire object - helper function
10289 + * for fh_to_dentry, fh_to_parent export operations;
10290 + * @super: super block;
10291 + * @addr: onwire object to be decoded;
10292 + *
10293 + * Returns dentry referring to the object being decoded.
10294 + */
10295 +static struct dentry *reiser4_decode_fh(struct super_block * super,
10296 +                                       char * addr)
10297 +{
10298 +       reiser4_object_on_wire object;
10299 +
10300 +       object_on_wire_init(&object);
10301 +
10302 +       addr = decode_inode(super, addr, &object);
10303 +       if (!IS_ERR(addr)) {
10304 +               struct dentry *d;
10305 +               d = reiser4_get_dentry(super, &object);
10306 +               if (d != NULL && !IS_ERR(d))
10307 +                       /* FIXME check for -ENOMEM */
10308 +                       reiser4_get_dentry_fsdata(d)->stateless = 1;
10309 +               addr = (char *)d;
10310 +       }
10311 +       object_on_wire_done(&object);
10312 +       return (void *)addr;
10313 +}
10314 +
10315 +static struct dentry *reiser4_fh_to_dentry(struct super_block *sb,
10316 +                                          struct fid *fid,
10317 +                                          int fh_len, int fh_type)
10318 +{
10319 +       reiser4_context *ctx;
10320 +       struct dentry *d;
10321 +
10322 +       assert("edward-1536",
10323 +              fh_type == FH_WITH_PARENT || fh_type == FH_WITHOUT_PARENT);
10324 +
10325 +       ctx = reiser4_init_context(sb);
10326 +       if (IS_ERR(ctx))
10327 +               return (struct dentry *)ctx;
10328 +
10329 +       d = reiser4_decode_fh(sb, (char *)fid->raw);
10330 +
10331 +       reiser4_exit_context(ctx);
10332 +       return d;
10333 +}
10334 +
10335 +static struct dentry *reiser4_fh_to_parent(struct super_block *sb,
10336 +                                          struct fid *fid,
10337 +                                          int fh_len, int fh_type)
10338 +{
10339 +       char * addr;
10340 +       struct dentry * d;
10341 +       reiser4_context *ctx;
10342 +       file_plugin *fplug;
10343 +
10344 +       if (fh_type == FH_WITHOUT_PARENT)
10345 +               return NULL;
10346 +       assert("edward-1537", fh_type == FH_WITH_PARENT);
10347 +
10348 +       ctx = reiser4_init_context(sb);
10349 +       if (IS_ERR(ctx))
10350 +               return (struct dentry *)ctx;
10351 +       addr = (char *)fid->raw;
10352 +       /* extract 2-bytes file plugin id */
10353 +       fplug = file_plugin_by_disk_id(reiser4_get_tree(sb), (d16 *)addr);
10354 +       if (fplug == NULL) {
10355 +               d = ERR_PTR(RETERR(-EINVAL));
10356 +               goto exit;
10357 +       }
10358 +       addr += sizeof(d16);
10359 +       /* skip previously encoded object */
10360 +       addr = fplug->wire.read(addr, NULL /* skip */);
10361 +       if (IS_ERR(addr)) {
10362 +               d = (struct dentry *)addr;
10363 +               goto exit;
10364 +       }
10365 +       /* @extract and decode parent object */
10366 +       d = reiser4_decode_fh(sb, addr);
10367 + exit:
10368 +       reiser4_exit_context(ctx);
10369 +       return d;
10370 +}
10371 +
10372 +/*
10373 + * Object serialization support.
10374 + *
10375 + * To support knfsd file system provides export_operations that are used to
10376 + * construct and interpret NFS file handles. As a generalization of this,
10377 + * reiser4 object plugins have serialization support: it provides methods to
10378 + * create on-wire representation of identity of reiser4 object, and
10379 + * re-create/locate object given its on-wire identity.
10380 + *
10381 + */
10382 +
10383 +/*
10384 + * return number of bytes that on-wire representation of @inode's identity
10385 + * consumes.
10386 + */
10387 +static int encode_inode_size(struct inode *inode)
10388 +{
10389 +       assert("nikita-3514", inode != NULL);
10390 +       assert("nikita-3515", inode_file_plugin(inode) != NULL);
10391 +       assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
10392 +
10393 +       return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
10394 +}
10395 +
10396 +/*
10397 + * store on-wire representation of @inode's identity at the area beginning at
10398 + * @start.
10399 + */
10400 +static char *encode_inode(struct inode *inode, char *start)
10401 +{
10402 +       assert("nikita-3517", inode != NULL);
10403 +       assert("nikita-3518", inode_file_plugin(inode) != NULL);
10404 +       assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
10405 +
10406 +       /*
10407 +        * first, store two-byte identifier of object plugin, then
10408 +        */
10409 +       save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
10410 +                      (d16 *) start);
10411 +       start += sizeof(d16);
10412 +       /*
10413 +        * call plugin to serialize object's identity
10414 +        */
10415 +       return inode_file_plugin(inode)->wire.write(inode, start);
10416 +}
10417 +
10418 +/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
10419 + * returned if file handle can not be stored */
10420 +/**
10421 + * reiser4_encode_fh - encode_fh of export operations
10422 + * @dentry:
10423 + * @fh:
10424 + * @lenp:
10425 + * @need_parent:
10426 + *
10427 + */
10428 +static int
10429 +reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
10430 +                 int need_parent)
10431 +{
10432 +       struct inode *inode;
10433 +       struct inode *parent;
10434 +       char *addr;
10435 +       int need;
10436 +       int delta;
10437 +       int result;
10438 +       reiser4_context *ctx;
10439 +
10440 +       /*
10441 +        * knfsd asks as to serialize object in @dentry, and, optionally its
10442 +        * parent (if need_parent != 0).
10443 +        *
10444 +        * encode_inode() and encode_inode_size() is used to build
10445 +        * representation of object and its parent. All hard work is done by
10446 +        * object plugins.
10447 +        */
10448 +       inode = dentry->d_inode;
10449 +       parent = dentry->d_parent->d_inode;
10450 +
10451 +       addr = (char *)fh;
10452 +
10453 +       need = encode_inode_size(inode);
10454 +       if (need < 0)
10455 +               return NFSERROR;
10456 +       if (need_parent) {
10457 +               delta = encode_inode_size(parent);
10458 +               if (delta < 0)
10459 +                       return NFSERROR;
10460 +               need += delta;
10461 +       }
10462 +
10463 +       ctx = reiser4_init_context(dentry->d_inode->i_sb);
10464 +       if (IS_ERR(ctx))
10465 +               return PTR_ERR(ctx);
10466 +
10467 +       if (need <= sizeof(__u32) * (*lenp)) {
10468 +               addr = encode_inode(inode, addr);
10469 +               if (need_parent)
10470 +                       addr = encode_inode(parent, addr);
10471 +
10472 +               /* store in lenp number of 32bit words required for file
10473 +                * handle. */
10474 +               *lenp = (need + sizeof(__u32) - 1) >> 2;
10475 +               result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
10476 +       } else
10477 +               /* no enough space in file handle */
10478 +               result = NFSERROR;
10479 +       reiser4_exit_context(ctx);
10480 +       return result;
10481 +}
10482 +
10483 +/**
10484 + * reiser4_get_dentry_parent - get_parent of export operations
10485 + * @child:
10486 + *
10487 + */
10488 +static struct dentry *reiser4_get_dentry_parent(struct dentry *child)
10489 +{
10490 +       struct inode *dir;
10491 +       dir_plugin *dplug;
10492 +       struct dentry *result;
10493 +       reiser4_context *ctx;
10494 +
10495 +       assert("nikita-3527", child != NULL);
10496 +
10497 +       dir = child->d_inode;
10498 +       assert("nikita-3529", dir != NULL);
10499 +
10500 +       ctx = reiser4_init_context(dir->i_sb);
10501 +       if (IS_ERR(ctx))
10502 +               return (void *)ctx;
10503 +
10504 +       dplug = inode_dir_plugin(dir);
10505 +       assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
10506 +
10507 +       if (unlikely(dplug == NULL)) {
10508 +               reiser4_exit_context(ctx);
10509 +               return ERR_PTR(RETERR(-ENOTDIR));
10510 +       }
10511 +       result = dplug->get_parent(dir);
10512 +       reiser4_exit_context(ctx);
10513 +       return result;
10514 +}
10515 +
10516 +/**
10517 + * reiser4_get_dentry - get_dentry of export operations
10518 + * @super:
10519 + * @data:
10520 + *
10521 + *
10522 + */
10523 +static struct dentry *reiser4_get_dentry(struct super_block *super, void *data)
10524 +{
10525 +       reiser4_object_on_wire *o;
10526 +
10527 +       assert("nikita-3522", super != NULL);
10528 +       assert("nikita-3523", data != NULL);
10529 +       /*
10530 +        * this is only supposed to be called by
10531 +        *
10532 +        *     reiser4_decode_fh->find_exported_dentry
10533 +        *
10534 +        * so, reiser4_context should be here already.
10535 +        */
10536 +       assert("nikita-3526", is_in_reiser4_context());
10537 +
10538 +       o = (reiser4_object_on_wire *)data;
10539 +       assert("nikita-3524", o->plugin != NULL);
10540 +       assert("nikita-3525", o->plugin->wire.get != NULL);
10541 +
10542 +       return o->plugin->wire.get(super, o);
10543 +}
10544 +
10545 +struct export_operations reiser4_export_operations = {
10546 +       .encode_fh = reiser4_encode_fh,
10547 +       .fh_to_dentry = reiser4_fh_to_dentry,
10548 +       .fh_to_parent = reiser4_fh_to_parent,
10549 +       .get_parent = reiser4_get_dentry_parent,
10550 +};
10551 +
10552 +/*
10553 + * Local variables:
10554 + * c-indentation-style: "K&R"
10555 + * mode-name: "LC"
10556 + * c-basic-offset: 8
10557 + * tab-width: 8
10558 + * fill-column: 79
10559 + * End:
10560 + */
10561 diff -urN linux-2.6.35.orig/fs/reiser4/flush.c linux-2.6.35/fs/reiser4/flush.c
10562 --- linux-2.6.35.orig/fs/reiser4/flush.c        1970-01-01 01:00:00.000000000 +0100
10563 +++ linux-2.6.35/fs/reiser4/flush.c     2010-08-04 15:44:57.000000000 +0200
10564 @@ -0,0 +1,3703 @@
10565 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
10566 +   reiser4/README */
10567 +
10568 +/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
10569 +
10570 +#include "forward.h"
10571 +#include "debug.h"
10572 +#include "dformat.h"
10573 +#include "key.h"
10574 +#include "coord.h"
10575 +#include "plugin/item/item.h"
10576 +#include "plugin/plugin.h"
10577 +#include "plugin/object.h"
10578 +#include "txnmgr.h"
10579 +#include "jnode.h"
10580 +#include "znode.h"
10581 +#include "block_alloc.h"
10582 +#include "tree_walk.h"
10583 +#include "carry.h"
10584 +#include "tree.h"
10585 +#include "vfs_ops.h"
10586 +#include "inode.h"
10587 +#include "page_cache.h"
10588 +#include "wander.h"
10589 +#include "super.h"
10590 +#include "entd.h"
10591 +#include "reiser4.h"
10592 +#include "flush.h"
10593 +#include "writeout.h"
10594 +
10595 +#include <asm/atomic.h>
10596 +#include <linux/fs.h>          /* for struct super_block  */
10597 +#include <linux/mm.h>          /* for struct page */
10598 +#include <linux/bio.h>         /* for struct bio */
10599 +#include <linux/pagemap.h>
10600 +#include <linux/blkdev.h>
10601 +
10602 +/* IMPLEMENTATION NOTES */
10603 +
10604 +/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of
10605 +   assigning a total order to the nodes of the tree in which the parent is
10606 +   placed before its children, which are ordered (recursively) in left-to-right
10607 +   order. When we speak of a "parent-first preceder", it describes the node that
10608 +   "came before in forward parent-first order". When we speak of a "parent-first
10609 +   follower", it describes the node that "comes next in parent-first order"
10610 +   (alternatively the node that "came before in reverse parent-first order").
10611 +
10612 +   The following pseudo-code prints the nodes of a tree in forward parent-first
10613 +   order:
10614 +
10615 +   void parent_first (node)
10616 +   {
10617 +     print_node (node);
10618 +     if (node->level > leaf) {
10619 +       for (i = 0; i < num_children; i += 1) {
10620 +        parent_first (node->child[i]);
10621 +       }
10622 +     }
10623 +   }
10624 +*/
10625 +
10626 +/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE?  The idea is to optimize block
10627 +   allocation so that a left-to-right scan of the tree's data (i.e., the leaves
10628 +   in left-to-right order) can be accomplished with sequential reads, which
10629 +   results in reading nodes in their parent-first order. This is a
10630 +   read-optimization aspect of the flush algorithm, and there is also a
10631 +   write-optimization aspect, which is that we wish to make large sequential
10632 +   writes to the disk by allocating or reallocating blocks so that they can be
10633 +   written in sequence. Sometimes the read-optimization and write-optimization
10634 +   goals conflict with each other, as we discuss in more detail below.
10635 +*/
10636 +
10637 +/* STATE BITS: The flush code revolves around the state of the jnodes it covers.
10638 +   Here are the relevant jnode->state bits and their relevence to flush:
10639 +
10640 +     JNODE_DIRTY: If a node is dirty, it must be flushed. But in order to be
10641 +     written it must be allocated first. In order to be considered allocated,
10642 +     the jnode must have exactly one of { JNODE_OVRWR, JNODE_RELOC } set. These
10643 +     two bits are exclusive, and all dirtied jnodes eventually have one of these
10644 +     bits set during each transaction.
10645 +
10646 +     JNODE_CREATED: The node was freshly created in its transaction and has no
10647 +     previous block address, so it is unconditionally assigned to be relocated,
10648 +     although this is mainly for code-convenience. It is not being 'relocated'
10649 +     from anything, but in almost every regard it is treated as part of the
10650 +     relocate set. The JNODE_CREATED bit remains set even after JNODE_RELOC is
10651 +     set, so the actual relocate can be distinguished from the
10652 +     created-and-allocated set easily: relocate-set members (belonging to the
10653 +     preserve-set) have (JNODE_RELOC) set and created-set members which have no
10654 +     previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
10655 +
10656 +     JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm
10657 +     made the decision to maintain the pre-existing location for this node and
10658 +     it will be written to the wandered-log.
10659 +
10660 +     JNODE_RELOC: The flush algorithm made the decision to relocate this block
10661 +     (if it was not created, see note above). A block with JNODE_RELOC set is
10662 +     eligible for early-flushing and may be submitted during flush_empty_queues.
10663 +     When the JNODE_RELOC bit is set on a znode, the parent node's internal item
10664 +     is modified and the znode is rehashed.
10665 +
10666 +     JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm
10667 +     scans the node and calls plugin->f.squeeze() method for its items. By this
10668 +     technology we update disk clusters of cryptcompress objects. Also if
10669 +     leftmost point that was found by flush scan has this flag (races with
10670 +     write(), rare case) the flush algorythm makes the decision to pass it to
10671 +     squalloc() in spite of its flushprepped status for squeezing, not for
10672 +     repeated allocation.
10673 +
10674 +     JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode
10675 +     into its flush queue. This means the jnode is not on any clean or dirty
10676 +     list, instead it is moved to one of the flush queue (see flush_queue.h)
10677 +     object private list. This prevents multiple concurrent flushes from
10678 +     attempting to start flushing from the same node.
10679 +
10680 +     (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
10681 +     squeeze-and-allocate on a node while its children are actively being
10682 +     squeezed and allocated. This flag was created to avoid submitting a write
10683 +     request for a node while its children are still being allocated and
10684 +     squeezed. Then flush queue was re-implemented to allow unlimited number of
10685 +     nodes be queued. This flag support was commented out in source code because
10686 +     we decided that there was no reason to submit queued nodes before
10687 +     jnode_flush() finishes.  However, current code calls fq_write() during a
10688 +     slum traversal and may submit "busy nodes" to disk. Probably we can
10689 +     re-enable the JNODE_FLUSH_BUSY bit support in future.
10690 +
10691 +   With these state bits, we describe a test used frequently in the code below,
10692 +   jnode_is_flushprepped()(and the spin-lock-taking jnode_check_flushprepped()).
10693 +   The test for "flushprepped" returns true if any of the following are true:
10694 +
10695 +     - The node is not dirty
10696 +     - The node has JNODE_RELOC set
10697 +     - The node has JNODE_OVRWR set
10698 +
10699 +   If either the node is not dirty or it has already been processed by flush
10700 +   (and assigned JNODE_OVRWR or JNODE_RELOC), then it is prepped. If
10701 +   jnode_is_flushprepped() returns true then flush has work to do on that node.
10702 +*/
10703 +
10704 +/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
10705 +   flushprepped twice (unless an explicit call to flush_unprep is made as
10706 +   described in detail below). For example a node is dirtied, allocated, and
10707 +   then early-flushed to disk and set clean. Before the transaction commits, the
10708 +   page is dirtied again and, due to memory pressure, the node is flushed again.
10709 +   The flush algorithm will not relocate the node to a new disk location, it
10710 +   will simply write it to the same, previously relocated position again.
10711 +*/
10712 +
10713 +/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm
10714 +   where we start at a leaf node and allocate in parent-first order by iterating
10715 +   to the right. At each step of the iteration, we check for the right neighbor.
10716 +   Before advancing to the right neighbor, we check if the current position and
10717 +   the right neighbor share the same parent. If they do not share the same
10718 +   parent, the parent is allocated before the right neighbor.
10719 +
10720 +   This process goes recursively up the tree and squeeze nodes level by level as
10721 +   long as the right neighbor and the current position have different parents,
10722 +   then it allocates the right-neighbors-with-different-parents on the way back
10723 +   down. This process is described in more detail in
10724 +   flush_squalloc_changed_ancestor and the recursive function
10725 +   squalloc_one_changed_ancestor. But the purpose here is not to discuss the
10726 +   specifics of the bottom-up approach as it is to contrast the bottom-up and
10727 +   top-down approaches.
10728 +
10729 +   The top-down algorithm was implemented earlier (April-May 2002). In the
10730 +   top-down approach, we find a starting point by scanning left along each level
10731 +   past dirty nodes, then going up and repeating the process until the left node
10732 +   and the parent node are clean. We then perform a parent-first traversal from
10733 +   the starting point, which makes allocating in parent-first order trivial.
10734 +   After one subtree has been allocated in this manner, we move to the right,
10735 +   try moving upward, then repeat the parent-first traversal.
10736 +
10737 +   Both approaches have problems that need to be addressed. Both are
10738 +   approximately the same amount of code, but the bottom-up approach has
10739 +   advantages in the order it acquires locks which, at the very least, make it
10740 +   the better approach. At first glance each one makes the other one look
10741 +   simpler, so it is important to remember a few of the problems with each one.
10742 +
10743 +   Main problem with the top-down approach: When you encounter a clean child
10744 +   during the parent-first traversal, what do you do? You would like to avoid
10745 +   searching through a large tree of nodes just to find a few dirty leaves at
10746 +   the bottom, and there is not an obvious solution. One of the advantages of
10747 +   the top-down approach is that during the parent-first traversal you check
10748 +   every child of a parent to see if it is dirty. In this way, the top-down
10749 +   approach easily handles the main problem of the bottom-up approach:
10750 +   unallocated children.
10751 +
10752 +   The unallocated children problem is that before writing a node to disk we
10753 +   must make sure that all of its children are allocated. Otherwise, the writing
10754 +   the node means extra I/O because the node will have to be written again when
10755 +   the child is finally allocated.
10756 +
10757 +   WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM. Except for bugs,
10758 +   this should not cause any file system corruption, it only degrades I/O
10759 +   performance because a node may be written when it is sure to be written at
10760 +   least one more time in the same transaction when the remaining children are
10761 +   allocated. What follows is a description of how we will solve the problem.
10762 +*/
10763 +
10764 +/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node,
10765 +   then proceeding in parent first order, allocate some of its left-children,
10766 +   then encounter a clean child in the middle of the parent. We do not allocate
10767 +   the clean child, but there may remain unallocated (dirty) children to the
10768 +   right of the clean child. If we were to stop flushing at this moment and
10769 +   write everything to disk, the parent might still contain unallocated
10770 +   children.
10771 +
10772 +   We could try to allocate all the descendents of every node that we allocate,
10773 +   but this is not necessary. Doing so could result in allocating the entire
10774 +   tree: if the root node is allocated then every unallocated node would have to
10775 +   be allocated before flushing. Actually, we do not have to write a node just
10776 +   because we allocate it. It is possible to allocate but not write a node
10777 +   during flush, when it still has unallocated children. However, this approach
10778 +   is probably not optimal for the following reason.
10779 +
10780 +   The flush algorithm is designed to allocate nodes in parent-first order in an
10781 +   attempt to optimize reads that occur in the same order. Thus we are
10782 +   read-optimizing for a left-to-right scan through all the leaves in the
10783 +   system, and we are hoping to write-optimize at the same time because those
10784 +   nodes will be written together in batch. What happens, however, if we assign
10785 +   a block number to a node in its read-optimized order but then avoid writing
10786 +   it because it has unallocated children? In that situation, we lose out on the
10787 +   write-optimization aspect because a node will have to be written again to the
10788 +   its location on the device, later, which likely means seeking back to that
10789 +   location.
10790 +
10791 +   So there are tradeoffs. We can choose either:
10792 +
10793 +   A. Allocate all unallocated children to preserve both write-optimization and
10794 +   read-optimization, but this is not always desirable because it may mean
10795 +   having to allocate and flush very many nodes at once.
10796 +
10797 +   B. Defer writing nodes with unallocated children, keep their read-optimized
10798 +   locations, but sacrifice write-optimization because those nodes will be
10799 +   written again.
10800 +
10801 +   C. Defer writing nodes with unallocated children, but do not keep their
10802 +   read-optimized locations. Instead, choose to write-optimize them later, when
10803 +   they are written. To facilitate this, we "undo" the read-optimized allocation
10804 +   that was given to the node so that later it can be write-optimized, thus
10805 +   "unpreparing" the flush decision. This is a case where we disturb the
10806 +   FLUSH_PREP_ONCE_PER_TRANSACTION rule described above. By a call to
10807 +   flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
10808 +   if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate
10809 +   its block location, and set the JNODE_CREATED bit, effectively setting the
10810 +   node back to an unallocated state.
10811 +
10812 +   We will take the following approach in v4.0: for twig nodes we will always
10813 +   finish allocating unallocated children (A).  For nodes with (level > TWIG)
10814 +   we will defer writing and choose write-optimization (C).
10815 +
10816 +   To summarize, there are several parts to a solution that avoids the problem
10817 +   with unallocated children:
10818 +
10819 +   FIXME-ZAM: Still no one approach is implemented to eliminate the
10820 +   "UNALLOCATED CHILDREN" problem because there was an experiment which was done
10821 +   showed that we have 1-2 nodes with unallocated children for thousands of
10822 +   written nodes. The experiment was simple like coping/deletion of linux kernel
10823 +   sources. However the problem can arise in more complex tests. I think we have
10824 +   jnode_io_hook to insert a check for unallocated children and see what kind of
10825 +   problem we have.
10826 +
10827 +   1. When flush reaches a stopping point (e.g. a clean node) it should continue
10828 +   calling squeeze-and-allocate on any remaining unallocated children.
10829 +   FIXME: Difficulty to implement: should be simple -- amounts to adding a while
10830 +   loop to jnode_flush, see comments in that function.
10831 +
10832 +   2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes
10833 +   may still have unallocated children. If the twig level has unallocated
10834 +   children it is an assertion failure. If a higher-level node has unallocated
10835 +   children, then it should be explicitly de-allocated by a call to
10836 +   flush_unprep().
10837 +   FIXME: Difficulty to implement: should be simple.
10838 +
10839 +   3. (CPU-Optimization) Checking whether a node has unallocated children may
10840 +   consume more CPU cycles than we would like, and it is possible (but medium
10841 +   complexity) to optimize this somewhat in the case where large sub-trees are
10842 +   flushed. The following observation helps: if both the left- and
10843 +   right-neighbor of a node are processed by the flush algorithm then the node
10844 +   itself is guaranteed to have all of its children allocated. However, the cost
10845 +   of this check may not be so expensive after all: it is not needed for leaves
10846 +   and flush can guarantee this property for twigs. That leaves only (level >
10847 +   TWIG) nodes that have to be checked, so this optimization only helps if at
10848 +   least three (level > TWIG) nodes are flushed in one pass, and the savings
10849 +   will be very small unless there are many more (level > TWIG) nodes. But if
10850 +   there are many (level > TWIG) nodes then the number of blocks being written
10851 +   will be very large, so the savings may be insignificant. That said, the idea
10852 +   is to maintain both the left and right edges of nodes that are processed in
10853 +   flush.  When flush_empty_queue() is called, a relatively simple test will
10854 +   tell whether the (level > TWIG) node is on the edge. If it is on the edge,
10855 +   the slow check is necessary, but if it is in the interior then it can be
10856 +   assumed to have all of its children allocated. FIXME: medium complexity to
10857 +   implement, but simple to verify given that we must have a slow check anyway.
10858 +
10859 +   4. (Optional) This part is optional, not for v4.0--flush should work
10860 +   independently of whether this option is used or not. Called RAPID_SCAN, the
10861 +   idea is to amend the left-scan operation to take unallocated children into
10862 +   account. Normally, the left-scan operation goes left as long as adjacent
10863 +   nodes are dirty up until some large maximum value (FLUSH_SCAN_MAXNODES) at
10864 +   which point it stops and begins flushing. But scan-left may stop at a
10865 +   position where there are unallocated children to the left with the same
10866 +   parent. When RAPID_SCAN is enabled, the ordinary scan-left operation stops
10867 +   after FLUSH_RELOCATE_THRESHOLD, which is much smaller than
10868 +   FLUSH_SCAN_MAXNODES, then procedes with a rapid scan. The rapid scan skips
10869 +   all the interior children of a node--if the leftmost child of a twig is
10870 +   dirty, check its left neighbor (the rightmost child of the twig to the left).
10871 +   If the left neighbor of the leftmost child is also dirty, then continue the
10872 +   scan at the left twig and repeat.  This option will cause flush to allocate
10873 +   more twigs in a single pass, but it also has the potential to write many more
10874 +   nodes than would otherwise be written without the RAPID_SCAN option.
10875 +   RAPID_SCAN was partially implemented, code removed August 12, 2002 by JMACD.
10876 +*/
10877 +
10878 +/* FLUSH CALLED ON NON-LEAF LEVEL. Most of our design considerations assume that
10879 +   the starting point for flush is a leaf node, but actually the flush code
10880 +   cares very little about whether or not this is true.  It is possible that all
10881 +   the leaf nodes are flushed and dirty parent nodes still remain, in which case
10882 +   jnode_flush() is called on a non-leaf argument. Flush doesn't care--it treats
10883 +   the argument node as if it were a leaf, even when it is not. This is a simple
10884 +   approach, and there may be a more optimal policy but until a problem with
10885 +   this approach is discovered, simplest is probably best.
10886 +
10887 +   NOTE: In this case, the ordering produced by flush is parent-first only if
10888 +   you ignore the leaves. This is done as a matter of simplicity and there is
10889 +   only one (shaky) justification. When an atom commits, it flushes all leaf
10890 +   level nodes first, followed by twigs, and so on. With flushing done in this
10891 +   order, if flush is eventually called on a non-leaf node it means that
10892 +   (somehow) we reached a point where all leaves are clean and only internal
10893 +   nodes need to be flushed. If that it the case, then it means there were no
10894 +   leaves that were the parent-first preceder/follower of the parent. This is
10895 +   expected to be a rare case, which is why we do nothing special about it.
10896 +   However, memory pressure may pass an internal node to flush when there are
10897 +   still dirty leaf nodes that need to be flushed, which could prove our
10898 +   original assumptions "inoperative". If this needs to be fixed, then
10899 +   scan_left/right should have special checks for the non-leaf levels. For
10900 +   example, instead of passing from a node to the left neighbor, it should pass
10901 +   from the node to the left neighbor's rightmost descendent (if dirty).
10902 +
10903 +*/
10904 +
10905 +/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING. We walk the tree in 4MB-16MB
10906 +   chunks, dirtying everything and putting it into a transaction. We tell the
10907 +   allocator to allocate the blocks as far as possible towards one end of the
10908 +   logical device--the left (starting) end of the device if we are walking from
10909 +   left to right, the right end of the device if we are walking from right to
10910 +   left.  We then make passes in alternating directions, and as we do this the
10911 +   device becomes sorted such that tree order and block number order fully
10912 +   correlate.
10913 +
10914 +   Resizing is done by shifting everything either all the way to the left or all
10915 +   the way to the right, and then reporting the last block.
10916 +*/
10917 +
10918 +/* RELOCATE DECISIONS: The code makes a decision to relocate in several places.
10919 +   This descibes the policy from the highest level:
10920 +
10921 +   The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive
10922 +   nodes on the leaf level during flush-scan (right, left), then we
10923 +   unconditionally decide to relocate leaf nodes.
10924 +
10925 +   Otherwise, there are two contexts in which we make a decision to relocate:
10926 +
10927 +   1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
10928 +   During the initial stages of flush, after scan-right completes, we want to
10929 +   ask the question: should we relocate this leaf node and thus dirty the parent
10930 +   node. Then if the node is a leftmost child its parent is its own parent-first
10931 +   preceder, thus we repeat the question at the next level up, and so on. In
10932 +   these cases we are moving in the reverse-parent first direction.
10933 +
10934 +   There is another case which is considered the reverse direction, which comes
10935 +   at the end of a twig in reverse_relocate_end_of_twig(). As we finish
10936 +   processing a twig we may reach a point where there is a clean twig to the
10937 +   right with a dirty leftmost child. In this case, we may wish to relocate the
10938 +   child by testing if it should be relocated relative to its parent.
10939 +
10940 +   2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done
10941 +   in allocate_znode. What distinguishes the forward parent-first case from the
10942 +   reverse-parent first case is that the preceder has already been allocated in
10943 +   the forward case, whereas in the reverse case we don't know what the preceder
10944 +   is until we finish "going in reverse". That simplifies the forward case
10945 +   considerably, and there we actually use the block allocator to determine
10946 +   whether, e.g., a block closer to the preceder is available.
10947 +*/
10948 +
10949 +/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration. The idea is,
10950 +   once we finish scan-left and find a starting point, if the parent's left
10951 +   neighbor is dirty then squeeze the parent's left neighbor and the parent.
10952 +   This may change the flush-starting-node's parent. Repeat until the child's
10953 +   parent is stable. If the child is a leftmost child, repeat this left-edge
10954 +   squeezing operation at the next level up. Note that we cannot allocate
10955 +   extents during this or they will be out of parent-first order. There is also
10956 +   some difficult coordinate maintenence issues.  We can't do a tree search to
10957 +   find coordinates again (because we hold locks), we have to determine them
10958 +   from the two nodes being squeezed. Looks difficult, but has potential to
10959 +   increase space utilization. */
10960 +
10961 +/* Flush-scan helper functions. */
10962 +static void scan_init(flush_scan * scan);
10963 +static void scan_done(flush_scan * scan);
10964 +
10965 +/* Flush-scan algorithm. */
10966 +static int scan_left(flush_scan * scan, flush_scan * right, jnode * node,
10967 +                    unsigned limit);
10968 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
10969 +static int scan_common(flush_scan * scan, flush_scan * other);
10970 +static int scan_formatted(flush_scan * scan);
10971 +static int scan_unformatted(flush_scan * scan, flush_scan * other);
10972 +static int scan_by_coord(flush_scan * scan);
10973 +
10974 +/* Initial flush-point ancestor allocation. */
10975 +static int alloc_pos_and_ancestors(flush_pos_t *pos);
10976 +static int alloc_one_ancestor(const coord_t *coord, flush_pos_t *pos);
10977 +static int set_preceder(const coord_t *coord_in, flush_pos_t *pos);
10978 +
10979 +/* Main flush algorithm.
10980 +   Note on abbreviation: "squeeze and allocate" == "squalloc". */
10981 +static int squalloc(flush_pos_t *pos);
10982 +
10983 +/* Flush squeeze implementation. */
10984 +static int squeeze_right_non_twig(znode * left, znode * right);
10985 +static int shift_one_internal_unit(znode * left, znode * right);
10986 +
10987 +/* Flush reverse parent-first relocation routines. */
10988 +static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
10989 +                                           const reiser4_block_nr * nblk);
10990 +static int reverse_relocate_test(jnode * node, const coord_t *parent_coord,
10991 +                                flush_pos_t *pos);
10992 +static int reverse_relocate_check_dirty_parent(jnode * node,
10993 +                                              const coord_t *parent_coord,
10994 +                                              flush_pos_t *pos);
10995 +
10996 +/* Flush allocate write-queueing functions: */
10997 +static int allocate_znode(znode * node, const coord_t *parent_coord,
10998 +                         flush_pos_t *pos);
10999 +static int allocate_znode_update(znode * node, const coord_t *parent_coord,
11000 +                                flush_pos_t *pos);
11001 +static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
11002 +
11003 +/* Flush helper functions: */
11004 +static int jnode_lock_parent_coord(jnode * node,
11005 +                                  coord_t *coord,
11006 +                                  lock_handle * parent_lh,
11007 +                                  load_count * parent_zh,
11008 +                                  znode_lock_mode mode, int try);
11009 +static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
11010 +                          znode_lock_mode mode, int check_dirty, int expected);
11011 +static int znode_same_parents(znode * a, znode * b);
11012 +
11013 +static int znode_check_flushprepped(znode * node)
11014 +{
11015 +       return jnode_check_flushprepped(ZJNODE(node));
11016 +}
11017 +
11018 +/* Flush position functions */
11019 +static void pos_init(flush_pos_t *pos);
11020 +static int pos_valid(flush_pos_t *pos);
11021 +static void pos_done(flush_pos_t *pos);
11022 +static int pos_stop(flush_pos_t *pos);
11023 +
11024 +/* check that @org is first jnode extent unit, if extent is unallocated,
11025 + * because all jnodes of unallocated extent are dirty and of the same atom. */
11026 +#define checkchild(scan)                                               \
11027 +assert("nikita-3435",                                                  \
11028 +       ergo(scan->direction == LEFT_SIDE &&                            \
11029 +           (scan->parent_coord.node->level == TWIG_LEVEL) &&           \
11030 +           jnode_is_unformatted(scan->node) &&                         \
11031 +           extent_is_unallocated(&scan->parent_coord),                 \
11032 +           extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
11033 +
11034 +/* This flush_cnt variable is used to track the number of concurrent flush
11035 +   operations, useful for debugging. It is initialized in txnmgr.c out of
11036 +   laziness (because flush has no static initializer function...) */
11037 +ON_DEBUG(atomic_t flush_cnt;
11038 +    )
11039 +
11040 +/* check fs backing device for write congestion */
11041 +static int check_write_congestion(void)
11042 +{
11043 +       struct super_block *sb;
11044 +       struct backing_dev_info *bdi;
11045 +
11046 +       sb = reiser4_get_current_sb();
11047 +       bdi = reiser4_get_super_fake(sb)->i_mapping->backing_dev_info;
11048 +       return bdi_write_congested(bdi);
11049 +}
11050 +
11051 +/* conditionally write flush queue */
11052 +static int write_prepped_nodes(flush_pos_t *pos)
11053 +{
11054 +       int ret;
11055 +
11056 +       assert("zam-831", pos);
11057 +       assert("zam-832", pos->fq);
11058 +
11059 +       if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
11060 +               return 0;
11061 +
11062 +       if (check_write_congestion())
11063 +               return 0;
11064 +
11065 +       ret = reiser4_write_fq(pos->fq, pos->nr_written,
11066 +                      WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11067 +       return ret;
11068 +}
11069 +
11070 +/* Proper release all flush pos. resources then move flush position to new
11071 +   locked node */
11072 +static void move_flush_pos(flush_pos_t *pos, lock_handle * new_lock,
11073 +                          load_count * new_load, const coord_t *new_coord)
11074 +{
11075 +       assert("zam-857", new_lock->node == new_load->node);
11076 +
11077 +       if (new_coord) {
11078 +               assert("zam-858", new_coord->node == new_lock->node);
11079 +               coord_dup(&pos->coord, new_coord);
11080 +       } else {
11081 +               coord_init_first_unit(&pos->coord, new_lock->node);
11082 +       }
11083 +
11084 +       if (pos->child) {
11085 +               jput(pos->child);
11086 +               pos->child = NULL;
11087 +       }
11088 +
11089 +       move_load_count(&pos->load, new_load);
11090 +       done_lh(&pos->lock);
11091 +       move_lh(&pos->lock, new_lock);
11092 +}
11093 +
11094 +/* delete empty node which link from the parent still exists. */
11095 +static int delete_empty_node(znode * node)
11096 +{
11097 +       reiser4_key smallest_removed;
11098 +
11099 +       assert("zam-1019", node != NULL);
11100 +       assert("zam-1020", node_is_empty(node));
11101 +       assert("zam-1023", znode_is_wlocked(node));
11102 +
11103 +       return reiser4_delete_node(node, &smallest_removed, NULL, 1);
11104 +}
11105 +
11106 +/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
11107 +static int prepare_flush_pos(flush_pos_t *pos, jnode * org)
11108 +{
11109 +       int ret;
11110 +       load_count load;
11111 +       lock_handle lock;
11112 +
11113 +       init_lh(&lock);
11114 +       init_load_count(&load);
11115 +
11116 +       if (jnode_is_znode(org)) {
11117 +               ret = longterm_lock_znode(&lock, JZNODE(org),
11118 +                                         ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
11119 +               if (ret)
11120 +                       return ret;
11121 +
11122 +               ret = incr_load_count_znode(&load, JZNODE(org));
11123 +               if (ret)
11124 +                       return ret;
11125 +
11126 +               pos->state =
11127 +                   (jnode_get_level(org) ==
11128 +                    LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
11129 +               move_flush_pos(pos, &lock, &load, NULL);
11130 +       } else {
11131 +               coord_t parent_coord;
11132 +               ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
11133 +                                             &load, ZNODE_WRITE_LOCK, 0);
11134 +               if (ret)
11135 +                       goto done;
11136 +               if (!item_is_extent(&parent_coord)) {
11137 +                       /* file was converted to tail, org became HB, we found
11138 +                          internal item */
11139 +                       ret = -EAGAIN;
11140 +                       goto done;
11141 +               }
11142 +
11143 +               pos->state = POS_ON_EPOINT;
11144 +               move_flush_pos(pos, &lock, &load, &parent_coord);
11145 +               pos->child = jref(org);
11146 +               if (extent_is_unallocated(&parent_coord)
11147 +                   && extent_unit_index(&parent_coord) != index_jnode(org)) {
11148 +                       /* @org is not first child of its parent unit. This may
11149 +                          happen because longerm lock of its parent node was
11150 +                          released between scan_left and scan_right. For now
11151 +                          work around this having flush to repeat */
11152 +                       ret = -EAGAIN;
11153 +               }
11154 +       }
11155 +
11156 +done:
11157 +       done_load_count(&load);
11158 +       done_lh(&lock);
11159 +       return ret;
11160 +}
11161 +
11162 +/* TODO LIST (no particular order): */
11163 +/* I have labelled most of the legitimate FIXME comments in this file with
11164 +   letters to indicate which issue they relate to. There are a few miscellaneous
11165 +   FIXMEs with specific names mentioned instead that need to be
11166 +   inspected/resolved. */
11167 +/* B. There is an issue described in reverse_relocate_test having to do with an
11168 +   imprecise is_preceder? check having to do with partially-dirty extents. The
11169 +   code that sets preceder hints and computes the preceder is basically
11170 +   untested. Careful testing needs to be done that preceder calculations are
11171 +   done correctly, since if it doesn't affect correctness we will not catch this
11172 +   stuff during regular testing. */
11173 +/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling. It is unclear which of
11174 +   these are considered expected but unlikely conditions. Flush currently
11175 +   returns 0 (i.e., success but no progress, i.e., restart) whenever it receives
11176 +   any of these in jnode_flush(). Many of the calls that may produce one of
11177 +   these return values (i.e., longterm_lock_znode, reiser4_get_parent,
11178 +   reiser4_get_neighbor, ...) check some of these values themselves and, for
11179 +   instance, stop flushing instead of resulting in a restart. If any of these
11180 +   results are true error conditions then flush will go into a busy-loop, as we
11181 +   noticed during testing when a corrupt tree caused find_child_ptr to return
11182 +   ENOENT. It needs careful thought and testing of corner conditions.
11183 +*/
11184 +/* D. Atomicity of flush_prep against deletion and flush concurrency. Suppose a
11185 +   created block is assigned a block number then early-flushed to disk. It is
11186 +   dirtied again and flush is called again. Concurrently, that block is deleted,
11187 +   and the de-allocation of its block number does not need to be deferred, since
11188 +   it is not part of the preserve set (i.e., it didn't exist before the
11189 +   transaction). I think there may be a race condition where flush writes the
11190 +   dirty, created block after the non-deferred deallocated block number is
11191 +   re-allocated, making it possible to write deleted data on top of non-deleted
11192 +   data. Its just a theory, but it needs to be thought out. */
11193 +/* F. bio_alloc() failure is not handled gracefully. */
11194 +/* G. Unallocated children. */
11195 +/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered
11196 +   blocks. */
11197 +/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
11198 +
11199 +/* JNODE_FLUSH: MAIN ENTRY POINT */
11200 +/* This is the main entry point for flushing a jnode and its dirty neighborhood
11201 +   (dirty neighborhood is named "slum"). Jnode_flush() is called if reiser4 has
11202 +   to write dirty blocks to disk, it happens when Linux VM decides to reduce
11203 +   number of dirty pages or as a part of transaction commit.
11204 +
11205 +   Our objective here is to prep and flush the slum the jnode belongs to. We
11206 +   want to squish the slum together, and allocate the nodes in it as we squish
11207 +   because allocation of children affects squishing of parents.
11208 +
11209 +   The "argument" @node tells flush where to start. From there, flush finds the
11210 +   left edge of the slum, and calls squalloc (in which nodes are squeezed and
11211 +   allocated). To find a "better place" to start squalloc first we perform a
11212 +   flush_scan.
11213 +
11214 +   Flush-scanning may be performed in both left and right directions, but for
11215 +   different purposes. When scanning to the left, we are searching for a node
11216 +   that precedes a sequence of parent-first-ordered nodes which we will then
11217 +   flush in parent-first order. During flush-scanning, we also take the
11218 +   opportunity to count the number of consecutive leaf nodes. If this number is
11219 +   past some threshold (FLUSH_RELOCATE_THRESHOLD), then we make a decision to
11220 +   reallocate leaf nodes (thus favoring write-optimization).
11221 +
11222 +   Since the flush argument node can be anywhere in a sequence of dirty leaves,
11223 +   there may also be dirty nodes to the right of the argument. If the scan-left
11224 +   operation does not count at least FLUSH_RELOCATE_THRESHOLD nodes then we
11225 +   follow it with a right-scan operation to see whether there is, in fact,
11226 +   enough nodes to meet the relocate threshold. Each right- and left-scan
11227 +   operation uses a single flush_scan object.
11228 +
11229 +   After left-scan and possibly right-scan, we prepare a flush_position object
11230 +   with the starting flush point or parent coordinate, which was determined
11231 +   using scan-left.
11232 +
11233 +   Next we call the main flush routine, squalloc, which iterates along the leaf
11234 +   level, squeezing and allocating nodes (and placing them into the flush
11235 +   queue).
11236 +
11237 +   After squalloc returns we take extra steps to ensure that all the children
11238 +   of the final twig node are allocated--this involves repeating squalloc
11239 +   until we finish at a twig with no unallocated children.
11240 +
11241 +   Finally, we call flush_empty_queue to submit write-requests to disk. If we
11242 +   encounter any above-twig nodes during flush_empty_queue that still have
11243 +   unallocated children, we flush_unprep them.
11244 +
11245 +   Flush treats several "failure" cases as non-failures, essentially causing
11246 +   them to start over. E_DEADLOCK is one example.
11247 +   FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should probably be handled
11248 +   properly rather than restarting, but there are a bunch of cases to audit.
11249 +*/
11250 +
11251 +static int
11252 +jnode_flush(jnode * node, long nr_to_write, long *nr_written,
11253 +           flush_queue_t *fq, int flags)
11254 +{
11255 +       long ret = 0;
11256 +       flush_scan *right_scan;
11257 +       flush_scan *left_scan;
11258 +       flush_pos_t *flush_pos;
11259 +       int todo;
11260 +       struct super_block *sb;
11261 +       reiser4_super_info_data *sbinfo;
11262 +       jnode *leftmost_in_slum = NULL;
11263 +
11264 +       assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
11265 +       assert("nikita-3022", reiser4_schedulable());
11266 +
11267 +       assert("nikita-3185",
11268 +              get_current_super_private()->delete_mutex_owner != current);
11269 +
11270 +       /* allocate right_scan, left_scan and flush_pos */
11271 +       right_scan =
11272 +           kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos),
11273 +                   reiser4_ctx_gfp_mask_get());
11274 +       if (right_scan == NULL)
11275 +               return RETERR(-ENOMEM);
11276 +       left_scan = right_scan + 1;
11277 +       flush_pos = (flush_pos_t *) (left_scan + 1);
11278 +
11279 +       sb = reiser4_get_current_sb();
11280 +       sbinfo = get_super_private(sb);
11281 +
11282 +       /* Flush-concurrency debug code */
11283 +#if REISER4_DEBUG
11284 +       atomic_inc(&flush_cnt);
11285 +#endif
11286 +
11287 +       reiser4_enter_flush(sb);
11288 +
11289 +       /* Initialize a flush position. */
11290 +       pos_init(flush_pos);
11291 +
11292 +       flush_pos->nr_written = nr_written;
11293 +       flush_pos->fq = fq;
11294 +       flush_pos->flags = flags;
11295 +       flush_pos->nr_to_write = nr_to_write;
11296 +
11297 +       scan_init(right_scan);
11298 +       scan_init(left_scan);
11299 +
11300 +       /* First scan left and remember the leftmost scan position. If the
11301 +          leftmost position is unformatted we remember its parent_coord. We
11302 +          scan until counting FLUSH_SCAN_MAXNODES.
11303 +
11304 +          If starting @node is unformatted, at the beginning of left scan its
11305 +          parent (twig level node, containing extent item) will be long term
11306 +          locked and lock handle will be stored in the
11307 +          @right_scan->parent_lock. This lock is used to start the rightward
11308 +          scan without redoing the tree traversal (necessary to find parent)
11309 +          and, hence, is kept during leftward scan. As a result, we have to
11310 +          use try-lock when taking long term locks during the leftward scan.
11311 +        */
11312 +       ret = scan_left(left_scan, right_scan,
11313 +                       node, sbinfo->flush.scan_maxnodes);
11314 +       if (ret != 0)
11315 +               goto failed;
11316 +
11317 +       leftmost_in_slum = jref(left_scan->node);
11318 +       scan_done(left_scan);
11319 +
11320 +       /* Then possibly go right to decide if we will use a policy of
11321 +          relocating leaves. This is only done if we did not scan past (and
11322 +          count) enough nodes during the leftward scan. If we do scan right,
11323 +          we only care to go far enough to establish that at least
11324 +          FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed. The scan
11325 +          limit is the difference between left_scan.count and the threshold. */
11326 +
11327 +       todo = sbinfo->flush.relocate_threshold - left_scan->count;
11328 +       /* scan right is inherently deadlock prone, because we are
11329 +        * (potentially) holding a lock on the twig node at this moment.
11330 +        * FIXME: this is incorrect comment: lock is not held */
11331 +       if (todo > 0) {
11332 +               ret = scan_right(right_scan, node, (unsigned)todo);
11333 +               if (ret != 0)
11334 +                       goto failed;
11335 +       }
11336 +
11337 +       /* Only the right-scan count is needed, release any rightward locks
11338 +          right away. */
11339 +       scan_done(right_scan);
11340 +
11341 +       /* ... and the answer is: we should relocate leaf nodes if at least
11342 +          FLUSH_RELOCATE_THRESHOLD nodes were found. */
11343 +       flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
11344 +           (left_scan->count + right_scan->count >=
11345 +            sbinfo->flush.relocate_threshold);
11346 +
11347 +       /* Funny business here.  We set the 'point' in the flush_position at
11348 +          prior to starting squalloc regardless of whether the first point is
11349 +          formatted or unformatted. Without this there would be an invariant,
11350 +          in the rest of the code, that if the flush_position is unformatted
11351 +          then flush_position->point is NULL and
11352 +          flush_position->parent_{lock,coord} is set, and if the flush_position
11353 +          is formatted then flush_position->point is non-NULL and no parent
11354 +          info is set.
11355 +
11356 +          This seems lazy, but it makes the initial calls to
11357 +          reverse_relocate_test (which ask "is it the pos->point the leftmost
11358 +          child of its parent") much easier because we know the first child
11359 +          already.  Nothing is broken by this, but the reasoning is subtle.
11360 +          Holding an extra reference on a jnode during flush can cause us to
11361 +          see nodes with HEARD_BANSHEE during squalloc, because nodes are not
11362 +          removed from sibling lists until they have zero reference count.
11363 +          Flush would never observe a HEARD_BANSHEE node on the left-edge of
11364 +          flush, nodes are only deleted to the right. So if nothing is broken,
11365 +          why fix it?
11366 +
11367 +          NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
11368 +          point and in any moment, because of the concurrent file system
11369 +          activity (for example, truncate). */
11370 +
11371 +       /* Check jnode state after flush_scan completed. Having a lock on this
11372 +          node or its parent (in case of unformatted) helps us in case of
11373 +          concurrent flushing. */
11374 +       if (jnode_check_flushprepped(leftmost_in_slum)
11375 +           && !jnode_convertible(leftmost_in_slum)) {
11376 +               ret = 0;
11377 +               goto failed;
11378 +       }
11379 +
11380 +       /* Now setup flush_pos using scan_left's endpoint. */
11381 +       ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
11382 +       if (ret)
11383 +               goto failed;
11384 +
11385 +       if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
11386 +           && node_is_empty(flush_pos->coord.node)) {
11387 +               znode *empty = flush_pos->coord.node;
11388 +
11389 +               assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
11390 +               ret = delete_empty_node(empty);
11391 +               goto failed;
11392 +       }
11393 +
11394 +       if (jnode_check_flushprepped(leftmost_in_slum)
11395 +           && !jnode_convertible(leftmost_in_slum)) {
11396 +               ret = 0;
11397 +               goto failed;
11398 +       }
11399 +
11400 +       /* Set pos->preceder and (re)allocate pos and its ancestors if it is
11401 +          needed  */
11402 +       ret = alloc_pos_and_ancestors(flush_pos);
11403 +       if (ret)
11404 +               goto failed;
11405 +
11406 +       /* Do the main rightward-bottom-up squeeze and allocate loop. */
11407 +       ret = squalloc(flush_pos);
11408 +       pos_stop(flush_pos);
11409 +       if (ret)
11410 +               goto failed;
11411 +
11412 +       /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated
11413 +          children. First, the pos_stop() and pos_valid() routines should be
11414 +          modified so that pos_stop() sets a flush_position->stop flag to 1
11415 +          without releasing the current position immediately--instead release
11416 +          it in pos_done(). This is a better implementation than the current
11417 +          one anyway.
11418 +
11419 +          It is not clear that all fields of the flush_position should not be
11420 +          released, but at the very least the parent_lock, parent_coord, and
11421 +          parent_load should remain held because they are hold the last twig
11422 +          when pos_stop() is called.
11423 +
11424 +          When we reach this point in the code, if the parent_coord is set to
11425 +          after the last item then we know that flush reached the end of a twig
11426 +          (and according to the new flush queueing design, we will return now).
11427 +          If parent_coord is not past the last item, we should check if the
11428 +          current twig has any unallocated children to the right (we are not
11429 +          concerned with unallocated children to the left--in that case the
11430 +          twig itself should not have been allocated). If the twig has
11431 +          unallocated children to the right, set the parent_coord to that
11432 +          position and then repeat the call to squalloc.
11433 +
11434 +          Testing for unallocated children may be defined in two ways: if any
11435 +          internal item has a fake block number, it is unallocated; if any
11436 +          extent item is unallocated then all of its children are unallocated.
11437 +          But there is a more aggressive approach: if there are any dirty
11438 +          children of the twig to the right of the current position, we may
11439 +          wish to relocate those nodes now. Checking for potential relocation
11440 +          is more expensive as it requires knowing whether there are any dirty
11441 +          children that are not unallocated. The extent_needs_allocation should
11442 +          be used after setting the correct preceder.
11443 +
11444 +          When we reach the end of a twig at this point in the code, if the
11445 +          flush can continue (when the queue is ready) it will need some
11446 +          information on the future starting point. That should be stored away
11447 +          in the flush_handle using a seal, I believe. Holding a jref() on the
11448 +          future starting point may break other code that deletes that node.
11449 +        */
11450 +
11451 +       /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is
11452 +          called above the twig level.  If the VM calls flush above the twig
11453 +          level, do nothing and return (but figure out why this happens). The
11454 +          txnmgr should be modified to only flush its leaf-level dirty list.
11455 +          This will do all the necessary squeeze and allocate steps but leave
11456 +          unallocated branches and possibly unallocated twigs (when the twig's
11457 +          leftmost child is not dirty). After flushing the leaf level, the
11458 +          remaining unallocated nodes should be given write-optimized
11459 +          locations. (Possibly, the remaining unallocated twigs should be
11460 +          allocated just before their leftmost child.)
11461 +        */
11462 +
11463 +       /* Any failure reaches this point. */
11464 +failed:
11465 +
11466 +       switch (ret) {
11467 +       case -E_REPEAT:
11468 +       case -EINVAL:
11469 +       case -E_DEADLOCK:
11470 +       case -E_NO_NEIGHBOR:
11471 +       case -ENOENT:
11472 +               /* FIXME(C): Except for E_DEADLOCK, these should probably be
11473 +                  handled properly in each case. They already are handled in
11474 +                  many cases. */
11475 +               /* Something bad happened, but difficult to avoid... Try again!
11476 +               */
11477 +               ret = 0;
11478 +       }
11479 +
11480 +       if (leftmost_in_slum)
11481 +               jput(leftmost_in_slum);
11482 +
11483 +       pos_done(flush_pos);
11484 +       scan_done(left_scan);
11485 +       scan_done(right_scan);
11486 +       kfree(right_scan);
11487 +
11488 +       ON_DEBUG(atomic_dec(&flush_cnt));
11489 +
11490 +       reiser4_leave_flush(sb);
11491 +
11492 +       return ret;
11493 +}
11494 +
11495 +/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
11496 + * flusher should submit all prepped nodes immediately without keeping them in
11497 + * flush queues for long time.  The reason for rapid flush mode is to free
11498 + * memory as fast as possible. */
11499 +
11500 +#if REISER4_USE_RAPID_FLUSH
11501 +
11502 +/**
11503 + * submit all prepped nodes if rapid flush mode is set,
11504 + * turn rapid flush mode off.
11505 + */
11506 +
11507 +static int rapid_flush(flush_pos_t *pos)
11508 +{
11509 +       if (!wbq_available())
11510 +               return 0;
11511 +
11512 +       return write_prepped_nodes(pos);
11513 +}
11514 +
11515 +#else
11516 +
11517 +#define rapid_flush(pos) (0)
11518 +
11519 +#endif                         /* REISER4_USE_RAPID_FLUSH */
11520 +
11521 +static jnode *find_flush_start_jnode(jnode *start, txn_atom * atom,
11522 +                                    flush_queue_t *fq, int *nr_queued,
11523 +                                    int flags)
11524 +{
11525 +       jnode * node;
11526 +
11527 +       if (start != NULL) {
11528 +               spin_lock_jnode(start);
11529 +               if (!jnode_is_flushprepped(start)) {
11530 +                       assert("zam-1056", start->atom == atom);
11531 +                       node = start;
11532 +                       goto enter;
11533 +               }
11534 +               spin_unlock_jnode(start);
11535 +       }
11536 +       /*
11537 +        * In this loop we process all already prepped (RELOC or OVRWR) and
11538 +        * dirtied again nodes. The atom spin lock is not released until all
11539 +        * dirty nodes processed or not prepped node found in the atom dirty
11540 +        * lists.
11541 +        */
11542 +       while ((node = find_first_dirty_jnode(atom, flags))) {
11543 +               spin_lock_jnode(node);
11544 +enter:
11545 +               assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
11546 +               assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
11547 +
11548 +               if (JF_ISSET(node, JNODE_WRITEBACK)) {
11549 +                       /* move node to the end of atom's writeback list */
11550 +                       list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
11551 +
11552 +                       /*
11553 +                        * jnode is not necessarily on dirty list: if it was
11554 +                        * dirtied when it was on flush queue - it does not get
11555 +                        * moved to dirty list
11556 +                        */
11557 +                       ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
11558 +                                            WB_LIST, 1));
11559 +
11560 +               } else if (jnode_is_znode(node)
11561 +                          && znode_above_root(JZNODE(node))) {
11562 +                       /*
11563 +                        * A special case for znode-above-root. The above-root
11564 +                        * (fake) znode is captured and dirtied when the tree
11565 +                        * height changes or when the root node is relocated.
11566 +                        * This causes atoms to fuse so that changes at the root
11567 +                        * are serialized.  However, this node is never flushed.
11568 +                        * This special case used to be in lock.c to prevent the
11569 +                        * above-root node from ever being captured, but now
11570 +                        * that it is captured we simply prevent it from
11571 +                        * flushing. The log-writer code relies on this to
11572 +                        * properly log superblock modifications of the tree
11573 +                        * height.
11574 +                        */
11575 +                       jnode_make_wander_nolock(node);
11576 +               } else if (JF_ISSET(node, JNODE_RELOC)) {
11577 +                       queue_jnode(fq, node);
11578 +                       ++(*nr_queued);
11579 +               } else
11580 +                       break;
11581 +
11582 +               spin_unlock_jnode(node);
11583 +       }
11584 +       return node;
11585 +}
11586 +
11587 +/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are
11588 + * more nodes to flush, return 0 if atom's dirty lists empty and keep current
11589 + * atom locked, return other errors as they are. */
11590 +int
11591 +flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
11592 +                  txn_atom ** atom, jnode *start)
11593 +{
11594 +       reiser4_super_info_data *sinfo = get_current_super_private();
11595 +       flush_queue_t *fq = NULL;
11596 +       jnode *node;
11597 +       int nr_queued;
11598 +       int ret;
11599 +
11600 +       assert("zam-889", atom != NULL && *atom != NULL);
11601 +       assert_spin_locked(&((*atom)->alock));
11602 +       assert("zam-892", get_current_context()->trans->atom == *atom);
11603 +
11604 +       nr_to_write = LONG_MAX;
11605 +       while (1) {
11606 +               ret = reiser4_fq_by_atom(*atom, &fq);
11607 +               if (ret != -E_REPEAT)
11608 +                       break;
11609 +               *atom = get_current_atom_locked();
11610 +       }
11611 +       if (ret)
11612 +               return ret;
11613 +
11614 +       assert_spin_locked(&((*atom)->alock));
11615 +
11616 +       /* parallel flushers limit */
11617 +       if (sinfo->tmgr.atom_max_flushers != 0) {
11618 +               while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
11619 +                       /* An reiser4_atom_send_event() call is inside
11620 +                          reiser4_fq_put_nolock() which is called when flush is
11621 +                          finished and nr_flushers is decremented. */
11622 +                       reiser4_atom_wait_event(*atom);
11623 +                       *atom = get_current_atom_locked();
11624 +               }
11625 +       }
11626 +
11627 +       /* count ourself as a flusher */
11628 +       (*atom)->nr_flushers++;
11629 +
11630 +       writeout_mode_enable();
11631 +
11632 +       nr_queued = 0;
11633 +       node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
11634 +
11635 +       if (node == NULL) {
11636 +               if (nr_queued == 0) {
11637 +                       (*atom)->nr_flushers--;
11638 +                       reiser4_fq_put_nolock(fq);
11639 +                       reiser4_atom_send_event(*atom);
11640 +                       /* current atom remains locked */
11641 +                       writeout_mode_disable();
11642 +                       return 0;
11643 +               }
11644 +               spin_unlock_atom(*atom);
11645 +       } else {
11646 +               jref(node);
11647 +               BUG_ON((*atom)->super != node->tree->super);
11648 +               spin_unlock_atom(*atom);
11649 +               spin_unlock_jnode(node);
11650 +               BUG_ON(nr_to_write == 0);
11651 +               ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
11652 +               jput(node);
11653 +       }
11654 +
11655 +       ret =
11656 +           reiser4_write_fq(fq, nr_submitted,
11657 +                    WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11658 +
11659 +       *atom = get_current_atom_locked();
11660 +       (*atom)->nr_flushers--;
11661 +       reiser4_fq_put_nolock(fq);
11662 +       reiser4_atom_send_event(*atom);
11663 +       spin_unlock_atom(*atom);
11664 +
11665 +       writeout_mode_disable();
11666 +
11667 +       if (ret == 0)
11668 +               ret = -E_REPEAT;
11669 +
11670 +       return ret;
11671 +}
11672 +
11673 +/* REVERSE PARENT-FIRST RELOCATION POLICIES */
11674 +
11675 +/* This implements the is-it-close-enough-to-its-preceder? test for relocation
11676 +   in the reverse parent-first relocate context. Here all we know is the
11677 +   preceder and the block number. Since we are going in reverse, the preceder
11678 +   may still be relocated as well, so we can't ask the block allocator "is there
11679 +   a closer block available to relocate?" here. In the _forward_ parent-first
11680 +   relocate context (not here) we actually call the block allocator to try and
11681 +   find a closer location. */
11682 +static int
11683 +reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
11684 +                                const reiser4_block_nr * nblk)
11685 +{
11686 +       reiser4_block_nr dist;
11687 +
11688 +       assert("jmacd-7710", *pblk != 0 && *nblk != 0);
11689 +       assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk));
11690 +       assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk));
11691 +
11692 +       /* Distance is the absolute value. */
11693 +       dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
11694 +
11695 +       /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from
11696 +          its preceder block, do not relocate. */
11697 +       if (dist <= get_current_super_private()->flush.relocate_distance)
11698 +               return 0;
11699 +
11700 +       return 1;
11701 +}
11702 +
11703 +/* This function is a predicate that tests for relocation. Always called in the
11704 +   reverse-parent-first context, when we are asking whether the current node
11705 +   should be relocated in order to expand the flush by dirtying the parent level
11706 +   (and thus proceeding to flush that level). When traversing in the forward
11707 +   parent-first direction (not here), relocation decisions are handled in two
11708 +   places: allocate_znode() and extent_needs_allocation(). */
11709 +static int
11710 +reverse_relocate_test(jnode * node, const coord_t *parent_coord,
11711 +                     flush_pos_t *pos)
11712 +{
11713 +       reiser4_block_nr pblk = 0;
11714 +       reiser4_block_nr nblk = 0;
11715 +
11716 +       assert("jmacd-8989", !jnode_is_root(node));
11717 +
11718 +       /*
11719 +        * This function is called only from the
11720 +        * reverse_relocate_check_dirty_parent() and only if the parent
11721 +        * node is clean. This implies that the parent has the real (i.e., not
11722 +        * fake) block number, and, so does the child, because otherwise the
11723 +        * parent would be dirty.
11724 +        */
11725 +
11726 +       /* New nodes are treated as if they are being relocated. */
11727 +       if (JF_ISSET(node, JNODE_CREATED) ||
11728 +           (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL))
11729 +               return 1;
11730 +
11731 +       /* Find the preceder. FIXME(B): When the child is an unformatted,
11732 +          previously existing node, the coord may be leftmost even though the
11733 +          child is not the parent-first preceder of the parent. If the first
11734 +          dirty node appears somewhere in the middle of the first extent unit,
11735 +          this preceder calculation is wrong.
11736 +          Needs more logic in here. */
11737 +       if (coord_is_leftmost_unit(parent_coord)) {
11738 +               pblk = *znode_get_block(parent_coord->node);
11739 +       } else {
11740 +               pblk = pos->preceder.blk;
11741 +       }
11742 +       check_preceder(pblk);
11743 +
11744 +       /* If (pblk == 0) then the preceder isn't allocated or isn't known:
11745 +          relocate. */
11746 +       if (pblk == 0)
11747 +               return 1;
11748 +
11749 +       nblk = *jnode_get_block(node);
11750 +
11751 +       if (reiser4_blocknr_is_fake(&nblk))
11752 +               /* child is unallocated, mark parent dirty */
11753 +               return 1;
11754 +
11755 +       return reverse_relocate_if_close_enough(&pblk, &nblk);
11756 +}
11757 +
11758 +/* This function calls reverse_relocate_test to make a reverse-parent-first
11759 +   relocation decision and then, if yes, it marks the parent dirty. */
11760 +static int
11761 +reverse_relocate_check_dirty_parent(jnode * node, const coord_t *parent_coord,
11762 +                                   flush_pos_t *pos)
11763 +{
11764 +       int ret;
11765 +
11766 +       if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
11767 +
11768 +               ret = reverse_relocate_test(node, parent_coord, pos);
11769 +               if (ret < 0)
11770 +                       return ret;
11771 +
11772 +               /* FIXME-ZAM
11773 +                 if parent is already relocated - we do not want to grab space,
11774 +                 right? */
11775 +               if (ret == 1) {
11776 +                       int grabbed;
11777 +
11778 +                       grabbed = get_current_context()->grabbed_blocks;
11779 +                       if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) !=
11780 +                           0)
11781 +                               reiser4_panic("umka-1250",
11782 +                                             "No space left during flush.");
11783 +
11784 +                       assert("jmacd-18923",
11785 +                              znode_is_write_locked(parent_coord->node));
11786 +                       znode_make_dirty(parent_coord->node);
11787 +                       grabbed2free_mark(grabbed);
11788 +               }
11789 +       }
11790 +
11791 +       return 0;
11792 +}
11793 +
11794 +/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE
11795 +   FORWARD PARENT-FIRST LOOP BEGINS) */
11796 +
11797 +/* Get the leftmost child for given coord. */
11798 +static int get_leftmost_child_of_unit(const coord_t *coord, jnode ** child)
11799 +{
11800 +       int ret;
11801 +
11802 +       ret = item_utmost_child(coord, LEFT_SIDE, child);
11803 +
11804 +       if (ret)
11805 +               return ret;
11806 +
11807 +       if (IS_ERR(*child))
11808 +               return PTR_ERR(*child);
11809 +
11810 +       return 0;
11811 +}
11812 +
11813 +/* This step occurs after the left- and right-scans are completed, before
11814 +   starting the forward parent-first traversal. Here we attempt to allocate
11815 +   ancestors of the starting flush point, which means continuing in the reverse
11816 +   parent-first direction to the parent, grandparent, and so on (as long as the
11817 +   child is a leftmost child). This routine calls a recursive process,
11818 +   alloc_one_ancestor, which does the real work, except there is special-case
11819 +   handling here for the first ancestor, which may be a twig. At each level
11820 +   (here and alloc_one_ancestor), we check for relocation and then, if the child
11821 +   is a leftmost child, repeat at the next level. On the way back down (the
11822 +   recursion), we allocate the ancestors in parent-first order. */
11823 +static int alloc_pos_and_ancestors(flush_pos_t *pos)
11824 +{
11825 +       int ret = 0;
11826 +       lock_handle plock;
11827 +       load_count pload;
11828 +       coord_t pcoord;
11829 +
11830 +       if (znode_check_flushprepped(pos->lock.node))
11831 +               return 0;
11832 +
11833 +       coord_init_invalid(&pcoord, NULL);
11834 +       init_lh(&plock);
11835 +       init_load_count(&pload);
11836 +
11837 +       if (pos->state == POS_ON_EPOINT) {
11838 +               /* a special case for pos on twig level, where we already have
11839 +                  a lock on parent node. */
11840 +               /* The parent may not be dirty, in which case we should decide
11841 +                  whether to relocate the child now. If decision is made to
11842 +                  relocate the child, the parent is marked dirty. */
11843 +               ret =
11844 +                   reverse_relocate_check_dirty_parent(pos->child, &pos->coord,
11845 +                                                       pos);
11846 +               if (ret)
11847 +                       goto exit;
11848 +
11849 +               /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
11850 +                  is leftmost) and the leaf/child, so recursion is not needed.
11851 +                  Levels above the twig will be allocated for
11852 +                  write-optimization before the transaction commits.  */
11853 +
11854 +               /* Do the recursive step, allocating zero or more of our
11855 +                * ancestors. */
11856 +               ret = alloc_one_ancestor(&pos->coord, pos);
11857 +
11858 +       } else {
11859 +               if (!znode_is_root(pos->lock.node)) {
11860 +                       /* all formatted nodes except tree root */
11861 +                       ret =
11862 +                           reiser4_get_parent(&plock, pos->lock.node,
11863 +                                              ZNODE_WRITE_LOCK);
11864 +                       if (ret)
11865 +                               goto exit;
11866 +
11867 +                       ret = incr_load_count_znode(&pload, plock.node);
11868 +                       if (ret)
11869 +                               goto exit;
11870 +
11871 +                       ret =
11872 +                           find_child_ptr(plock.node, pos->lock.node, &pcoord);
11873 +                       if (ret)
11874 +                               goto exit;
11875 +
11876 +                       ret =
11877 +                           reverse_relocate_check_dirty_parent(ZJNODE
11878 +                                                               (pos->lock.
11879 +                                                                node), &pcoord,
11880 +                                                               pos);
11881 +                       if (ret)
11882 +                               goto exit;
11883 +
11884 +                       ret = alloc_one_ancestor(&pcoord, pos);
11885 +                       if (ret)
11886 +                               goto exit;
11887 +               }
11888 +
11889 +               ret = allocate_znode(pos->lock.node, &pcoord, pos);
11890 +       }
11891 +exit:
11892 +       done_load_count(&pload);
11893 +       done_lh(&plock);
11894 +       return ret;
11895 +}
11896 +
11897 +/* This is the recursive step described in alloc_pos_and_ancestors, above.
11898 +   Ignoring the call to set_preceder, which is the next function described, this
11899 +   checks if the child is a leftmost child and returns if it is not. If the
11900 +   child is a leftmost child it checks for relocation, possibly dirtying the
11901 +   parent. Then it performs the recursive step. */
11902 +static int alloc_one_ancestor(const coord_t *coord, flush_pos_t *pos)
11903 +{
11904 +       int ret = 0;
11905 +       lock_handle alock;
11906 +       load_count aload;
11907 +       coord_t acoord;
11908 +
11909 +       /* As we ascend at the left-edge of the region to flush, take this
11910 +          opportunity at the twig level to find our parent-first preceder
11911 +          unless we have already set it. */
11912 +       if (pos->preceder.blk == 0) {
11913 +               ret = set_preceder(coord, pos);
11914 +               if (ret != 0)
11915 +                       return ret;
11916 +       }
11917 +
11918 +       /* If the ancestor is clean or already allocated, or if the child is not
11919 +          a leftmost child, stop going up, even leaving coord->node not
11920 +          flushprepped. */
11921 +       if (znode_check_flushprepped(coord->node)
11922 +           || !coord_is_leftmost_unit(coord))
11923 +               return 0;
11924 +
11925 +       init_lh(&alock);
11926 +       init_load_count(&aload);
11927 +       coord_init_invalid(&acoord, NULL);
11928 +
11929 +       /* Only ascend to the next level if it is a leftmost child, but
11930 +          write-lock the parent in case we will relocate the child. */
11931 +       if (!znode_is_root(coord->node)) {
11932 +
11933 +               ret =
11934 +                   jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
11935 +                                           &alock, &aload, ZNODE_WRITE_LOCK,
11936 +                                           0);
11937 +               if (ret != 0) {
11938 +                       /* FIXME(C): check EINVAL, E_DEADLOCK */
11939 +                       goto exit;
11940 +               }
11941 +
11942 +               ret =
11943 +                   reverse_relocate_check_dirty_parent(ZJNODE(coord->node),
11944 +                                                       &acoord, pos);
11945 +               if (ret != 0)
11946 +                       goto exit;
11947 +
11948 +               /* Recursive call. */
11949 +               if (!znode_check_flushprepped(acoord.node)) {
11950 +                       ret = alloc_one_ancestor(&acoord, pos);
11951 +                       if (ret)
11952 +                               goto exit;
11953 +               }
11954 +       }
11955 +
11956 +       /* Note: we call allocate with the parent write-locked (except at the
11957 +          root) in case we relocate the child, in which case it will modify the
11958 +          parent during this call. */
11959 +       ret = allocate_znode(coord->node, &acoord, pos);
11960 +
11961 +exit:
11962 +       done_load_count(&aload);
11963 +       done_lh(&alock);
11964 +       return ret;
11965 +}
11966 +
11967 +/* During the reverse parent-first alloc_pos_and_ancestors process described
11968 +   above there is a call to this function at the twig level. During
11969 +   alloc_pos_and_ancestors we may ask: should this node be relocated (in reverse
11970 +   parent-first context)?  We repeat this process as long as the child is the
11971 +   leftmost child, eventually reaching an ancestor of the flush point that is
11972 +   not a leftmost child. The preceder of that ancestors, which is not a leftmost
11973 +   child, is actually on the leaf level. The preceder of that block is the
11974 +   left-neighbor of the flush point. The preceder of that block is the rightmost
11975 +   child of the twig on the left. So, when alloc_pos_and_ancestors passes upward
11976 +   through the twig level, it stops momentarily to remember the block of the
11977 +   rightmost child of the twig on the left and sets it to the flush_position's
11978 +   preceder_hint.
11979 +
11980 +   There is one other place where we may set the flush_position's preceder hint,
11981 +   which is during scan-left.
11982 +*/
11983 +static int set_preceder(const coord_t *coord_in, flush_pos_t *pos)
11984 +{
11985 +       int ret;
11986 +       coord_t coord;
11987 +       lock_handle left_lock;
11988 +       load_count left_load;
11989 +
11990 +       coord_dup(&coord, coord_in);
11991 +
11992 +       init_lh(&left_lock);
11993 +       init_load_count(&left_load);
11994 +
11995 +       /* FIXME(B): Same FIXME as in "Find the preceder" in
11996 +          reverse_relocate_test. coord_is_leftmost_unit is not the right test
11997 +          if the unformatted child is in the middle of the first extent unit.*/
11998 +       if (!coord_is_leftmost_unit(&coord)) {
11999 +               coord_prev_unit(&coord);
12000 +       } else {
12001 +               ret =
12002 +                   reiser4_get_left_neighbor(&left_lock, coord.node,
12003 +                                             ZNODE_READ_LOCK, GN_SAME_ATOM);
12004 +               if (ret) {
12005 +                       /* If we fail for any reason it doesn't matter because
12006 +                          the preceder is only a hint. We are low-priority at
12007 +                          this point, so this must be the case. */
12008 +                       if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
12009 +                           ret == -ENOENT || ret == -EINVAL
12010 +                           || ret == -E_DEADLOCK)
12011 +                               ret = 0;
12012 +                       goto exit;
12013 +               }
12014 +
12015 +               ret = incr_load_count_znode(&left_load, left_lock.node);
12016 +               if (ret)
12017 +                       goto exit;
12018 +
12019 +               coord_init_last_unit(&coord, left_lock.node);
12020 +       }
12021 +
12022 +       ret =
12023 +           item_utmost_child_real_block(&coord, RIGHT_SIDE,
12024 +                                        &pos->preceder.blk);
12025 +exit:
12026 +       check_preceder(pos->preceder.blk);
12027 +       done_load_count(&left_load);
12028 +       done_lh(&left_lock);
12029 +       return ret;
12030 +}
12031 +
12032 +/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
12033 +
12034 +/* This procedure implements the outer loop of the flush algorithm. To put this
12035 +   in context, here is the general list of steps taken by the flush routine as a
12036 +   whole:
12037 +
12038 +   1. Scan-left
12039 +   2. Scan-right (maybe)
12040 +   3. Allocate initial flush position and its ancestors
12041 +   4. <handle extents>
12042 +   5. <squeeze and next position and its ancestors to-the-right,
12043 +       then update position to-the-right>
12044 +   6. <repeat from #4 until flush is stopped>
12045 +
12046 +   This procedure implements the loop in steps 4 through 6 in the above listing.
12047 +
12048 +   Step 4: if the current flush position is an extent item (position on the twig
12049 +   level), it allocates the extent (allocate_extent_item_in_place) then shifts
12050 +   to the next coordinate. If the next coordinate's leftmost child needs
12051 +   flushprep, we will continue. If the next coordinate is an internal item, we
12052 +   descend back to the leaf level, otherwise we repeat a step #4 (labeled
12053 +   ALLOC_EXTENTS below). If the "next coordinate" brings us past the end of the
12054 +   twig level, then we call reverse_relocate_end_of_twig to possibly dirty the
12055 +   next (right) twig, prior to step #5 which moves to the right.
12056 +
12057 +   Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up
12058 +   the tree to allocate any ancestors of the next-right flush position that are
12059 +   not also ancestors of the current position. Those ancestors (in top-down
12060 +   order) are the next in parent-first order. We squeeze adjacent nodes on the
12061 +   way up until the right node and current node share the same parent, then
12062 +   allocate on the way back down. Finally, this step sets the flush position to
12063 +   the next-right node.  Then repeat steps 4 and 5.
12064 +*/
12065 +
12066 +/* SQUEEZE CODE */
12067 +
12068 +/* squalloc_right_twig helper function, cut a range of extent items from
12069 +   cut node to->node from the beginning up to coord @to. */
12070 +static int squalloc_right_twig_cut(coord_t *to, reiser4_key * to_key,
12071 +                                  znode * left)
12072 +{
12073 +       coord_t from;
12074 +       reiser4_key from_key;
12075 +
12076 +       coord_init_first_unit(&from, to->node);
12077 +       item_key_by_coord(&from, &from_key);
12078 +
12079 +       return cut_node_content(&from, to, &from_key, to_key, NULL);
12080 +}
12081 +
12082 +/* Copy as much of the leading extents from @right to @left, allocating
12083 +   unallocated extents as they are copied.  Returns SQUEEZE_TARGET_FULL or
12084 +   SQUEEZE_SOURCE_EMPTY when no more can be shifted.  If the next item is an
12085 +   internal item it calls shift_one_internal_unit and may then return
12086 +   SUBTREE_MOVED. */
12087 +static int squeeze_right_twig(znode * left, znode * right, flush_pos_t *pos)
12088 +{
12089 +       int ret = SUBTREE_MOVED;
12090 +       coord_t coord;          /* used to iterate over items */
12091 +       reiser4_key stop_key;
12092 +
12093 +       assert("jmacd-2008", !node_is_empty(right));
12094 +       coord_init_first_unit(&coord, right);
12095 +
12096 +       /* FIXME: can be optimized to cut once */
12097 +       while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
12098 +               ON_DEBUG(void *vp);
12099 +
12100 +               assert("vs-1468", coord_is_leftmost_unit(&coord));
12101 +               ON_DEBUG(vp = shift_check_prepare(left, coord.node));
12102 +
12103 +               /* stop_key is used to find what was copied and what to cut */
12104 +               stop_key = *reiser4_min_key();
12105 +               ret = squalloc_extent(left, &coord, pos, &stop_key);
12106 +               if (ret != SQUEEZE_CONTINUE) {
12107 +                       ON_DEBUG(kfree(vp));
12108 +                       break;
12109 +               }
12110 +               assert("vs-1465", !keyeq(&stop_key, reiser4_min_key()));
12111 +
12112 +               /* Helper function to do the cutting. */
12113 +               set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
12114 +               check_me("vs-1466",
12115 +                        squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
12116 +
12117 +               ON_DEBUG(shift_check(vp, left, coord.node));
12118 +       }
12119 +
12120 +       if (node_is_empty(coord.node))
12121 +               ret = SQUEEZE_SOURCE_EMPTY;
12122 +
12123 +       if (ret == SQUEEZE_TARGET_FULL)
12124 +               goto out;
12125 +
12126 +       if (node_is_empty(right)) {
12127 +               /* The whole right node was copied into @left. */
12128 +               assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
12129 +               goto out;
12130 +       }
12131 +
12132 +       coord_init_first_unit(&coord, right);
12133 +
12134 +       if (!item_is_internal(&coord)) {
12135 +               /* we do not want to squeeze anything else to left neighbor
12136 +                  because "slum" is over */
12137 +               ret = SQUEEZE_TARGET_FULL;
12138 +               goto out;
12139 +       }
12140 +       assert("jmacd-433", item_is_internal(&coord));
12141 +
12142 +       /* Shift an internal unit.  The child must be allocated before shifting
12143 +          any more extents, so we stop here. */
12144 +       ret = shift_one_internal_unit(left, right);
12145 +
12146 +out:
12147 +       assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
12148 +              || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
12149 +
12150 +       if (ret == SQUEEZE_TARGET_FULL) {
12151 +               /* We submit prepped nodes here and expect that this @left twig
12152 +                * will not be modified again during this jnode_flush() call. */
12153 +               int ret1;
12154 +
12155 +               /* NOTE: seems like io is done under long term locks. */
12156 +               ret1 = write_prepped_nodes(pos);
12157 +               if (ret1 < 0)
12158 +                       return ret1;
12159 +       }
12160 +
12161 +       return ret;
12162 +}
12163 +
12164 +#if REISER4_DEBUG
12165 +static void item_convert_invariant(flush_pos_t *pos)
12166 +{
12167 +       assert("edward-1225", coord_is_existing_item(&pos->coord));
12168 +       if (chaining_data_present(pos)) {
12169 +               item_plugin *iplug = item_convert_plug(pos);
12170 +
12171 +               assert("edward-1000",
12172 +                      iplug == item_plugin_by_coord(&pos->coord));
12173 +               assert("edward-1001", iplug->f.convert != NULL);
12174 +       } else
12175 +               assert("edward-1226", pos->child == NULL);
12176 +}
12177 +#else
12178 +
12179 +#define item_convert_invariant(pos) noop
12180 +
12181 +#endif
12182 +
12183 +/* Scan node items starting from the first one and apply for each
12184 +   item its flush ->convert() method (if any). This method may
12185 +   resize/kill the item so the tree will be changed.
12186 +*/
12187 +static int convert_node(flush_pos_t *pos, znode * node)
12188 +{
12189 +       int ret = 0;
12190 +       item_plugin *iplug;
12191 +
12192 +       assert("edward-304", pos != NULL);
12193 +       assert("edward-305", pos->child == NULL);
12194 +       assert("edward-475", znode_convertible(node));
12195 +       assert("edward-669", znode_is_wlocked(node));
12196 +       assert("edward-1210", !node_is_empty(node));
12197 +
12198 +       if (znode_get_level(node) != LEAF_LEVEL)
12199 +               /* unsupported */
12200 +               goto exit;
12201 +
12202 +       coord_init_first_unit(&pos->coord, node);
12203 +
12204 +       while (1) {
12205 +               ret = 0;
12206 +               coord_set_to_left(&pos->coord);
12207 +               item_convert_invariant(pos);
12208 +
12209 +               iplug = item_plugin_by_coord(&pos->coord);
12210 +               assert("edward-844", iplug != NULL);
12211 +
12212 +               if (iplug->f.convert) {
12213 +                       ret = iplug->f.convert(pos);
12214 +                       if (ret)
12215 +                               goto exit;
12216 +               }
12217 +               assert("edward-307", pos->child == NULL);
12218 +
12219 +               if (coord_next_item(&pos->coord)) {
12220 +                       /* node is over */
12221 +
12222 +                       if (!chaining_data_present(pos))
12223 +                               /* finished this node */
12224 +                               break;
12225 +                       if (should_chain_next_node(pos)) {
12226 +                               /* go to next node */
12227 +                               move_chaining_data(pos, 0/* to next node */);
12228 +                               break;
12229 +                       }
12230 +                       /* repeat this node */
12231 +                       move_chaining_data(pos, 1/* this node */);
12232 +                       continue;
12233 +               }
12234 +               /* Node is not over.
12235 +                  Check if there is attached convert data.
12236 +                  If so roll one item position back and repeat
12237 +                  on this node
12238 +                */
12239 +               if (chaining_data_present(pos)) {
12240 +
12241 +                       if (iplug != item_plugin_by_coord(&pos->coord))
12242 +                               set_item_convert_count(pos, 0);
12243 +
12244 +                       ret = coord_prev_item(&pos->coord);
12245 +                       assert("edward-1003", !ret);
12246 +
12247 +                       move_chaining_data(pos, 1/* this node */);
12248 +               }
12249 +       }
12250 +       JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
12251 +       znode_make_dirty(node);
12252 +exit:
12253 +       assert("edward-1004", !ret);
12254 +       return ret;
12255 +}
12256 +
12257 +/* Squeeze and allocate the right neighbor.  This is called after @left and
12258 +   its current children have been squeezed and allocated already.  This
12259 +   procedure's job is to squeeze and items from @right to @left.
12260 +
12261 +   If at the leaf level, use the shift_everything_left memcpy-optimized
12262 +   version of shifting (squeeze_right_leaf).
12263 +
12264 +   If at the twig level, extents are allocated as they are shifted from @right
12265 +   to @left (squalloc_right_twig).
12266 +
12267 +   At any other level, shift one internal item and return to the caller
12268 +   (squalloc_parent_first) so that the shifted-subtree can be processed in
12269 +   parent-first order.
12270 +
12271 +   When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
12272 +   returned.  When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
12273 +   returned.  If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
12274 +   is returned.
12275 +*/
12276 +
12277 +static int squeeze_right_neighbor(flush_pos_t *pos, znode * left,
12278 +                                 znode * right)
12279 +{
12280 +       int ret;
12281 +
12282 +       /* FIXME it is possible to see empty hasn't-heard-banshee node in a
12283 +        * tree owing to error (for example, ENOSPC) in write */
12284 +       /* assert("jmacd-9321", !node_is_empty(left)); */
12285 +       assert("jmacd-9322", !node_is_empty(right));
12286 +       assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
12287 +
12288 +       switch (znode_get_level(left)) {
12289 +       case TWIG_LEVEL:
12290 +               /* Shift with extent allocating until either an internal item
12291 +                  is encountered or everything is shifted or no free space
12292 +                  left in @left */
12293 +               ret = squeeze_right_twig(left, right, pos);
12294 +               break;
12295 +
12296 +       default:
12297 +               /* All other levels can use shift_everything until we implement
12298 +                  per-item flush plugins. */
12299 +               ret = squeeze_right_non_twig(left, right);
12300 +               break;
12301 +       }
12302 +
12303 +       assert("jmacd-2011", (ret < 0 ||
12304 +                             ret == SQUEEZE_SOURCE_EMPTY
12305 +                             || ret == SQUEEZE_TARGET_FULL
12306 +                             || ret == SUBTREE_MOVED));
12307 +       return ret;
12308 +}
12309 +
12310 +static int squeeze_right_twig_and_advance_coord(flush_pos_t *pos,
12311 +                                               znode * right)
12312 +{
12313 +       int ret;
12314 +
12315 +       ret = squeeze_right_twig(pos->lock.node, right, pos);
12316 +       if (ret < 0)
12317 +               return ret;
12318 +       if (ret > 0) {
12319 +               coord_init_after_last_item(&pos->coord, pos->lock.node);
12320 +               return ret;
12321 +       }
12322 +
12323 +       coord_init_last_unit(&pos->coord, pos->lock.node);
12324 +       return 0;
12325 +}
12326 +
12327 +/* forward declaration */
12328 +static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
12329 +
12330 +/* do a fast check for "same parents" condition before calling
12331 + * squalloc_upper_levels() */
12332 +static inline int check_parents_and_squalloc_upper_levels(flush_pos_t *pos,
12333 +                                                         znode * left,
12334 +                                                         znode * right)
12335 +{
12336 +       if (znode_same_parents(left, right))
12337 +               return 0;
12338 +
12339 +       return squalloc_upper_levels(pos, left, right);
12340 +}
12341 +
12342 +/* Check whether the parent of given @right node needs to be processes
12343 +   ((re)allocated) prior to processing of the child.  If @left and @right do not
12344 +   share at least the parent of the @right is after the @left but before the
12345 +   @right in parent-first order, we have to (re)allocate it before the @right
12346 +   gets (re)allocated. */
12347 +static int squalloc_upper_levels(flush_pos_t *pos, znode * left, znode * right)
12348 +{
12349 +       int ret;
12350 +
12351 +       lock_handle left_parent_lock;
12352 +       lock_handle right_parent_lock;
12353 +
12354 +       load_count left_parent_load;
12355 +       load_count right_parent_load;
12356 +
12357 +       init_lh(&left_parent_lock);
12358 +       init_lh(&right_parent_lock);
12359 +
12360 +       init_load_count(&left_parent_load);
12361 +       init_load_count(&right_parent_load);
12362 +
12363 +       ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
12364 +       if (ret)
12365 +               goto out;
12366 +
12367 +       ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
12368 +       if (ret)
12369 +               goto out;
12370 +
12371 +       /* Check for same parents */
12372 +       if (left_parent_lock.node == right_parent_lock.node)
12373 +               goto out;
12374 +
12375 +       if (znode_check_flushprepped(right_parent_lock.node)) {
12376 +               /* Keep parent-first order.  In the order, the right parent node
12377 +                  stands before the @right node.  If it is already allocated,
12378 +                  we set the preceder (next block search start point) to its
12379 +                  block number, @right node should be allocated after it.
12380 +
12381 +                  However, preceder is set only if the right parent is on twig
12382 +                  level. The explanation is the following: new branch nodes are
12383 +                  allocated over already allocated children while the tree
12384 +                  grows, it is difficult to keep tree ordered, we assume that
12385 +                  only leaves and twings are correctly allocated. So, only
12386 +                  twigs are used as a preceder for allocating of the rest of
12387 +                  the slum. */
12388 +               if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
12389 +                       pos->preceder.blk =
12390 +                           *znode_get_block(right_parent_lock.node);
12391 +                       check_preceder(pos->preceder.blk);
12392 +               }
12393 +               goto out;
12394 +       }
12395 +
12396 +       ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
12397 +       if (ret)
12398 +               goto out;
12399 +
12400 +       ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
12401 +       if (ret)
12402 +               goto out;
12403 +
12404 +       ret =
12405 +           squeeze_right_neighbor(pos, left_parent_lock.node,
12406 +                                  right_parent_lock.node);
12407 +       /* We stop if error. We stop if some items/units were shifted (ret == 0)
12408 +        * and thus @right changed its parent. It means we have not process
12409 +        * right_parent node prior to processing of @right. Positive return
12410 +        * values say that shifting items was not happen because of "empty
12411 +        * source" or "target full" conditions. */
12412 +       if (ret <= 0)
12413 +               goto out;
12414 +
12415 +       /* parent(@left) and parent(@right) may have different parents also. We
12416 +        * do a recursive call for checking that. */
12417 +       ret =
12418 +           check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
12419 +                                                   right_parent_lock.node);
12420 +       if (ret)
12421 +               goto out;
12422 +
12423 +       /* allocate znode when going down */
12424 +       ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
12425 +
12426 +out:
12427 +       done_load_count(&left_parent_load);
12428 +       done_load_count(&right_parent_load);
12429 +
12430 +       done_lh(&left_parent_lock);
12431 +       done_lh(&right_parent_lock);
12432 +
12433 +       return ret;
12434 +}
12435 +
12436 +/* Check the leftmost child "flushprepped" status, also returns true if child
12437 + * node was not found in cache.  */
12438 +static int leftmost_child_of_unit_check_flushprepped(const coord_t *coord)
12439 +{
12440 +       int ret;
12441 +       int prepped;
12442 +
12443 +       jnode *child;
12444 +
12445 +       ret = get_leftmost_child_of_unit(coord, &child);
12446 +
12447 +       if (ret)
12448 +               return ret;
12449 +
12450 +       if (child) {
12451 +               prepped = jnode_check_flushprepped(child);
12452 +               jput(child);
12453 +       } else {
12454 +               /* We consider not existing child as a node which slum
12455 +                  processing should not continue to.  Not cached node is clean,
12456 +                  so it is flushprepped. */
12457 +               prepped = 1;
12458 +       }
12459 +
12460 +       return prepped;
12461 +}
12462 +
12463 +/* (re)allocate znode with automated getting parent node */
12464 +static int lock_parent_and_allocate_znode(znode * node, flush_pos_t *pos)
12465 +{
12466 +       int ret;
12467 +       lock_handle parent_lock;
12468 +       load_count parent_load;
12469 +       coord_t pcoord;
12470 +
12471 +       assert("zam-851", znode_is_write_locked(node));
12472 +
12473 +       init_lh(&parent_lock);
12474 +       init_load_count(&parent_load);
12475 +
12476 +       ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
12477 +       if (ret)
12478 +               goto out;
12479 +
12480 +       ret = incr_load_count_znode(&parent_load, parent_lock.node);
12481 +       if (ret)
12482 +               goto out;
12483 +
12484 +       ret = find_child_ptr(parent_lock.node, node, &pcoord);
12485 +       if (ret)
12486 +               goto out;
12487 +
12488 +       ret = allocate_znode(node, &pcoord, pos);
12489 +
12490 +out:
12491 +       done_load_count(&parent_load);
12492 +       done_lh(&parent_lock);
12493 +       return ret;
12494 +}
12495 +
12496 +/* Process nodes on leaf level until unformatted node or rightmost node in the
12497 + * slum reached.  */
12498 +static int handle_pos_on_formatted(flush_pos_t *pos)
12499 +{
12500 +       int ret;
12501 +       lock_handle right_lock;
12502 +       load_count right_load;
12503 +
12504 +       init_lh(&right_lock);
12505 +       init_load_count(&right_load);
12506 +
12507 +       if (should_convert_node(pos, pos->lock.node)) {
12508 +               ret = convert_node(pos, pos->lock.node);
12509 +               if (ret)
12510 +                       return ret;
12511 +       }
12512 +
12513 +       while (1) {
12514 +               int expected;
12515 +               expected = should_convert_next_node(pos);
12516 +               ret = neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE,
12517 +                                      ZNODE_WRITE_LOCK, !expected, expected);
12518 +               if (ret) {
12519 +                       if (expected)
12520 +                               warning("edward-1495",
12521 +                               "Expected neighbor not found (ret = %d). Fsck?",
12522 +                                       ret);
12523 +                       break;
12524 +               }
12525 +
12526 +               /* we don't prep(allocate) nodes for flushing twice. This can be
12527 +                * suboptimal, or it can be optimal. For now we choose to live
12528 +                * with the risk that it will be suboptimal because it would be
12529 +                * quite complex to code it to be smarter. */
12530 +               if (znode_check_flushprepped(right_lock.node)
12531 +                   && !znode_convertible(right_lock.node)) {
12532 +                       assert("edward-1005", !should_convert_next_node(pos));
12533 +                       pos_stop(pos);
12534 +                       break;
12535 +               }
12536 +
12537 +               ret = incr_load_count_znode(&right_load, right_lock.node);
12538 +               if (ret)
12539 +                       break;
12540 +               if (should_convert_node(pos, right_lock.node)) {
12541 +                       ret = convert_node(pos, right_lock.node);
12542 +                       if (ret)
12543 +                               break;
12544 +                       if (node_is_empty(right_lock.node)) {
12545 +                               /* node became empty after converting, repeat */
12546 +                               done_load_count(&right_load);
12547 +                               done_lh(&right_lock);
12548 +                               continue;
12549 +                       }
12550 +               }
12551 +
12552 +               /* squeeze _before_ going upward. */
12553 +               ret =
12554 +                   squeeze_right_neighbor(pos, pos->lock.node,
12555 +                                          right_lock.node);
12556 +               if (ret < 0)
12557 +                       break;
12558 +
12559 +               if (znode_check_flushprepped(right_lock.node)) {
12560 +                       if (should_convert_next_node(pos)) {
12561 +                               /* in spite of flushprepped status of the node,
12562 +                                  its right slum neighbor should be converted*/
12563 +                               assert("edward-953", convert_data(pos));
12564 +                               assert("edward-954", item_convert_data(pos));
12565 +
12566 +                               if (node_is_empty(right_lock.node)) {
12567 +                                       done_load_count(&right_load);
12568 +                                       done_lh(&right_lock);
12569 +                               } else
12570 +                                       move_flush_pos(pos, &right_lock,
12571 +                                                      &right_load, NULL);
12572 +                               continue;
12573 +                       }
12574 +                       pos_stop(pos);
12575 +                       break;
12576 +               }
12577 +
12578 +               if (node_is_empty(right_lock.node)) {
12579 +                       /* repeat if right node was squeezed completely */
12580 +                       done_load_count(&right_load);
12581 +                       done_lh(&right_lock);
12582 +                       continue;
12583 +               }
12584 +
12585 +               /* parent(right_lock.node) has to be processed before
12586 +                * (right_lock.node) due to "parent-first" allocation order. */
12587 +               ret =
12588 +                   check_parents_and_squalloc_upper_levels(pos, pos->lock.node,
12589 +                                                           right_lock.node);
12590 +               if (ret)
12591 +                       break;
12592 +               /* (re)allocate _after_ going upward */
12593 +               ret = lock_parent_and_allocate_znode(right_lock.node, pos);
12594 +               if (ret)
12595 +                       break;
12596 +               if (should_terminate_squalloc(pos)) {
12597 +                       set_item_convert_count(pos, 0);
12598 +                       break;
12599 +               }
12600 +
12601 +               /* advance the flush position to the right neighbor */
12602 +               move_flush_pos(pos, &right_lock, &right_load, NULL);
12603 +
12604 +               ret = rapid_flush(pos);
12605 +               if (ret)
12606 +                       break;
12607 +       }
12608 +       check_convert_info(pos);
12609 +       done_load_count(&right_load);
12610 +       done_lh(&right_lock);
12611 +
12612 +       /* This function indicates via pos whether to stop or go to twig or
12613 +        * continue on current level. */
12614 +       return ret;
12615 +
12616 +}
12617 +
12618 +/* Process nodes on leaf level until unformatted node or rightmost node in the
12619 + * slum reached.  */
12620 +static int handle_pos_on_leaf(flush_pos_t *pos)
12621 +{
12622 +       int ret;
12623 +
12624 +       assert("zam-845", pos->state == POS_ON_LEAF);
12625 +
12626 +       ret = handle_pos_on_formatted(pos);
12627 +
12628 +       if (ret == -E_NO_NEIGHBOR) {
12629 +               /* cannot get right neighbor, go process extents. */
12630 +               pos->state = POS_TO_TWIG;
12631 +               return 0;
12632 +       }
12633 +
12634 +       return ret;
12635 +}
12636 +
12637 +/* Process slum on level > 1 */
12638 +static int handle_pos_on_internal(flush_pos_t *pos)
12639 +{
12640 +       assert("zam-850", pos->state == POS_ON_INTERNAL);
12641 +       return handle_pos_on_formatted(pos);
12642 +}
12643 +
12644 +/* check whether squalloc should stop before processing given extent */
12645 +static int squalloc_extent_should_stop(flush_pos_t *pos)
12646 +{
12647 +       assert("zam-869", item_is_extent(&pos->coord));
12648 +
12649 +       /* pos->child is a jnode handle_pos_on_extent() should start with in
12650 +        * stead of the first child of the first extent unit. */
12651 +       if (pos->child) {
12652 +               int prepped;
12653 +
12654 +               assert("vs-1383", jnode_is_unformatted(pos->child));
12655 +               prepped = jnode_check_flushprepped(pos->child);
12656 +               pos->pos_in_unit =
12657 +                   jnode_get_index(pos->child) -
12658 +                   extent_unit_index(&pos->coord);
12659 +               assert("vs-1470",
12660 +                      pos->pos_in_unit < extent_unit_width(&pos->coord));
12661 +               assert("nikita-3434",
12662 +                      ergo(extent_is_unallocated(&pos->coord),
12663 +                           pos->pos_in_unit == 0));
12664 +               jput(pos->child);
12665 +               pos->child = NULL;
12666 +
12667 +               return prepped;
12668 +       }
12669 +
12670 +       pos->pos_in_unit = 0;
12671 +       if (extent_is_unallocated(&pos->coord))
12672 +               return 0;
12673 +
12674 +       return leftmost_child_of_unit_check_flushprepped(&pos->coord);
12675 +}
12676 +
12677 +/* Handle the case when regular reiser4 tree (znodes connected one to its
12678 + * neighbors by sibling pointers) is interrupted on leaf level by one or more
12679 + * unformatted nodes.  By having a lock on twig level and use extent code
12680 + * routines to process unformatted nodes we swim around an irregular part of
12681 + * reiser4 tree. */
12682 +static int handle_pos_on_twig(flush_pos_t *pos)
12683 +{
12684 +       int ret;
12685 +
12686 +       assert("zam-844", pos->state == POS_ON_EPOINT);
12687 +       assert("zam-843", item_is_extent(&pos->coord));
12688 +
12689 +       /* We decide should we continue slum processing with current extent
12690 +          unit: if leftmost child of current extent unit is flushprepped
12691 +          (i.e. clean or already processed by flush) we stop squalloc().  There
12692 +          is a fast check for unallocated extents which we assume contain all
12693 +          not flushprepped nodes. */
12694 +       /* FIXME: Here we implement simple check, we are only looking on the
12695 +          leftmost child. */
12696 +       ret = squalloc_extent_should_stop(pos);
12697 +       if (ret != 0) {
12698 +               pos_stop(pos);
12699 +               return ret;
12700 +       }
12701 +
12702 +       while (pos_valid(pos) && coord_is_existing_unit(&pos->coord)
12703 +              && item_is_extent(&pos->coord)) {
12704 +               ret = reiser4_alloc_extent(pos);
12705 +               if (ret)
12706 +                       break;
12707 +               coord_next_unit(&pos->coord);
12708 +       }
12709 +
12710 +       if (coord_is_after_rightmost(&pos->coord)) {
12711 +               pos->state = POS_END_OF_TWIG;
12712 +               return 0;
12713 +       }
12714 +       if (item_is_internal(&pos->coord)) {
12715 +               pos->state = POS_TO_LEAF;
12716 +               return 0;
12717 +       }
12718 +
12719 +       assert("zam-860", item_is_extent(&pos->coord));
12720 +
12721 +       /* "slum" is over */
12722 +       pos->state = POS_INVALID;
12723 +       return 0;
12724 +}
12725 +
12726 +/* When we about to return flush position from twig to leaf level we can process
12727 + * the right twig node or move position to the leaf.  This processes right twig
12728 + * if it is possible and jump to leaf level if not. */
12729 +static int handle_pos_end_of_twig(flush_pos_t *pos)
12730 +{
12731 +       int ret;
12732 +       lock_handle right_lock;
12733 +       load_count right_load;
12734 +       coord_t at_right;
12735 +       jnode *child = NULL;
12736 +
12737 +       assert("zam-848", pos->state == POS_END_OF_TWIG);
12738 +       assert("zam-849", coord_is_after_rightmost(&pos->coord));
12739 +
12740 +       init_lh(&right_lock);
12741 +       init_load_count(&right_load);
12742 +
12743 +       /* We get a lock on the right twig node even it is not dirty because
12744 +        * slum continues or discontinues on leaf level not on next twig. This
12745 +        * lock on the right twig is needed for getting its leftmost child. */
12746 +       ret =
12747 +           reiser4_get_right_neighbor(&right_lock, pos->lock.node,
12748 +                                      ZNODE_WRITE_LOCK, GN_SAME_ATOM);
12749 +       if (ret)
12750 +               goto out;
12751 +
12752 +       ret = incr_load_count_znode(&right_load, right_lock.node);
12753 +       if (ret)
12754 +               goto out;
12755 +
12756 +       /* right twig could be not dirty */
12757 +       if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
12758 +               /* If right twig node is dirty we always attempt to squeeze it
12759 +                * content to the left... */
12760 +became_dirty:
12761 +               ret =
12762 +                   squeeze_right_twig_and_advance_coord(pos, right_lock.node);
12763 +               if (ret <= 0) {
12764 +                       /* pos->coord is on internal item, go to leaf level, or
12765 +                        * we have an error which will be caught in squalloc()
12766 +                        */
12767 +                       pos->state = POS_TO_LEAF;
12768 +                       goto out;
12769 +               }
12770 +
12771 +               /* If right twig was squeezed completely we wave to re-lock
12772 +                * right twig. now it is done through the top-level squalloc
12773 +                * routine. */
12774 +               if (node_is_empty(right_lock.node))
12775 +                       goto out;
12776 +
12777 +               /* ... and prep it if it is not yet prepped */
12778 +               if (!znode_check_flushprepped(right_lock.node)) {
12779 +                       /* As usual, process parent before ... */
12780 +                       ret =
12781 +                           check_parents_and_squalloc_upper_levels(pos,
12782 +                                                                   pos->lock.
12783 +                                                                   node,
12784 +                                                                   right_lock.
12785 +                                                                   node);
12786 +                       if (ret)
12787 +                               goto out;
12788 +
12789 +                       /* ... processing the child */
12790 +                       ret =
12791 +                           lock_parent_and_allocate_znode(right_lock.node,
12792 +                                                          pos);
12793 +                       if (ret)
12794 +                               goto out;
12795 +               }
12796 +       } else {
12797 +               coord_init_first_unit(&at_right, right_lock.node);
12798 +
12799 +               /* check first child of next twig, should we continue there ? */
12800 +               ret = get_leftmost_child_of_unit(&at_right, &child);
12801 +               if (ret || child == NULL || jnode_check_flushprepped(child)) {
12802 +                       pos_stop(pos);
12803 +                       goto out;
12804 +               }
12805 +
12806 +               /* check clean twig for possible relocation */
12807 +               if (!znode_check_flushprepped(right_lock.node)) {
12808 +                       ret =
12809 +                           reverse_relocate_check_dirty_parent(child,
12810 +                                                               &at_right, pos);
12811 +                       if (ret)
12812 +                               goto out;
12813 +                       if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
12814 +                               goto became_dirty;
12815 +               }
12816 +       }
12817 +
12818 +       assert("zam-875", znode_check_flushprepped(right_lock.node));
12819 +
12820 +       /* Update the preceder by a block number of just processed right twig
12821 +        * node. The code above could miss the preceder updating because
12822 +        * allocate_znode() could not be called for this node. */
12823 +       pos->preceder.blk = *znode_get_block(right_lock.node);
12824 +       check_preceder(pos->preceder.blk);
12825 +
12826 +       coord_init_first_unit(&at_right, right_lock.node);
12827 +       assert("zam-868", coord_is_existing_unit(&at_right));
12828 +
12829 +       pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
12830 +       move_flush_pos(pos, &right_lock, &right_load, &at_right);
12831 +
12832 +out:
12833 +       done_load_count(&right_load);
12834 +       done_lh(&right_lock);
12835 +
12836 +       if (child)
12837 +               jput(child);
12838 +
12839 +       return ret;
12840 +}
12841 +
12842 +/* Move the pos->lock to leaf node pointed by pos->coord, check should we
12843 + * continue there. */
12844 +static int handle_pos_to_leaf(flush_pos_t *pos)
12845 +{
12846 +       int ret;
12847 +       lock_handle child_lock;
12848 +       load_count child_load;
12849 +       jnode *child;
12850 +
12851 +       assert("zam-846", pos->state == POS_TO_LEAF);
12852 +       assert("zam-847", item_is_internal(&pos->coord));
12853 +
12854 +       init_lh(&child_lock);
12855 +       init_load_count(&child_load);
12856 +
12857 +       ret = get_leftmost_child_of_unit(&pos->coord, &child);
12858 +       if (ret)
12859 +               return ret;
12860 +       if (child == NULL) {
12861 +               pos_stop(pos);
12862 +               return 0;
12863 +       }
12864 +
12865 +       if (jnode_check_flushprepped(child)) {
12866 +               pos->state = POS_INVALID;
12867 +               goto out;
12868 +       }
12869 +
12870 +       ret =
12871 +           longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
12872 +                               ZNODE_LOCK_LOPRI);
12873 +       if (ret)
12874 +               goto out;
12875 +
12876 +       ret = incr_load_count_znode(&child_load, JZNODE(child));
12877 +       if (ret)
12878 +               goto out;
12879 +
12880 +       ret = allocate_znode(JZNODE(child), &pos->coord, pos);
12881 +       if (ret)
12882 +               goto out;
12883 +
12884 +       /* move flush position to leaf level */
12885 +       pos->state = POS_ON_LEAF;
12886 +       move_flush_pos(pos, &child_lock, &child_load, NULL);
12887 +
12888 +       if (node_is_empty(JZNODE(child))) {
12889 +               ret = delete_empty_node(JZNODE(child));
12890 +               pos->state = POS_INVALID;
12891 +       }
12892 +out:
12893 +       done_load_count(&child_load);
12894 +       done_lh(&child_lock);
12895 +       jput(child);
12896 +
12897 +       return ret;
12898 +}
12899 +
12900 +/* move pos from leaf to twig, and move lock from leaf to twig. */
12901 +/* Move pos->lock to upper (twig) level */
12902 +static int handle_pos_to_twig(flush_pos_t *pos)
12903 +{
12904 +       int ret;
12905 +
12906 +       lock_handle parent_lock;
12907 +       load_count parent_load;
12908 +       coord_t pcoord;
12909 +
12910 +       assert("zam-852", pos->state == POS_TO_TWIG);
12911 +
12912 +       init_lh(&parent_lock);
12913 +       init_load_count(&parent_load);
12914 +
12915 +       ret =
12916 +           reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK);
12917 +       if (ret)
12918 +               goto out;
12919 +
12920 +       ret = incr_load_count_znode(&parent_load, parent_lock.node);
12921 +       if (ret)
12922 +               goto out;
12923 +
12924 +       ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
12925 +       if (ret)
12926 +               goto out;
12927 +
12928 +       assert("zam-870", item_is_internal(&pcoord));
12929 +       coord_next_item(&pcoord);
12930 +
12931 +       if (coord_is_after_rightmost(&pcoord))
12932 +               pos->state = POS_END_OF_TWIG;
12933 +       else if (item_is_extent(&pcoord))
12934 +               pos->state = POS_ON_EPOINT;
12935 +       else {
12936 +               /* Here we understand that getting -E_NO_NEIGHBOR in
12937 +                * handle_pos_on_leaf() was because of just a reaching edge of
12938 +                * slum */
12939 +               pos_stop(pos);
12940 +               goto out;
12941 +       }
12942 +
12943 +       move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
12944 +
12945 +out:
12946 +       done_load_count(&parent_load);
12947 +       done_lh(&parent_lock);
12948 +
12949 +       return ret;
12950 +}
12951 +
12952 +typedef int (*pos_state_handle_t) (flush_pos_t *);
12953 +static pos_state_handle_t flush_pos_handlers[] = {
12954 +       /* process formatted nodes on leaf level, keep lock on a leaf node */
12955 +       [POS_ON_LEAF] = handle_pos_on_leaf,
12956 +       /* process unformatted nodes, keep lock on twig node, pos->coord points
12957 +        * to extent currently being processed */
12958 +       [POS_ON_EPOINT] = handle_pos_on_twig,
12959 +       /* move a lock from leaf node to its parent for further processing of
12960 +          unformatted nodes */
12961 +       [POS_TO_TWIG] = handle_pos_to_twig,
12962 +       /* move a lock from twig to leaf level when a processing of unformatted
12963 +        * nodes finishes, pos->coord points to the leaf node we jump to */
12964 +       [POS_TO_LEAF] = handle_pos_to_leaf,
12965 +       /* after processing last extent in the twig node, attempting to shift
12966 +        * items from the twigs right neighbor and process them while shifting*/
12967 +       [POS_END_OF_TWIG] = handle_pos_end_of_twig,
12968 +       /* process formatted nodes on internal level, keep lock on an internal
12969 +          node */
12970 +       [POS_ON_INTERNAL] = handle_pos_on_internal
12971 +};
12972 +
12973 +/* Advance flush position horizontally, prepare for flushing ((re)allocate,
12974 + * squeeze, encrypt) nodes and their ancestors in "parent-first" order */
12975 +static int squalloc(flush_pos_t *pos)
12976 +{
12977 +       int ret = 0;
12978 +
12979 +       /* maybe needs to be made a case statement with handle_pos_on_leaf as
12980 +        * first case, for greater CPU efficiency? Measure and see.... -Hans */
12981 +       while (pos_valid(pos)) {
12982 +               ret = flush_pos_handlers[pos->state] (pos);
12983 +               if (ret < 0)
12984 +                       break;
12985 +
12986 +               ret = rapid_flush(pos);
12987 +               if (ret)
12988 +                       break;
12989 +       }
12990 +
12991 +       /* any positive value or -E_NO_NEIGHBOR are legal return codes for
12992 +          handle_pos* routines, -E_NO_NEIGHBOR means that slum edge was
12993 +          reached */
12994 +       if (ret > 0 || ret == -E_NO_NEIGHBOR)
12995 +               ret = 0;
12996 +
12997 +       return ret;
12998 +}
12999 +
13000 +static void update_ldkey(znode * node)
13001 +{
13002 +       reiser4_key ldkey;
13003 +
13004 +       assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
13005 +       if (node_is_empty(node))
13006 +               return;
13007 +
13008 +       znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
13009 +}
13010 +
13011 +/* this is to be called after calling of shift node's method to shift data from
13012 +   @right to @left. It sets left delimiting keys of @left and @right to keys of
13013 +   first items of @left and @right correspondingly and sets right delimiting key
13014 +   of @left to first key of @right */
13015 +static void update_znode_dkeys(znode * left, znode * right)
13016 +{
13017 +       assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
13018 +       assert("vs-1629", (znode_is_write_locked(left) &&
13019 +                          znode_is_write_locked(right)));
13020 +
13021 +       /* we need to update left delimiting of left if it was empty before
13022 +          shift */
13023 +       update_ldkey(left);
13024 +       update_ldkey(right);
13025 +       if (node_is_empty(right))
13026 +               znode_set_rd_key(left, znode_get_rd_key(right));
13027 +       else
13028 +               znode_set_rd_key(left, znode_get_ld_key(right));
13029 +}
13030 +
13031 +/* try to shift everything from @right to @left. If everything was shifted -
13032 +   @right is removed from the tree.  Result is the number of bytes shifted. */
13033 +static int
13034 +shift_everything_left(znode * right, znode * left, carry_level * todo)
13035 +{
13036 +       coord_t from;
13037 +       node_plugin *nplug;
13038 +       carry_plugin_info info;
13039 +
13040 +       coord_init_after_last_item(&from, right);
13041 +
13042 +       nplug = node_plugin_by_node(right);
13043 +       info.doing = NULL;
13044 +       info.todo = todo;
13045 +       return nplug->shift(&from, left, SHIFT_LEFT,
13046 +                           1 /* delete @right if it becomes empty */ ,
13047 +                           1
13048 +                           /* move coord @from to node @left if everything will
13049 +                              be shifted */
13050 +                           ,
13051 +                           &info);
13052 +}
13053 +
13054 +/* Shift as much as possible from @right to @left using the memcpy-optimized
13055 +   shift_everything_left.  @left and @right are formatted neighboring nodes on
13056 +   leaf level. */
13057 +static int squeeze_right_non_twig(znode * left, znode * right)
13058 +{
13059 +       int ret;
13060 +       carry_pool *pool;
13061 +       carry_level *todo;
13062 +
13063 +       assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
13064 +
13065 +       if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
13066 +           !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
13067 +               return SQUEEZE_TARGET_FULL;
13068 +
13069 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
13070 +       if (IS_ERR(pool))
13071 +               return PTR_ERR(pool);
13072 +       todo = (carry_level *) (pool + 1);
13073 +       init_carry_level(todo, pool);
13074 +
13075 +       ret = shift_everything_left(right, left, todo);
13076 +       if (ret > 0) {
13077 +               /* something was shifted */
13078 +               reiser4_tree *tree;
13079 +               __u64 grabbed;
13080 +
13081 +               znode_make_dirty(left);
13082 +               znode_make_dirty(right);
13083 +
13084 +               /* update delimiting keys of nodes which participated in
13085 +                  shift. FIXME: it would be better to have this in shift
13086 +                  node's operation. But it can not be done there. Nobody
13087 +                  remembers why, though */
13088 +               tree = znode_get_tree(left);
13089 +               write_lock_dk(tree);
13090 +               update_znode_dkeys(left, right);
13091 +               write_unlock_dk(tree);
13092 +
13093 +               /* Carry is called to update delimiting key and, maybe, to
13094 +                  remove empty node. */
13095 +               grabbed = get_current_context()->grabbed_blocks;
13096 +               ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13097 +               assert("nikita-3003", ret == 0);        /* reserved space is
13098 +                                                       exhausted. Ask Hans. */
13099 +               ret = reiser4_carry(todo, NULL/* previous level */);
13100 +               grabbed2free_mark(grabbed);
13101 +       } else {
13102 +               /* Shifting impossible, we return appropriate result code */
13103 +               ret =
13104 +                   node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
13105 +                   SQUEEZE_TARGET_FULL;
13106 +       }
13107 +
13108 +       done_carry_pool(pool);
13109 +
13110 +       return ret;
13111 +}
13112 +
13113 +#if REISER4_DEBUG
13114 +static int sibling_link_is_ok(const znode *left, const znode *right)
13115 +{
13116 +       int result;
13117 +
13118 +       read_lock_tree(znode_get_tree(left));
13119 +       result = (left->right == right && left == right->left);
13120 +       read_unlock_tree(znode_get_tree(left));
13121 +       return result;
13122 +}
13123 +#endif
13124 +
13125 +/* Shift first unit of first item if it is an internal one.  Return
13126 +   SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
13127 +   SUBTREE_MOVED. */
13128 +static int shift_one_internal_unit(znode * left, znode * right)
13129 +{
13130 +       int ret;
13131 +       carry_pool *pool;
13132 +       carry_level *todo;
13133 +       coord_t *coord;
13134 +       carry_plugin_info *info;
13135 +       int size, moved;
13136 +
13137 +       assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
13138 +       assert("nikita-2435", znode_is_write_locked(left));
13139 +       assert("nikita-2436", znode_is_write_locked(right));
13140 +       assert("nikita-2434", sibling_link_is_ok(left, right));
13141 +
13142 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
13143 +                              sizeof(*coord) + sizeof(*info)
13144 +#if REISER4_DEBUG
13145 +                              + sizeof(*coord) + 2 * sizeof(reiser4_key)
13146 +#endif
13147 +           );
13148 +       if (IS_ERR(pool))
13149 +               return PTR_ERR(pool);
13150 +       todo = (carry_level *) (pool + 1);
13151 +       init_carry_level(todo, pool);
13152 +
13153 +       coord = (coord_t *) (todo + 3);
13154 +       coord_init_first_unit(coord, right);
13155 +       info = (carry_plugin_info *) (coord + 1);
13156 +
13157 +#if REISER4_DEBUG
13158 +       if (!node_is_empty(left)) {
13159 +               coord_t *last;
13160 +               reiser4_key *right_key;
13161 +               reiser4_key *left_key;
13162 +
13163 +               last = (coord_t *) (info + 1);
13164 +               right_key = (reiser4_key *) (last + 1);
13165 +               left_key = right_key + 1;
13166 +               coord_init_last_unit(last, left);
13167 +
13168 +               assert("nikita-2463",
13169 +                      keyle(item_key_by_coord(last, left_key),
13170 +                            item_key_by_coord(coord, right_key)));
13171 +       }
13172 +#endif
13173 +
13174 +       assert("jmacd-2007", item_is_internal(coord));
13175 +
13176 +       size = item_length_by_coord(coord);
13177 +       info->todo = todo;
13178 +       info->doing = NULL;
13179 +
13180 +       ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
13181 +                                              1
13182 +                                              /* delete @right if it becomes
13183 +                                                 empty */
13184 +                                              ,
13185 +                                              0
13186 +                                              /* do not move coord @coord to
13187 +                                                 node @left */
13188 +                                              ,
13189 +                                              info);
13190 +
13191 +       /* If shift returns positive, then we shifted the item. */
13192 +       assert("vs-423", ret <= 0 || size == ret);
13193 +       moved = (ret > 0);
13194 +
13195 +       if (moved) {
13196 +               /* something was moved */
13197 +               reiser4_tree *tree;
13198 +               int grabbed;
13199 +
13200 +               znode_make_dirty(left);
13201 +               znode_make_dirty(right);
13202 +               tree = znode_get_tree(left);
13203 +               write_lock_dk(tree);
13204 +               update_znode_dkeys(left, right);
13205 +               write_unlock_dk(tree);
13206 +
13207 +               /* reserve space for delimiting keys after shifting */
13208 +               grabbed = get_current_context()->grabbed_blocks;
13209 +               ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13210 +               assert("nikita-3003", ret == 0);        /* reserved space is
13211 +                                                       exhausted. Ask Hans. */
13212 +
13213 +               ret = reiser4_carry(todo, NULL/* previous level */);
13214 +               grabbed2free_mark(grabbed);
13215 +       }
13216 +
13217 +       done_carry_pool(pool);
13218 +
13219 +       if (ret != 0) {
13220 +               /* Shift or carry operation failed. */
13221 +               assert("jmacd-7325", ret < 0);
13222 +               return ret;
13223 +       }
13224 +
13225 +       return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
13226 +}
13227 +
13228 +/* Make the final relocate/wander decision during forward parent-first squalloc
13229 +   for a znode. For unformatted nodes this is done in
13230 +   plugin/item/extent.c:extent_needs_allocation(). */
13231 +static int
13232 +allocate_znode_loaded(znode * node,
13233 +                     const coord_t *parent_coord, flush_pos_t *pos)
13234 +{
13235 +       int ret;
13236 +       reiser4_super_info_data *sbinfo = get_current_super_private();
13237 +       /* FIXME(D): We have the node write-locked and should have checked for !
13238 +          allocated() somewhere before reaching this point, but there can be a
13239 +          race, so this assertion is bogus. */
13240 +       assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
13241 +       assert("jmacd-7988", znode_is_write_locked(node));
13242 +       assert("jmacd-7989", coord_is_invalid(parent_coord)
13243 +              || znode_is_write_locked(parent_coord->node));
13244 +
13245 +       if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
13246 +           znode_is_root(node) ||
13247 +           /* We have enough nodes to relocate no matter what. */
13248 +           (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) {
13249 +               /* No need to decide with new nodes, they are treated the same
13250 +                  as relocate. If the root node is dirty, relocate. */
13251 +               if (pos->preceder.blk == 0) {
13252 +                       /* preceder is unknown and we have decided to relocate
13253 +                          node -- using of default value for search start is
13254 +                          better than search from block #0. */
13255 +                       get_blocknr_hint_default(&pos->preceder.blk);
13256 +                       check_preceder(pos->preceder.blk);
13257 +               }
13258 +
13259 +               goto best_reloc;
13260 +
13261 +       } else if (pos->preceder.blk == 0) {
13262 +               /* If we don't know the preceder, leave it where it is. */
13263 +               jnode_make_wander(ZJNODE(node));
13264 +       } else {
13265 +               /* Make a decision based on block distance. */
13266 +               reiser4_block_nr dist;
13267 +               reiser4_block_nr nblk = *znode_get_block(node);
13268 +
13269 +               assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk));
13270 +               assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13271 +               assert("jmacd-6174", pos->preceder.blk != 0);
13272 +
13273 +               if (pos->preceder.blk == nblk - 1) {
13274 +                       /* Ideal. */
13275 +                       jnode_make_wander(ZJNODE(node));
13276 +               } else {
13277 +
13278 +                       dist =
13279 +                           (nblk <
13280 +                            pos->preceder.blk) ? (pos->preceder.blk -
13281 +                                                  nblk) : (nblk -
13282 +                                                           pos->preceder.blk);
13283 +
13284 +                       /* See if we can find a closer block
13285 +                          (forward direction only). */
13286 +                       pos->preceder.max_dist =
13287 +                           min((reiser4_block_nr) sbinfo->flush.
13288 +                               relocate_distance, dist);
13289 +                       pos->preceder.level = znode_get_level(node);
13290 +
13291 +                       ret = allocate_znode_update(node, parent_coord, pos);
13292 +
13293 +                       pos->preceder.max_dist = 0;
13294 +
13295 +                       if (ret && (ret != -ENOSPC))
13296 +                               return ret;
13297 +
13298 +                       if (ret == 0) {
13299 +                               /* Got a better allocation. */
13300 +                               znode_make_reloc(node, pos->fq);
13301 +                       } else if (dist < sbinfo->flush.relocate_distance) {
13302 +                               /* The present allocation is good enough. */
13303 +                               jnode_make_wander(ZJNODE(node));
13304 +                       } else {
13305 +                               /* Otherwise, try to relocate to the best
13306 +                                  position. */
13307 +best_reloc:
13308 +                               ret =
13309 +                                   allocate_znode_update(node, parent_coord,
13310 +                                                         pos);
13311 +                               if (ret != 0)
13312 +                                       return ret;
13313 +
13314 +                               /* set JNODE_RELOC bit _after_ node gets
13315 +                                  allocated */
13316 +                               znode_make_reloc(node, pos->fq);
13317 +                       }
13318 +               }
13319 +       }
13320 +
13321 +       /* This is the new preceder. */
13322 +       pos->preceder.blk = *znode_get_block(node);
13323 +       check_preceder(pos->preceder.blk);
13324 +       pos->alloc_cnt += 1;
13325 +
13326 +       assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13327 +
13328 +       return 0;
13329 +}
13330 +
13331 +static int
13332 +allocate_znode(znode * node, const coord_t *parent_coord, flush_pos_t *pos)
13333 +{
13334 +       /*
13335 +        * perform znode allocation with znode pinned in memory to avoid races
13336 +        * with asynchronous emergency flush (which plays with
13337 +        * JNODE_FLUSH_RESERVED bit).
13338 +        */
13339 +       return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
13340 +}
13341 +
13342 +/* A subroutine of allocate_znode, this is called first to see if there is a
13343 +   close position to relocate to. It may return ENOSPC if there is no close
13344 +   position. If there is no close position it may not relocate. This takes care
13345 +   of updating the parent node with the relocated block address. */
13346 +static int
13347 +allocate_znode_update(znode * node, const coord_t *parent_coord,
13348 +                     flush_pos_t *pos)
13349 +{
13350 +       int ret;
13351 +       reiser4_block_nr blk;
13352 +       lock_handle uber_lock;
13353 +       int flush_reserved_used = 0;
13354 +       int grabbed;
13355 +       reiser4_context *ctx;
13356 +       reiser4_super_info_data *sbinfo;
13357 +
13358 +       init_lh(&uber_lock);
13359 +
13360 +       ctx = get_current_context();
13361 +       sbinfo = get_super_private(ctx->super);
13362 +
13363 +       grabbed = ctx->grabbed_blocks;
13364 +
13365 +       /* discard e-flush allocation */
13366 +       ret = zload(node);
13367 +       if (ret)
13368 +               return ret;
13369 +
13370 +       if (ZF_ISSET(node, JNODE_CREATED)) {
13371 +               assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node)));
13372 +               pos->preceder.block_stage = BLOCK_UNALLOCATED;
13373 +       } else {
13374 +               pos->preceder.block_stage = BLOCK_GRABBED;
13375 +
13376 +               /* The disk space for relocating the @node is already reserved
13377 +                * in "flush reserved" counter if @node is leaf, otherwise we
13378 +                * grab space using BA_RESERVED (means grab space from whole
13379 +                * disk not from only 95%). */
13380 +               if (znode_get_level(node) == LEAF_LEVEL) {
13381 +                       /*
13382 +                        * earlier (during do_jnode_make_dirty()) we decided
13383 +                        * that @node can possibly go into overwrite set and
13384 +                        * reserved block for its wandering location.
13385 +                        */
13386 +                       txn_atom *atom = get_current_atom_locked();
13387 +                       assert("nikita-3449",
13388 +                              ZF_ISSET(node, JNODE_FLUSH_RESERVED));
13389 +                       flush_reserved2grabbed(atom, (__u64) 1);
13390 +                       spin_unlock_atom(atom);
13391 +                       /*
13392 +                        * we are trying to move node into relocate
13393 +                        * set. Allocation of relocated position "uses"
13394 +                        * reserved block.
13395 +                        */
13396 +                       ZF_CLR(node, JNODE_FLUSH_RESERVED);
13397 +                       flush_reserved_used = 1;
13398 +               } else {
13399 +                       ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
13400 +                       if (ret != 0)
13401 +                               goto exit;
13402 +               }
13403 +       }
13404 +
13405 +       /* We may do not use 5% of reserved disk space here and flush will not
13406 +          pack tightly. */
13407 +       ret = reiser4_alloc_block(&pos->preceder, &blk,
13408 +                                 BA_FORMATTED | BA_PERMANENT);
13409 +       if (ret)
13410 +               goto exit;
13411 +
13412 +       if (!ZF_ISSET(node, JNODE_CREATED) &&
13413 +           (ret =
13414 +            reiser4_dealloc_block(znode_get_block(node), 0,
13415 +                                  BA_DEFER | BA_FORMATTED)))
13416 +               goto exit;
13417 +
13418 +       if (likely(!znode_is_root(node))) {
13419 +               item_plugin *iplug;
13420 +
13421 +               iplug = item_plugin_by_coord(parent_coord);
13422 +               assert("nikita-2954", iplug->f.update != NULL);
13423 +               iplug->f.update(parent_coord, &blk);
13424 +
13425 +               znode_make_dirty(parent_coord->node);
13426 +
13427 +       } else {
13428 +               reiser4_tree *tree = znode_get_tree(node);
13429 +               znode *uber;
13430 +
13431 +               /* We take a longterm lock on the fake node in order to change
13432 +                  the root block number.  This may cause atom fusion. */
13433 +               ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
13434 +                                    &uber_lock);
13435 +               /* The fake node cannot be deleted, and we must have priority
13436 +                  here, and may not be confused with ENOSPC. */
13437 +               assert("jmacd-74412",
13438 +                      ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
13439 +
13440 +               if (ret)
13441 +                       goto exit;
13442 +
13443 +               uber = uber_lock.node;
13444 +
13445 +               write_lock_tree(tree);
13446 +               tree->root_block = blk;
13447 +               write_unlock_tree(tree);
13448 +
13449 +               znode_make_dirty(uber);
13450 +       }
13451 +
13452 +       ret = znode_rehash(node, &blk);
13453 +exit:
13454 +       if (ret) {
13455 +               /* Get flush reserved block back if something fails, because
13456 +                * callers assume that on error block wasn't relocated and its
13457 +                * flush reserved block wasn't used. */
13458 +               if (flush_reserved_used) {
13459 +                       /*
13460 +                        * ok, we failed to move node into relocate
13461 +                        * set. Restore status quo.
13462 +                        */
13463 +                       grabbed2flush_reserved((__u64) 1);
13464 +                       ZF_SET(node, JNODE_FLUSH_RESERVED);
13465 +               }
13466 +       }
13467 +       zrelse(node);
13468 +       done_lh(&uber_lock);
13469 +       grabbed2free_mark(grabbed);
13470 +       return ret;
13471 +}
13472 +
13473 +/* JNODE INTERFACE */
13474 +
13475 +/* Lock a node (if formatted) and then get its parent locked, set the child's
13476 +   coordinate in the parent.  If the child is the root node, the above_root
13477 +   znode is returned but the coord is not set.  This function may cause atom
13478 +   fusion, but it is only used for read locks (at this point) and therefore
13479 +   fusion only occurs when the parent is already dirty. */
13480 +/* Hans adds this note: remember to ask how expensive this operation is vs.
13481 +   storing parent pointer in jnodes. */
13482 +static int
13483 +jnode_lock_parent_coord(jnode * node,
13484 +                       coord_t *coord,
13485 +                       lock_handle * parent_lh,
13486 +                       load_count * parent_zh,
13487 +                       znode_lock_mode parent_mode, int try)
13488 +{
13489 +       int ret;
13490 +
13491 +       assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
13492 +       assert("edward-54", jnode_is_unformatted(node)
13493 +              || znode_is_any_locked(JZNODE(node)));
13494 +
13495 +       if (!jnode_is_znode(node)) {
13496 +               reiser4_key key;
13497 +               tree_level stop_level = TWIG_LEVEL;
13498 +               lookup_bias bias = FIND_EXACT;
13499 +
13500 +               assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
13501 +
13502 +               /* The case when node is not znode, but can have parent coord
13503 +                  (unformatted node, node which represents cluster page,
13504 +                  etc..).  Generate a key for the appropriate entry, search
13505 +                  in the tree using coord_by_key, which handles locking for
13506 +                  us. */
13507 +
13508 +               /*
13509 +                * nothing is locked at this moment, so, nothing prevents
13510 +                * concurrent truncate from removing jnode from inode. To
13511 +                * prevent this spin-lock jnode. jnode can be truncated just
13512 +                * after call to the jnode_build_key(), but this is ok,
13513 +                * because coord_by_key() will just fail to find appropriate
13514 +                * extent.
13515 +                */
13516 +               spin_lock_jnode(node);
13517 +               if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13518 +                       jnode_build_key(node, &key);
13519 +                       ret = 0;
13520 +               } else
13521 +                       ret = RETERR(-ENOENT);
13522 +               spin_unlock_jnode(node);
13523 +
13524 +               if (ret != 0)
13525 +                       return ret;
13526 +
13527 +               if (jnode_is_cluster_page(node))
13528 +                       stop_level = LEAF_LEVEL;
13529 +
13530 +               assert("jmacd-1812", coord != NULL);
13531 +
13532 +               ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
13533 +                                  parent_mode, bias, stop_level, stop_level,
13534 +                                  CBK_UNIQUE, NULL/*ra_info */);
13535 +               switch (ret) {
13536 +               case CBK_COORD_NOTFOUND:
13537 +                       assert("edward-1038",
13538 +                              ergo(jnode_is_cluster_page(node),
13539 +                                   JF_ISSET(node, JNODE_HEARD_BANSHEE)));
13540 +                       if (!JF_ISSET(node, JNODE_HEARD_BANSHEE))
13541 +                               warning("nikita-3177", "Parent not found");
13542 +                       return ret;
13543 +               case CBK_COORD_FOUND:
13544 +                       if (coord->between != AT_UNIT) {
13545 +                               /* FIXME: comment needed */
13546 +                               done_lh(parent_lh);
13547 +                               if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13548 +                                       warning("nikita-3178",
13549 +                                               "Found but not happy: %i",
13550 +                                               coord->between);
13551 +                               }
13552 +                               return RETERR(-ENOENT);
13553 +                       }
13554 +                       ret = incr_load_count_znode(parent_zh, parent_lh->node);
13555 +                       if (ret != 0)
13556 +                               return ret;
13557 +                       /* if (jnode_is_cluster_page(node)) {
13558 +                          races with write() are possible
13559 +                          check_child_cluster (parent_lh->node);
13560 +                          }
13561 +                        */
13562 +                       break;
13563 +               default:
13564 +                       return ret;
13565 +               }
13566 +
13567 +       } else {
13568 +               int flags;
13569 +               znode *z;
13570 +
13571 +               z = JZNODE(node);
13572 +               /* Formatted node case: */
13573 +               assert("jmacd-2061", !znode_is_root(z));
13574 +
13575 +               flags = GN_ALLOW_NOT_CONNECTED;
13576 +               if (try)
13577 +                       flags |= GN_TRY_LOCK;
13578 +
13579 +               ret =
13580 +                   reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
13581 +               if (ret != 0)
13582 +                       /* -E_REPEAT is ok here, it is handled by the caller. */
13583 +                       return ret;
13584 +
13585 +               /* Make the child's position "hint" up-to-date.  (Unless above
13586 +                  root, which caller must check.) */
13587 +               if (coord != NULL) {
13588 +
13589 +                       ret = incr_load_count_znode(parent_zh, parent_lh->node);
13590 +                       if (ret != 0) {
13591 +                               warning("jmacd-976812386",
13592 +                                       "incr_load_count_znode failed: %d",
13593 +                                       ret);
13594 +                               return ret;
13595 +                       }
13596 +
13597 +                       ret = find_child_ptr(parent_lh->node, z, coord);
13598 +                       if (ret != 0) {
13599 +                               warning("jmacd-976812",
13600 +                                       "find_child_ptr failed: %d", ret);
13601 +                               return ret;
13602 +                       }
13603 +               }
13604 +       }
13605 +
13606 +       return 0;
13607 +}
13608 +
13609 +/* Get the (locked) next neighbor of a znode which is dirty and a member of the
13610 +   same atom. If there is no next neighbor or the neighbor is not in memory or
13611 +   if there is a neighbor but it is not dirty or not in the same atom,
13612 +   -E_NO_NEIGHBOR is returned. In some cases the slum may include nodes which
13613 +   are not dirty, if so @check_dirty should be 0 */
13614 +static int neighbor_in_slum(znode * node,      /* starting point */
13615 +                           lock_handle * lock, /* lock on starting point */
13616 +                           sideof side,        /* left or right direction we
13617 +                                                  seek the next node in */
13618 +                           znode_lock_mode mode, /* kind of lock we want */
13619 +                           int check_dirty,    /* true if the neighbor should
13620 +                                                  be dirty */
13621 +                           int use_upper_levels /* get neighbor by going though
13622 +                                                   upper levels */)
13623 +{
13624 +       int ret;
13625 +       int flags;
13626 +
13627 +       assert("jmacd-6334", znode_is_connected(node));
13628 +
13629 +       flags =  GN_SAME_ATOM | (side == LEFT_SIDE ? GN_GO_LEFT : 0);
13630 +       if (use_upper_levels)
13631 +               flags |= GN_CAN_USE_UPPER_LEVELS;
13632 +
13633 +       ret = reiser4_get_neighbor(lock, node, mode, flags);
13634 +       if (ret) {
13635 +               /* May return -ENOENT or -E_NO_NEIGHBOR. */
13636 +               /* FIXME(C): check EINVAL, E_DEADLOCK */
13637 +               if (ret == -ENOENT)
13638 +                       ret = RETERR(-E_NO_NEIGHBOR);
13639 +               return ret;
13640 +       }
13641 +       if (!check_dirty)
13642 +               return 0;
13643 +       /* Check dirty bit of locked znode, no races here */
13644 +       if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
13645 +               return 0;
13646 +
13647 +       done_lh(lock);
13648 +       return RETERR(-E_NO_NEIGHBOR);
13649 +}
13650 +
13651 +/* Return true if two znodes have the same parent.  This is called with both
13652 +   nodes write-locked (for squeezing) so no tree lock is needed. */
13653 +static int znode_same_parents(znode * a, znode * b)
13654 +{
13655 +       int result;
13656 +
13657 +       assert("jmacd-7011", znode_is_write_locked(a));
13658 +       assert("jmacd-7012", znode_is_write_locked(b));
13659 +
13660 +       /* We lock the whole tree for this check.... I really don't like whole
13661 +        * tree locks... -Hans */
13662 +       read_lock_tree(znode_get_tree(a));
13663 +       result = (znode_parent(a) == znode_parent(b));
13664 +       read_unlock_tree(znode_get_tree(a));
13665 +       return result;
13666 +}
13667 +
13668 +/* FLUSH SCAN */
13669 +
13670 +/* Initialize the flush_scan data structure. */
13671 +static void scan_init(flush_scan * scan)
13672 +{
13673 +       memset(scan, 0, sizeof(*scan));
13674 +       init_lh(&scan->node_lock);
13675 +       init_lh(&scan->parent_lock);
13676 +       init_load_count(&scan->parent_load);
13677 +       init_load_count(&scan->node_load);
13678 +       coord_init_invalid(&scan->parent_coord, NULL);
13679 +}
13680 +
13681 +/* Release any resources held by the flush scan, e.g. release locks,
13682 +   free memory, etc. */
13683 +static void scan_done(flush_scan * scan)
13684 +{
13685 +       done_load_count(&scan->node_load);
13686 +       if (scan->node != NULL) {
13687 +               jput(scan->node);
13688 +               scan->node = NULL;
13689 +       }
13690 +       done_load_count(&scan->parent_load);
13691 +       done_lh(&scan->parent_lock);
13692 +       done_lh(&scan->node_lock);
13693 +}
13694 +
13695 +/* Returns true if flush scanning is finished. */
13696 +int reiser4_scan_finished(flush_scan * scan)
13697 +{
13698 +       return scan->stop || (scan->direction == RIGHT_SIDE &&
13699 +                             scan->count >= scan->max_count);
13700 +}
13701 +
13702 +/* Return true if the scan should continue to the @tonode. True if the node
13703 +   meets the same_slum_check condition. If not, deref the "left" node and stop
13704 +   the scan. */
13705 +int reiser4_scan_goto(flush_scan * scan, jnode * tonode)
13706 +{
13707 +       int go = same_slum_check(scan->node, tonode, 1, 0);
13708 +
13709 +       if (!go) {
13710 +               scan->stop = 1;
13711 +               jput(tonode);
13712 +       }
13713 +
13714 +       return go;
13715 +}
13716 +
13717 +/* Set the current scan->node, refcount it, increment count by the @add_count
13718 +   (number to count, e.g., skipped unallocated nodes), deref previous current,
13719 +   and copy the current parent coordinate. */
13720 +int
13721 +scan_set_current(flush_scan * scan, jnode * node, unsigned add_count,
13722 +                const coord_t *parent)
13723 +{
13724 +       /* Release the old references, take the new reference. */
13725 +       done_load_count(&scan->node_load);
13726 +
13727 +       if (scan->node != NULL)
13728 +               jput(scan->node);
13729 +       scan->node = node;
13730 +       scan->count += add_count;
13731 +
13732 +       /* This next stmt is somewhat inefficient.  The reiser4_scan_extent()
13733 +          code could delay this update step until it finishes and update the
13734 +          parent_coord only once. It did that before, but there was a bug and
13735 +          this was the easiest way to make it correct. */
13736 +       if (parent != NULL)
13737 +               coord_dup(&scan->parent_coord, parent);
13738 +
13739 +       /* Failure may happen at the incr_load_count call, but the caller can
13740 +          assume the reference is safely taken. */
13741 +       return incr_load_count_jnode(&scan->node_load, node);
13742 +}
13743 +
13744 +/* Return true if scanning in the leftward direction. */
13745 +int reiser4_scanning_left(flush_scan * scan)
13746 +{
13747 +       return scan->direction == LEFT_SIDE;
13748 +}
13749 +
13750 +/* Performs leftward scanning starting from either kind of node. Counts the
13751 +   starting node. The right-scan object is passed in for the left-scan in order
13752 +   to copy the parent of an unformatted starting position. This way we avoid
13753 +   searching for the unformatted node's parent when scanning in each direction.
13754 +   If we search for the parent once it is set in both scan objects. The limit
13755 +   parameter tells flush-scan when to stop.
13756 +
13757 +   Rapid scanning is used only during scan_left, where we are interested in
13758 +   finding the 'leftpoint' where we begin flushing. We are interested in
13759 +   stopping at the left child of a twig that does not have a dirty left
13760 +   neighbour. THIS IS A SPECIAL CASE. The problem is finding a way to flush only
13761 +   those nodes without unallocated children, and it is difficult to solve in the
13762 +   bottom-up flushing algorithm we are currently using. The problem can be
13763 +   solved by scanning left at every level as we go upward, but this would
13764 +   basically bring us back to using a top-down allocation strategy, which we
13765 +   already tried (see BK history from May 2002), and has a different set of
13766 +   problems. The top-down strategy makes avoiding unallocated children easier,
13767 +   but makes it difficult to propertly flush dirty children with clean parents
13768 +   that would otherwise stop the top-down flush, only later to dirty the parent
13769 +   once the children are flushed. So we solve the problem in the bottom-up
13770 +   algorithm with a special case for twigs and leaves only.
13771 +
13772 +   The first step in solving the problem is this rapid leftward scan.  After we
13773 +   determine that there are at least enough nodes counted to qualify for
13774 +   FLUSH_RELOCATE_THRESHOLD we are no longer interested in the exact count, we
13775 +   are only interested in finding the best place to start the flush.
13776 +
13777 +   We could choose one of two possibilities:
13778 +
13779 +   1. Stop at the leftmost child (of a twig) that does not have a dirty left
13780 +   neighbor. This requires checking one leaf per rapid-scan twig
13781 +
13782 +   2. Stop at the leftmost child (of a twig) where there are no dirty children
13783 +   of the twig to the left. This requires checking possibly all of the in-memory
13784 +   children of each twig during the rapid scan.
13785 +
13786 +   For now we implement the first policy.
13787 +*/
13788 +static int
13789 +scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
13790 +{
13791 +       int ret = 0;
13792 +
13793 +       scan->max_count = limit;
13794 +       scan->direction = LEFT_SIDE;
13795 +
13796 +       ret = scan_set_current(scan, jref(node), 1, NULL);
13797 +       if (ret != 0)
13798 +               return ret;
13799 +
13800 +       ret = scan_common(scan, right);
13801 +       if (ret != 0)
13802 +               return ret;
13803 +
13804 +       /* Before rapid scanning, we need a lock on scan->node so that we can
13805 +          get its parent, only if formatted. */
13806 +       if (jnode_is_znode(scan->node)) {
13807 +               ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
13808 +                                         ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
13809 +       }
13810 +
13811 +       /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD)
13812 +       */
13813 +       return ret;
13814 +}
13815 +
13816 +/* Performs rightward scanning... Does not count the starting node. The limit
13817 +   parameter is described in scan_left. If the starting node is unformatted then
13818 +   the parent_coord was already set during scan_left. The rapid_after parameter
13819 +   is not used during right-scanning.
13820 +
13821 +   scan_right is only called if the scan_left operation does not count at least
13822 +   FLUSH_RELOCATE_THRESHOLD nodes for flushing.  Otherwise, the limit parameter
13823 +   is set to the difference between scan-left's count and
13824 +   FLUSH_RELOCATE_THRESHOLD, meaning scan-right counts as high as
13825 +   FLUSH_RELOCATE_THRESHOLD and then stops. */
13826 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit)
13827 +{
13828 +       int ret;
13829 +
13830 +       scan->max_count = limit;
13831 +       scan->direction = RIGHT_SIDE;
13832 +
13833 +       ret = scan_set_current(scan, jref(node), 0, NULL);
13834 +       if (ret != 0)
13835 +               return ret;
13836 +
13837 +       return scan_common(scan, NULL);
13838 +}
13839 +
13840 +/* Common code to perform left or right scanning. */
13841 +static int scan_common(flush_scan * scan, flush_scan * other)
13842 +{
13843 +       int ret;
13844 +
13845 +       assert("nikita-2376", scan->node != NULL);
13846 +       assert("edward-54", jnode_is_unformatted(scan->node)
13847 +              || jnode_is_znode(scan->node));
13848 +
13849 +       /* Special case for starting at an unformatted node. Optimization: we
13850 +          only want to search for the parent (which requires a tree traversal)
13851 +          once. Obviously, we shouldn't have to call it once for the left scan
13852 +          and once for the right scan. For this reason, if we search for the
13853 +          parent during scan-left we then duplicate the coord/lock/load into
13854 +          the scan-right object. */
13855 +       if (jnode_is_unformatted(scan->node)) {
13856 +               ret = scan_unformatted(scan, other);
13857 +               if (ret != 0)
13858 +                       return ret;
13859 +       }
13860 +       /* This loop expects to start at a formatted position and performs
13861 +          chaining of formatted regions */
13862 +       while (!reiser4_scan_finished(scan)) {
13863 +
13864 +               ret = scan_formatted(scan);
13865 +               if (ret != 0)
13866 +                       return ret;
13867 +       }
13868 +
13869 +       return 0;
13870 +}
13871 +
13872 +static int scan_unformatted(flush_scan * scan, flush_scan * other)
13873 +{
13874 +       int ret = 0;
13875 +       int try = 0;
13876 +
13877 +       if (!coord_is_invalid(&scan->parent_coord))
13878 +               goto scan;
13879 +
13880 +       /* set parent coord from */
13881 +       if (!jnode_is_unformatted(scan->node)) {
13882 +               /* formatted position */
13883 +
13884 +               lock_handle lock;
13885 +               assert("edward-301", jnode_is_znode(scan->node));
13886 +               init_lh(&lock);
13887 +
13888 +               /*
13889 +                * when flush starts from unformatted node, first thing it
13890 +                * does is tree traversal to find formatted parent of starting
13891 +                * node. This parent is then kept lock across scans to the
13892 +                * left and to the right. This means that during scan to the
13893 +                * left we cannot take left-ward lock, because this is
13894 +                * dead-lock prone. So, if we are scanning to the left and
13895 +                * there is already lock held by this thread,
13896 +                * jnode_lock_parent_coord() should use try-lock.
13897 +                */
13898 +               try = reiser4_scanning_left(scan)
13899 +                   && !lock_stack_isclean(get_current_lock_stack());
13900 +               /* Need the node locked to get the parent lock, We have to
13901 +                  take write lock since there is at least one call path
13902 +                  where this znode is already write-locked by us. */
13903 +               ret =
13904 +                   longterm_lock_znode(&lock, JZNODE(scan->node),
13905 +                                       ZNODE_WRITE_LOCK,
13906 +                                       reiser4_scanning_left(scan) ?
13907 +                                       ZNODE_LOCK_LOPRI :
13908 +                                       ZNODE_LOCK_HIPRI);
13909 +               if (ret != 0)
13910 +                       /* EINVAL or E_DEADLOCK here mean... try again!  At this
13911 +                          point we've scanned too far and can't back out, just
13912 +                          start over. */
13913 +                       return ret;
13914 +
13915 +               ret = jnode_lock_parent_coord(scan->node,
13916 +                                             &scan->parent_coord,
13917 +                                             &scan->parent_lock,
13918 +                                             &scan->parent_load,
13919 +                                             ZNODE_WRITE_LOCK, try);
13920 +
13921 +               /* FIXME(C): check EINVAL, E_DEADLOCK */
13922 +               done_lh(&lock);
13923 +               if (ret == -E_REPEAT) {
13924 +                       scan->stop = 1;
13925 +                       return 0;
13926 +               }
13927 +               if (ret)
13928 +                       return ret;
13929 +
13930 +       } else {
13931 +               /* unformatted position */
13932 +
13933 +               ret =
13934 +                   jnode_lock_parent_coord(scan->node, &scan->parent_coord,
13935 +                                           &scan->parent_lock,
13936 +                                           &scan->parent_load,
13937 +                                           ZNODE_WRITE_LOCK, try);
13938 +
13939 +               if (IS_CBKERR(ret))
13940 +                       return ret;
13941 +
13942 +               if (ret == CBK_COORD_NOTFOUND)
13943 +                       /* FIXME(C): check EINVAL, E_DEADLOCK */
13944 +                       return ret;
13945 +
13946 +               /* parent was found */
13947 +               assert("jmacd-8661", other != NULL);
13948 +               /* Duplicate the reference into the other flush_scan. */
13949 +               coord_dup(&other->parent_coord, &scan->parent_coord);
13950 +               copy_lh(&other->parent_lock, &scan->parent_lock);
13951 +               copy_load_count(&other->parent_load, &scan->parent_load);
13952 +       }
13953 +scan:
13954 +       return scan_by_coord(scan);
13955 +}
13956 +
13957 +/* Performs left- or rightward scanning starting from a formatted node. Follow
13958 +   left pointers under tree lock as long as:
13959 +
13960 +   - node->left/right is non-NULL
13961 +   - node->left/right is connected, dirty
13962 +   - node->left/right belongs to the same atom
13963 +   - scan has not reached maximum count
13964 +*/
13965 +static int scan_formatted(flush_scan * scan)
13966 +{
13967 +       int ret;
13968 +       znode *neighbor = NULL;
13969 +
13970 +       assert("jmacd-1401", !reiser4_scan_finished(scan));
13971 +
13972 +       do {
13973 +               znode *node = JZNODE(scan->node);
13974 +
13975 +               /* Node should be connected, but if not stop the scan. */
13976 +               if (!znode_is_connected(node)) {
13977 +                       scan->stop = 1;
13978 +                       break;
13979 +               }
13980 +
13981 +               /* Lock the tree, check-for and reference the next sibling. */
13982 +               read_lock_tree(znode_get_tree(node));
13983 +
13984 +               /* It may be that a node is inserted or removed between a node
13985 +                  and its left sibling while the tree lock is released, but the
13986 +                  flush-scan count does not need to be precise. Thus, we
13987 +                  release the tree lock as soon as we get the neighboring node.
13988 +               */
13989 +               neighbor =
13990 +                       reiser4_scanning_left(scan) ? node->left : node->right;
13991 +               if (neighbor != NULL)
13992 +                       zref(neighbor);
13993 +
13994 +               read_unlock_tree(znode_get_tree(node));
13995 +
13996 +               /* If neighbor is NULL at the leaf level, need to check for an
13997 +                  unformatted sibling using the parent--break in any case. */
13998 +               if (neighbor == NULL)
13999 +                       break;
14000 +
14001 +               /* Check the condition for going left, break if it is not met.
14002 +                  This also releases (jputs) the neighbor if false. */
14003 +               if (!reiser4_scan_goto(scan, ZJNODE(neighbor)))
14004 +                       break;
14005 +
14006 +               /* Advance the flush_scan state to the left, repeat. */
14007 +               ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
14008 +               if (ret != 0)
14009 +                       return ret;
14010 +
14011 +       } while (!reiser4_scan_finished(scan));
14012 +
14013 +       /* If neighbor is NULL then we reached the end of a formatted region, or
14014 +          else the sibling is out of memory, now check for an extent to the
14015 +          left (as long as LEAF_LEVEL). */
14016 +       if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL
14017 +           || reiser4_scan_finished(scan)) {
14018 +               scan->stop = 1;
14019 +               return 0;
14020 +       }
14021 +       /* Otherwise, calls scan_by_coord for the right(left)most item of the
14022 +          left(right) neighbor on the parent level, then possibly continue. */
14023 +
14024 +       coord_init_invalid(&scan->parent_coord, NULL);
14025 +       return scan_unformatted(scan, NULL);
14026 +}
14027 +
14028 +/* NOTE-EDWARD:
14029 +   This scans adjacent items of the same type and calls scan flush plugin for
14030 +   each one. Performs left(right)ward scanning starting from a (possibly)
14031 +   unformatted node. If we start from unformatted node, then we continue only if
14032 +   the next neighbor is also unformatted. When called from scan_formatted, we
14033 +   skip first iteration (to make sure that right(left)most item of the
14034 +   left(right) neighbor on the parent level is of the same type and set
14035 +   appropriate coord). */
14036 +static int scan_by_coord(flush_scan * scan)
14037 +{
14038 +       int ret = 0;
14039 +       int scan_this_coord;
14040 +       lock_handle next_lock;
14041 +       load_count next_load;
14042 +       coord_t next_coord;
14043 +       jnode *child;
14044 +       item_plugin *iplug;
14045 +
14046 +       init_lh(&next_lock);
14047 +       init_load_count(&next_load);
14048 +       scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
14049 +
14050 +       /* set initial item id */
14051 +       iplug = item_plugin_by_coord(&scan->parent_coord);
14052 +
14053 +       for (; !reiser4_scan_finished(scan); scan_this_coord = 1) {
14054 +               if (scan_this_coord) {
14055 +                       /* Here we expect that unit is scannable. it would not
14056 +                        * be so due to race with extent->tail conversion.  */
14057 +                       if (iplug->f.scan == NULL) {
14058 +                               scan->stop = 1;
14059 +                               ret = -E_REPEAT;
14060 +                               /* skip the check at the end. */
14061 +                               goto race;
14062 +                       }
14063 +
14064 +                       ret = iplug->f.scan(scan);
14065 +                       if (ret != 0)
14066 +                               goto exit;
14067 +
14068 +                       if (reiser4_scan_finished(scan)) {
14069 +                               checkchild(scan);
14070 +                               break;
14071 +                       }
14072 +               } else {
14073 +                       /* the same race against truncate as above is possible
14074 +                        * here, it seems */
14075 +
14076 +                       /* NOTE-JMACD: In this case, apply the same end-of-node
14077 +                          logic but don't scan the first coordinate. */
14078 +                       assert("jmacd-1231",
14079 +                              item_is_internal(&scan->parent_coord));
14080 +               }
14081 +
14082 +               if (iplug->f.utmost_child == NULL
14083 +                   || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
14084 +                       /* stop this coord and continue on parrent level */
14085 +                       ret =
14086 +                           scan_set_current(scan,
14087 +                                            ZJNODE(zref
14088 +                                                   (scan->parent_coord.node)),
14089 +                                            1, NULL);
14090 +                       if (ret != 0)
14091 +                               goto exit;
14092 +                       break;
14093 +               }
14094 +
14095 +               /* Either way, the invariant is that scan->parent_coord is set
14096 +                  to the parent of scan->node. Now get the next unit. */
14097 +               coord_dup(&next_coord, &scan->parent_coord);
14098 +               coord_sideof_unit(&next_coord, scan->direction);
14099 +
14100 +               /* If off-the-end of the twig, try the next twig. */
14101 +               if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
14102 +                       /* We take the write lock because we may start flushing
14103 +                        * from this coordinate. */
14104 +                       ret = neighbor_in_slum(next_coord.node,
14105 +                                              &next_lock,
14106 +                                              scan->direction,
14107 +                                              ZNODE_WRITE_LOCK,
14108 +                                              1 /* check dirty */,
14109 +                                              0 /* don't go though upper
14110 +                                                   levels */);
14111 +                       if (ret == -E_NO_NEIGHBOR) {
14112 +                               scan->stop = 1;
14113 +                               ret = 0;
14114 +                               break;
14115 +                       }
14116 +
14117 +                       if (ret != 0)
14118 +                               goto exit;
14119 +
14120 +                       ret = incr_load_count_znode(&next_load, next_lock.node);
14121 +                       if (ret != 0)
14122 +                               goto exit;
14123 +
14124 +                       coord_init_sideof_unit(&next_coord, next_lock.node,
14125 +                                              sideof_reverse(scan->direction));
14126 +               }
14127 +
14128 +               iplug = item_plugin_by_coord(&next_coord);
14129 +
14130 +               /* Get the next child. */
14131 +               ret =
14132 +                   iplug->f.utmost_child(&next_coord,
14133 +                                         sideof_reverse(scan->direction),
14134 +                                         &child);
14135 +               if (ret != 0)
14136 +                       goto exit;
14137 +               /* If the next child is not in memory, or, item_utmost_child
14138 +                  failed (due to race with unlink, most probably), stop
14139 +                  here. */
14140 +               if (child == NULL || IS_ERR(child)) {
14141 +                       scan->stop = 1;
14142 +                       checkchild(scan);
14143 +                       break;
14144 +               }
14145 +
14146 +               assert("nikita-2374", jnode_is_unformatted(child)
14147 +                      || jnode_is_znode(child));
14148 +
14149 +               /* See if it is dirty, part of the same atom. */
14150 +               if (!reiser4_scan_goto(scan, child)) {
14151 +                       checkchild(scan);
14152 +                       break;
14153 +               }
14154 +
14155 +               /* If so, make this child current. */
14156 +               ret = scan_set_current(scan, child, 1, &next_coord);
14157 +               if (ret != 0)
14158 +                       goto exit;
14159 +
14160 +               /* Now continue.  If formatted we release the parent lock and
14161 +                  return, then proceed. */
14162 +               if (jnode_is_znode(child))
14163 +                       break;
14164 +
14165 +               /* Otherwise, repeat the above loop with next_coord. */
14166 +               if (next_load.node != NULL) {
14167 +                       done_lh(&scan->parent_lock);
14168 +                       move_lh(&scan->parent_lock, &next_lock);
14169 +                       move_load_count(&scan->parent_load, &next_load);
14170 +               }
14171 +       }
14172 +
14173 +       assert("jmacd-6233",
14174 +              reiser4_scan_finished(scan) || jnode_is_znode(scan->node));
14175 +exit:
14176 +       checkchild(scan);
14177 +race:                  /* skip the above check  */
14178 +       if (jnode_is_znode(scan->node)) {
14179 +               done_lh(&scan->parent_lock);
14180 +               done_load_count(&scan->parent_load);
14181 +       }
14182 +
14183 +       done_load_count(&next_load);
14184 +       done_lh(&next_lock);
14185 +       return ret;
14186 +}
14187 +
14188 +/* FLUSH POS HELPERS */
14189 +
14190 +/* Initialize the fields of a flush_position. */
14191 +static void pos_init(flush_pos_t *pos)
14192 +{
14193 +       memset(pos, 0, sizeof *pos);
14194 +
14195 +       pos->state = POS_INVALID;
14196 +       coord_init_invalid(&pos->coord, NULL);
14197 +       init_lh(&pos->lock);
14198 +       init_load_count(&pos->load);
14199 +
14200 +       reiser4_blocknr_hint_init(&pos->preceder);
14201 +}
14202 +
14203 +/* The flush loop inside squalloc periodically checks pos_valid to determine
14204 +   when "enough flushing" has been performed. This will return true until one
14205 +   of the following conditions is met:
14206 +
14207 +   1. the number of flush-queued nodes has reached the kernel-supplied
14208 +   "int *nr_to_flush" parameter, meaning we have flushed as many blocks as the
14209 +   kernel requested. When flushing to commit, this parameter is NULL.
14210 +
14211 +   2. pos_stop() is called because squalloc discovers that the "next" node in
14212 +   the flush order is either non-existant, not dirty, or not in the same atom.
14213 +*/
14214 +
14215 +static int pos_valid(flush_pos_t *pos)
14216 +{
14217 +       return pos->state != POS_INVALID;
14218 +}
14219 +
14220 +/* Release any resources of a flush_position. Called when jnode_flush
14221 +   finishes. */
14222 +static void pos_done(flush_pos_t *pos)
14223 +{
14224 +       pos_stop(pos);
14225 +       reiser4_blocknr_hint_done(&pos->preceder);
14226 +       if (convert_data(pos))
14227 +               free_convert_data(pos);
14228 +}
14229 +
14230 +/* Reset the point and parent.  Called during flush subroutines to terminate the
14231 +   squalloc loop. */
14232 +static int pos_stop(flush_pos_t *pos)
14233 +{
14234 +       pos->state = POS_INVALID;
14235 +       done_lh(&pos->lock);
14236 +       done_load_count(&pos->load);
14237 +       coord_init_invalid(&pos->coord, NULL);
14238 +
14239 +       if (pos->child) {
14240 +               jput(pos->child);
14241 +               pos->child = NULL;
14242 +       }
14243 +
14244 +       return 0;
14245 +}
14246 +
14247 +/* Return the flush_position's block allocator hint. */
14248 +reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t *pos)
14249 +{
14250 +       return &pos->preceder;
14251 +}
14252 +
14253 +flush_queue_t *reiser4_pos_fq(flush_pos_t *pos)
14254 +{
14255 +       return pos->fq;
14256 +}
14257 +
14258 +/* Make Linus happy.
14259 +   Local variables:
14260 +   c-indentation-style: "K&R"
14261 +   mode-name: "LC"
14262 +   c-basic-offset: 8
14263 +   tab-width: 8
14264 +   fill-column: 90
14265 +   LocalWords:  preceder
14266 +   End:
14267 +*/
14268 diff -urN linux-2.6.35.orig/fs/reiser4/flush.h linux-2.6.35/fs/reiser4/flush.h
14269 --- linux-2.6.35.orig/fs/reiser4/flush.h        1970-01-01 01:00:00.000000000 +0100
14270 +++ linux-2.6.35/fs/reiser4/flush.h     2010-08-04 15:44:57.000000000 +0200
14271 @@ -0,0 +1,300 @@
14272 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14273 +
14274 +/* DECLARATIONS: */
14275 +
14276 +#if !defined(__REISER4_FLUSH_H__)
14277 +#define __REISER4_FLUSH_H__
14278 +
14279 +#include "plugin/cluster.h"
14280 +
14281 +/* The flush_scan data structure maintains the state of an in-progress
14282 +   flush-scan on a single level of the tree. A flush-scan is used for counting
14283 +   the number of adjacent nodes to flush, which is used to determine whether we
14284 +   should relocate, and it is also used to find a starting point for flush. A
14285 +   flush-scan object can scan in both right and left directions via the
14286 +   scan_left() and scan_right() interfaces. The right- and left-variations are
14287 +   similar but perform different functions. When scanning left we (optionally
14288 +   perform rapid scanning and then) longterm-lock the endpoint node. When
14289 +   scanning right we are simply counting the number of adjacent, dirty nodes. */
14290 +struct flush_scan {
14291 +
14292 +       /* The current number of nodes scanned on this level. */
14293 +       unsigned count;
14294 +
14295 +       /* There may be a maximum number of nodes for a scan on any single
14296 +          level. When going leftward, max_count is determined by
14297 +          FLUSH_SCAN_MAXNODES (see reiser4.h) */
14298 +       unsigned max_count;
14299 +
14300 +       /* Direction: Set to one of the sideof enumeration:
14301 +          { LEFT_SIDE, RIGHT_SIDE }. */
14302 +       sideof direction;
14303 +
14304 +       /* Initially @stop is set to false then set true once some condition
14305 +          stops the search (e.g., we found a clean node before reaching
14306 +          max_count or we found a node belonging to another atom). */
14307 +       int stop;
14308 +
14309 +       /* The current scan position.  If @node is non-NULL then its reference
14310 +          count has been incremented to reflect this reference. */
14311 +       jnode *node;
14312 +
14313 +       /* A handle for zload/zrelse of current scan position node. */
14314 +       load_count node_load;
14315 +
14316 +       /* During left-scan, if the final position (a.k.a. endpoint node) is
14317 +          formatted the node is locked using this lock handle. The endpoint
14318 +          needs to be locked for transfer to the flush_position object after
14319 +          scanning finishes. */
14320 +       lock_handle node_lock;
14321 +
14322 +       /* When the position is unformatted, its parent, coordinate, and parent
14323 +          zload/zrelse handle. */
14324 +       lock_handle parent_lock;
14325 +       coord_t parent_coord;
14326 +       load_count parent_load;
14327 +
14328 +       /* The block allocator preceder hint.  Sometimes flush_scan determines
14329 +          what the preceder is and if so it sets it here, after which it is
14330 +          copied into the flush_position. Otherwise, the preceder is computed
14331 +          later. */
14332 +       reiser4_block_nr preceder_blk;
14333 +};
14334 +
14335 +struct convert_item_info {
14336 +       dc_item_stat d_cur;     /* disk cluster state of the current item */
14337 +       dc_item_stat d_next;    /* disk cluster state of the next slum item */
14338 +       int cluster_shift;      /* disk cluster shift */
14339 +       flow_t flow;            /* disk cluster data */
14340 +};
14341 +
14342 +struct convert_info {
14343 +       int count;              /* for squalloc terminating */
14344 +       item_plugin *iplug;     /* current item plugin */
14345 +       struct convert_item_info *itm;  /* current item info */
14346 +       struct cluster_handle clust;    /* transform cluster */
14347 +};
14348 +
14349 +typedef enum flush_position_state {
14350 +       POS_INVALID,            /* Invalid or stopped pos, do not continue slum
14351 +                                * processing */
14352 +       POS_ON_LEAF,            /* pos points to already prepped, locked
14353 +                                * formatted node at leaf level */
14354 +       POS_ON_EPOINT,          /* pos keeps a lock on twig level, "coord" field
14355 +                                * is used to traverse unformatted nodes */
14356 +       POS_TO_LEAF,            /* pos is being moved to leaf level */
14357 +       POS_TO_TWIG,            /* pos is being moved to twig level */
14358 +       POS_END_OF_TWIG,        /* special case of POS_ON_TWIG, when coord is
14359 +                                * after rightmost unit of the current twig */
14360 +       POS_ON_INTERNAL         /* same as POS_ON_LEAF, but points to internal
14361 +                                * node */
14362 +} flushpos_state_t;
14363 +
14364 +/* An encapsulation of the current flush point and all the parameters that are
14365 +   passed through the entire squeeze-and-allocate stage of the flush routine.
14366 +   A single flush_position object is constructed after left- and right-scanning
14367 +   finishes. */
14368 +struct flush_position {
14369 +       flushpos_state_t state;
14370 +
14371 +       coord_t coord;          /* coord to traverse unformatted nodes */
14372 +       lock_handle lock;       /* current lock we hold */
14373 +       load_count load;        /* load status for current locked formatted node
14374 +                               */
14375 +       jnode *child;           /* for passing a reference to unformatted child
14376 +                                * across pos state changes */
14377 +
14378 +       reiser4_blocknr_hint preceder;  /* The flush 'hint' state. */
14379 +       int leaf_relocate;      /* True if enough leaf-level nodes were
14380 +                                * found to suggest a relocate policy. */
14381 +       int alloc_cnt;          /* The number of nodes allocated during squeeze
14382 +                                  and allococate. */
14383 +       int prep_or_free_cnt;   /* The number of nodes prepared for write
14384 +                                  (allocate) or squeezed and freed. */
14385 +       flush_queue_t *fq;
14386 +       long *nr_written;       /* number of nodes submitted to disk */
14387 +       int flags;              /* a copy of jnode_flush flags argument */
14388 +
14389 +       znode *prev_twig;       /* previous parent pointer value, used to catch
14390 +                                * processing of new twig node */
14391 +       struct convert_info *sq;        /* convert info */
14392 +
14393 +       unsigned long pos_in_unit;      /* for extents only. Position
14394 +                                          within an extent unit of first
14395 +                                          jnode of slum */
14396 +       long nr_to_write;       /* number of unformatted nodes to handle on
14397 +                                  flush */
14398 +};
14399 +
14400 +static inline int item_convert_count(flush_pos_t *pos)
14401 +{
14402 +       return pos->sq->count;
14403 +}
14404 +static inline void inc_item_convert_count(flush_pos_t *pos)
14405 +{
14406 +       pos->sq->count++;
14407 +}
14408 +static inline void set_item_convert_count(flush_pos_t *pos, int count)
14409 +{
14410 +       pos->sq->count = count;
14411 +}
14412 +static inline item_plugin *item_convert_plug(flush_pos_t *pos)
14413 +{
14414 +       return pos->sq->iplug;
14415 +}
14416 +
14417 +static inline struct convert_info *convert_data(flush_pos_t *pos)
14418 +{
14419 +       return pos->sq;
14420 +}
14421 +
14422 +static inline struct convert_item_info *item_convert_data(flush_pos_t *pos)
14423 +{
14424 +       assert("edward-955", convert_data(pos));
14425 +       return pos->sq->itm;
14426 +}
14427 +
14428 +static inline struct tfm_cluster *tfm_cluster_sq(flush_pos_t *pos)
14429 +{
14430 +       return &pos->sq->clust.tc;
14431 +}
14432 +
14433 +static inline struct tfm_stream *tfm_stream_sq(flush_pos_t *pos,
14434 +                                               tfm_stream_id id)
14435 +{
14436 +       assert("edward-854", pos->sq != NULL);
14437 +       return get_tfm_stream(tfm_cluster_sq(pos), id);
14438 +}
14439 +
14440 +static inline int chaining_data_present(flush_pos_t *pos)
14441 +{
14442 +       return convert_data(pos) && item_convert_data(pos);
14443 +}
14444 +
14445 +/* Returns true if next node contains next item of the disk cluster
14446 +   so item convert data should be moved to the right slum neighbor.
14447 +*/
14448 +static inline int should_chain_next_node(flush_pos_t *pos)
14449 +{
14450 +       int result = 0;
14451 +
14452 +       assert("edward-1007", chaining_data_present(pos));
14453 +
14454 +       switch (item_convert_data(pos)->d_next) {
14455 +       case DC_CHAINED_ITEM:
14456 +               result = 1;
14457 +               break;
14458 +       case DC_AFTER_CLUSTER:
14459 +               break;
14460 +       default:
14461 +               impossible("edward-1009", "bad state of next slum item");
14462 +       }
14463 +       return result;
14464 +}
14465 +
14466 +/* update item state in a disk cluster to assign conversion mode */
14467 +static inline void
14468 +move_chaining_data(flush_pos_t *pos, int this_node/* where is next item */)
14469 +{
14470 +
14471 +       assert("edward-1010", chaining_data_present(pos));
14472 +
14473 +       if (this_node == 0) {
14474 +               /* next item is on the right neighbor */
14475 +               assert("edward-1011",
14476 +                      item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14477 +                      item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14478 +               assert("edward-1012",
14479 +                      item_convert_data(pos)->d_next == DC_CHAINED_ITEM);
14480 +
14481 +               item_convert_data(pos)->d_cur = DC_CHAINED_ITEM;
14482 +               item_convert_data(pos)->d_next = DC_INVALID_STATE;
14483 +       } else {
14484 +               /* next item is on the same node */
14485 +               assert("edward-1013",
14486 +                      item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14487 +                      item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14488 +               assert("edward-1227",
14489 +                      item_convert_data(pos)->d_next == DC_AFTER_CLUSTER ||
14490 +                      item_convert_data(pos)->d_next == DC_INVALID_STATE);
14491 +
14492 +               item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER;
14493 +               item_convert_data(pos)->d_next = DC_INVALID_STATE;
14494 +       }
14495 +}
14496 +
14497 +static inline int should_convert_node(flush_pos_t *pos, znode * node)
14498 +{
14499 +       return znode_convertible(node);
14500 +}
14501 +
14502 +/* true if there is attached convert item info */
14503 +static inline int should_convert_next_node(flush_pos_t *pos)
14504 +{
14505 +       return convert_data(pos) && item_convert_data(pos);
14506 +}
14507 +
14508 +#define SQUALLOC_THRESHOLD 256
14509 +
14510 +static inline int should_terminate_squalloc(flush_pos_t *pos)
14511 +{
14512 +       return convert_data(pos) &&
14513 +           !item_convert_data(pos) &&
14514 +           item_convert_count(pos) >= SQUALLOC_THRESHOLD;
14515 +}
14516 +
14517 +#if 1
14518 +#define check_convert_info(pos)                                                \
14519 +do {                                                                   \
14520 +       if (unlikely(should_convert_next_node(pos))) {                  \
14521 +               warning("edward-1006", "unprocessed chained data");     \
14522 +               printk("d_cur = %d, d_next = %d, flow.len = %llu\n",    \
14523 +                      item_convert_data(pos)->d_cur,                   \
14524 +                      item_convert_data(pos)->d_next,                  \
14525 +                      item_convert_data(pos)->flow.length);            \
14526 +       }                                                               \
14527 +} while (0)
14528 +#else
14529 +#define check_convert_info(pos)
14530 +#endif /* REISER4_DEBUG */
14531 +
14532 +void free_convert_data(flush_pos_t *pos);
14533 +/* used in extent.c */
14534 +int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size,
14535 +                    const coord_t *parent);
14536 +int reiser4_scan_finished(flush_scan * scan);
14537 +int reiser4_scanning_left(flush_scan * scan);
14538 +int reiser4_scan_goto(flush_scan * scan, jnode * tonode);
14539 +txn_atom *atom_locked_by_fq(flush_queue_t *fq);
14540 +int reiser4_alloc_extent(flush_pos_t *flush_pos);
14541 +squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *,
14542 +                              reiser4_key *stop_key);
14543 +extern int reiser4_init_fqs(void);
14544 +extern void reiser4_done_fqs(void);
14545 +
14546 +#if REISER4_DEBUG
14547 +
14548 +extern void reiser4_check_fq(const txn_atom *atom);
14549 +extern atomic_t flush_cnt;
14550 +
14551 +#define check_preceder(blk) \
14552 +assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb()));
14553 +extern void check_pos(flush_pos_t *pos);
14554 +#else
14555 +#define check_preceder(b) noop
14556 +#define check_pos(pos) noop
14557 +#endif
14558 +
14559 +/* __REISER4_FLUSH_H__ */
14560 +#endif
14561 +
14562 +/* Make Linus happy.
14563 +   Local variables:
14564 +   c-indentation-style: "K&R"
14565 +   mode-name: "LC"
14566 +   c-basic-offset: 8
14567 +   tab-width: 8
14568 +   fill-column: 90
14569 +   LocalWords:  preceder
14570 +   End:
14571 +*/
14572 diff -urN linux-2.6.35.orig/fs/reiser4/flush_queue.c linux-2.6.35/fs/reiser4/flush_queue.c
14573 --- linux-2.6.35.orig/fs/reiser4/flush_queue.c  1970-01-01 01:00:00.000000000 +0100
14574 +++ linux-2.6.35/fs/reiser4/flush_queue.c       2010-08-04 15:44:57.000000000 +0200
14575 @@ -0,0 +1,678 @@
14576 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
14577 +   reiser4/README */
14578 +
14579 +#include "debug.h"
14580 +#include "super.h"
14581 +#include "txnmgr.h"
14582 +#include "jnode.h"
14583 +#include "znode.h"
14584 +#include "page_cache.h"
14585 +#include "wander.h"
14586 +#include "vfs_ops.h"
14587 +#include "writeout.h"
14588 +#include "flush.h"
14589 +
14590 +#include <linux/bio.h>
14591 +#include <linux/mm.h>
14592 +#include <linux/pagemap.h>
14593 +#include <linux/blkdev.h>
14594 +#include <linux/writeback.h>
14595 +
14596 +/* A flush queue object is an accumulator for keeping jnodes prepared
14597 +   by the jnode_flush() function for writing to disk. Those "queued" jnodes are
14598 +   kept on the flush queue until memory pressure or atom commit asks
14599 +   flush queues to write some or all from their jnodes. */
14600 +
14601 +/*
14602 +   LOCKING:
14603 +
14604 +   fq->guard spin lock protects fq->atom pointer and nothing else.  fq->prepped
14605 +   list protected by atom spin lock.  fq->prepped list uses the following
14606 +   locking:
14607 +
14608 +   two ways to protect fq->prepped list for read-only list traversal:
14609 +
14610 +   1. atom spin-lock atom.
14611 +   2. fq is IN_USE, atom->nr_running_queues increased.
14612 +
14613 +   and one for list modification:
14614 +
14615 +   1. atom is spin-locked and one condition is true: fq is IN_USE or
14616 +      atom->nr_running_queues == 0.
14617 +
14618 +   The deadlock-safe order for flush queues and atoms is: first lock atom, then
14619 +   lock flush queue, then lock jnode.
14620 +*/
14621 +
14622 +#define fq_in_use(fq)          ((fq)->state & FQ_IN_USE)
14623 +#define fq_ready(fq)           (!fq_in_use(fq))
14624 +
14625 +#define mark_fq_in_use(fq)     do { (fq)->state |= FQ_IN_USE;    } while (0)
14626 +#define mark_fq_ready(fq)      do { (fq)->state &= ~FQ_IN_USE;   } while (0)
14627 +
14628 +/* get lock on atom from locked flush queue object */
14629 +static txn_atom *atom_locked_by_fq_nolock(flush_queue_t *fq)
14630 +{
14631 +       /* This code is similar to jnode_get_atom(), look at it for the
14632 +        * explanation. */
14633 +       txn_atom *atom;
14634 +
14635 +       assert_spin_locked(&(fq->guard));
14636 +
14637 +       while (1) {
14638 +               atom = fq->atom;
14639 +               if (atom == NULL)
14640 +                       break;
14641 +
14642 +               if (spin_trylock_atom(atom))
14643 +                       break;
14644 +
14645 +               atomic_inc(&atom->refcount);
14646 +               spin_unlock(&(fq->guard));
14647 +               spin_lock_atom(atom);
14648 +               spin_lock(&(fq->guard));
14649 +
14650 +               if (fq->atom == atom) {
14651 +                       atomic_dec(&atom->refcount);
14652 +                       break;
14653 +               }
14654 +
14655 +               spin_unlock(&(fq->guard));
14656 +               atom_dec_and_unlock(atom);
14657 +               spin_lock(&(fq->guard));
14658 +       }
14659 +
14660 +       return atom;
14661 +}
14662 +
14663 +txn_atom *atom_locked_by_fq(flush_queue_t *fq)
14664 +{
14665 +       txn_atom *atom;
14666 +
14667 +       spin_lock(&(fq->guard));
14668 +       atom = atom_locked_by_fq_nolock(fq);
14669 +       spin_unlock(&(fq->guard));
14670 +       return atom;
14671 +}
14672 +
14673 +static void init_fq(flush_queue_t *fq)
14674 +{
14675 +       memset(fq, 0, sizeof *fq);
14676 +
14677 +       atomic_set(&fq->nr_submitted, 0);
14678 +
14679 +       INIT_LIST_HEAD(ATOM_FQ_LIST(fq));
14680 +
14681 +       init_waitqueue_head(&fq->wait);
14682 +       spin_lock_init(&fq->guard);
14683 +}
14684 +
14685 +/* slab for flush queues */
14686 +static struct kmem_cache *fq_slab;
14687 +
14688 +/**
14689 + * reiser4_init_fqs - create flush queue cache
14690 + *
14691 + * Initializes slab cache of flush queues. It is part of reiser4 module
14692 + * initialization.
14693 + */
14694 +int reiser4_init_fqs(void)
14695 +{
14696 +       fq_slab = kmem_cache_create("fq",
14697 +                                   sizeof(flush_queue_t),
14698 +                                   0, SLAB_HWCACHE_ALIGN, NULL);
14699 +       if (fq_slab == NULL)
14700 +               return RETERR(-ENOMEM);
14701 +       return 0;
14702 +}
14703 +
14704 +/**
14705 + * reiser4_done_fqs - delete flush queue cache
14706 + *
14707 + * This is called on reiser4 module unloading or system shutdown.
14708 + */
14709 +void reiser4_done_fqs(void)
14710 +{
14711 +       destroy_reiser4_cache(&fq_slab);
14712 +}
14713 +
14714 +/* create new flush queue object */
14715 +static flush_queue_t *create_fq(gfp_t gfp)
14716 +{
14717 +       flush_queue_t *fq;
14718 +
14719 +       fq = kmem_cache_alloc(fq_slab, gfp);
14720 +       if (fq)
14721 +               init_fq(fq);
14722 +
14723 +       return fq;
14724 +}
14725 +
14726 +/* adjust atom's and flush queue's counters of queued nodes */
14727 +static void count_enqueued_node(flush_queue_t *fq)
14728 +{
14729 +       ON_DEBUG(fq->atom->num_queued++);
14730 +}
14731 +
14732 +static void count_dequeued_node(flush_queue_t *fq)
14733 +{
14734 +       assert("zam-993", fq->atom->num_queued > 0);
14735 +       ON_DEBUG(fq->atom->num_queued--);
14736 +}
14737 +
14738 +/* attach flush queue object to the atom */
14739 +static void attach_fq(txn_atom *atom, flush_queue_t *fq)
14740 +{
14741 +       assert_spin_locked(&(atom->alock));
14742 +       list_add(&fq->alink, &atom->flush_queues);
14743 +       fq->atom = atom;
14744 +       ON_DEBUG(atom->nr_flush_queues++);
14745 +}
14746 +
14747 +static void detach_fq(flush_queue_t *fq)
14748 +{
14749 +       assert_spin_locked(&(fq->atom->alock));
14750 +
14751 +       spin_lock(&(fq->guard));
14752 +       list_del_init(&fq->alink);
14753 +       assert("vs-1456", fq->atom->nr_flush_queues > 0);
14754 +       ON_DEBUG(fq->atom->nr_flush_queues--);
14755 +       fq->atom = NULL;
14756 +       spin_unlock(&(fq->guard));
14757 +}
14758 +
14759 +/* destroy flush queue object */
14760 +static void done_fq(flush_queue_t *fq)
14761 +{
14762 +       assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq)));
14763 +       assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
14764 +
14765 +       kmem_cache_free(fq_slab, fq);
14766 +}
14767 +
14768 +/* */
14769 +static void mark_jnode_queued(flush_queue_t *fq, jnode * node)
14770 +{
14771 +       JF_SET(node, JNODE_FLUSH_QUEUED);
14772 +       count_enqueued_node(fq);
14773 +}
14774 +
14775 +/* Putting jnode into the flush queue. Both atom and jnode should be
14776 +   spin-locked. */
14777 +void queue_jnode(flush_queue_t *fq, jnode * node)
14778 +{
14779 +       assert_spin_locked(&(node->guard));
14780 +       assert("zam-713", node->atom != NULL);
14781 +       assert_spin_locked(&(node->atom->alock));
14782 +       assert("zam-716", fq->atom != NULL);
14783 +       assert("zam-717", fq->atom == node->atom);
14784 +       assert("zam-907", fq_in_use(fq));
14785 +
14786 +       assert("zam-714", JF_ISSET(node, JNODE_DIRTY));
14787 +       assert("zam-826", JF_ISSET(node, JNODE_RELOC));
14788 +       assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
14789 +       assert("vs-1481", NODE_LIST(node) != FQ_LIST);
14790 +
14791 +       mark_jnode_queued(fq, node);
14792 +       list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq));
14793 +
14794 +       ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
14795 +                            FQ_LIST, 1));
14796 +}
14797 +
14798 +/* repeatable process for waiting io completion on a flush queue object */
14799 +static int wait_io(flush_queue_t *fq, int *nr_io_errors)
14800 +{
14801 +       assert("zam-738", fq->atom != NULL);
14802 +       assert_spin_locked(&(fq->atom->alock));
14803 +       assert("zam-736", fq_in_use(fq));
14804 +       assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq)));
14805 +
14806 +       if (atomic_read(&fq->nr_submitted) != 0) {
14807 +               struct super_block *super;
14808 +
14809 +               spin_unlock_atom(fq->atom);
14810 +
14811 +               assert("nikita-3013", reiser4_schedulable());
14812 +
14813 +               super = reiser4_get_current_sb();
14814 +
14815 +               /* FIXME: this is instead of blk_run_queues() */
14816 +               blk_run_address_space(reiser4_get_super_fake(super)->i_mapping);
14817 +
14818 +               if (!(super->s_flags & MS_RDONLY))
14819 +                       wait_event(fq->wait,
14820 +                                  atomic_read(&fq->nr_submitted) == 0);
14821 +
14822 +               /* Ask the caller to re-acquire the locks and call this
14823 +                  function again. Note: this technique is commonly used in
14824 +                  the txnmgr code. */
14825 +               return -E_REPEAT;
14826 +       }
14827 +
14828 +       *nr_io_errors += atomic_read(&fq->nr_errors);
14829 +       return 0;
14830 +}
14831 +
14832 +/* wait on I/O completion, re-submit dirty nodes to write */
14833 +static int finish_fq(flush_queue_t *fq, int *nr_io_errors)
14834 +{
14835 +       int ret;
14836 +       txn_atom *atom = fq->atom;
14837 +
14838 +       assert("zam-801", atom != NULL);
14839 +       assert_spin_locked(&(atom->alock));
14840 +       assert("zam-762", fq_in_use(fq));
14841 +
14842 +       ret = wait_io(fq, nr_io_errors);
14843 +       if (ret)
14844 +               return ret;
14845 +
14846 +       detach_fq(fq);
14847 +       done_fq(fq);
14848 +
14849 +       reiser4_atom_send_event(atom);
14850 +
14851 +       return 0;
14852 +}
14853 +
14854 +/* wait for all i/o for given atom to be completed, actually do one iteration
14855 +   on that and return -E_REPEAT if there more iterations needed */
14856 +static int finish_all_fq(txn_atom * atom, int *nr_io_errors)
14857 +{
14858 +       flush_queue_t *fq;
14859 +
14860 +       assert_spin_locked(&(atom->alock));
14861 +
14862 +       if (list_empty_careful(&atom->flush_queues))
14863 +               return 0;
14864 +
14865 +       list_for_each_entry(fq, &atom->flush_queues, alink) {
14866 +               if (fq_ready(fq)) {
14867 +                       int ret;
14868 +
14869 +                       mark_fq_in_use(fq);
14870 +                       assert("vs-1247", fq->owner == NULL);
14871 +                       ON_DEBUG(fq->owner = current);
14872 +                       ret = finish_fq(fq, nr_io_errors);
14873 +
14874 +                       if (*nr_io_errors)
14875 +                               reiser4_handle_error();
14876 +
14877 +                       if (ret) {
14878 +                               reiser4_fq_put(fq);
14879 +                               return ret;
14880 +                       }
14881 +
14882 +                       spin_unlock_atom(atom);
14883 +
14884 +                       return -E_REPEAT;
14885 +               }
14886 +       }
14887 +
14888 +       /* All flush queues are in use; atom remains locked */
14889 +       return -EBUSY;
14890 +}
14891 +
14892 +/* wait all i/o for current atom */
14893 +int current_atom_finish_all_fq(void)
14894 +{
14895 +       txn_atom *atom;
14896 +       int nr_io_errors = 0;
14897 +       int ret = 0;
14898 +
14899 +       do {
14900 +               while (1) {
14901 +                       atom = get_current_atom_locked();
14902 +                       ret = finish_all_fq(atom, &nr_io_errors);
14903 +                       if (ret != -EBUSY)
14904 +                               break;
14905 +                       reiser4_atom_wait_event(atom);
14906 +               }
14907 +       } while (ret == -E_REPEAT);
14908 +
14909 +       /* we do not need locked atom after this function finishes, SUCCESS or
14910 +          -EBUSY are two return codes when atom remains locked after
14911 +          finish_all_fq */
14912 +       if (!ret)
14913 +               spin_unlock_atom(atom);
14914 +
14915 +       assert_spin_not_locked(&(atom->alock));
14916 +
14917 +       if (ret)
14918 +               return ret;
14919 +
14920 +       if (nr_io_errors)
14921 +               return RETERR(-EIO);
14922 +
14923 +       return 0;
14924 +}
14925 +
14926 +/* change node->atom field for all jnode from given list */
14927 +static void
14928 +scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom)
14929 +{
14930 +       jnode *cur;
14931 +
14932 +       list_for_each_entry(cur, list, capture_link) {
14933 +               spin_lock_jnode(cur);
14934 +               cur->atom = atom;
14935 +               spin_unlock_jnode(cur);
14936 +       }
14937 +}
14938 +
14939 +/* support for atom fusion operation */
14940 +void reiser4_fuse_fq(txn_atom *to, txn_atom *from)
14941 +{
14942 +       flush_queue_t *fq;
14943 +
14944 +       assert_spin_locked(&(to->alock));
14945 +       assert_spin_locked(&(from->alock));
14946 +
14947 +       list_for_each_entry(fq, &from->flush_queues, alink) {
14948 +               scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
14949 +               spin_lock(&(fq->guard));
14950 +               fq->atom = to;
14951 +               spin_unlock(&(fq->guard));
14952 +       }
14953 +
14954 +       list_splice_init(&from->flush_queues, to->flush_queues.prev);
14955 +
14956 +#if REISER4_DEBUG
14957 +       to->num_queued += from->num_queued;
14958 +       to->nr_flush_queues += from->nr_flush_queues;
14959 +       from->nr_flush_queues = 0;
14960 +#endif
14961 +}
14962 +
14963 +#if REISER4_DEBUG
14964 +int atom_fq_parts_are_clean(txn_atom * atom)
14965 +{
14966 +       assert("zam-915", atom != NULL);
14967 +       return list_empty_careful(&atom->flush_queues);
14968 +}
14969 +#endif
14970 +/* Bio i/o completion routine for reiser4 write operations. */
14971 +static void
14972 +end_io_handler(struct bio *bio, int err)
14973 +{
14974 +       int i;
14975 +       int nr_errors = 0;
14976 +       flush_queue_t *fq;
14977 +
14978 +       assert("zam-958", bio->bi_rw & WRITE);
14979 +
14980 +       if (err == -EOPNOTSUPP)
14981 +               set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
14982 +
14983 +       /* we expect that bio->private is set to NULL or fq object which is used
14984 +        * for synchronization and error counting. */
14985 +       fq = bio->bi_private;
14986 +       /* Check all elements of io_vec for correct write completion. */
14987 +       for (i = 0; i < bio->bi_vcnt; i += 1) {
14988 +               struct page *pg = bio->bi_io_vec[i].bv_page;
14989 +
14990 +               if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
14991 +                       SetPageError(pg);
14992 +                       nr_errors++;
14993 +               }
14994 +
14995 +               {
14996 +                       /* jnode WRITEBACK ("write is in progress bit") is
14997 +                        * atomically cleared here. */
14998 +                       jnode *node;
14999 +
15000 +                       assert("zam-736", pg != NULL);
15001 +                       assert("zam-736", PagePrivate(pg));
15002 +                       node = jprivate(pg);
15003 +
15004 +                       JF_CLR(node, JNODE_WRITEBACK);
15005 +               }
15006 +
15007 +               end_page_writeback(pg);
15008 +               page_cache_release(pg);
15009 +       }
15010 +
15011 +       if (fq) {
15012 +               /* count i/o error in fq object */
15013 +               atomic_add(nr_errors, &fq->nr_errors);
15014 +
15015 +               /* If all write requests registered in this "fq" are done we up
15016 +                * the waiter. */
15017 +               if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted))
15018 +                       wake_up(&fq->wait);
15019 +       }
15020 +
15021 +       bio_put(bio);
15022 +}
15023 +
15024 +/* Count I/O requests which will be submitted by @bio in given flush queues
15025 +   @fq */
15026 +void add_fq_to_bio(flush_queue_t *fq, struct bio *bio)
15027 +{
15028 +       bio->bi_private = fq;
15029 +       bio->bi_end_io = end_io_handler;
15030 +
15031 +       if (fq)
15032 +               atomic_add(bio->bi_vcnt, &fq->nr_submitted);
15033 +}
15034 +
15035 +/* Move all queued nodes out from @fq->prepped list. */
15036 +static void release_prepped_list(flush_queue_t *fq)
15037 +{
15038 +       txn_atom *atom;
15039 +
15040 +       assert("zam-904", fq_in_use(fq));
15041 +       atom = atom_locked_by_fq(fq);
15042 +
15043 +       while (!list_empty(ATOM_FQ_LIST(fq))) {
15044 +               jnode *cur;
15045 +
15046 +               cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link);
15047 +               list_del_init(&cur->capture_link);
15048 +
15049 +               count_dequeued_node(fq);
15050 +               spin_lock_jnode(cur);
15051 +               assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
15052 +               assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
15053 +               assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
15054 +               JF_CLR(cur, JNODE_FLUSH_QUEUED);
15055 +
15056 +               if (JF_ISSET(cur, JNODE_DIRTY)) {
15057 +                       list_add_tail(&cur->capture_link,
15058 +                                     ATOM_DIRTY_LIST(atom,
15059 +                                                     jnode_get_level(cur)));
15060 +                       ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
15061 +                                            DIRTY_LIST, 1));
15062 +               } else {
15063 +                       list_add_tail(&cur->capture_link,
15064 +                                     ATOM_CLEAN_LIST(atom));
15065 +                       ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
15066 +                                            CLEAN_LIST, 1));
15067 +               }
15068 +
15069 +               spin_unlock_jnode(cur);
15070 +       }
15071 +
15072 +       if (--atom->nr_running_queues == 0)
15073 +               reiser4_atom_send_event(atom);
15074 +
15075 +       spin_unlock_atom(atom);
15076 +}
15077 +
15078 +/* Submit write requests for nodes on the already filled flush queue @fq.
15079 +
15080 +   @fq: flush queue object which contains jnodes we can (and will) write.
15081 +   @return: number of submitted blocks (>=0) if success, otherwise -- an error
15082 +           code (<0). */
15083 +int reiser4_write_fq(flush_queue_t *fq, long *nr_submitted, int flags)
15084 +{
15085 +       int ret;
15086 +       txn_atom *atom;
15087 +
15088 +       while (1) {
15089 +               atom = atom_locked_by_fq(fq);
15090 +               assert("zam-924", atom);
15091 +               /* do not write fq in parallel. */
15092 +               if (atom->nr_running_queues == 0
15093 +                   || !(flags & WRITEOUT_SINGLE_STREAM))
15094 +                       break;
15095 +               reiser4_atom_wait_event(atom);
15096 +       }
15097 +
15098 +       atom->nr_running_queues++;
15099 +       spin_unlock_atom(atom);
15100 +
15101 +       ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
15102 +       release_prepped_list(fq);
15103 +
15104 +       return ret;
15105 +}
15106 +
15107 +/* Getting flush queue object for exclusive use by one thread. May require
15108 +   several iterations which is indicated by -E_REPEAT return code.
15109 +
15110 +   This function does not contain code for obtaining an atom lock because an
15111 +   atom lock is obtained by different ways in different parts of reiser4,
15112 +   usually it is current atom, but we need a possibility for getting fq for the
15113 +   atom of given jnode. */
15114 +static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp)
15115 +{
15116 +       flush_queue_t *fq;
15117 +
15118 +       assert_spin_locked(&(atom->alock));
15119 +
15120 +       fq = list_entry(atom->flush_queues.next, flush_queue_t, alink);
15121 +       while (&atom->flush_queues != &fq->alink) {
15122 +               spin_lock(&(fq->guard));
15123 +
15124 +               if (fq_ready(fq)) {
15125 +                       mark_fq_in_use(fq);
15126 +                       assert("vs-1246", fq->owner == NULL);
15127 +                       ON_DEBUG(fq->owner = current);
15128 +                       spin_unlock(&(fq->guard));
15129 +
15130 +                       if (*new_fq)
15131 +                               done_fq(*new_fq);
15132 +
15133 +                       *new_fq = fq;
15134 +
15135 +                       return 0;
15136 +               }
15137 +
15138 +               spin_unlock(&(fq->guard));
15139 +
15140 +               fq = list_entry(fq->alink.next, flush_queue_t, alink);
15141 +       }
15142 +
15143 +       /* Use previously allocated fq object */
15144 +       if (*new_fq) {
15145 +               mark_fq_in_use(*new_fq);
15146 +               assert("vs-1248", (*new_fq)->owner == 0);
15147 +               ON_DEBUG((*new_fq)->owner = current);
15148 +               attach_fq(atom, *new_fq);
15149 +
15150 +               return 0;
15151 +       }
15152 +
15153 +       spin_unlock_atom(atom);
15154 +
15155 +       *new_fq = create_fq(gfp);
15156 +
15157 +       if (*new_fq == NULL)
15158 +               return RETERR(-ENOMEM);
15159 +
15160 +       return RETERR(-E_REPEAT);
15161 +}
15162 +
15163 +int reiser4_fq_by_atom(txn_atom * atom, flush_queue_t **new_fq)
15164 +{
15165 +       return fq_by_atom_gfp(atom, new_fq, reiser4_ctx_gfp_mask_get());
15166 +}
15167 +
15168 +/* A wrapper around reiser4_fq_by_atom for getting a flush queue
15169 +   object for current atom, if success fq->atom remains locked. */
15170 +flush_queue_t *get_fq_for_current_atom(void)
15171 +{
15172 +       flush_queue_t *fq = NULL;
15173 +       txn_atom *atom;
15174 +       int ret;
15175 +
15176 +       do {
15177 +               atom = get_current_atom_locked();
15178 +               ret = reiser4_fq_by_atom(atom, &fq);
15179 +       } while (ret == -E_REPEAT);
15180 +
15181 +       if (ret)
15182 +               return ERR_PTR(ret);
15183 +       return fq;
15184 +}
15185 +
15186 +/* Releasing flush queue object after exclusive use */
15187 +void reiser4_fq_put_nolock(flush_queue_t *fq)
15188 +{
15189 +       assert("zam-747", fq->atom != NULL);
15190 +       assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq)));
15191 +       mark_fq_ready(fq);
15192 +       assert("vs-1245", fq->owner == current);
15193 +       ON_DEBUG(fq->owner = NULL);
15194 +}
15195 +
15196 +void reiser4_fq_put(flush_queue_t *fq)
15197 +{
15198 +       txn_atom *atom;
15199 +
15200 +       spin_lock(&(fq->guard));
15201 +       atom = atom_locked_by_fq_nolock(fq);
15202 +
15203 +       assert("zam-746", atom != NULL);
15204 +
15205 +       reiser4_fq_put_nolock(fq);
15206 +       reiser4_atom_send_event(atom);
15207 +
15208 +       spin_unlock(&(fq->guard));
15209 +       spin_unlock_atom(atom);
15210 +}
15211 +
15212 +/* A part of atom object initialization related to the embedded flush queue
15213 +   list head */
15214 +
15215 +void init_atom_fq_parts(txn_atom *atom)
15216 +{
15217 +       INIT_LIST_HEAD(&atom->flush_queues);
15218 +}
15219 +
15220 +#if REISER4_DEBUG
15221 +
15222 +void reiser4_check_fq(const txn_atom *atom)
15223 +{
15224 +       /* check number of nodes on all atom's flush queues */
15225 +       flush_queue_t *fq;
15226 +       int count;
15227 +       struct list_head *pos;
15228 +
15229 +       count = 0;
15230 +       list_for_each_entry(fq, &atom->flush_queues, alink) {
15231 +               spin_lock(&(fq->guard));
15232 +               /* calculate number of jnodes on fq' list of prepped jnodes */
15233 +               list_for_each(pos, ATOM_FQ_LIST(fq))
15234 +                       count++;
15235 +               spin_unlock(&(fq->guard));
15236 +       }
15237 +       if (count != atom->fq)
15238 +               warning("", "fq counter %d, real %d\n", atom->fq, count);
15239 +
15240 +}
15241 +
15242 +#endif
15243 +
15244 +/*
15245 + * Local variables:
15246 + * c-indentation-style: "K&R"
15247 + * mode-name: "LC"
15248 + * c-basic-offset: 8
15249 + * tab-width: 8
15250 + * fill-column: 79
15251 + * scroll-step: 1
15252 + * End:
15253 + */
15254 diff -urN linux-2.6.35.orig/fs/reiser4/forward.h linux-2.6.35/fs/reiser4/forward.h
15255 --- linux-2.6.35.orig/fs/reiser4/forward.h      1970-01-01 01:00:00.000000000 +0100
15256 +++ linux-2.6.35/fs/reiser4/forward.h   2010-08-04 15:44:57.000000000 +0200
15257 @@ -0,0 +1,256 @@
15258 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
15259 +   reiser4/README */
15260 +
15261 +/* Forward declarations. Thank you Kernighan. */
15262 +
15263 +#if !defined(__REISER4_FORWARD_H__)
15264 +#define __REISER4_FORWARD_H__
15265 +
15266 +#include <asm/errno.h>
15267 +#include <linux/types.h>
15268 +
15269 +typedef struct zlock zlock;
15270 +typedef struct lock_stack lock_stack;
15271 +typedef struct lock_handle lock_handle;
15272 +typedef struct znode znode;
15273 +typedef struct flow flow_t;
15274 +typedef struct coord coord_t;
15275 +typedef struct tree_access_pointer tap_t;
15276 +typedef struct reiser4_object_create_data reiser4_object_create_data;
15277 +typedef union reiser4_plugin reiser4_plugin;
15278 +typedef __u16 reiser4_plugin_id;
15279 +typedef __u64 reiser4_plugin_groups;
15280 +typedef struct item_plugin item_plugin;
15281 +typedef struct jnode_plugin jnode_plugin;
15282 +typedef struct reiser4_item_data reiser4_item_data;
15283 +typedef union reiser4_key reiser4_key;
15284 +typedef struct reiser4_tree reiser4_tree;
15285 +typedef struct carry_cut_data carry_cut_data;
15286 +typedef struct carry_kill_data carry_kill_data;
15287 +typedef struct carry_tree_op carry_tree_op;
15288 +typedef struct carry_tree_node carry_tree_node;
15289 +typedef struct carry_plugin_info carry_plugin_info;
15290 +typedef struct reiser4_journal reiser4_journal;
15291 +typedef struct txn_atom txn_atom;
15292 +typedef struct txn_handle txn_handle;
15293 +typedef struct txn_mgr txn_mgr;
15294 +typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
15295 +typedef struct reiser4_context reiser4_context;
15296 +typedef struct carry_level carry_level;
15297 +typedef struct blocknr_set_entry blocknr_set_entry;
15298 +/* super_block->s_fs_info points to this */
15299 +typedef struct reiser4_super_info_data reiser4_super_info_data;
15300 +/* next two objects are fields of reiser4_super_info_data */
15301 +typedef struct reiser4_oid_allocator reiser4_oid_allocator;
15302 +typedef struct reiser4_space_allocator reiser4_space_allocator;
15303 +
15304 +typedef struct flush_scan flush_scan;
15305 +typedef struct flush_position flush_pos_t;
15306 +
15307 +typedef unsigned short pos_in_node_t;
15308 +#define MAX_POS_IN_NODE 65535
15309 +
15310 +typedef struct jnode jnode;
15311 +typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
15312 +
15313 +typedef struct uf_coord uf_coord_t;
15314 +typedef struct hint hint_t;
15315 +
15316 +typedef struct ktxnmgrd_context ktxnmgrd_context;
15317 +
15318 +struct inode;
15319 +struct page;
15320 +struct file;
15321 +struct dentry;
15322 +struct super_block;
15323 +
15324 +/* return values of coord_by_key(). cbk == coord_by_key */
15325 +typedef enum {
15326 +       CBK_COORD_FOUND = 0,
15327 +       CBK_COORD_NOTFOUND = -ENOENT,
15328 +} lookup_result;
15329 +
15330 +/* results of lookup with directory file */
15331 +typedef enum {
15332 +       FILE_NAME_FOUND = 0,
15333 +       FILE_NAME_NOTFOUND = -ENOENT,
15334 +       FILE_IO_ERROR = -EIO,   /* FIXME: it seems silly to have special OOM,
15335 +                                  IO_ERROR return codes for each search. */
15336 +       FILE_OOM = -ENOMEM      /* FIXME: it seems silly to have special OOM,
15337 +                                  IO_ERROR return codes for each search. */
15338 +} file_lookup_result;
15339 +
15340 +/* behaviors of lookup. If coord we are looking for is actually in a tree,
15341 +    both coincide. */
15342 +typedef enum {
15343 +       /* search exactly for the coord with key given */
15344 +       FIND_EXACT,
15345 +       /* search for coord with the maximal key not greater than one
15346 +          given */
15347 +       FIND_MAX_NOT_MORE_THAN  /*LEFT_SLANT_BIAS */
15348 +} lookup_bias;
15349 +
15350 +typedef enum {
15351 +       /* number of leaf level of the tree
15352 +          The fake root has (tree_level=0). */
15353 +       LEAF_LEVEL = 1,
15354 +
15355 +       /* number of level one above leaf level of the tree.
15356 +
15357 +          It is supposed that internal tree used by reiser4 to store file
15358 +          system data and meta data will have height 2 initially (when
15359 +          created by mkfs).
15360 +        */
15361 +       TWIG_LEVEL = 2,
15362 +} tree_level;
15363 +
15364 +/* The "real" maximum ztree height is the 0-origin size of any per-level
15365 +   array, since the zero'th level is not used. */
15366 +#define REAL_MAX_ZTREE_HEIGHT     (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
15367 +
15368 +/* enumeration of possible mutual position of item and coord.  This enum is
15369 +    return type of ->is_in_item() item plugin method which see. */
15370 +typedef enum {
15371 +       /* coord is on the left of an item */
15372 +       IP_ON_THE_LEFT,
15373 +       /* coord is inside item */
15374 +       IP_INSIDE,
15375 +       /* coord is inside item, but to the right of the rightmost unit of
15376 +          this item */
15377 +       IP_RIGHT_EDGE,
15378 +       /* coord is on the right of an item */
15379 +       IP_ON_THE_RIGHT
15380 +} interposition;
15381 +
15382 +/* type of lock to acquire on znode before returning it to caller */
15383 +typedef enum {
15384 +       ZNODE_NO_LOCK = 0,
15385 +       ZNODE_READ_LOCK = 1,
15386 +       ZNODE_WRITE_LOCK = 2,
15387 +} znode_lock_mode;
15388 +
15389 +/* type of lock request */
15390 +typedef enum {
15391 +       ZNODE_LOCK_LOPRI = 0,
15392 +       ZNODE_LOCK_HIPRI = (1 << 0),
15393 +
15394 +       /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to
15395 +          longterm_lock_znode will not sleep waiting for the lock to become
15396 +          available.  If the lock is unavailable, reiser4_znode_lock will
15397 +          immediately return the value -E_REPEAT. */
15398 +       ZNODE_LOCK_NONBLOCK = (1 << 1),
15399 +       /* An option for longterm_lock_znode which prevents atom fusion */
15400 +       ZNODE_LOCK_DONT_FUSE = (1 << 2)
15401 +} znode_lock_request;
15402 +
15403 +typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
15404 +
15405 +/* used to specify direction of shift. These must be -1 and 1 */
15406 +typedef enum {
15407 +       SHIFT_LEFT = 1,
15408 +       SHIFT_RIGHT = -1
15409 +} shift_direction;
15410 +
15411 +typedef enum {
15412 +       LEFT_SIDE,
15413 +       RIGHT_SIDE
15414 +} sideof;
15415 +
15416 +#define reiser4_round_up(value, order)                         \
15417 +       ((typeof(value))(((long) (value) + (order) - 1U) &      \
15418 +                        ~((order) - 1)))
15419 +
15420 +/* values returned by squalloc_right_neighbor and its auxiliary functions */
15421 +typedef enum {
15422 +       /* unit of internal item is moved */
15423 +       SUBTREE_MOVED = 0,
15424 +       /* nothing else can be squeezed into left neighbor */
15425 +       SQUEEZE_TARGET_FULL = 1,
15426 +       /* all content of node is squeezed into its left neighbor */
15427 +       SQUEEZE_SOURCE_EMPTY = 2,
15428 +       /* one more item is copied (this is only returned by
15429 +          allocate_and_copy_extent to squalloc_twig)) */
15430 +       SQUEEZE_CONTINUE = 3
15431 +} squeeze_result;
15432 +
15433 +/* Do not change items ids. If you do - there will be format change */
15434 +typedef enum {
15435 +       STATIC_STAT_DATA_ID = 0x0,
15436 +       SIMPLE_DIR_ENTRY_ID = 0x1,
15437 +       COMPOUND_DIR_ID = 0x2,
15438 +       NODE_POINTER_ID = 0x3,
15439 +       EXTENT_POINTER_ID = 0x5,
15440 +       FORMATTING_ID = 0x6,
15441 +       CTAIL_ID = 0x7,
15442 +       BLACK_BOX_ID = 0x8,
15443 +       LAST_ITEM_ID = 0x9
15444 +} item_id;
15445 +
15446 +/* Flags passed to jnode_flush() to allow it to distinguish default settings
15447 +   based on whether commit() was called or VM memory pressure was applied. */
15448 +typedef enum {
15449 +       /* submit flush queue to disk at jnode_flush completion */
15450 +       JNODE_FLUSH_WRITE_BLOCKS = 1,
15451 +
15452 +       /* flush is called for commit */
15453 +       JNODE_FLUSH_COMMIT = 2,
15454 +       /* not implemented */
15455 +       JNODE_FLUSH_MEMORY_FORMATTED = 4,
15456 +
15457 +       /* not implemented */
15458 +       JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
15459 +} jnode_flush_flags;
15460 +
15461 +/* Flags to insert/paste carry operations. Currently they only used in
15462 +   flushing code, but in future, they can be used to optimize for repetitive
15463 +   accesses.  */
15464 +typedef enum {
15465 +       /* carry is not allowed to shift data to the left when trying to find
15466 +          free space  */
15467 +       COPI_DONT_SHIFT_LEFT = (1 << 0),
15468 +       /* carry is not allowed to shift data to the right when trying to find
15469 +          free space  */
15470 +       COPI_DONT_SHIFT_RIGHT = (1 << 1),
15471 +       /* carry is not allowed to allocate new node(s) when trying to find
15472 +          free space */
15473 +       COPI_DONT_ALLOCATE = (1 << 2),
15474 +       /* try to load left neighbor if its not in a cache */
15475 +       COPI_LOAD_LEFT = (1 << 3),
15476 +       /* try to load right neighbor if its not in a cache */
15477 +       COPI_LOAD_RIGHT = (1 << 4),
15478 +       /* shift insertion point to the left neighbor */
15479 +       COPI_GO_LEFT = (1 << 5),
15480 +       /* shift insertion point to the right neighbor */
15481 +       COPI_GO_RIGHT = (1 << 6),
15482 +       /* try to step back into original node if insertion into new node
15483 +          fails after shifting data there. */
15484 +       COPI_STEP_BACK = (1 << 7)
15485 +} cop_insert_flag;
15486 +
15487 +typedef enum {
15488 +       SAFE_UNLINK,            /* safe-link for unlink */
15489 +       SAFE_TRUNCATE           /* safe-link for truncate */
15490 +} reiser4_safe_link_t;
15491 +
15492 +/* this is to show on which list of atom jnode is */
15493 +typedef enum {
15494 +       NOT_CAPTURED,
15495 +       DIRTY_LIST,
15496 +       CLEAN_LIST,
15497 +       FQ_LIST,
15498 +       WB_LIST,
15499 +       OVRWR_LIST
15500 +} atom_list;
15501 +
15502 +/* __REISER4_FORWARD_H__ */
15503 +#endif
15504 +
15505 +/* Make Linus happy.
15506 +   Local variables:
15507 +   c-indentation-style: "K&R"
15508 +   mode-name: "LC"
15509 +   c-basic-offset: 8
15510 +   tab-width: 8
15511 +   fill-column: 120
15512 +   End:
15513 +*/
15514 diff -urN linux-2.6.35.orig/fs/reiser4/fsdata.c linux-2.6.35/fs/reiser4/fsdata.c
15515 --- linux-2.6.35.orig/fs/reiser4/fsdata.c       1970-01-01 01:00:00.000000000 +0100
15516 +++ linux-2.6.35/fs/reiser4/fsdata.c    2010-08-04 20:21:08.000000000 +0200
15517 @@ -0,0 +1,804 @@
15518 +/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by
15519 + * reiser4/README */
15520 +
15521 +#include "fsdata.h"
15522 +#include "inode.h"
15523 +
15524 +
15525 +/* cache or dir_cursors */
15526 +static struct kmem_cache *d_cursor_cache;
15527 +
15528 +/* list of unused cursors */
15529 +static LIST_HEAD(cursor_cache);
15530 +
15531 +/* number of cursors in list of ununsed cursors */
15532 +static unsigned long d_cursor_unused = 0;
15533 +
15534 +/* spinlock protecting manipulations with dir_cursor's hash table and lists */
15535 +DEFINE_SPINLOCK(d_lock);
15536 +
15537 +static reiser4_file_fsdata *create_fsdata(struct file *file);
15538 +static int file_is_stateless(struct file *file);
15539 +static void free_fsdata(reiser4_file_fsdata *fsdata);
15540 +static void kill_cursor(dir_cursor *);
15541 +
15542 +/**
15543 + * d_cursor_shrink - shrink callback for cache of dir_cursor-s
15544 + * @nr: number of objects to free
15545 + * @mask: GFP mask
15546 + *
15547 + * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested
15548 + * number. Return number of still freeable cursors.
15549 + */
15550 +static int d_cursor_shrink(struct shrinker *shrink, int nr, gfp_t mask)
15551 +{
15552 +       if (nr != 0) {
15553 +               dir_cursor *scan;
15554 +               int killed;
15555 +
15556 +               killed = 0;
15557 +               spin_lock(&d_lock);
15558 +               while (!list_empty(&cursor_cache)) {
15559 +                       scan = list_entry(cursor_cache.next, dir_cursor, alist);
15560 +                       assert("nikita-3567", scan->ref == 0);
15561 +                       kill_cursor(scan);
15562 +                       ++killed;
15563 +                       --nr;
15564 +                       if (nr == 0)
15565 +                               break;
15566 +               }
15567 +               spin_unlock(&d_lock);
15568 +       }
15569 +       return d_cursor_unused;
15570 +}
15571 +
15572 +/*
15573 + * actually, d_cursors are "priceless", because there is no way to
15574 + * recover information stored in them. On the other hand, we don't
15575 + * want to consume all kernel memory by them. As a compromise, just
15576 + * assign higher "seeks" value to d_cursor cache, so that it will be
15577 + * shrunk only if system is really tight on memory.
15578 + */
15579 +static struct shrinker d_cursor_shrinker = {
15580 +       .shrink = d_cursor_shrink,
15581 +       .seeks = DEFAULT_SEEKS << 3,
15582 +};
15583 +
15584 +/**
15585 + * reiser4_init_d_cursor - create d_cursor cache
15586 + *
15587 + * Initializes slab cache of d_cursors. It is part of reiser4 module
15588 + * initialization.
15589 + */
15590 +int reiser4_init_d_cursor(void)
15591 +{
15592 +       d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0,
15593 +                                          SLAB_HWCACHE_ALIGN, NULL);
15594 +       if (d_cursor_cache == NULL)
15595 +               return RETERR(-ENOMEM);
15596 +
15597 +       register_shrinker(&d_cursor_shrinker);
15598 +       return 0;
15599 +}
15600 +
15601 +/**
15602 + * reiser4_done_d_cursor - delete d_cursor cache and d_cursor shrinker
15603 + *
15604 + * This is called on reiser4 module unloading or system shutdown.
15605 + */
15606 +void reiser4_done_d_cursor(void)
15607 +{
15608 +       unregister_shrinker(&d_cursor_shrinker);
15609 +
15610 +       destroy_reiser4_cache(&d_cursor_cache);
15611 +}
15612 +
15613 +#define D_CURSOR_TABLE_SIZE (256)
15614 +
15615 +static inline unsigned long
15616 +d_cursor_hash(d_cursor_hash_table * table, const struct d_cursor_key *key)
15617 +{
15618 +       assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
15619 +       return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
15620 +}
15621 +
15622 +static inline int d_cursor_eq(const struct d_cursor_key *k1,
15623 +                             const struct d_cursor_key *k2)
15624 +{
15625 +       return k1->cid == k2->cid && k1->oid == k2->oid;
15626 +}
15627 +
15628 +/*
15629 + * define functions to manipulate reiser4 super block's hash table of
15630 + * dir_cursors
15631 + */
15632 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
15633 +#define KFREE(ptr, size) kfree(ptr)
15634 +TYPE_SAFE_HASH_DEFINE(d_cursor,
15635 +                     dir_cursor,
15636 +                     struct d_cursor_key,
15637 +                     key, hash, d_cursor_hash, d_cursor_eq);
15638 +#undef KFREE
15639 +#undef KMALLOC
15640 +
15641 +/**
15642 + * reiser4_init_super_d_info - initialize per-super-block d_cursor resources
15643 + * @super: super block to initialize
15644 + *
15645 + * Initializes per-super-block d_cursor's hash table and radix tree. It is part
15646 + * of mount.
15647 + */
15648 +int reiser4_init_super_d_info(struct super_block *super)
15649 +{
15650 +       struct d_cursor_info *p;
15651 +
15652 +       p = &get_super_private(super)->d_info;
15653 +
15654 +       INIT_RADIX_TREE(&p->tree, reiser4_ctx_gfp_mask_get());
15655 +       return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE);
15656 +}
15657 +
15658 +/**
15659 + * reiser4_done_super_d_info - release per-super-block d_cursor resources
15660 + * @super: super block being umounted
15661 + *
15662 + * It is called on umount. Kills all directory cursors attached to suoer block.
15663 + */
15664 +void reiser4_done_super_d_info(struct super_block *super)
15665 +{
15666 +       struct d_cursor_info *d_info;
15667 +       dir_cursor *cursor, *next;
15668 +
15669 +       d_info = &get_super_private(super)->d_info;
15670 +       for_all_in_htable(&d_info->table, d_cursor, cursor, next)
15671 +               kill_cursor(cursor);
15672 +
15673 +       BUG_ON(d_info->tree.rnode != NULL);
15674 +       d_cursor_hash_done(&d_info->table);
15675 +}
15676 +
15677 +/**
15678 + * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it
15679 + * @cursor: cursor to free
15680 + *
15681 + * Removes reiser4_file_fsdata attached to @cursor from readdir list of
15682 + * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from
15683 + * indices, hash table, list of unused cursors and frees it.
15684 + */
15685 +static void kill_cursor(dir_cursor *cursor)
15686 +{
15687 +       unsigned long index;
15688 +
15689 +       assert("nikita-3566", cursor->ref == 0);
15690 +       assert("nikita-3572", cursor->fsdata != NULL);
15691 +
15692 +       index = (unsigned long)cursor->key.oid;
15693 +       list_del_init(&cursor->fsdata->dir.linkage);
15694 +       free_fsdata(cursor->fsdata);
15695 +       cursor->fsdata = NULL;
15696 +
15697 +       if (list_empty_careful(&cursor->list))
15698 +               /* this is last cursor for a file. Kill radix-tree entry */
15699 +               radix_tree_delete(&cursor->info->tree, index);
15700 +       else {
15701 +               void **slot;
15702 +
15703 +               /*
15704 +                * there are other cursors for the same oid.
15705 +                */
15706 +
15707 +               /*
15708 +                * if radix tree point to the cursor being removed, re-target
15709 +                * radix tree slot to the next cursor in the (non-empty as was
15710 +                * checked above) element of the circular list of all cursors
15711 +                * for this oid.
15712 +                */
15713 +               slot = radix_tree_lookup_slot(&cursor->info->tree, index);
15714 +               assert("nikita-3571", *slot != NULL);
15715 +               if (*slot == cursor)
15716 +                       *slot = list_entry(cursor->list.next, dir_cursor, list);
15717 +               /* remove cursor from circular list */
15718 +               list_del_init(&cursor->list);
15719 +       }
15720 +       /* remove cursor from the list of unused cursors */
15721 +       list_del_init(&cursor->alist);
15722 +       /* remove cursor from the hash table */
15723 +       d_cursor_hash_remove(&cursor->info->table, cursor);
15724 +       /* and free it */
15725 +       kmem_cache_free(d_cursor_cache, cursor);
15726 +       --d_cursor_unused;
15727 +}
15728 +
15729 +/* possible actions that can be performed on all cursors for the given file */
15730 +enum cursor_action {
15731 +       /*
15732 +        * load all detached state: this is called when stat-data is loaded
15733 +        * from the disk to recover information about all pending readdirs
15734 +        */
15735 +       CURSOR_LOAD,
15736 +       /*
15737 +        * detach all state from inode, leaving it in the cache. This is called
15738 +        * when inode is removed form the memory by memory pressure
15739 +        */
15740 +       CURSOR_DISPOSE,
15741 +       /*
15742 +        * detach cursors from the inode, and free them. This is called when
15743 +        * inode is destroyed
15744 +        */
15745 +       CURSOR_KILL
15746 +};
15747 +
15748 +/*
15749 + * return d_cursor data for the file system @inode is in.
15750 + */
15751 +static inline struct d_cursor_info *d_info(struct inode *inode)
15752 +{
15753 +       return &get_super_private(inode->i_sb)->d_info;
15754 +}
15755 +
15756 +/*
15757 + * lookup d_cursor in the per-super-block radix tree.
15758 + */
15759 +static inline dir_cursor *lookup(struct d_cursor_info *info,
15760 +                                unsigned long index)
15761 +{
15762 +       return (dir_cursor *) radix_tree_lookup(&info->tree, index);
15763 +}
15764 +
15765 +/*
15766 + * attach @cursor to the radix tree. There may be multiple cursors for the
15767 + * same oid, they are chained into circular list.
15768 + */
15769 +static void bind_cursor(dir_cursor * cursor, unsigned long index)
15770 +{
15771 +       dir_cursor *head;
15772 +
15773 +       head = lookup(cursor->info, index);
15774 +       if (head == NULL) {
15775 +               /* this is the first cursor for this index */
15776 +               INIT_LIST_HEAD(&cursor->list);
15777 +               radix_tree_insert(&cursor->info->tree, index, cursor);
15778 +       } else {
15779 +               /* some cursor already exists. Chain ours */
15780 +               list_add(&cursor->list, &head->list);
15781 +       }
15782 +}
15783 +
15784 +/*
15785 + * detach fsdata (if detachable) from file descriptor, and put cursor on the
15786 + * "unused" list. Called when file descriptor is not longer in active use.
15787 + */
15788 +static void clean_fsdata(struct file *file)
15789 +{
15790 +       dir_cursor *cursor;
15791 +       reiser4_file_fsdata *fsdata;
15792 +
15793 +       assert("nikita-3570", file_is_stateless(file));
15794 +
15795 +       fsdata = (reiser4_file_fsdata *) file->private_data;
15796 +       if (fsdata != NULL) {
15797 +               cursor = fsdata->cursor;
15798 +               if (cursor != NULL) {
15799 +                       spin_lock(&d_lock);
15800 +                       --cursor->ref;
15801 +                       if (cursor->ref == 0) {
15802 +                               list_add_tail(&cursor->alist, &cursor_cache);
15803 +                               ++d_cursor_unused;
15804 +                       }
15805 +                       spin_unlock(&d_lock);
15806 +                       file->private_data = NULL;
15807 +               }
15808 +       }
15809 +}
15810 +
15811 +/*
15812 + * global counter used to generate "client ids". These ids are encoded into
15813 + * high bits of fpos.
15814 + */
15815 +static __u32 cid_counter = 0;
15816 +#define CID_SHIFT (20)
15817 +#define CID_MASK  (0xfffffull)
15818 +
15819 +static void free_file_fsdata_nolock(struct file *);
15820 +
15821 +/**
15822 + * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table
15823 + * @cursor:
15824 + * @file:
15825 + * @inode:
15826 + *
15827 + * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to
15828 + * reiser4 super block's hash table and radix tree.
15829 + add detachable readdir
15830 + * state to the @f
15831 + */
15832 +static int insert_cursor(dir_cursor *cursor, struct file *file,
15833 +                        struct inode *inode)
15834 +{
15835 +       int result;
15836 +       reiser4_file_fsdata *fsdata;
15837 +
15838 +       memset(cursor, 0, sizeof *cursor);
15839 +
15840 +       /* this is either first call to readdir, or rewind. Anyway, create new
15841 +        * cursor. */
15842 +       fsdata = create_fsdata(NULL);
15843 +       if (fsdata != NULL) {
15844 +               result = radix_tree_preload(reiser4_ctx_gfp_mask_get());
15845 +               if (result == 0) {
15846 +                       struct d_cursor_info *info;
15847 +                       oid_t oid;
15848 +
15849 +                       info = d_info(inode);
15850 +                       oid = get_inode_oid(inode);
15851 +                       /* cid occupies higher 12 bits of f->f_pos. Don't
15852 +                        * allow it to become negative: this confuses
15853 +                        * nfsd_readdir() */
15854 +                       cursor->key.cid = (++cid_counter) & 0x7ff;
15855 +                       cursor->key.oid = oid;
15856 +                       cursor->fsdata = fsdata;
15857 +                       cursor->info = info;
15858 +                       cursor->ref = 1;
15859 +
15860 +                       spin_lock_inode(inode);
15861 +                       /* install cursor as @f's private_data, discarding old
15862 +                        * one if necessary */
15863 +#if REISER4_DEBUG
15864 +                       if (file->private_data)
15865 +                               warning("", "file has fsdata already");
15866 +#endif
15867 +                       clean_fsdata(file);
15868 +                       free_file_fsdata_nolock(file);
15869 +                       file->private_data = fsdata;
15870 +                       fsdata->cursor = cursor;
15871 +                       spin_unlock_inode(inode);
15872 +                       spin_lock(&d_lock);
15873 +                       /* insert cursor into hash table */
15874 +                       d_cursor_hash_insert(&info->table, cursor);
15875 +                       /* and chain it into radix-tree */
15876 +                       bind_cursor(cursor, (unsigned long)oid);
15877 +                       spin_unlock(&d_lock);
15878 +                       radix_tree_preload_end();
15879 +                       file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT;
15880 +               }
15881 +       } else
15882 +               result = RETERR(-ENOMEM);
15883 +       return result;
15884 +}
15885 +
15886 +/**
15887 + * process_cursors - do action on each cursor attached to inode
15888 + * @inode:
15889 + * @act: action to do
15890 + *
15891 + * Finds all cursors of @inode in reiser4's super block radix tree of cursors
15892 + * and performs action specified by @act on each of cursors.
15893 + */
15894 +static void process_cursors(struct inode *inode, enum cursor_action act)
15895 +{
15896 +       oid_t oid;
15897 +       dir_cursor *start;
15898 +       struct list_head *head;
15899 +       reiser4_context *ctx;
15900 +       struct d_cursor_info *info;
15901 +
15902 +       /* this can be called by
15903 +        *
15904 +        * kswapd->...->prune_icache->..reiser4_destroy_inode
15905 +        *
15906 +        * without reiser4_context
15907 +        */
15908 +       ctx = reiser4_init_context(inode->i_sb);
15909 +       if (IS_ERR(ctx)) {
15910 +               warning("vs-23", "failed to init context");
15911 +               return;
15912 +       }
15913 +
15914 +       assert("nikita-3558", inode != NULL);
15915 +
15916 +       info = d_info(inode);
15917 +       oid = get_inode_oid(inode);
15918 +       spin_lock_inode(inode);
15919 +       head = get_readdir_list(inode);
15920 +       spin_lock(&d_lock);
15921 +       /* find any cursor for this oid: reference to it is hanging of radix
15922 +        * tree */
15923 +       start = lookup(info, (unsigned long)oid);
15924 +       if (start != NULL) {
15925 +               dir_cursor *scan;
15926 +               reiser4_file_fsdata *fsdata;
15927 +
15928 +               /* process circular list of cursors for this oid */
15929 +               scan = start;
15930 +               do {
15931 +                       dir_cursor *next;
15932 +
15933 +                       next = list_entry(scan->list.next, dir_cursor, list);
15934 +                       fsdata = scan->fsdata;
15935 +                       assert("nikita-3557", fsdata != NULL);
15936 +                       if (scan->key.oid == oid) {
15937 +                               switch (act) {
15938 +                               case CURSOR_DISPOSE:
15939 +                                       list_del_init(&fsdata->dir.linkage);
15940 +                                       break;
15941 +                               case CURSOR_LOAD:
15942 +                                       list_add(&fsdata->dir.linkage, head);
15943 +                                       break;
15944 +                               case CURSOR_KILL:
15945 +                                       kill_cursor(scan);
15946 +                                       break;
15947 +                               }
15948 +                       }
15949 +                       if (scan == next)
15950 +                               /* last cursor was just killed */
15951 +                               break;
15952 +                       scan = next;
15953 +               } while (scan != start);
15954 +       }
15955 +       spin_unlock(&d_lock);
15956 +       /* check that we killed 'em all */
15957 +       assert("nikita-3568",
15958 +              ergo(act == CURSOR_KILL,
15959 +                   list_empty_careful(get_readdir_list(inode))));
15960 +       assert("nikita-3569",
15961 +              ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
15962 +       spin_unlock_inode(inode);
15963 +       reiser4_exit_context(ctx);
15964 +}
15965 +
15966 +/**
15967 + * reiser4_dispose_cursors - removes cursors from inode's list
15968 + * @inode: inode to dispose cursors of
15969 + *
15970 + * For each of cursors corresponding to @inode - removes reiser4_file_fsdata
15971 + * attached to cursor from inode's readdir list. This is called when inode is
15972 + * removed from the memory by memory pressure.
15973 + */
15974 +void reiser4_dispose_cursors(struct inode *inode)
15975 +{
15976 +       process_cursors(inode, CURSOR_DISPOSE);
15977 +}
15978 +
15979 +/**
15980 + * reiser4_load_cursors - attach cursors to inode
15981 + * @inode: inode to load cursors to
15982 + *
15983 + * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata
15984 + * attached to cursor to inode's readdir list. This is done when inode is
15985 + * loaded into memory.
15986 + */
15987 +void reiser4_load_cursors(struct inode *inode)
15988 +{
15989 +       process_cursors(inode, CURSOR_LOAD);
15990 +}
15991 +
15992 +/**
15993 + * reiser4_kill_cursors - kill all inode cursors
15994 + * @inode: inode to kill cursors of
15995 + *
15996 + * Frees all cursors for this inode. This is called when inode is destroyed.
15997 + */
15998 +void reiser4_kill_cursors(struct inode *inode)
15999 +{
16000 +       process_cursors(inode, CURSOR_KILL);
16001 +}
16002 +
16003 +/**
16004 + * file_is_stateless -
16005 + * @file:
16006 + *
16007 + * true, if file descriptor @f is created by NFS server by "demand" to serve
16008 + * one file system operation. This means that there may be "detached state"
16009 + * for underlying inode.
16010 + */
16011 +static int file_is_stateless(struct file *file)
16012 +{
16013 +       return reiser4_get_dentry_fsdata(file->f_dentry)->stateless;
16014 +}
16015 +
16016 +/**
16017 + * reiser4_get_dir_fpos -
16018 + * @dir:
16019 + *
16020 + * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
16021 + * in the case of stateless directory operation (readdir-over-nfs), client id
16022 + * was encoded in the high bits of cookie and should me masked off.
16023 + */
16024 +loff_t reiser4_get_dir_fpos(struct file *dir)
16025 +{
16026 +       if (file_is_stateless(dir))
16027 +               return dir->f_pos & CID_MASK;
16028 +       else
16029 +               return dir->f_pos;
16030 +}
16031 +
16032 +/**
16033 + * reiser4_attach_fsdata - try to attach fsdata
16034 + * @file:
16035 + * @inode:
16036 + *
16037 + * Finds or creates cursor for readdir-over-nfs.
16038 + */
16039 +int reiser4_attach_fsdata(struct file *file, struct inode *inode)
16040 +{
16041 +       loff_t pos;
16042 +       int result;
16043 +       dir_cursor *cursor;
16044 +
16045 +       /*
16046 +        * we are serialized by inode->i_mutex
16047 +        */
16048 +       if (!file_is_stateless(file))
16049 +               return 0;
16050 +
16051 +       pos = file->f_pos;
16052 +       result = 0;
16053 +       if (pos == 0) {
16054 +               /*
16055 +                * first call to readdir (or rewind to the beginning of
16056 +                * directory)
16057 +                */
16058 +               cursor = kmem_cache_alloc(d_cursor_cache,
16059 +                                         reiser4_ctx_gfp_mask_get());
16060 +               if (cursor != NULL)
16061 +                       result = insert_cursor(cursor, file, inode);
16062 +               else
16063 +                       result = RETERR(-ENOMEM);
16064 +       } else {
16065 +               /* try to find existing cursor */
16066 +               struct d_cursor_key key;
16067 +
16068 +               key.cid = pos >> CID_SHIFT;
16069 +               key.oid = get_inode_oid(inode);
16070 +               spin_lock(&d_lock);
16071 +               cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
16072 +               if (cursor != NULL) {
16073 +                       /* cursor was found */
16074 +                       if (cursor->ref == 0) {
16075 +                               /* move it from unused list */
16076 +                               list_del_init(&cursor->alist);
16077 +                               --d_cursor_unused;
16078 +                       }
16079 +                       ++cursor->ref;
16080 +               }
16081 +               spin_unlock(&d_lock);
16082 +               if (cursor != NULL) {
16083 +                       spin_lock_inode(inode);
16084 +                       assert("nikita-3556", cursor->fsdata->back == NULL);
16085 +                       clean_fsdata(file);
16086 +                       free_file_fsdata_nolock(file);
16087 +                       file->private_data = cursor->fsdata;
16088 +                       spin_unlock_inode(inode);
16089 +               }
16090 +       }
16091 +       return result;
16092 +}
16093 +
16094 +/**
16095 + * reiser4_detach_fsdata - ???
16096 + * @file:
16097 + *
16098 + * detach fsdata, if necessary
16099 + */
16100 +void reiser4_detach_fsdata(struct file *file)
16101 +{
16102 +       struct inode *inode;
16103 +
16104 +       if (!file_is_stateless(file))
16105 +               return;
16106 +
16107 +       inode = file->f_dentry->d_inode;
16108 +       spin_lock_inode(inode);
16109 +       clean_fsdata(file);
16110 +       spin_unlock_inode(inode);
16111 +}
16112 +
16113 +/* slab for reiser4_dentry_fsdata */
16114 +static struct kmem_cache *dentry_fsdata_cache;
16115 +
16116 +/**
16117 + * reiser4_init_dentry_fsdata - create cache of dentry_fsdata
16118 + *
16119 + * Initializes slab cache of structures attached to denty->d_fsdata. It is
16120 + * part of reiser4 module initialization.
16121 + */
16122 +int reiser4_init_dentry_fsdata(void)
16123 +{
16124 +       dentry_fsdata_cache = kmem_cache_create("dentry_fsdata",
16125 +                                          sizeof(struct reiser4_dentry_fsdata),
16126 +                                          0,
16127 +                                          SLAB_HWCACHE_ALIGN |
16128 +                                          SLAB_RECLAIM_ACCOUNT,
16129 +                                          NULL);
16130 +       if (dentry_fsdata_cache == NULL)
16131 +               return RETERR(-ENOMEM);
16132 +       return 0;
16133 +}
16134 +
16135 +/**
16136 + * reiser4_done_dentry_fsdata - delete cache of dentry_fsdata
16137 + *
16138 + * This is called on reiser4 module unloading or system shutdown.
16139 + */
16140 +void reiser4_done_dentry_fsdata(void)
16141 +{
16142 +       destroy_reiser4_cache(&dentry_fsdata_cache);
16143 +}
16144 +
16145 +/**
16146 + * reiser4_get_dentry_fsdata - get fs-specific dentry data
16147 + * @dentry: queried dentry
16148 + *
16149 + * Allocates if necessary and returns per-dentry data that we attach to each
16150 + * dentry.
16151 + */
16152 +struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry)
16153 +{
16154 +       assert("nikita-1365", dentry != NULL);
16155 +
16156 +       if (dentry->d_fsdata == NULL) {
16157 +               dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache,
16158 +                                                   reiser4_ctx_gfp_mask_get());
16159 +               if (dentry->d_fsdata == NULL)
16160 +                       return ERR_PTR(RETERR(-ENOMEM));
16161 +               memset(dentry->d_fsdata, 0,
16162 +                      sizeof(struct reiser4_dentry_fsdata));
16163 +       }
16164 +       return dentry->d_fsdata;
16165 +}
16166 +
16167 +/**
16168 + * reiser4_free_dentry_fsdata - detach and free dentry_fsdata
16169 + * @dentry: dentry to free fsdata of
16170 + *
16171 + * Detaches and frees fs-specific dentry data
16172 + */
16173 +void reiser4_free_dentry_fsdata(struct dentry *dentry)
16174 +{
16175 +       if (dentry->d_fsdata != NULL) {
16176 +               kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata);
16177 +               dentry->d_fsdata = NULL;
16178 +       }
16179 +}
16180 +
16181 +/* slab for reiser4_file_fsdata */
16182 +static struct kmem_cache *file_fsdata_cache;
16183 +
16184 +/**
16185 + * reiser4_init_file_fsdata - create cache of reiser4_file_fsdata
16186 + *
16187 + * Initializes slab cache of structures attached to file->private_data. It is
16188 + * part of reiser4 module initialization.
16189 + */
16190 +int reiser4_init_file_fsdata(void)
16191 +{
16192 +       file_fsdata_cache = kmem_cache_create("file_fsdata",
16193 +                                             sizeof(reiser4_file_fsdata),
16194 +                                             0,
16195 +                                             SLAB_HWCACHE_ALIGN |
16196 +                                             SLAB_RECLAIM_ACCOUNT, NULL);
16197 +       if (file_fsdata_cache == NULL)
16198 +               return RETERR(-ENOMEM);
16199 +       return 0;
16200 +}
16201 +
16202 +/**
16203 + * reiser4_done_file_fsdata - delete cache of reiser4_file_fsdata
16204 + *
16205 + * This is called on reiser4 module unloading or system shutdown.
16206 + */
16207 +void reiser4_done_file_fsdata(void)
16208 +{
16209 +       destroy_reiser4_cache(&file_fsdata_cache);
16210 +}
16211 +
16212 +/**
16213 + * create_fsdata - allocate and initialize reiser4_file_fsdata
16214 + * @file: what to create file_fsdata for, may be NULL
16215 + *
16216 + * Allocates and initializes reiser4_file_fsdata structure.
16217 + */
16218 +static reiser4_file_fsdata *create_fsdata(struct file *file)
16219 +{
16220 +       reiser4_file_fsdata *fsdata;
16221 +
16222 +       fsdata = kmem_cache_alloc(file_fsdata_cache,
16223 +                                 reiser4_ctx_gfp_mask_get());
16224 +       if (fsdata != NULL) {
16225 +               memset(fsdata, 0, sizeof *fsdata);
16226 +               fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024;
16227 +               fsdata->back = file;
16228 +               INIT_LIST_HEAD(&fsdata->dir.linkage);
16229 +       }
16230 +       return fsdata;
16231 +}
16232 +
16233 +/**
16234 + * free_fsdata - free reiser4_file_fsdata
16235 + * @fsdata: object to free
16236 + *
16237 + * Dual to create_fsdata(). Free reiser4_file_fsdata.
16238 + */
16239 +static void free_fsdata(reiser4_file_fsdata *fsdata)
16240 +{
16241 +       BUG_ON(fsdata == NULL);
16242 +       kmem_cache_free(file_fsdata_cache, fsdata);
16243 +}
16244 +
16245 +/**
16246 + * reiser4_get_file_fsdata - get fs-specific file data
16247 + * @file: queried file
16248 + *
16249 + * Returns fs-specific data of @file. If it is NULL, allocates it and attaches
16250 + * to @file.
16251 + */
16252 +reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file)
16253 +{
16254 +       assert("nikita-1603", file != NULL);
16255 +
16256 +       if (file->private_data == NULL) {
16257 +               reiser4_file_fsdata *fsdata;
16258 +               struct inode *inode;
16259 +
16260 +               fsdata = create_fsdata(file);
16261 +               if (fsdata == NULL)
16262 +                       return ERR_PTR(RETERR(-ENOMEM));
16263 +
16264 +               inode = file->f_dentry->d_inode;
16265 +               spin_lock_inode(inode);
16266 +               if (file->private_data == NULL) {
16267 +                       file->private_data = fsdata;
16268 +                       fsdata = NULL;
16269 +               }
16270 +               spin_unlock_inode(inode);
16271 +               if (fsdata != NULL)
16272 +                       /* other thread initialized ->fsdata */
16273 +                       kmem_cache_free(file_fsdata_cache, fsdata);
16274 +       }
16275 +       assert("nikita-2665", file->private_data != NULL);
16276 +       return file->private_data;
16277 +}
16278 +
16279 +/**
16280 + * free_file_fsdata_nolock - detach and free reiser4_file_fsdata
16281 + * @file:
16282 + *
16283 + * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from
16284 + * readdir list, frees if it is not linked to d_cursor object.
16285 + */
16286 +static void free_file_fsdata_nolock(struct file *file)
16287 +{
16288 +       reiser4_file_fsdata *fsdata;
16289 +
16290 +       assert("", spin_inode_is_locked(file->f_dentry->d_inode));
16291 +       fsdata = file->private_data;
16292 +       if (fsdata != NULL) {
16293 +               list_del_init(&fsdata->dir.linkage);
16294 +               if (fsdata->cursor == NULL)
16295 +                       free_fsdata(fsdata);
16296 +       }
16297 +       file->private_data = NULL;
16298 +}
16299 +
16300 +/**
16301 + * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata
16302 + * @file:
16303 + *
16304 + * Spinlocks inode and calls free_file_fsdata_nolock to do the work.
16305 + */
16306 +void reiser4_free_file_fsdata(struct file *file)
16307 +{
16308 +       spin_lock_inode(file->f_dentry->d_inode);
16309 +       free_file_fsdata_nolock(file);
16310 +       spin_unlock_inode(file->f_dentry->d_inode);
16311 +}
16312 +
16313 +/*
16314 + * Local variables:
16315 + * c-indentation-style: "K&R"
16316 + * mode-name: "LC"
16317 + * c-basic-offset: 8
16318 + * tab-width: 8
16319 + * fill-column: 79
16320 + * End:
16321 + */
16322 diff -urN linux-2.6.35.orig/fs/reiser4/fsdata.h linux-2.6.35/fs/reiser4/fsdata.h
16323 --- linux-2.6.35.orig/fs/reiser4/fsdata.h       1970-01-01 01:00:00.000000000 +0100
16324 +++ linux-2.6.35/fs/reiser4/fsdata.h    2010-08-04 15:44:57.000000000 +0200
16325 @@ -0,0 +1,205 @@
16326 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
16327 + * reiser4/README */
16328 +
16329 +#if !defined(__REISER4_FSDATA_H__)
16330 +#define __REISER4_FSDATA_H__
16331 +
16332 +#include "debug.h"
16333 +#include "kassign.h"
16334 +#include "seal.h"
16335 +#include "type_safe_hash.h"
16336 +#include "plugin/file/file.h"
16337 +#include "readahead.h"
16338 +
16339 +/*
16340 + * comment about reiser4_dentry_fsdata
16341 + *
16342 + *
16343 + */
16344 +
16345 +/*
16346 + * locking: fields of per file descriptor readdir_pos and ->f_pos are
16347 + * protected by ->i_mutex on inode. Under this lock following invariant
16348 + * holds:
16349 + *
16350 + *     file descriptor is "looking" at the entry_no-th directory entry from
16351 + *     the beginning of directory. This entry has key dir_entry_key and is
16352 + *     pos-th entry with duplicate-key sequence.
16353 + *
16354 + */
16355 +
16356 +/* logical position within directory */
16357 +struct dir_pos {
16358 +       /* key of directory entry (actually, part of a key sufficient to
16359 +          identify directory entry)  */
16360 +       de_id dir_entry_key;
16361 +       /* ordinal number of directory entry among all entries with the same
16362 +          key. (Starting from 0.) */
16363 +       unsigned pos;
16364 +};
16365 +
16366 +struct readdir_pos {
16367 +       /* f_pos corresponding to this readdir position */
16368 +       __u64 fpos;
16369 +       /* logical position within directory */
16370 +       struct dir_pos position;
16371 +       /* logical number of directory entry within
16372 +          directory  */
16373 +       __u64 entry_no;
16374 +};
16375 +
16376 +/*
16377 + * this is used to speed up lookups for directory entry: on initial call to
16378 + * ->lookup() seal and coord of directory entry (if found, that is) are stored
16379 + * in struct dentry and reused later to avoid tree traversals.
16380 + */
16381 +struct de_location {
16382 +       /* seal covering directory entry */
16383 +       seal_t entry_seal;
16384 +       /* coord of directory entry */
16385 +       coord_t entry_coord;
16386 +       /* ordinal number of directory entry among all entries with the same
16387 +          key. (Starting from 0.) */
16388 +       int pos;
16389 +};
16390 +
16391 +/**
16392 + * reiser4_dentry_fsdata - reiser4-specific data attached to dentries
16393 + *
16394 + * This is allocated dynamically and released in d_op->d_release()
16395 + *
16396 + * Currently it only contains cached location (hint) of directory entry, but
16397 + * it is expected that other information will be accumulated here.
16398 + */
16399 +struct reiser4_dentry_fsdata {
16400 +       /*
16401 +        * here will go fields filled by ->lookup() to speedup next
16402 +        * create/unlink, like blocknr of znode with stat-data, or key of
16403 +        * stat-data.
16404 +        */
16405 +       struct de_location dec;
16406 +       int stateless;          /* created through reiser4_decode_fh, needs
16407 +                                * special treatment in readdir. */
16408 +};
16409 +
16410 +extern int reiser4_init_dentry_fsdata(void);
16411 +extern void reiser4_done_dentry_fsdata(void);
16412 +extern struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *);
16413 +extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
16414 +
16415 +/**
16416 + * reiser4_file_fsdata - reiser4-specific data attached to file->private_data
16417 + *
16418 + * This is allocated dynamically and released in inode->i_fop->release
16419 + */
16420 +typedef struct reiser4_file_fsdata {
16421 +       /*
16422 +        * pointer back to the struct file which this reiser4_file_fsdata is
16423 +        * part of
16424 +        */
16425 +       struct file *back;
16426 +       /* detached cursor for stateless readdir. */
16427 +       struct dir_cursor *cursor;
16428 +       /*
16429 +        * We need both directory and regular file parts here, because there
16430 +        * are file system objects that are files and directories.
16431 +        */
16432 +       struct {
16433 +               /*
16434 +                * position in directory. It is updated each time directory is
16435 +                * modified
16436 +                */
16437 +               struct readdir_pos readdir;
16438 +               /* head of this list is reiser4_inode->lists.readdir_list */
16439 +               struct list_head linkage;
16440 +       } dir;
16441 +       /* hints to speed up operations with regular files: read and write. */
16442 +       struct {
16443 +               hint_t hint;
16444 +       } reg;
16445 +       struct reiser4_file_ra_state ra1;
16446 +
16447 +} reiser4_file_fsdata;
16448 +
16449 +extern int reiser4_init_file_fsdata(void);
16450 +extern void reiser4_done_file_fsdata(void);
16451 +extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *);
16452 +extern void reiser4_free_file_fsdata(struct file *);
16453 +
16454 +/*
16455 + * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are
16456 + * used to address problem reiser4 has with readdir accesses via NFS. See
16457 + * plugin/file_ops_readdir.c for more details.
16458 + */
16459 +struct d_cursor_key{
16460 +       __u16 cid;
16461 +       __u64 oid;
16462 +};
16463 +
16464 +/*
16465 + * define structures d_cursor_hash_table d_cursor_hash_link which are used to
16466 + * maintain hash table of dir_cursor-s in reiser4's super block
16467 + */
16468 +typedef struct dir_cursor dir_cursor;
16469 +TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
16470 +
16471 +struct dir_cursor {
16472 +       int ref;
16473 +       reiser4_file_fsdata *fsdata;
16474 +
16475 +       /* link to reiser4 super block hash table of cursors */
16476 +       d_cursor_hash_link hash;
16477 +
16478 +       /*
16479 +        * this is to link cursors to reiser4 super block's radix tree of
16480 +        * cursors if there are more than one cursor of the same objectid
16481 +        */
16482 +       struct list_head list;
16483 +       struct d_cursor_key key;
16484 +       struct d_cursor_info *info;
16485 +       /* list of unused cursors */
16486 +       struct list_head alist;
16487 +};
16488 +
16489 +extern int reiser4_init_d_cursor(void);
16490 +extern void reiser4_done_d_cursor(void);
16491 +
16492 +extern int reiser4_init_super_d_info(struct super_block *);
16493 +extern void reiser4_done_super_d_info(struct super_block *);
16494 +
16495 +extern loff_t reiser4_get_dir_fpos(struct file *);
16496 +extern int reiser4_attach_fsdata(struct file *, struct inode *);
16497 +extern void reiser4_detach_fsdata(struct file *);
16498 +
16499 +/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for
16500 +   more details */
16501 +void reiser4_dispose_cursors(struct inode *inode);
16502 +void reiser4_load_cursors(struct inode *inode);
16503 +void reiser4_kill_cursors(struct inode *inode);
16504 +void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
16505 +                            int offset, int adj);
16506 +
16507 +/*
16508 + * this structure is embedded to reise4_super_info_data. It maintains d_cursors
16509 + * (detached readdir state). See plugin/file_ops_readdir.c for more details.
16510 + */
16511 +struct d_cursor_info {
16512 +       d_cursor_hash_table table;
16513 +       struct radix_tree_root tree;
16514 +};
16515 +
16516 +/* spinlock protecting readdir cursors */
16517 +extern spinlock_t d_lock;
16518 +
16519 +/* __REISER4_FSDATA_H__ */
16520 +#endif
16521 +
16522 +/*
16523 + * Local variables:
16524 + * c-indentation-style: "K&R"
16525 + * mode-name: "LC"
16526 + * c-basic-offset: 8
16527 + * tab-width: 8
16528 + * fill-column: 120
16529 + * End:
16530 + */
16531 diff -urN linux-2.6.35.orig/fs/reiser4/init_super.c linux-2.6.35/fs/reiser4/init_super.c
16532 --- linux-2.6.35.orig/fs/reiser4/init_super.c   1970-01-01 01:00:00.000000000 +0100
16533 +++ linux-2.6.35/fs/reiser4/init_super.c        2010-08-04 15:44:57.000000000 +0200
16534 @@ -0,0 +1,761 @@
16535 +/* Copyright by Hans Reiser, 2003 */
16536 +
16537 +#include "super.h"
16538 +#include "inode.h"
16539 +#include "plugin/plugin_set.h"
16540 +
16541 +#include <linux/swap.h>
16542 +
16543 +/**
16544 + * init_fs_info - allocate reiser4 specific super block
16545 + * @super: super block of filesystem
16546 + *
16547 + * Allocates and initialize reiser4_super_info_data, attaches it to
16548 + * super->s_fs_info, initializes structures maintaining d_cursor-s.
16549 + */
16550 +int reiser4_init_fs_info(struct super_block *super)
16551 +{
16552 +       reiser4_super_info_data *sbinfo;
16553 +
16554 +       sbinfo = kzalloc(sizeof(reiser4_super_info_data),
16555 +                        reiser4_ctx_gfp_mask_get());
16556 +       if (!sbinfo)
16557 +               return RETERR(-ENOMEM);
16558 +
16559 +       super->s_fs_info = sbinfo;
16560 +       super->s_op = NULL;
16561 +
16562 +       ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
16563 +       ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
16564 +
16565 +       mutex_init(&sbinfo->delete_mutex);
16566 +       spin_lock_init(&(sbinfo->guard));
16567 +
16568 +       /*  initialize per-super-block d_cursor resources */
16569 +       reiser4_init_super_d_info(super);
16570 +
16571 +       return 0;
16572 +}
16573 +
16574 +/**
16575 + * reiser4_done_fs_info - free reiser4 specific super block
16576 + * @super: super block of filesystem
16577 + *
16578 + * Performs some sanity checks, releases structures maintaining d_cursor-s,
16579 + * frees reiser4_super_info_data.
16580 + */
16581 +void reiser4_done_fs_info(struct super_block *super)
16582 +{
16583 +       assert("zam-990", super->s_fs_info != NULL);
16584 +
16585 +       /* release per-super-block d_cursor resources */
16586 +       reiser4_done_super_d_info(super);
16587 +
16588 +       /* make sure that there are not jnodes already */
16589 +       assert("", list_empty(&get_super_private(super)->all_jnodes));
16590 +       assert("", get_current_context()->trans->atom == NULL);
16591 +       reiser4_check_block_counters(super);
16592 +       kfree(super->s_fs_info);
16593 +       super->s_fs_info = NULL;
16594 +}
16595 +
16596 +/* type of option parseable by parse_option() */
16597 +typedef enum {
16598 +       /* value of option is arbitrary string */
16599 +       OPT_STRING,
16600 +
16601 +       /*
16602 +        * option specifies bit in a bitmask. When option is set - bit in
16603 +        * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush,
16604 +        * dont_load_bitmap, atomic_write.
16605 +        */
16606 +       OPT_BIT,
16607 +
16608 +       /*
16609 +        * value of option should conform to sprintf() format. Examples are
16610 +        * tmgr.atom_max_size=N, tmgr.atom_max_age=N
16611 +        */
16612 +       OPT_FORMAT,
16613 +
16614 +       /*
16615 +        * option can take one of predefined values. Example is onerror=panic or
16616 +        * onerror=remount-ro
16617 +        */
16618 +       OPT_ONEOF,
16619 +} opt_type_t;
16620 +
16621 +#if 0
16622 +struct opt_bitmask_bit {
16623 +       const char *bit_name;
16624 +       int bit_nr;
16625 +};
16626 +#endif
16627 +
16628 +/* description of option parseable by parse_option() */
16629 +struct opt_desc {
16630 +       /* option name.
16631 +
16632 +          parsed portion of string has a form "name=value".
16633 +        */
16634 +       const char *name;
16635 +       /* type of option */
16636 +       opt_type_t type;
16637 +       union {
16638 +               /* where to store value of string option (type == OPT_STRING) */
16639 +               char **string;
16640 +               /* description of bits for bit option (type == OPT_BIT) */
16641 +               struct {
16642 +                       int nr;
16643 +                       void *addr;
16644 +               } bit;
16645 +               /* description of format and targets for format option (type
16646 +                  == OPT_FORMAT) */
16647 +               struct {
16648 +                       const char *format;
16649 +                       int nr_args;
16650 +                       void *arg1;
16651 +                       void *arg2;
16652 +                       void *arg3;
16653 +                       void *arg4;
16654 +               } f;
16655 +               struct {
16656 +                       int *result;
16657 +                       const char *list[10];
16658 +               } oneof;
16659 +               struct {
16660 +                       void *addr;
16661 +                       int nr_bits;
16662 +                       /* struct opt_bitmask_bit *bits; */
16663 +               } bitmask;
16664 +       } u;
16665 +};
16666 +
16667 +/**
16668 + * parse_option - parse one option
16669 + * @opt_strin: starting point of parsing
16670 + * @opt: option description
16671 + *
16672 + * foo=bar,
16673 + * ^   ^  ^
16674 + * |   |  +-- replaced to '\0'
16675 + * |   +-- val_start
16676 + * +-- opt_string
16677 + * Figures out option type and handles option correspondingly.
16678 + */
16679 +static int parse_option(char *opt_string, struct opt_desc *opt)
16680 +{
16681 +       char *val_start;
16682 +       int result;
16683 +       const char *err_msg;
16684 +
16685 +       /* NOTE-NIKITA think about using lib/cmdline.c functions here. */
16686 +
16687 +       val_start = strchr(opt_string, '=');
16688 +       if (val_start != NULL) {
16689 +               *val_start = '\0';
16690 +               ++val_start;
16691 +       }
16692 +
16693 +       err_msg = NULL;
16694 +       result = 0;
16695 +       switch (opt->type) {
16696 +       case OPT_STRING:
16697 +               if (val_start == NULL) {
16698 +                       err_msg = "String arg missing";
16699 +                       result = RETERR(-EINVAL);
16700 +               } else
16701 +                       *opt->u.string = val_start;
16702 +               break;
16703 +       case OPT_BIT:
16704 +               if (val_start != NULL)
16705 +                       err_msg = "Value ignored";
16706 +               else
16707 +                       set_bit(opt->u.bit.nr, opt->u.bit.addr);
16708 +               break;
16709 +       case OPT_FORMAT:
16710 +               if (val_start == NULL) {
16711 +                       err_msg = "Formatted arg missing";
16712 +                       result = RETERR(-EINVAL);
16713 +                       break;
16714 +               }
16715 +               if (sscanf(val_start, opt->u.f.format,
16716 +                          opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3,
16717 +                          opt->u.f.arg4) != opt->u.f.nr_args) {
16718 +                       err_msg = "Wrong conversion";
16719 +                       result = RETERR(-EINVAL);
16720 +               }
16721 +               break;
16722 +       case OPT_ONEOF:
16723 +               {
16724 +                       int i = 0;
16725 +
16726 +                       if (val_start == NULL) {
16727 +                               err_msg = "Value is missing";
16728 +                               result = RETERR(-EINVAL);
16729 +                               break;
16730 +                       }
16731 +                       err_msg = "Wrong option value";
16732 +                       result = RETERR(-EINVAL);
16733 +                       while (opt->u.oneof.list[i]) {
16734 +                               if (!strcmp(opt->u.oneof.list[i], val_start)) {
16735 +                                       result = 0;
16736 +                                       err_msg = NULL;
16737 +                                       *opt->u.oneof.result = i;
16738 +                                       break;
16739 +                               }
16740 +                               i++;
16741 +                       }
16742 +                       break;
16743 +               }
16744 +       default:
16745 +               wrong_return_value("nikita-2100", "opt -> type");
16746 +               break;
16747 +       }
16748 +       if (err_msg != NULL) {
16749 +               warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
16750 +                       err_msg, opt->name, val_start ? "=" : "",
16751 +                       val_start ? : "");
16752 +       }
16753 +       return result;
16754 +}
16755 +
16756 +/**
16757 + * parse_options - parse reiser4 mount options
16758 + * @opt_string: starting point
16759 + * @opts: array of option description
16760 + * @nr_opts: number of elements in @opts
16761 + *
16762 + * Parses comma separated list of reiser4 mount options.
16763 + */
16764 +static int parse_options(char *opt_string, struct opt_desc *opts, int nr_opts)
16765 +{
16766 +       int result;
16767 +
16768 +       result = 0;
16769 +       while ((result == 0) && opt_string && *opt_string) {
16770 +               int j;
16771 +               char *next;
16772 +
16773 +               next = strchr(opt_string, ',');
16774 +               if (next != NULL) {
16775 +                       *next = '\0';
16776 +                       ++next;
16777 +               }
16778 +               for (j = 0; j < nr_opts; ++j) {
16779 +                       if (!strncmp(opt_string, opts[j].name,
16780 +                                    strlen(opts[j].name))) {
16781 +                               result = parse_option(opt_string, &opts[j]);
16782 +                               break;
16783 +                       }
16784 +               }
16785 +               if (j == nr_opts) {
16786 +                       warning("nikita-2307", "Unrecognized option: \"%s\"",
16787 +                               opt_string);
16788 +                       /* traditionally, -EINVAL is returned on wrong mount
16789 +                          option */
16790 +                       result = RETERR(-EINVAL);
16791 +               }
16792 +               opt_string = next;
16793 +       }
16794 +       return result;
16795 +}
16796 +
16797 +#define NUM_OPT(label, fmt, addr)                              \
16798 +               {                                               \
16799 +                       .name = (label),                        \
16800 +                       .type = OPT_FORMAT,                     \
16801 +                       .u = {                                  \
16802 +                               .f = {                          \
16803 +                                       .format  = (fmt),       \
16804 +                                       .nr_args = 1,           \
16805 +                                       .arg1 = (addr),         \
16806 +                                       .arg2 = NULL,           \
16807 +                                       .arg3 = NULL,           \
16808 +                                       .arg4 = NULL            \
16809 +                               }                               \
16810 +                       }                                       \
16811 +               }
16812 +
16813 +#define SB_FIELD_OPT(field, fmt) NUM_OPT(#field, fmt, &sbinfo->field)
16814 +
16815 +#define BIT_OPT(label, bitnr)                                  \
16816 +       {                                                       \
16817 +               .name = label,                                  \
16818 +               .type = OPT_BIT,                                \
16819 +               .u = {                                          \
16820 +                       .bit = {                                \
16821 +                               .nr = bitnr,                    \
16822 +                               .addr = &sbinfo->fs_flags       \
16823 +                       }                                       \
16824 +               }                                               \
16825 +       }
16826 +
16827 +#define MAX_NR_OPTIONS (30)
16828 +
16829 +#if REISER4_DEBUG
16830 +#  define OPT_ARRAY_CHECK(opt, array)                                  \
16831 +       if ((opt) > (array) + MAX_NR_OPTIONS) {                         \
16832 +               warning("zam-1046", "opt array is overloaded"); break;  \
16833 +       }
16834 +#else
16835 +#   define OPT_ARRAY_CHECK(opt, array) noop
16836 +#endif
16837 +
16838 +#define PUSH_OPT(opt, array, ...)              \
16839 +do {                                           \
16840 +       struct opt_desc o = __VA_ARGS__;        \
16841 +       OPT_ARRAY_CHECK(opt, array);            \
16842 +       *(opt) ++ = o;                          \
16843 +} while (0)
16844 +
16845 +static noinline void push_sb_field_opts(struct opt_desc **p,
16846 +                                       struct opt_desc *opts,
16847 +                                       reiser4_super_info_data *sbinfo)
16848 +{
16849 +#define PUSH_SB_FIELD_OPT(field, format)               \
16850 +       PUSH_OPT(*p, opts, SB_FIELD_OPT(field, format))
16851 +       /*
16852 +        * tmgr.atom_max_size=N
16853 +        * Atoms containing more than N blocks will be forced to commit. N is
16854 +        * decimal.
16855 +        */
16856 +       PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u");
16857 +       /*
16858 +        * tmgr.atom_max_age=N
16859 +        * Atoms older than N seconds will be forced to commit. N is decimal.
16860 +        */
16861 +       PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u");
16862 +       /*
16863 +        * tmgr.atom_min_size=N
16864 +        * In committing an atom to free dirty pages, force the atom less than
16865 +        * N in size to fuse with another one.
16866 +        */
16867 +       PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u");
16868 +       /*
16869 +        * tmgr.atom_max_flushers=N
16870 +        * limit of concurrent flushers for one atom. 0 means no limit.
16871 +        */
16872 +       PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u");
16873 +       /*
16874 +        * tree.cbk_cache_slots=N
16875 +        * Number of slots in the cbk cache.
16876 +        */
16877 +       PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u");
16878 +       /*
16879 +        * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty
16880 +        * leaf-level blocks it will force them to be relocated.
16881 +        */
16882 +       PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u");
16883 +       /*
16884 +        * If flush finds can find a block allocation closer than at most
16885 +        * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that
16886 +        * position.
16887 +        */
16888 +       PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u");
16889 +       /*
16890 +        * If we have written this much or more blocks before encountering busy
16891 +        * jnode in flush list - abort flushing hoping that next time we get
16892 +        * called this jnode will be clean already, and we will save some
16893 +        * seeks.
16894 +        */
16895 +       PUSH_SB_FIELD_OPT(flush.written_threshold, "%u");
16896 +       /* The maximum number of nodes to scan left on a level during flush. */
16897 +       PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u");
16898 +       /* preferred IO size */
16899 +       PUSH_SB_FIELD_OPT(optimal_io_size, "%u");
16900 +       /* carry flags used for insertion of new nodes */
16901 +       PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u");
16902 +       /* carry flags used for insertion of new extents */
16903 +       PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u");
16904 +       /* carry flags used for paste operations */
16905 +       PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u");
16906 +       /* carry flags used for insert operations */
16907 +       PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u");
16908 +
16909 +#ifdef CONFIG_REISER4_BADBLOCKS
16910 +       /*
16911 +        * Alternative master superblock location in case if it's original
16912 +        * location is not writeable/accessable. This is offset in BYTES.
16913 +        */
16914 +       PUSH_SB_FIELD_OPT(altsuper, "%lu");
16915 +#endif
16916 +}
16917 +
16918 +/**
16919 + * reiser4_init_super_data - initialize reiser4 private super block
16920 + * @super: super block to initialize
16921 + * @opt_string: list of reiser4 mount options
16922 + *
16923 + * Sets various reiser4 parameters to default values. Parses mount options and
16924 + * overwrites default settings.
16925 + */
16926 +int reiser4_init_super_data(struct super_block *super, char *opt_string)
16927 +{
16928 +       int result;
16929 +       struct opt_desc *opts, *p;
16930 +       reiser4_super_info_data *sbinfo = get_super_private(super);
16931 +
16932 +       /* initialize super, export, dentry operations */
16933 +       sbinfo->ops.super = reiser4_super_operations;
16934 +       sbinfo->ops.export = reiser4_export_operations;
16935 +       sbinfo->ops.dentry = reiser4_dentry_operations;
16936 +       super->s_op = &sbinfo->ops.super;
16937 +       super->s_export_op = &sbinfo->ops.export;
16938 +
16939 +       /* initialize transaction manager parameters to default values */
16940 +       sbinfo->tmgr.atom_max_size = totalram_pages / 4;
16941 +       sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
16942 +       sbinfo->tmgr.atom_min_size = 256;
16943 +       sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
16944 +
16945 +       /* initialize cbk cache parameter */
16946 +       sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
16947 +
16948 +       /* initialize flush parameters */
16949 +       sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
16950 +       sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
16951 +       sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
16952 +       sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
16953 +
16954 +       sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
16955 +
16956 +       /* preliminary tree initializations */
16957 +       sbinfo->tree.super = super;
16958 +       sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
16959 +       sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
16960 +       sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
16961 +       sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
16962 +       rwlock_init(&(sbinfo->tree.tree_lock));
16963 +       spin_lock_init(&(sbinfo->tree.epoch_lock));
16964 +
16965 +       /* initialize default readahead params */
16966 +       sbinfo->ra_params.max = num_physpages / 4;
16967 +       sbinfo->ra_params.flags = 0;
16968 +
16969 +       /* allocate memory for structure describing reiser4 mount options */
16970 +       opts = kmalloc(sizeof(struct opt_desc) * MAX_NR_OPTIONS,
16971 +                      reiser4_ctx_gfp_mask_get());
16972 +       if (opts == NULL)
16973 +               return RETERR(-ENOMEM);
16974 +
16975 +       /* initialize structure describing reiser4 mount options */
16976 +       p = opts;
16977 +
16978 +       push_sb_field_opts(&p, opts, sbinfo);
16979 +       /* turn on BSD-style gid assignment */
16980 +
16981 +#define PUSH_BIT_OPT(name, bit)                        \
16982 +       PUSH_OPT(p, opts, BIT_OPT(name, bit))
16983 +
16984 +       PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
16985 +       /* turn on 32 bit times */
16986 +       PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES);
16987 +       /*
16988 +        * Don't load all bitmap blocks at mount time, it is useful for
16989 +        * machines with tiny RAM and large disks.
16990 +        */
16991 +       PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP);
16992 +       /* disable transaction commits during write() */
16993 +       PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE);
16994 +       /* disable use of write barriers in the reiser4 log writer. */
16995 +       PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER);
16996 +
16997 +       PUSH_OPT(p, opts,
16998 +       {
16999 +               /*
17000 +                * tree traversal readahead parameters:
17001 +                * -o readahead:MAXNUM:FLAGS
17002 +                * MAXNUM - max number fo nodes to request readahead for: -1UL
17003 +                * will set it to max_sane_readahead()
17004 +                * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS,
17005 +                * CONTINUE_ON_PRESENT
17006 +                */
17007 +               .name = "readahead",
17008 +               .type = OPT_FORMAT,
17009 +               .u = {
17010 +                       .f = {
17011 +                               .format = "%u:%u",
17012 +                               .nr_args = 2,
17013 +                               .arg1 = &sbinfo->ra_params.max,
17014 +                               .arg2 = &sbinfo->ra_params.flags,
17015 +                               .arg3 = NULL,
17016 +                               .arg4 = NULL
17017 +                       }
17018 +               }
17019 +       }
17020 +       );
17021 +
17022 +       /* What to do in case of fs error */
17023 +       PUSH_OPT(p, opts,
17024 +       {
17025 +               .name = "onerror",
17026 +               .type = OPT_ONEOF,
17027 +               .u = {
17028 +                       .oneof = {
17029 +                               .result = &sbinfo->onerror,
17030 +                               .list = {
17031 +                                       "panic", "remount-ro", NULL
17032 +                               },
17033 +                       }
17034 +               }
17035 +       }
17036 +       );
17037 +
17038 +       /* modify default settings to values set by mount options */
17039 +       result = parse_options(opt_string, opts, p - opts);
17040 +       kfree(opts);
17041 +       if (result != 0)
17042 +               return result;
17043 +
17044 +       /* correct settings to sanity values */
17045 +       sbinfo->tmgr.atom_max_age *= HZ;
17046 +       if (sbinfo->tmgr.atom_max_age <= 0)
17047 +               /* overflow */
17048 +               sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE;
17049 +
17050 +       /* round optimal io size up to 512 bytes */
17051 +       sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS;
17052 +       sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS;
17053 +       if (sbinfo->optimal_io_size == 0) {
17054 +               warning("nikita-2497", "optimal_io_size is too small");
17055 +               return RETERR(-EINVAL);
17056 +       }
17057 +       return result;
17058 +}
17059 +
17060 +/**
17061 + * reiser4_init_read_super - read reiser4 master super block
17062 + * @super: super block to fill
17063 + * @silent: if 0 - print warnings
17064 + *
17065 + * Reads reiser4 master super block either from predefined location or from
17066 + * location specified by altsuper mount option, initializes disk format plugin.
17067 + */
17068 +int reiser4_init_read_super(struct super_block *super, int silent)
17069 +{
17070 +       struct buffer_head *super_bh;
17071 +       struct reiser4_master_sb *master_sb;
17072 +       reiser4_super_info_data *sbinfo = get_super_private(super);
17073 +       unsigned long blocksize;
17074 +
17075 + read_super_block:
17076 +#ifdef CONFIG_REISER4_BADBLOCKS
17077 +       if (sbinfo->altsuper)
17078 +               /*
17079 +                * read reiser4 master super block at position specified by
17080 +                * mount option
17081 +                */
17082 +               super_bh = sb_bread(super,
17083 +                                   (sector_t)(sbinfo->altsuper / super->s_blocksize));
17084 +       else
17085 +#endif
17086 +               /* read reiser4 master super block at 16-th 4096 block */
17087 +               super_bh = sb_bread(super,
17088 +                                   (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize));
17089 +       if (!super_bh)
17090 +               return RETERR(-EIO);
17091 +
17092 +       master_sb = (struct reiser4_master_sb *)super_bh->b_data;
17093 +       /* check reiser4 magic string */
17094 +       if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING,
17095 +                    sizeof(REISER4_SUPER_MAGIC_STRING))) {
17096 +               /* reiser4 master super block contains filesystem blocksize */
17097 +               blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize));
17098 +
17099 +               if (blocksize != PAGE_CACHE_SIZE) {
17100 +                       /*
17101 +                        * currenly reiser4's blocksize must be equal to
17102 +                        * pagesize
17103 +                        */
17104 +                       if (!silent)
17105 +                               warning("nikita-2609",
17106 +                                       "%s: wrong block size %ld\n", super->s_id,
17107 +                                       blocksize);
17108 +                       brelse(super_bh);
17109 +                       return RETERR(-EINVAL);
17110 +               }
17111 +               if (blocksize != super->s_blocksize) {
17112 +                       /*
17113 +                        * filesystem uses different blocksize. Reread master
17114 +                        * super block with correct blocksize
17115 +                        */
17116 +                       brelse(super_bh);
17117 +                       if (!sb_set_blocksize(super, (int)blocksize))
17118 +                               return RETERR(-EINVAL);
17119 +                       goto read_super_block;
17120 +               }
17121 +
17122 +               sbinfo->df_plug =
17123 +                       disk_format_plugin_by_id(
17124 +                               le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
17125 +               if (sbinfo->df_plug == NULL) {
17126 +                       if (!silent)
17127 +                               warning("nikita-26091",
17128 +                                       "%s: unknown disk format plugin %d\n",
17129 +                                       super->s_id,
17130 +                                       le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
17131 +                       brelse(super_bh);
17132 +                       return RETERR(-EINVAL);
17133 +               }
17134 +               sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap));
17135 +               brelse(super_bh);
17136 +               return 0;
17137 +       }
17138 +
17139 +       /* there is no reiser4 on the device */
17140 +       if (!silent)
17141 +               warning("nikita-2608",
17142 +                       "%s: wrong master super block magic", super->s_id);
17143 +       brelse(super_bh);
17144 +       return RETERR(-EINVAL);
17145 +}
17146 +
17147 +static struct {
17148 +       reiser4_plugin_type type;
17149 +       reiser4_plugin_id id;
17150 +} default_plugins[PSET_LAST] = {
17151 +       [PSET_FILE] = {
17152 +               .type = REISER4_FILE_PLUGIN_TYPE,
17153 +               .id = UNIX_FILE_PLUGIN_ID
17154 +       },
17155 +       [PSET_DIR] = {
17156 +               .type = REISER4_DIR_PLUGIN_TYPE,
17157 +               .id = HASHED_DIR_PLUGIN_ID
17158 +       },
17159 +       [PSET_HASH] = {
17160 +               .type = REISER4_HASH_PLUGIN_TYPE,
17161 +               .id = R5_HASH_ID
17162 +       },
17163 +       [PSET_FIBRATION] = {
17164 +               .type = REISER4_FIBRATION_PLUGIN_TYPE,
17165 +               .id = FIBRATION_DOT_O
17166 +       },
17167 +       [PSET_PERM] = {
17168 +               .type = REISER4_PERM_PLUGIN_TYPE,
17169 +               .id = NULL_PERM_ID
17170 +       },
17171 +       [PSET_FORMATTING] = {
17172 +               .type = REISER4_FORMATTING_PLUGIN_TYPE,
17173 +               .id = SMALL_FILE_FORMATTING_ID
17174 +       },
17175 +       [PSET_SD] = {
17176 +               .type = REISER4_ITEM_PLUGIN_TYPE,
17177 +               .id = STATIC_STAT_DATA_ID
17178 +       },
17179 +       [PSET_DIR_ITEM] = {
17180 +               .type = REISER4_ITEM_PLUGIN_TYPE,
17181 +               .id = COMPOUND_DIR_ID
17182 +       },
17183 +       [PSET_CIPHER] = {
17184 +               .type = REISER4_CIPHER_PLUGIN_TYPE,
17185 +               .id = NONE_CIPHER_ID
17186 +       },
17187 +       [PSET_DIGEST] = {
17188 +               .type = REISER4_DIGEST_PLUGIN_TYPE,
17189 +               .id = SHA256_32_DIGEST_ID
17190 +       },
17191 +       [PSET_COMPRESSION] = {
17192 +               .type = REISER4_COMPRESSION_PLUGIN_TYPE,
17193 +               .id = LZO1_COMPRESSION_ID
17194 +       },
17195 +       [PSET_COMPRESSION_MODE] = {
17196 +               .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
17197 +               .id = CONVX_COMPRESSION_MODE_ID
17198 +       },
17199 +       [PSET_CLUSTER] = {
17200 +               .type = REISER4_CLUSTER_PLUGIN_TYPE,
17201 +               .id = CLUSTER_64K_ID
17202 +       },
17203 +       [PSET_CREATE] = {
17204 +               .type = REISER4_FILE_PLUGIN_TYPE,
17205 +               .id = UNIX_FILE_PLUGIN_ID
17206 +       }
17207 +};
17208 +
17209 +/* access to default plugin table */
17210 +reiser4_plugin *get_default_plugin(pset_member memb)
17211 +{
17212 +       return plugin_by_id(default_plugins[memb].type,
17213 +                           default_plugins[memb].id);
17214 +}
17215 +
17216 +/**
17217 + * reiser4_init_root_inode - obtain inode of root directory
17218 + * @super: super block of filesystem
17219 + *
17220 + * Obtains inode of root directory (reading it from disk), initializes plugin
17221 + * set it was not initialized.
17222 + */
17223 +int reiser4_init_root_inode(struct super_block *super)
17224 +{
17225 +       reiser4_super_info_data *sbinfo = get_super_private(super);
17226 +       struct inode *inode;
17227 +       int result = 0;
17228 +
17229 +       inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0);
17230 +       if (IS_ERR(inode))
17231 +               return RETERR(PTR_ERR(inode));
17232 +
17233 +       super->s_root = d_alloc_root(inode);
17234 +       if (!super->s_root) {
17235 +               iput(inode);
17236 +               return RETERR(-ENOMEM);
17237 +       }
17238 +
17239 +       super->s_root->d_op = &sbinfo->ops.dentry;
17240 +
17241 +       if (!is_inode_loaded(inode)) {
17242 +               pset_member memb;
17243 +               plugin_set *pset;
17244 +
17245 +               pset = reiser4_inode_data(inode)->pset;
17246 +               for (memb = 0; memb < PSET_LAST; ++memb) {
17247 +
17248 +                       if (aset_get(pset, memb) != NULL)
17249 +                               continue;
17250 +
17251 +                       result = grab_plugin_pset(inode, NULL, memb);
17252 +                       if (result != 0)
17253 +                               break;
17254 +
17255 +                       reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17256 +               }
17257 +
17258 +               if (result == 0) {
17259 +                       if (REISER4_DEBUG) {
17260 +                               for (memb = 0; memb < PSET_LAST; ++memb)
17261 +                                       assert("nikita-3500",
17262 +                                              aset_get(pset, memb) != NULL);
17263 +                       }
17264 +               } else
17265 +                       warning("nikita-3448", "Cannot set plugins of root: %i",
17266 +                               result);
17267 +               reiser4_iget_complete(inode);
17268 +
17269 +               /* As the default pset kept in the root dir may has been changed
17270 +                  (length is unknown), call update_sd. */
17271 +               if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
17272 +                       result = reiser4_grab_space(
17273 +                               inode_file_plugin(inode)->estimate.update(inode),
17274 +                               BA_CAN_COMMIT);
17275 +
17276 +                       if (result == 0)
17277 +                               result = reiser4_update_sd(inode);
17278 +
17279 +                       all_grabbed2free();
17280 +               }
17281 +       }
17282 +
17283 +       super->s_maxbytes = MAX_LFS_FILESIZE;
17284 +       return result;
17285 +}
17286 +
17287 +/*
17288 + * Local variables:
17289 + * c-indentation-style: "K&R"
17290 + * mode-name: "LC"
17291 + * c-basic-offset: 8
17292 + * tab-width: 8
17293 + * fill-column: 79
17294 + * End:
17295 + */
17296 diff -urN linux-2.6.35.orig/fs/reiser4/inode.c linux-2.6.35/fs/reiser4/inode.c
17297 --- linux-2.6.35.orig/fs/reiser4/inode.c        1970-01-01 01:00:00.000000000 +0100
17298 +++ linux-2.6.35/fs/reiser4/inode.c     2010-08-04 15:44:57.000000000 +0200
17299 @@ -0,0 +1,711 @@
17300 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
17301 +   reiser4/README */
17302 +
17303 +/* Inode specific operations. */
17304 +
17305 +#include "forward.h"
17306 +#include "debug.h"
17307 +#include "key.h"
17308 +#include "kassign.h"
17309 +#include "coord.h"
17310 +#include "seal.h"
17311 +#include "dscale.h"
17312 +#include "plugin/item/item.h"
17313 +#include "plugin/security/perm.h"
17314 +#include "plugin/plugin.h"
17315 +#include "plugin/object.h"
17316 +#include "znode.h"
17317 +#include "vfs_ops.h"
17318 +#include "inode.h"
17319 +#include "super.h"
17320 +#include "reiser4.h"
17321 +
17322 +#include <linux/fs.h>          /* for struct super_block,  address_space */
17323 +
17324 +/* return reiser4 internal tree which inode belongs to */
17325 +/* Audited by: green(2002.06.17) */
17326 +reiser4_tree *reiser4_tree_by_inode(const struct inode *inode/* inode queried*/)
17327 +{
17328 +       assert("nikita-256", inode != NULL);
17329 +       assert("nikita-257", inode->i_sb != NULL);
17330 +       return reiser4_get_tree(inode->i_sb);
17331 +}
17332 +
17333 +/* return reiser4-specific inode flags */
17334 +static inline unsigned long *inode_flags(const struct inode *const inode)
17335 +{
17336 +       assert("nikita-2842", inode != NULL);
17337 +       return &reiser4_inode_data(inode)->flags;
17338 +}
17339 +
17340 +/* set reiser4-specific flag @f in @inode */
17341 +void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f)
17342 +{
17343 +       assert("nikita-2248", inode != NULL);
17344 +       set_bit((int)f, inode_flags(inode));
17345 +}
17346 +
17347 +/* clear reiser4-specific flag @f in @inode */
17348 +void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f)
17349 +{
17350 +       assert("nikita-2250", inode != NULL);
17351 +       clear_bit((int)f, inode_flags(inode));
17352 +}
17353 +
17354 +/* true if reiser4-specific flag @f is set in @inode */
17355 +int reiser4_inode_get_flag(const struct inode *inode,
17356 +                          reiser4_file_plugin_flags f)
17357 +{
17358 +       assert("nikita-2251", inode != NULL);
17359 +       return test_bit((int)f, inode_flags(inode));
17360 +}
17361 +
17362 +/* convert oid to inode number */
17363 +ino_t oid_to_ino(oid_t oid)
17364 +{
17365 +       return (ino_t) oid;
17366 +}
17367 +
17368 +/* convert oid to user visible inode number */
17369 +ino_t oid_to_uino(oid_t oid)
17370 +{
17371 +       /* reiser4 object is uniquely identified by oid which is 64 bit
17372 +          quantity. Kernel in-memory inode is indexed (in the hash table) by
17373 +          32 bit i_ino field, but this is not a problem, because there is a
17374 +          way to further distinguish inodes with identical inode numbers
17375 +          (find_actor supplied to iget()).
17376 +
17377 +          But user space expects unique 32 bit inode number. Obviously this
17378 +          is impossible. Work-around is to somehow hash oid into user visible
17379 +          inode number.
17380 +        */
17381 +       oid_t max_ino = (ino_t) ~0;
17382 +
17383 +       if (REISER4_INO_IS_OID || (oid <= max_ino))
17384 +               return oid;
17385 +       else
17386 +               /* this is remotely similar to algorithm used to find next pid
17387 +                  to use for process: after wrap-around start from some
17388 +                  offset rather than from 0. Idea is that there are some long
17389 +                  living objects with which we don't want to collide.
17390 +                */
17391 +               return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1));
17392 +}
17393 +
17394 +/* check that "inode" is on reiser4 file-system */
17395 +int is_reiser4_inode(const struct inode *inode/* inode queried */)
17396 +{
17397 +       return inode != NULL && is_reiser4_super(inode->i_sb);
17398 +}
17399 +
17400 +/* Maximal length of a name that can be stored in directory @inode.
17401 +
17402 +   This is used in check during file creation and lookup. */
17403 +int reiser4_max_filename_len(const struct inode *inode/* inode queried */)
17404 +{
17405 +       assert("nikita-287", is_reiser4_inode(inode));
17406 +       assert("nikita-1710", inode_dir_item_plugin(inode));
17407 +       if (inode_dir_item_plugin(inode)->s.dir.max_name_len)
17408 +               return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode);
17409 +       else
17410 +               return 255;
17411 +}
17412 +
17413 +#if REISER4_USE_COLLISION_LIMIT
17414 +/* Maximal number of hash collisions for this directory. */
17415 +int max_hash_collisions(const struct inode *dir/* inode queried */)
17416 +{
17417 +       assert("nikita-1711", dir != NULL);
17418 +       return reiser4_inode_data(dir)->plugin.max_collisions;
17419 +}
17420 +#endif  /*  REISER4_USE_COLLISION_LIMIT  */
17421 +
17422 +/* Install file, inode, and address_space operation on @inode, depending on
17423 +   its mode. */
17424 +int setup_inode_ops(struct inode *inode /* inode to intialize */ ,
17425 +                   reiser4_object_create_data * data   /* parameters to create
17426 +                                                        * object */ )
17427 +{
17428 +       reiser4_super_info_data *sinfo;
17429 +       file_plugin *fplug;
17430 +       dir_plugin *dplug;
17431 +
17432 +       fplug = inode_file_plugin(inode);
17433 +       dplug = inode_dir_plugin(inode);
17434 +
17435 +       sinfo = get_super_private(inode->i_sb);
17436 +
17437 +       switch (inode->i_mode & S_IFMT) {
17438 +       case S_IFSOCK:
17439 +       case S_IFBLK:
17440 +       case S_IFCHR:
17441 +       case S_IFIFO:
17442 +               {
17443 +                       dev_t rdev;     /* to keep gcc happy */
17444 +
17445 +                       assert("vs-46", fplug != NULL);
17446 +                       /* ugly hack with rdev */
17447 +                       if (data == NULL) {
17448 +                               rdev = inode->i_rdev;
17449 +                               inode->i_rdev = 0;
17450 +                       } else
17451 +                               rdev = data->rdev;
17452 +                       inode->i_blocks = 0;
17453 +                       assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID);
17454 +                       inode->i_op = file_plugins[fplug->h.id].inode_ops;
17455 +                       /* initialize inode->i_fop and inode->i_rdev for block
17456 +                          and char devices */
17457 +                       init_special_inode(inode, inode->i_mode, rdev);
17458 +                       /* all address space operations are null */
17459 +                       inode->i_mapping->a_ops =
17460 +                           file_plugins[fplug->h.id].as_ops;
17461 +                       break;
17462 +               }
17463 +       case S_IFLNK:
17464 +               assert("vs-46", fplug != NULL);
17465 +               assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID);
17466 +               inode->i_op = file_plugins[fplug->h.id].inode_ops;
17467 +               inode->i_fop = NULL;
17468 +               /* all address space operations are null */
17469 +               inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops;
17470 +               break;
17471 +       case S_IFDIR:
17472 +               assert("vs-46", dplug != NULL);
17473 +               assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID ||
17474 +                                dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID));
17475 +               inode->i_op = dir_plugins[dplug->h.id].inode_ops;
17476 +               inode->i_fop = dir_plugins[dplug->h.id].file_ops;
17477 +               inode->i_mapping->a_ops = dir_plugins[dplug->h.id].as_ops;
17478 +               break;
17479 +       case S_IFREG:
17480 +               assert("vs-46", fplug != NULL);
17481 +               assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID ||
17482 +                                fplug->h.id == CRYPTCOMPRESS_FILE_PLUGIN_ID));
17483 +               inode->i_op = file_plugins[fplug->h.id].inode_ops;
17484 +               inode->i_fop = file_plugins[fplug->h.id].file_ops;
17485 +               inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops;
17486 +               break;
17487 +       default:
17488 +               warning("nikita-291", "wrong file mode: %o for %llu",
17489 +                       inode->i_mode,
17490 +                       (unsigned long long)get_inode_oid(inode));
17491 +               reiser4_make_bad_inode(inode);
17492 +               return RETERR(-EINVAL);
17493 +       }
17494 +       return 0;
17495 +}
17496 +
17497 +/* Initialize inode from disk data. Called with inode locked.
17498 +   Return inode locked. */
17499 +static int init_inode(struct inode *inode /* inode to intialise */ ,
17500 +                     coord_t *coord/* coord of stat data */)
17501 +{
17502 +       int result;
17503 +       item_plugin *iplug;
17504 +       void *body;
17505 +       int length;
17506 +       reiser4_inode *state;
17507 +
17508 +       assert("nikita-292", coord != NULL);
17509 +       assert("nikita-293", inode != NULL);
17510 +
17511 +       coord_clear_iplug(coord);
17512 +       result = zload(coord->node);
17513 +       if (result)
17514 +               return result;
17515 +       iplug = item_plugin_by_coord(coord);
17516 +       body = item_body_by_coord(coord);
17517 +       length = item_length_by_coord(coord);
17518 +
17519 +       assert("nikita-295", iplug != NULL);
17520 +       assert("nikita-296", body != NULL);
17521 +       assert("nikita-297", length > 0);
17522 +
17523 +       /* inode is under I_LOCK now */
17524 +
17525 +       state = reiser4_inode_data(inode);
17526 +       /* call stat-data plugin method to load sd content into inode */
17527 +       result = iplug->s.sd.init_inode(inode, body, length);
17528 +       set_plugin(&state->pset, PSET_SD, item_plugin_to_plugin(iplug));
17529 +       if (result == 0) {
17530 +               result = setup_inode_ops(inode, NULL);
17531 +               if (result == 0 && inode->i_sb->s_root &&
17532 +                   inode->i_sb->s_root->d_inode)
17533 +                       result = finish_pset(inode);
17534 +       }
17535 +       zrelse(coord->node);
17536 +       return result;
17537 +}
17538 +
17539 +/* read `inode' from the disk. This is what was previously in
17540 +   reiserfs_read_inode2().
17541 +
17542 +   Must be called with inode locked. Return inode still locked.
17543 +*/
17544 +static int read_inode(struct inode *inode /* inode to read from disk */ ,
17545 +                     const reiser4_key * key /* key of stat data */ ,
17546 +                     int silent)
17547 +{
17548 +       int result;
17549 +       lock_handle lh;
17550 +       reiser4_inode *info;
17551 +       coord_t coord;
17552 +
17553 +       assert("nikita-298", inode != NULL);
17554 +       assert("nikita-1945", !is_inode_loaded(inode));
17555 +
17556 +       info = reiser4_inode_data(inode);
17557 +       assert("nikita-300", info->locality_id != 0);
17558 +
17559 +       coord_init_zero(&coord);
17560 +       init_lh(&lh);
17561 +       /* locate stat-data in a tree and return znode locked */
17562 +       result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent);
17563 +       assert("nikita-301", !is_inode_loaded(inode));
17564 +       if (result == 0) {
17565 +               /* use stat-data plugin to load sd into inode. */
17566 +               result = init_inode(inode, &coord);
17567 +               if (result == 0) {
17568 +                       /* initialize stat-data seal */
17569 +                       spin_lock_inode(inode);
17570 +                       reiser4_seal_init(&info->sd_seal, &coord, key);
17571 +                       info->sd_coord = coord;
17572 +                       spin_unlock_inode(inode);
17573 +
17574 +                       /* call file plugin's method to initialize plugin
17575 +                        * specific part of inode */
17576 +                       if (inode_file_plugin(inode)->init_inode_data)
17577 +                               inode_file_plugin(inode)->init_inode_data(inode,
17578 +                                                                         NULL,
17579 +                                                                         0);
17580 +                       /* load detached directory cursors for stateless
17581 +                        * directory readers (NFS). */
17582 +                       reiser4_load_cursors(inode);
17583 +
17584 +                       /* Check the opened inode for consistency. */
17585 +                       result =
17586 +                           get_super_private(inode->i_sb)->df_plug->
17587 +                           check_open(inode);
17588 +               }
17589 +       }
17590 +       /* lookup_sd() doesn't release coord because we want znode
17591 +          stay read-locked while stat-data fields are accessed in
17592 +          init_inode() */
17593 +       done_lh(&lh);
17594 +
17595 +       if (result != 0)
17596 +               reiser4_make_bad_inode(inode);
17597 +       return result;
17598 +}
17599 +
17600 +/* initialise new reiser4 inode being inserted into hash table. */
17601 +static int init_locked_inode(struct inode *inode /* new inode */ ,
17602 +                            void *opaque       /* key of stat data passed to
17603 +                                               * the iget5_locked as cookie */)
17604 +{
17605 +       reiser4_key *key;
17606 +
17607 +       assert("nikita-1995", inode != NULL);
17608 +       assert("nikita-1996", opaque != NULL);
17609 +       key = opaque;
17610 +       set_inode_oid(inode, get_key_objectid(key));
17611 +       reiser4_inode_data(inode)->locality_id = get_key_locality(key);
17612 +       return 0;
17613 +}
17614 +
17615 +/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to
17616 +   iget5_locked().
17617 +
17618 +   This function is called by iget5_locked() to distinguish reiser4 inodes
17619 +   having the same inode numbers. Such inodes can only exist due to some error
17620 +   condition. One of them should be bad. Inodes with identical inode numbers
17621 +   (objectids) are distinguished by their packing locality.
17622 +
17623 +*/
17624 +static int reiser4_inode_find_actor(struct inode *inode        /* inode from hash table
17625 +                                                        * to check */ ,
17626 +                                   void *opaque        /* "cookie" passed to
17627 +                                                        * iget5_locked(). This
17628 +                                                        * is stat-data key */)
17629 +{
17630 +       reiser4_key *key;
17631 +
17632 +       key = opaque;
17633 +       return
17634 +           /* oid is unique, so first term is enough, actually. */
17635 +           get_inode_oid(inode) == get_key_objectid(key) &&
17636 +           /*
17637 +            * also, locality should be checked, but locality is stored in
17638 +            * the reiser4-specific part of the inode, and actor can be
17639 +            * called against arbitrary inode that happened to be in this
17640 +            * hash chain. Hence we first have to check that this is
17641 +            * reiser4 inode at least. is_reiser4_inode() is probably too
17642 +            * early to call, as inode may have ->i_op not yet
17643 +            * initialised.
17644 +            */
17645 +           is_reiser4_super(inode->i_sb) &&
17646 +           /*
17647 +            * usually objectid is unique, but pseudo files use counter to
17648 +            * generate objectid. All pseudo files are placed into special
17649 +            * (otherwise unused) locality.
17650 +            */
17651 +           reiser4_inode_data(inode)->locality_id == get_key_locality(key);
17652 +}
17653 +
17654 +/* hook for kmem_cache_create */
17655 +void loading_init_once(reiser4_inode * info)
17656 +{
17657 +       mutex_init(&info->loading);
17658 +}
17659 +
17660 +/* for reiser4_alloc_inode */
17661 +void loading_alloc(reiser4_inode * info)
17662 +{
17663 +       assert("vs-1717", !mutex_is_locked(&info->loading));
17664 +}
17665 +
17666 +/* for reiser4_destroy */
17667 +void loading_destroy(reiser4_inode * info)
17668 +{
17669 +       assert("vs-1717a", !mutex_is_locked(&info->loading));
17670 +}
17671 +
17672 +static void loading_begin(reiser4_inode * info)
17673 +{
17674 +       mutex_lock(&info->loading);
17675 +}
17676 +
17677 +static void loading_end(reiser4_inode * info)
17678 +{
17679 +       mutex_unlock(&info->loading);
17680 +}
17681 +
17682 +/**
17683 + * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary
17684 + * @super: super block of filesystem
17685 + * @key: key of inode's stat-data
17686 + * @silent:
17687 + *
17688 + * This is our helper function a la iget(). This is be called by
17689 + * lookup_common() and reiser4_read_super(). Return inode locked or error
17690 + * encountered.
17691 + */
17692 +struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key,
17693 +                          int silent)
17694 +{
17695 +       struct inode *inode;
17696 +       int result;
17697 +       reiser4_inode *info;
17698 +
17699 +       assert("nikita-302", super != NULL);
17700 +       assert("nikita-303", key != NULL);
17701 +
17702 +       result = 0;
17703 +
17704 +       /* call iget(). Our ->read_inode() is dummy, so this will either
17705 +          find inode in cache or return uninitialised inode */
17706 +       inode = iget5_locked(super,
17707 +                            (unsigned long)get_key_objectid(key),
17708 +                            reiser4_inode_find_actor,
17709 +                            init_locked_inode, (reiser4_key *) key);
17710 +       if (inode == NULL)
17711 +               return ERR_PTR(RETERR(-ENOMEM));
17712 +       if (is_bad_inode(inode)) {
17713 +               warning("nikita-304", "Bad inode found");
17714 +               reiser4_print_key("key", key);
17715 +               iput(inode);
17716 +               return ERR_PTR(RETERR(-EIO));
17717 +       }
17718 +
17719 +       info = reiser4_inode_data(inode);
17720 +
17721 +       /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully
17722 +          loaded and initialized inode from just allocated inode. If
17723 +          REISER4_LOADED bit is not set, reiser4_iget() completes loading under
17724 +          info->loading.  The place in reiser4 which uses not initialized inode
17725 +          is the reiser4 repacker, see repacker-related functions in
17726 +          plugin/item/extent.c */
17727 +       if (!is_inode_loaded(inode)) {
17728 +               loading_begin(info);
17729 +               if (!is_inode_loaded(inode)) {
17730 +                       /* locking: iget5_locked returns locked inode */
17731 +                       assert("nikita-1941", !is_inode_loaded(inode));
17732 +                       assert("nikita-1949",
17733 +                              reiser4_inode_find_actor(inode,
17734 +                                                       (reiser4_key *) key));
17735 +                       /* now, inode has objectid as ->i_ino and locality in
17736 +                          reiser4-specific part. This is enough for
17737 +                          read_inode() to read stat data from the disk */
17738 +                       result = read_inode(inode, key, silent);
17739 +               } else
17740 +                       loading_end(info);
17741 +       }
17742 +
17743 +       if (inode->i_state & I_NEW)
17744 +               unlock_new_inode(inode);
17745 +
17746 +       if (is_bad_inode(inode)) {
17747 +               assert("vs-1717", result != 0);
17748 +               loading_end(info);
17749 +               iput(inode);
17750 +               inode = ERR_PTR(result);
17751 +       } else if (REISER4_DEBUG) {
17752 +               reiser4_key found_key;
17753 +
17754 +               assert("vs-1717", result == 0);
17755 +               build_sd_key(inode, &found_key);
17756 +               if (!keyeq(&found_key, key)) {
17757 +                       warning("nikita-305", "Wrong key in sd");
17758 +                       reiser4_print_key("sought for", key);
17759 +                       reiser4_print_key("found", &found_key);
17760 +               }
17761 +               if (inode->i_nlink == 0) {
17762 +                       warning("nikita-3559", "Unlinked inode found: %llu\n",
17763 +                               (unsigned long long)get_inode_oid(inode));
17764 +               }
17765 +       }
17766 +       return inode;
17767 +}
17768 +
17769 +/* reiser4_iget() may return not fully initialized inode, this function should
17770 + * be called after one completes reiser4 inode initializing. */
17771 +void reiser4_iget_complete(struct inode *inode)
17772 +{
17773 +       assert("zam-988", is_reiser4_inode(inode));
17774 +
17775 +       if (!is_inode_loaded(inode)) {
17776 +               reiser4_inode_set_flag(inode, REISER4_LOADED);
17777 +               loading_end(reiser4_inode_data(inode));
17778 +       }
17779 +}
17780 +
17781 +void reiser4_make_bad_inode(struct inode *inode)
17782 +{
17783 +       assert("nikita-1934", inode != NULL);
17784 +
17785 +       /* clear LOADED bit */
17786 +       reiser4_inode_clr_flag(inode, REISER4_LOADED);
17787 +       make_bad_inode(inode);
17788 +       return;
17789 +}
17790 +
17791 +file_plugin *inode_file_plugin(const struct inode *inode)
17792 +{
17793 +       assert("nikita-1997", inode != NULL);
17794 +       return reiser4_inode_data(inode)->pset->file;
17795 +}
17796 +
17797 +dir_plugin *inode_dir_plugin(const struct inode *inode)
17798 +{
17799 +       assert("nikita-1998", inode != NULL);
17800 +       return reiser4_inode_data(inode)->pset->dir;
17801 +}
17802 +
17803 +formatting_plugin *inode_formatting_plugin(const struct inode *inode)
17804 +{
17805 +       assert("nikita-2000", inode != NULL);
17806 +       return reiser4_inode_data(inode)->pset->formatting;
17807 +}
17808 +
17809 +hash_plugin *inode_hash_plugin(const struct inode *inode)
17810 +{
17811 +       assert("nikita-2001", inode != NULL);
17812 +       return reiser4_inode_data(inode)->pset->hash;
17813 +}
17814 +
17815 +fibration_plugin *inode_fibration_plugin(const struct inode *inode)
17816 +{
17817 +       assert("nikita-2001", inode != NULL);
17818 +       return reiser4_inode_data(inode)->pset->fibration;
17819 +}
17820 +
17821 +cipher_plugin *inode_cipher_plugin(const struct inode *inode)
17822 +{
17823 +       assert("edward-36", inode != NULL);
17824 +       return reiser4_inode_data(inode)->pset->cipher;
17825 +}
17826 +
17827 +compression_plugin *inode_compression_plugin(const struct inode *inode)
17828 +{
17829 +       assert("edward-37", inode != NULL);
17830 +       return reiser4_inode_data(inode)->pset->compression;
17831 +}
17832 +
17833 +compression_mode_plugin *inode_compression_mode_plugin(const struct inode *
17834 +                                                      inode)
17835 +{
17836 +       assert("edward-1330", inode != NULL);
17837 +       return reiser4_inode_data(inode)->pset->compression_mode;
17838 +}
17839 +
17840 +cluster_plugin *inode_cluster_plugin(const struct inode *inode)
17841 +{
17842 +       assert("edward-1328", inode != NULL);
17843 +       return reiser4_inode_data(inode)->pset->cluster;
17844 +}
17845 +
17846 +file_plugin *inode_create_plugin(const struct inode *inode)
17847 +{
17848 +       assert("edward-1329", inode != NULL);
17849 +       return reiser4_inode_data(inode)->pset->create;
17850 +}
17851 +
17852 +digest_plugin *inode_digest_plugin(const struct inode *inode)
17853 +{
17854 +       assert("edward-86", inode != NULL);
17855 +       return reiser4_inode_data(inode)->pset->digest;
17856 +}
17857 +
17858 +item_plugin *inode_sd_plugin(const struct inode *inode)
17859 +{
17860 +       assert("vs-534", inode != NULL);
17861 +       return reiser4_inode_data(inode)->pset->sd;
17862 +}
17863 +
17864 +item_plugin *inode_dir_item_plugin(const struct inode *inode)
17865 +{
17866 +       assert("vs-534", inode != NULL);
17867 +       return reiser4_inode_data(inode)->pset->dir_item;
17868 +}
17869 +
17870 +file_plugin *child_create_plugin(const struct inode *inode)
17871 +{
17872 +       assert("edward-1329", inode != NULL);
17873 +       return reiser4_inode_data(inode)->hset->create;
17874 +}
17875 +
17876 +void inode_set_extension(struct inode *inode, sd_ext_bits ext)
17877 +{
17878 +       reiser4_inode *state;
17879 +
17880 +       assert("nikita-2716", inode != NULL);
17881 +       assert("nikita-2717", ext < LAST_SD_EXTENSION);
17882 +       assert("nikita-3491", spin_inode_is_locked(inode));
17883 +
17884 +       state = reiser4_inode_data(inode);
17885 +       state->extmask |= 1 << ext;
17886 +       /* force re-calculation of stat-data length on next call to
17887 +          update_sd(). */
17888 +       reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17889 +}
17890 +
17891 +void inode_clr_extension(struct inode *inode, sd_ext_bits ext)
17892 +{
17893 +       reiser4_inode *state;
17894 +
17895 +       assert("vpf-1926", inode != NULL);
17896 +       assert("vpf-1927", ext < LAST_SD_EXTENSION);
17897 +       assert("vpf-1928", spin_inode_is_locked(inode));
17898 +
17899 +       state = reiser4_inode_data(inode);
17900 +       state->extmask &= ~(1 << ext);
17901 +       /* force re-calculation of stat-data length on next call to
17902 +          update_sd(). */
17903 +       reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17904 +}
17905 +
17906 +void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new)
17907 +{
17908 +       assert("edward-1287", inode != NULL);
17909 +       if (!dscale_fit(old, new))
17910 +               reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17911 +       return;
17912 +}
17913 +
17914 +void inode_check_scale(struct inode *inode, __u64 old, __u64 new)
17915 +{
17916 +       assert("nikita-2875", inode != NULL);
17917 +       spin_lock_inode(inode);
17918 +       inode_check_scale_nolock(inode, old, new);
17919 +       spin_unlock_inode(inode);
17920 +}
17921 +
17922 +/*
17923 + * initialize ->ordering field of inode. This field defines how file stat-data
17924 + * and body is ordered within a tree with respect to other objects within the
17925 + * same parent directory.
17926 + */
17927 +void
17928 +init_inode_ordering(struct inode *inode,
17929 +                   reiser4_object_create_data * crd, int create)
17930 +{
17931 +       reiser4_key key;
17932 +
17933 +       if (create) {
17934 +               struct inode *parent;
17935 +
17936 +               parent = crd->parent;
17937 +               assert("nikita-3224", inode_dir_plugin(parent) != NULL);
17938 +               inode_dir_plugin(parent)->build_entry_key(parent,
17939 +                                                         &crd->dentry->d_name,
17940 +                                                         &key);
17941 +       } else {
17942 +               coord_t *coord;
17943 +
17944 +               coord = &reiser4_inode_data(inode)->sd_coord;
17945 +               coord_clear_iplug(coord);
17946 +               /* safe to use ->sd_coord, because node is under long term
17947 +                * lock */
17948 +               WITH_DATA(coord->node, item_key_by_coord(coord, &key));
17949 +       }
17950 +
17951 +       set_inode_ordering(inode, get_key_ordering(&key));
17952 +}
17953 +
17954 +znode *inode_get_vroot(struct inode *inode)
17955 +{
17956 +       reiser4_block_nr blk;
17957 +       znode *result;
17958 +
17959 +       spin_lock_inode(inode);
17960 +       blk = reiser4_inode_data(inode)->vroot;
17961 +       spin_unlock_inode(inode);
17962 +       if (!disk_addr_eq(&UBER_TREE_ADDR, &blk))
17963 +               result = zlook(reiser4_tree_by_inode(inode), &blk);
17964 +       else
17965 +               result = NULL;
17966 +       return result;
17967 +}
17968 +
17969 +void inode_set_vroot(struct inode *inode, znode *vroot)
17970 +{
17971 +       spin_lock_inode(inode);
17972 +       reiser4_inode_data(inode)->vroot = *znode_get_block(vroot);
17973 +       spin_unlock_inode(inode);
17974 +}
17975 +
17976 +#if REISER4_DEBUG
17977 +
17978 +void reiser4_inode_invariant(const struct inode *inode)
17979 +{
17980 +       assert("nikita-3077", spin_inode_is_locked(inode));
17981 +}
17982 +
17983 +int inode_has_no_jnodes(reiser4_inode * r4_inode)
17984 +{
17985 +       return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL &&
17986 +               r4_inode->nr_jnodes == 0;
17987 +}
17988 +
17989 +#endif
17990 +
17991 +/* true if directory is empty (only contains dot and dotdot) */
17992 +/* FIXME: shouldn't it be dir plugin method? */
17993 +int is_dir_empty(const struct inode *dir)
17994 +{
17995 +       assert("nikita-1976", dir != NULL);
17996 +
17997 +       /* rely on our method to maintain directory i_size being equal to the
17998 +          number of entries. */
17999 +       return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY);
18000 +}
18001 +
18002 +/* Make Linus happy.
18003 +   Local variables:
18004 +   c-indentation-style: "K&R"
18005 +   mode-name: "LC"
18006 +   c-basic-offset: 8
18007 +   tab-width: 8
18008 +   fill-column: 120
18009 +   End:
18010 +*/
18011 diff -urN linux-2.6.35.orig/fs/reiser4/inode.h linux-2.6.35/fs/reiser4/inode.h
18012 --- linux-2.6.35.orig/fs/reiser4/inode.h        1970-01-01 01:00:00.000000000 +0100
18013 +++ linux-2.6.35/fs/reiser4/inode.h     2010-08-04 15:44:57.000000000 +0200
18014 @@ -0,0 +1,453 @@
18015 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
18016 +   reiser4/README */
18017 +
18018 +/* Inode functions. */
18019 +
18020 +#if !defined(__REISER4_INODE_H__)
18021 +#define __REISER4_INODE_H__
18022 +
18023 +#include "forward.h"
18024 +#include "debug.h"
18025 +#include "key.h"
18026 +#include "seal.h"
18027 +#include "plugin/plugin.h"
18028 +#include "plugin/file/cryptcompress.h"
18029 +#include "plugin/file/file.h"
18030 +#include "plugin/dir/dir.h"
18031 +#include "plugin/plugin_set.h"
18032 +#include "plugin/security/perm.h"
18033 +#include "vfs_ops.h"
18034 +#include "jnode.h"
18035 +#include "fsdata.h"
18036 +
18037 +#include <linux/types.h>       /* for __u?? , ino_t */
18038 +#include <linux/fs.h>          /* for struct super_block, struct
18039 +                                * rw_semaphore, etc  */
18040 +#include <linux/spinlock.h>
18041 +#include <asm/types.h>
18042 +
18043 +/* reiser4-specific inode flags. They are "transient" and are not
18044 +   supposed to be stored on disk. Used to trace "state" of
18045 +   inode
18046 +*/
18047 +typedef enum {
18048 +       /* this is light-weight inode, inheriting some state from its
18049 +          parent  */
18050 +       REISER4_LIGHT_WEIGHT = 0,
18051 +       /* stat data wasn't yet created */
18052 +       REISER4_NO_SD = 1,
18053 +       /* internal immutable flag. Currently is only used
18054 +          to avoid race condition during file creation.
18055 +          See comment in create_object(). */
18056 +       REISER4_IMMUTABLE = 2,
18057 +       /* inode was read from storage */
18058 +       REISER4_LOADED = 3,
18059 +       /* this bit is set for symlinks. inode->i_private points to target
18060 +          name of symlink. */
18061 +       REISER4_GENERIC_PTR_USED = 4,
18062 +       /* set if size of stat-data item for this inode is known. If this is
18063 +        * set we can avoid recalculating size of stat-data on each update. */
18064 +       REISER4_SDLEN_KNOWN = 5,
18065 +       /* reiser4_inode->crypt points to the crypto stat */
18066 +       REISER4_CRYPTO_STAT_LOADED = 6,
18067 +       /* cryptcompress_inode_data points to the secret key */
18068 +       REISER4_SECRET_KEY_INSTALLED = 7,
18069 +       /* File (possibly) has pages corresponding to the tail items, that
18070 +        * were created by ->readpage. It is set by mmap_unix_file() and
18071 +        * sendfile_unix_file(). This bit is inspected by write_unix_file and
18072 +        * kill-hook of tail items. It is never cleared once set. This bit is
18073 +        * modified and inspected under i_mutex. */
18074 +       REISER4_HAS_MMAP = 8,
18075 +       REISER4_PART_MIXED = 9,
18076 +       REISER4_PART_IN_CONV = 10,
18077 +       /* This flag indicates that file plugin conversion is in progress */
18078 +       REISER4_FILE_CONV_IN_PROGRESS = 11
18079 +} reiser4_file_plugin_flags;
18080 +
18081 +/* state associated with each inode.
18082 +   reiser4 inode.
18083 +
18084 +   NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes
18085 +   be of the same size. File-system allocates inodes by itself through
18086 +   s_op->allocate_inode() method. So, it is possible to adjust size of inode
18087 +   at the time of its creation.
18088 +
18089 +   Invariants involving parts of this data-type:
18090 +
18091 +      [inode->eflushed]
18092 +
18093 +*/
18094 +
18095 +typedef struct reiser4_inode reiser4_inode;
18096 +/* return pointer to reiser4-specific part of inode */
18097 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
18098 +                                               /* inode queried */ );
18099 +
18100 +#if BITS_PER_LONG == 64
18101 +
18102 +#define REISER4_INO_IS_OID (1)
18103 +typedef struct {;
18104 +} oid_hi_t;
18105 +
18106 +/* BITS_PER_LONG == 64 */
18107 +#else
18108 +
18109 +#define REISER4_INO_IS_OID (0)
18110 +typedef __u32 oid_hi_t;
18111 +
18112 +/* BITS_PER_LONG == 64 */
18113 +#endif
18114 +
18115 +struct reiser4_inode {
18116 +       /* spin lock protecting fields of this structure. */
18117 +       spinlock_t guard;
18118 +       /* main plugin set that control the file
18119 +          (see comments in plugin/plugin_set.c) */
18120 +       plugin_set *pset;
18121 +       /* plugin set for inheritance
18122 +          (see comments in plugin/plugin_set.c) */
18123 +       plugin_set *hset;
18124 +       /* high 32 bits of object id */
18125 +       oid_hi_t oid_hi;
18126 +       /* seal for stat-data */
18127 +       seal_t sd_seal;
18128 +       /* locality id for this file */
18129 +       oid_t locality_id;
18130 +#if REISER4_LARGE_KEY
18131 +       __u64 ordering;
18132 +#endif
18133 +       /* coord of stat-data in sealed node */
18134 +       coord_t sd_coord;
18135 +       /* bit-mask of stat-data extentions used by this file */
18136 +       __u64 extmask;
18137 +       /* bitmask of non-default plugins for this inode */
18138 +       __u16 plugin_mask;
18139 +       /* bitmask of set heir plugins for this inode. */
18140 +       __u16 heir_mask;
18141 +       union {
18142 +               struct list_head readdir_list;
18143 +               struct list_head not_used;
18144 +       } lists;
18145 +       /* per-inode flags. Filled by values of reiser4_file_plugin_flags */
18146 +       unsigned long flags;
18147 +       union {
18148 +               /* fields specific to unix_file plugin */
18149 +               struct unix_file_info unix_file_info;
18150 +               /* fields specific to cryptcompress file plugin */
18151 +               struct cryptcompress_info cryptcompress_info;
18152 +       } file_plugin_data;
18153 +
18154 +       /* this semaphore is to serialize readers and writers of @pset->file
18155 +        * when file plugin conversion is enabled
18156 +        */
18157 +       struct rw_semaphore conv_sem;
18158 +
18159 +       /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are
18160 +          tagged in that tree by EFLUSH_TAG_ANONYMOUS */
18161 +       struct radix_tree_root jnodes_tree;
18162 +#if REISER4_DEBUG
18163 +       /* number of unformatted node jnodes of this file in jnode hash table */
18164 +       unsigned long nr_jnodes;
18165 +#endif
18166 +
18167 +       /* block number of virtual root for this object. See comment above
18168 +        * fs/reiser4/search.c:handle_vroot() */
18169 +       reiser4_block_nr vroot;
18170 +       struct mutex loading;
18171 +};
18172 +
18173 +void loading_init_once(reiser4_inode *);
18174 +void loading_alloc(reiser4_inode *);
18175 +void loading_destroy(reiser4_inode *);
18176 +
18177 +struct reiser4_inode_object {
18178 +       /* private part */
18179 +       reiser4_inode p;
18180 +       /* generic fields not specific to reiser4, but used by VFS */
18181 +       struct inode vfs_inode;
18182 +};
18183 +
18184 +/* return pointer to the reiser4 specific portion of @inode */
18185 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
18186 +                                               /* inode queried */ )
18187 +{
18188 +       assert("nikita-254", inode != NULL);
18189 +       return &container_of(inode, struct reiser4_inode_object, vfs_inode)->p;
18190 +}
18191 +
18192 +static inline struct inode *inode_by_reiser4_inode(const reiser4_inode *
18193 +                                                  r4_inode /* inode queried */
18194 +                                                  )
18195 +{
18196 +       return &container_of(r4_inode, struct reiser4_inode_object,
18197 +                            p)->vfs_inode;
18198 +}
18199 +
18200 +/*
18201 + * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct
18202 + * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64
18203 + * bits.
18204 + *
18205 + * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part
18206 + * of inode, otherwise whole oid is stored in i_ino.
18207 + *
18208 + * Wrappers below ([sg]et_inode_oid()) are used to hide this difference.
18209 + */
18210 +
18211 +#define OID_HI_SHIFT (sizeof(ino_t) * 8)
18212 +
18213 +#if REISER4_INO_IS_OID
18214 +
18215 +static inline oid_t get_inode_oid(const struct inode *inode)
18216 +{
18217 +       return inode->i_ino;
18218 +}
18219 +
18220 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
18221 +{
18222 +       inode->i_ino = oid;
18223 +}
18224 +
18225 +/* REISER4_INO_IS_OID */
18226 +#else
18227 +
18228 +static inline oid_t get_inode_oid(const struct inode *inode)
18229 +{
18230 +       return
18231 +           ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) |
18232 +           inode->i_ino;
18233 +}
18234 +
18235 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
18236 +{
18237 +       assert("nikita-2519", inode != NULL);
18238 +       inode->i_ino = (ino_t) (oid);
18239 +       reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT;
18240 +       assert("nikita-2521", get_inode_oid(inode) == (oid));
18241 +}
18242 +
18243 +/* REISER4_INO_IS_OID */
18244 +#endif
18245 +
18246 +static inline oid_t get_inode_locality(const struct inode *inode)
18247 +{
18248 +       return reiser4_inode_data(inode)->locality_id;
18249 +}
18250 +
18251 +#if REISER4_LARGE_KEY
18252 +static inline __u64 get_inode_ordering(const struct inode *inode)
18253 +{
18254 +       return reiser4_inode_data(inode)->ordering;
18255 +}
18256 +
18257 +static inline void set_inode_ordering(const struct inode *inode, __u64 ordering)
18258 +{
18259 +       reiser4_inode_data(inode)->ordering = ordering;
18260 +}
18261 +
18262 +#else
18263 +
18264 +#define get_inode_ordering(inode) (0)
18265 +#define set_inode_ordering(inode, val) noop
18266 +
18267 +#endif
18268 +
18269 +/* return inode in which @uf_info is embedded */
18270 +static inline struct inode *
18271 +unix_file_info_to_inode(const struct unix_file_info *uf_info)
18272 +{
18273 +       return &container_of(uf_info, struct reiser4_inode_object,
18274 +                            p.file_plugin_data.unix_file_info)->vfs_inode;
18275 +}
18276 +
18277 +extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const));
18278 +extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const));
18279 +
18280 +extern reiser4_tree *reiser4_tree_by_inode(const struct inode *inode);
18281 +
18282 +#if REISER4_DEBUG
18283 +extern void reiser4_inode_invariant(const struct inode *inode);
18284 +extern int inode_has_no_jnodes(reiser4_inode *);
18285 +#else
18286 +#define reiser4_inode_invariant(inode) noop
18287 +#endif
18288 +
18289 +static inline int spin_inode_is_locked(const struct inode *inode)
18290 +{
18291 +       assert_spin_locked(&reiser4_inode_data(inode)->guard);
18292 +       return 1;
18293 +}
18294 +
18295 +/**
18296 + * spin_lock_inode - lock reiser4_inode' embedded spinlock
18297 + * @inode: inode to lock
18298 + *
18299 + * In debug mode it checks that lower priority locks are not held and
18300 + * increments reiser4_context's lock counters on which lock ordering checking
18301 + * is based.
18302 + */
18303 +static inline void spin_lock_inode(struct inode *inode)
18304 +{
18305 +       assert("", LOCK_CNT_NIL(spin_locked));
18306 +       /* check lock ordering */
18307 +       assert_spin_not_locked(&d_lock);
18308 +
18309 +       spin_lock(&reiser4_inode_data(inode)->guard);
18310 +
18311 +       LOCK_CNT_INC(spin_locked_inode);
18312 +       LOCK_CNT_INC(spin_locked);
18313 +
18314 +       reiser4_inode_invariant(inode);
18315 +}
18316 +
18317 +/**
18318 + * spin_unlock_inode - unlock reiser4_inode' embedded spinlock
18319 + * @inode: inode to unlock
18320 + *
18321 + * In debug mode it checks that spinlock is held and decrements
18322 + * reiser4_context's lock counters on which lock ordering checking is based.
18323 + */
18324 +static inline void spin_unlock_inode(struct inode *inode)
18325 +{
18326 +       assert_spin_locked(&reiser4_inode_data(inode)->guard);
18327 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode));
18328 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
18329 +
18330 +       reiser4_inode_invariant(inode);
18331 +
18332 +       LOCK_CNT_DEC(spin_locked_inode);
18333 +       LOCK_CNT_DEC(spin_locked);
18334 +
18335 +       spin_unlock(&reiser4_inode_data(inode)->guard);
18336 +}
18337 +
18338 +extern znode *inode_get_vroot(struct inode *inode);
18339 +extern void inode_set_vroot(struct inode *inode, znode * vroot);
18340 +
18341 +extern int reiser4_max_filename_len(const struct inode *inode);
18342 +extern int max_hash_collisions(const struct inode *dir);
18343 +extern void reiser4_unlock_inode(struct inode *inode);
18344 +extern int is_reiser4_inode(const struct inode *inode);
18345 +extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *);
18346 +extern struct inode *reiser4_iget(struct super_block *super,
18347 +                                 const reiser4_key * key, int silent);
18348 +extern void reiser4_iget_complete(struct inode *inode);
18349 +extern void reiser4_inode_set_flag(struct inode *inode,
18350 +                                  reiser4_file_plugin_flags f);
18351 +extern void reiser4_inode_clr_flag(struct inode *inode,
18352 +                                  reiser4_file_plugin_flags f);
18353 +extern int reiser4_inode_get_flag(const struct inode *inode,
18354 +                                 reiser4_file_plugin_flags f);
18355 +
18356 +/*  has inode been initialized? */
18357 +static inline int
18358 +is_inode_loaded(const struct inode *inode/* inode queried */)
18359 +{
18360 +       assert("nikita-1120", inode != NULL);
18361 +       return reiser4_inode_get_flag(inode, REISER4_LOADED);
18362 +}
18363 +
18364 +extern file_plugin *inode_file_plugin(const struct inode *inode);
18365 +extern dir_plugin *inode_dir_plugin(const struct inode *inode);
18366 +extern formatting_plugin *inode_formatting_plugin(const struct inode *inode);
18367 +extern hash_plugin *inode_hash_plugin(const struct inode *inode);
18368 +extern fibration_plugin *inode_fibration_plugin(const struct inode *inode);
18369 +extern cipher_plugin *inode_cipher_plugin(const struct inode *inode);
18370 +extern digest_plugin *inode_digest_plugin(const struct inode *inode);
18371 +extern compression_plugin *inode_compression_plugin(const struct inode *inode);
18372 +extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode
18373 +                                                             *inode);
18374 +extern cluster_plugin *inode_cluster_plugin(const struct inode *inode);
18375 +extern file_plugin *inode_create_plugin(const struct inode *inode);
18376 +extern item_plugin *inode_sd_plugin(const struct inode *inode);
18377 +extern item_plugin *inode_dir_item_plugin(const struct inode *inode);
18378 +extern file_plugin *child_create_plugin(const struct inode *inode);
18379 +
18380 +extern void reiser4_make_bad_inode(struct inode *inode);
18381 +
18382 +extern void inode_set_extension(struct inode *inode, sd_ext_bits ext);
18383 +extern void inode_clr_extension(struct inode *inode, sd_ext_bits ext);
18384 +extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new);
18385 +extern void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new);
18386 +
18387 +#define INODE_SET_SIZE(i, value)                       \
18388 +({                                                     \
18389 +       struct inode *__i;                              \
18390 +       typeof(value) __v;                              \
18391 +                                                       \
18392 +       __i = (i);                                      \
18393 +       __v = (value);                                  \
18394 +       inode_check_scale(__i, __i->i_size, __v);       \
18395 +       i_size_write(__i, __v);                         \
18396 +})
18397 +
18398 +/*
18399 + * update field @field in inode @i to contain value @value.
18400 + */
18401 +#define INODE_SET_FIELD(i, field, value)               \
18402 +({                                                     \
18403 +       struct inode *__i;                              \
18404 +       typeof(value) __v;                              \
18405 +                                                       \
18406 +       __i = (i);                                      \
18407 +       __v = (value);                                  \
18408 +       inode_check_scale(__i, __i->field, __v);        \
18409 +       __i->field = __v;                               \
18410 +})
18411 +
18412 +#define INODE_INC_FIELD(i, field)                              \
18413 +({                                                             \
18414 +       struct inode *__i;                                      \
18415 +                                                               \
18416 +       __i = (i);                                              \
18417 +       inode_check_scale(__i, __i->field, __i->field + 1);     \
18418 +       ++ __i->field;                                          \
18419 +})
18420 +
18421 +#define INODE_DEC_FIELD(i, field)                              \
18422 +({                                                             \
18423 +       struct inode *__i;                                      \
18424 +                                                               \
18425 +       __i = (i);                                              \
18426 +       inode_check_scale(__i, __i->field, __i->field - 1);     \
18427 +       -- __i->field;                                          \
18428 +})
18429 +
18430 +/* See comment before reiser4_readdir_common() for description. */
18431 +static inline struct list_head *get_readdir_list(const struct inode *inode)
18432 +{
18433 +       return &reiser4_inode_data(inode)->lists.readdir_list;
18434 +}
18435 +
18436 +extern void init_inode_ordering(struct inode *inode,
18437 +                               reiser4_object_create_data * crd, int create);
18438 +
18439 +static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode)
18440 +{
18441 +       return &reiser4_inode_data(inode)->jnodes_tree;
18442 +}
18443 +
18444 +static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode
18445 +                                                                 *r4_inode)
18446 +{
18447 +       return &r4_inode->jnodes_tree;
18448 +}
18449 +
18450 +#if REISER4_DEBUG
18451 +extern void print_inode(const char *prefix, const struct inode *i);
18452 +#endif
18453 +
18454 +int is_dir_empty(const struct inode *);
18455 +
18456 +/* __REISER4_INODE_H__ */
18457 +#endif
18458 +
18459 +/* Make Linus happy.
18460 +   Local variables:
18461 +   c-indentation-style: "K&R"
18462 +   mode-name: "LC"
18463 +   c-basic-offset: 8
18464 +   tab-width: 8
18465 +   fill-column: 120
18466 +   End:
18467 +*/
18468 diff -urN linux-2.6.35.orig/fs/reiser4/ioctl.h linux-2.6.35/fs/reiser4/ioctl.h
18469 --- linux-2.6.35.orig/fs/reiser4/ioctl.h        1970-01-01 01:00:00.000000000 +0100
18470 +++ linux-2.6.35/fs/reiser4/ioctl.h     2010-08-04 15:44:57.000000000 +0200
18471 @@ -0,0 +1,41 @@
18472 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
18473 + * reiser4/README */
18474 +
18475 +#if !defined(__REISER4_IOCTL_H__)
18476 +#define __REISER4_IOCTL_H__
18477 +
18478 +#include <linux/fs.h>
18479 +
18480 +/*
18481 + * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into
18482 + * extents and fix in this state. This is used by applications that rely on
18483 + *
18484 + *     . files being block aligned, and
18485 + *
18486 + *     . files never migrating on disk
18487 + *
18488 + * for example, boot loaders (LILO) need this.
18489 + *
18490 + * This ioctl should be used as
18491 + *
18492 + *     result = ioctl(fd, REISER4_IOC_UNPACK);
18493 + *
18494 + * File behind fd descriptor will be converted to the extents (if necessary),
18495 + * and its stat-data will be updated so that it will never be converted back
18496 + * into tails again.
18497 + */
18498 +#define REISER4_IOC_UNPACK _IOW(0xCD, 1, long)
18499 +
18500 +/* __REISER4_IOCTL_H__ */
18501 +#endif
18502 +
18503 +/* Make Linus happy.
18504 +   Local variables:
18505 +   c-indentation-style: "K&R"
18506 +   mode-name: "LC"
18507 +   c-basic-offset: 8
18508 +   tab-width: 8
18509 +   fill-column: 120
18510 +   scroll-step: 1
18511 +   End:
18512 +*/
18513 diff -urN linux-2.6.35.orig/fs/reiser4/jnode.c linux-2.6.35/fs/reiser4/jnode.c
18514 --- linux-2.6.35.orig/fs/reiser4/jnode.c        1970-01-01 01:00:00.000000000 +0100
18515 +++ linux-2.6.35/fs/reiser4/jnode.c     2010-08-04 15:44:57.000000000 +0200
18516 @@ -0,0 +1,1923 @@
18517 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
18518 + * reiser4/README */
18519 +/* Jnode manipulation functions. */
18520 +/* Jnode is entity used to track blocks with data and meta-data in reiser4.
18521 +
18522 +   In particular, jnodes are used to track transactional information
18523 +   associated with each block. Each znode contains jnode as ->zjnode field.
18524 +
18525 +   Jnode stands for either Josh or Journal node.
18526 +*/
18527 +
18528 +/*
18529 + * Taxonomy.
18530 + *
18531 + *     Jnode represents block containing data or meta-data. There are jnodes
18532 + *     for:
18533 + *
18534 + *         unformatted blocks (jnodes proper). There are plans, however to
18535 + *         have a handle per extent unit rather than per each unformatted
18536 + *         block, because there are so many of them.
18537 + *
18538 + *         For bitmaps. Each bitmap is actually represented by two jnodes--one
18539 + *         for working and another for "commit" data, together forming bnode.
18540 + *
18541 + *         For io-heads. These are used by log writer.
18542 + *
18543 + *         For formatted nodes (znode). See comment at the top of znode.c for
18544 + *         details specific to the formatted nodes (znodes).
18545 + *
18546 + * Node data.
18547 + *
18548 + *     Jnode provides access to the data of node it represents. Data are
18549 + *     stored in a page. Page is kept in a page cache. This means, that jnodes
18550 + *     are highly interconnected with page cache and VM internals.
18551 + *
18552 + *     jnode has a pointer to page (->pg) containing its data. Pointer to data
18553 + *     themselves is cached in ->data field to avoid frequent calls to
18554 + *     page_address().
18555 + *
18556 + *     jnode and page are attached to each other by jnode_attach_page(). This
18557 + *     function places pointer to jnode in set_page_private(), sets PG_private
18558 + *     flag and increments page counter.
18559 + *
18560 + *     Opposite operation is performed by page_clear_jnode().
18561 + *
18562 + *     jnode->pg is protected by jnode spin lock, and page->private is
18563 + *     protected by page lock. See comment at the top of page_cache.c for
18564 + *     more.
18565 + *
18566 + *     page can be detached from jnode for two reasons:
18567 + *
18568 + *         . jnode is removed from a tree (file is truncated, of formatted
18569 + *         node is removed by balancing).
18570 + *
18571 + *         . during memory pressure, VM calls ->releasepage() method
18572 + *         (reiser4_releasepage()) to evict page from memory.
18573 + *
18574 + *    (there, of course, is also umount, but this is special case we are not
18575 + *    concerned with here).
18576 + *
18577 + *    To protect jnode page from eviction, one calls jload() function that
18578 + *    "pins" page in memory (loading it if necessary), increments
18579 + *    jnode->d_count, and kmap()s page. Page is unpinned through call to
18580 + *    jrelse().
18581 + *
18582 + * Jnode life cycle.
18583 + *
18584 + *    jnode is created, placed in hash table, and, optionally, in per-inode
18585 + *    radix tree. Page can be attached to jnode, pinned, released, etc.
18586 + *
18587 + *    When jnode is captured into atom its reference counter is
18588 + *    increased. While being part of an atom, jnode can be "early
18589 + *    flushed". This means that as part of flush procedure, jnode is placed
18590 + *    into "relocate set", and its page is submitted to the disk. After io
18591 + *    completes, page can be detached, then loaded again, re-dirtied, etc.
18592 + *
18593 + *    Thread acquired reference to jnode by calling jref() and releases it by
18594 + *    jput(). When last reference is removed, jnode is still retained in
18595 + *    memory (cached) if it has page attached, _unless_ it is scheduled for
18596 + *    destruction (has JNODE_HEARD_BANSHEE bit set).
18597 + *
18598 + *    Tree read-write lock was used as "existential" lock for jnodes. That is,
18599 + *    jnode->x_count could be changed from 0 to 1 only under tree write lock,
18600 + *    that is, tree lock protected unreferenced jnodes stored in the hash
18601 + *    table, from recycling.
18602 + *
18603 + *    This resulted in high contention on tree lock, because jref()/jput() is
18604 + *    frequent operation. To ameliorate this problem, RCU is used: when jput()
18605 + *    is just about to release last reference on jnode it sets JNODE_RIP bit
18606 + *    on it, and then proceed with jnode destruction (removing jnode from hash
18607 + *    table, cbk_cache, detaching page, etc.). All places that change jnode
18608 + *    reference counter from 0 to 1 (jlookup(), zlook(), zget(), and
18609 + *    cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by
18610 + *    jnode_rip_check() function), and pretend that nothing was found in hash
18611 + *    table if bit is set.
18612 + *
18613 + *    jput defers actual return of jnode into slab cache to some later time
18614 + *    (by call_rcu()), this guarantees that other threads can safely continue
18615 + *    working with JNODE_RIP-ped jnode.
18616 + *
18617 + */
18618 +
18619 +#include "reiser4.h"
18620 +#include "debug.h"
18621 +#include "dformat.h"
18622 +#include "jnode.h"
18623 +#include "plugin/plugin_header.h"
18624 +#include "plugin/plugin.h"
18625 +#include "txnmgr.h"
18626 +/*#include "jnode.h"*/
18627 +#include "znode.h"
18628 +#include "tree.h"
18629 +#include "tree_walk.h"
18630 +#include "super.h"
18631 +#include "inode.h"
18632 +#include "page_cache.h"
18633 +
18634 +#include <asm/uaccess.h>       /* UML needs this for PAGE_OFFSET */
18635 +#include <linux/types.h>
18636 +#include <linux/slab.h>
18637 +#include <linux/pagemap.h>
18638 +#include <linux/swap.h>
18639 +#include <linux/fs.h>          /* for struct address_space  */
18640 +#include <linux/writeback.h>   /* for inode_lock */
18641 +
18642 +static struct kmem_cache *_jnode_slab = NULL;
18643 +
18644 +static void jnode_set_type(jnode * node, jnode_type type);
18645 +static int jdelete(jnode * node);
18646 +static int jnode_try_drop(jnode * node);
18647 +
18648 +#if REISER4_DEBUG
18649 +static int jnode_invariant(jnode * node, int tlocked, int jlocked);
18650 +#endif
18651 +
18652 +/* true if valid page is attached to jnode */
18653 +static inline int jnode_is_parsed(jnode * node)
18654 +{
18655 +       return JF_ISSET(node, JNODE_PARSED);
18656 +}
18657 +
18658 +/* hash table support */
18659 +
18660 +/* compare two jnode keys for equality. Used by hash-table macros */
18661 +static inline int jnode_key_eq(const struct jnode_key *k1,
18662 +                              const struct jnode_key *k2)
18663 +{
18664 +       assert("nikita-2350", k1 != NULL);
18665 +       assert("nikita-2351", k2 != NULL);
18666 +
18667 +       return (k1->index == k2->index && k1->objectid == k2->objectid);
18668 +}
18669 +
18670 +/* Hash jnode by its key (inode plus offset). Used by hash-table macros */
18671 +static inline __u32 jnode_key_hashfn(j_hash_table * table,
18672 +                                    const struct jnode_key *key)
18673 +{
18674 +       assert("nikita-2352", key != NULL);
18675 +       assert("nikita-3346", IS_POW(table->_buckets));
18676 +
18677 +       /* yes, this is remarkable simply (where not stupid) hash function. */
18678 +       return (key->objectid + key->index) & (table->_buckets - 1);
18679 +}
18680 +
18681 +/* The hash table definition */
18682 +#define KMALLOC(size) reiser4_vmalloc(size)
18683 +#define KFREE(ptr, size) vfree(ptr)
18684 +TYPE_SAFE_HASH_DEFINE(j, jnode, struct jnode_key, key.j, link.j,
18685 +                     jnode_key_hashfn, jnode_key_eq);
18686 +#undef KFREE
18687 +#undef KMALLOC
18688 +
18689 +/* call this to initialise jnode hash table */
18690 +int jnodes_tree_init(reiser4_tree * tree/* tree to initialise jnodes for */)
18691 +{
18692 +       assert("nikita-2359", tree != NULL);
18693 +       return j_hash_init(&tree->jhash_table, 16384);
18694 +}
18695 +
18696 +/* call this to destroy jnode hash table. This is called during umount. */
18697 +int jnodes_tree_done(reiser4_tree * tree/* tree to destroy jnodes for */)
18698 +{
18699 +       j_hash_table *jtable;
18700 +       jnode *node;
18701 +       jnode *next;
18702 +
18703 +       assert("nikita-2360", tree != NULL);
18704 +
18705 +       /*
18706 +        * Scan hash table and free all jnodes.
18707 +        */
18708 +       jtable = &tree->jhash_table;
18709 +       if (jtable->_table) {
18710 +               for_all_in_htable(jtable, j, node, next) {
18711 +                       assert("nikita-2361", !atomic_read(&node->x_count));
18712 +                       jdrop(node);
18713 +               }
18714 +
18715 +               j_hash_done(&tree->jhash_table);
18716 +       }
18717 +       return 0;
18718 +}
18719 +
18720 +/**
18721 + * init_jnodes - create jnode cache
18722 + *
18723 + * Initializes slab cache jnodes. It is part of reiser4 module initialization.
18724 + */
18725 +int init_jnodes(void)
18726 +{
18727 +       assert("umka-168", _jnode_slab == NULL);
18728 +
18729 +       _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0,
18730 +                                       SLAB_HWCACHE_ALIGN |
18731 +                                       SLAB_RECLAIM_ACCOUNT, NULL);
18732 +       if (_jnode_slab == NULL)
18733 +               return RETERR(-ENOMEM);
18734 +
18735 +       return 0;
18736 +}
18737 +
18738 +/**
18739 + * done_znodes - delete znode cache
18740 + *
18741 + * This is called on reiser4 module unloading or system shutdown.
18742 + */
18743 +void done_jnodes(void)
18744 +{
18745 +       destroy_reiser4_cache(&_jnode_slab);
18746 +}
18747 +
18748 +/* Initialize a jnode. */
18749 +void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type)
18750 +{
18751 +       assert("umka-175", node != NULL);
18752 +
18753 +       memset(node, 0, sizeof(jnode));
18754 +       ON_DEBUG(node->magic = JMAGIC);
18755 +       jnode_set_type(node, type);
18756 +       atomic_set(&node->d_count, 0);
18757 +       atomic_set(&node->x_count, 0);
18758 +       spin_lock_init(&node->guard);
18759 +       spin_lock_init(&node->load);
18760 +       node->atom = NULL;
18761 +       node->tree = tree;
18762 +       INIT_LIST_HEAD(&node->capture_link);
18763 +
18764 +       ASSIGN_NODE_LIST(node, NOT_CAPTURED);
18765 +
18766 +       INIT_RCU_HEAD(&node->rcu);
18767 +
18768 +#if REISER4_DEBUG
18769 +       {
18770 +               reiser4_super_info_data *sbinfo;
18771 +
18772 +               sbinfo = get_super_private(tree->super);
18773 +               spin_lock_irq(&sbinfo->all_guard);
18774 +               list_add(&node->jnodes, &sbinfo->all_jnodes);
18775 +               spin_unlock_irq(&sbinfo->all_guard);
18776 +       }
18777 +#endif
18778 +}
18779 +
18780 +#if REISER4_DEBUG
18781 +/*
18782 + * Remove jnode from ->all_jnodes list.
18783 + */
18784 +static void jnode_done(jnode * node, reiser4_tree * tree)
18785 +{
18786 +       reiser4_super_info_data *sbinfo;
18787 +
18788 +       sbinfo = get_super_private(tree->super);
18789 +
18790 +       spin_lock_irq(&sbinfo->all_guard);
18791 +       assert("nikita-2422", !list_empty(&node->jnodes));
18792 +       list_del_init(&node->jnodes);
18793 +       spin_unlock_irq(&sbinfo->all_guard);
18794 +}
18795 +#endif
18796 +
18797 +/* return already existing jnode of page */
18798 +jnode *jnode_by_page(struct page *pg)
18799 +{
18800 +       assert("nikita-2066", pg != NULL);
18801 +       assert("nikita-2400", PageLocked(pg));
18802 +       assert("nikita-2068", PagePrivate(pg));
18803 +       assert("nikita-2067", jprivate(pg) != NULL);
18804 +       return jprivate(pg);
18805 +}
18806 +
18807 +/* exported functions to allocate/free jnode objects outside this file */
18808 +jnode *jalloc(void)
18809 +{
18810 +       jnode *jal = kmem_cache_alloc(_jnode_slab, reiser4_ctx_gfp_mask_get());
18811 +       return jal;
18812 +}
18813 +
18814 +/* return jnode back to the slab allocator */
18815 +inline void jfree(jnode * node)
18816 +{
18817 +       assert("zam-449", node != NULL);
18818 +
18819 +       assert("nikita-2663", (list_empty_careful(&node->capture_link) &&
18820 +                              NODE_LIST(node) == NOT_CAPTURED));
18821 +       assert("nikita-3222", list_empty(&node->jnodes));
18822 +       assert("nikita-3221", jnode_page(node) == NULL);
18823 +
18824 +       /* not yet phash_jnode_destroy(node); */
18825 +
18826 +       kmem_cache_free(_jnode_slab, node);
18827 +}
18828 +
18829 +/*
18830 + * This function is supplied as RCU callback. It actually frees jnode when
18831 + * last reference to it is gone.
18832 + */
18833 +static void jnode_free_actor(struct rcu_head *head)
18834 +{
18835 +       jnode *node;
18836 +       jnode_type jtype;
18837 +
18838 +       node = container_of(head, jnode, rcu);
18839 +       jtype = jnode_get_type(node);
18840 +
18841 +       ON_DEBUG(jnode_done(node, jnode_get_tree(node)));
18842 +
18843 +       switch (jtype) {
18844 +       case JNODE_IO_HEAD:
18845 +       case JNODE_BITMAP:
18846 +       case JNODE_UNFORMATTED_BLOCK:
18847 +               jfree(node);
18848 +               break;
18849 +       case JNODE_FORMATTED_BLOCK:
18850 +               zfree(JZNODE(node));
18851 +               break;
18852 +       case JNODE_INODE:
18853 +       default:
18854 +               wrong_return_value("nikita-3197", "Wrong jnode type");
18855 +       }
18856 +}
18857 +
18858 +/*
18859 + * Free a jnode. Post a callback to be executed later through RCU when all
18860 + * references to @node are released.
18861 + */
18862 +static inline void jnode_free(jnode * node, jnode_type jtype)
18863 +{
18864 +       if (jtype != JNODE_INODE) {
18865 +               /*assert("nikita-3219", list_empty(&node->rcu.list)); */
18866 +               call_rcu(&node->rcu, jnode_free_actor);
18867 +       } else
18868 +               jnode_list_remove(node);
18869 +}
18870 +
18871 +/* allocate new unformatted jnode */
18872 +static jnode *jnew_unformatted(void)
18873 +{
18874 +       jnode *jal;
18875 +
18876 +       jal = jalloc();
18877 +       if (jal == NULL)
18878 +               return NULL;
18879 +
18880 +       jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK);
18881 +       jal->key.j.mapping = NULL;
18882 +       jal->key.j.index = (unsigned long)-1;
18883 +       jal->key.j.objectid = 0;
18884 +       return jal;
18885 +}
18886 +
18887 +/* look for jnode with given mapping and offset within hash table */
18888 +jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index)
18889 +{
18890 +       struct jnode_key jkey;
18891 +       jnode *node;
18892 +
18893 +       assert("nikita-2353", tree != NULL);
18894 +
18895 +       jkey.objectid = objectid;
18896 +       jkey.index = index;
18897 +
18898 +       /*
18899 +        * hash table is _not_ protected by any lock during lookups. All we
18900 +        * have to do is to disable preemption to keep RCU happy.
18901 +        */
18902 +
18903 +       rcu_read_lock();
18904 +       node = j_hash_find(&tree->jhash_table, &jkey);
18905 +       if (node != NULL) {
18906 +               /* protect @node from recycling */
18907 +               jref(node);
18908 +               assert("nikita-2955", jnode_invariant(node, 0, 0));
18909 +               node = jnode_rip_check(tree, node);
18910 +       }
18911 +       rcu_read_unlock();
18912 +       return node;
18913 +}
18914 +
18915 +/* per inode radix tree of jnodes is protected by tree's read write spin lock */
18916 +static jnode *jfind_nolock(struct address_space *mapping, unsigned long index)
18917 +{
18918 +       assert("vs-1694", mapping->host != NULL);
18919 +
18920 +       return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index);
18921 +}
18922 +
18923 +jnode *jfind(struct address_space *mapping, unsigned long index)
18924 +{
18925 +       reiser4_tree *tree;
18926 +       jnode *node;
18927 +
18928 +       assert("vs-1694", mapping->host != NULL);
18929 +       tree = reiser4_tree_by_inode(mapping->host);
18930 +
18931 +       read_lock_tree(tree);
18932 +       node = jfind_nolock(mapping, index);
18933 +       if (node != NULL)
18934 +               jref(node);
18935 +       read_unlock_tree(tree);
18936 +       return node;
18937 +}
18938 +
18939 +static void inode_attach_jnode(jnode * node)
18940 +{
18941 +       struct inode *inode;
18942 +       reiser4_inode *info;
18943 +       struct radix_tree_root *rtree;
18944 +
18945 +       assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18946 +       assert("zam-1043", node->key.j.mapping != NULL);
18947 +       inode = node->key.j.mapping->host;
18948 +       info = reiser4_inode_data(inode);
18949 +       rtree = jnode_tree_by_reiser4_inode(info);
18950 +       if (rtree->rnode == NULL) {
18951 +               /* prevent inode from being pruned when it has jnodes attached
18952 +                  to it */
18953 +               spin_lock_irq(&inode->i_data.tree_lock);
18954 +               inode->i_data.nrpages++;
18955 +               spin_unlock_irq(&inode->i_data.tree_lock);
18956 +       }
18957 +       assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0));
18958 +       check_me("zam-1045",
18959 +                !radix_tree_insert(rtree, node->key.j.index, node));
18960 +       ON_DEBUG(info->nr_jnodes++);
18961 +}
18962 +
18963 +static void inode_detach_jnode(jnode * node)
18964 +{
18965 +       struct inode *inode;
18966 +       reiser4_inode *info;
18967 +       struct radix_tree_root *rtree;
18968 +
18969 +       assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18970 +       assert("zam-1044", node->key.j.mapping != NULL);
18971 +       inode = node->key.j.mapping->host;
18972 +       info = reiser4_inode_data(inode);
18973 +       rtree = jnode_tree_by_reiser4_inode(info);
18974 +
18975 +       assert("zam-1051", info->nr_jnodes != 0);
18976 +       assert("zam-1052", rtree->rnode != NULL);
18977 +       ON_DEBUG(info->nr_jnodes--);
18978 +
18979 +       /* delete jnode from inode's radix tree of jnodes */
18980 +       check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
18981 +       if (rtree->rnode == NULL) {
18982 +               /* inode can be pruned now */
18983 +               spin_lock_irq(&inode->i_data.tree_lock);
18984 +               inode->i_data.nrpages--;
18985 +               spin_unlock_irq(&inode->i_data.tree_lock);
18986 +       }
18987 +}
18988 +
18989 +/* put jnode into hash table (where they can be found by flush who does not know
18990 +   mapping) and to inode's tree of jnodes (where they can be found (hopefully
18991 +   faster) in places where mapping is known). Currently it is used by
18992 +   fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is
18993 +   created */
18994 +static void
18995 +hash_unformatted_jnode(jnode * node, struct address_space *mapping,
18996 +                      unsigned long index)
18997 +{
18998 +       j_hash_table *jtable;
18999 +
19000 +       assert("vs-1446", jnode_is_unformatted(node));
19001 +       assert("vs-1442", node->key.j.mapping == 0);
19002 +       assert("vs-1443", node->key.j.objectid == 0);
19003 +       assert("vs-1444", node->key.j.index == (unsigned long)-1);
19004 +       assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
19005 +
19006 +       node->key.j.mapping = mapping;
19007 +       node->key.j.objectid = get_inode_oid(mapping->host);
19008 +       node->key.j.index = index;
19009 +
19010 +       jtable = &jnode_get_tree(node)->jhash_table;
19011 +
19012 +       /* race with some other thread inserting jnode into the hash table is
19013 +        * impossible, because we keep the page lock. */
19014 +       /*
19015 +        * following assertion no longer holds because of RCU: it is possible
19016 +        * jnode is in the hash table, but with JNODE_RIP bit set.
19017 +        */
19018 +       /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */
19019 +       j_hash_insert_rcu(jtable, node);
19020 +       inode_attach_jnode(node);
19021 +}
19022 +
19023 +static void unhash_unformatted_node_nolock(jnode * node)
19024 +{
19025 +       assert("vs-1683", node->key.j.mapping != NULL);
19026 +       assert("vs-1684",
19027 +              node->key.j.objectid ==
19028 +              get_inode_oid(node->key.j.mapping->host));
19029 +
19030 +       /* remove jnode from hash-table */
19031 +       j_hash_remove_rcu(&node->tree->jhash_table, node);
19032 +       inode_detach_jnode(node);
19033 +       node->key.j.mapping = NULL;
19034 +       node->key.j.index = (unsigned long)-1;
19035 +       node->key.j.objectid = 0;
19036 +
19037 +}
19038 +
19039 +/* remove jnode from hash table and from inode's tree of jnodes. This is used in
19040 +   reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes ->
19041 +   reiser4_uncapture_jnode */
19042 +void unhash_unformatted_jnode(jnode * node)
19043 +{
19044 +       assert("vs-1445", jnode_is_unformatted(node));
19045 +
19046 +       write_lock_tree(node->tree);
19047 +       unhash_unformatted_node_nolock(node);
19048 +       write_unlock_tree(node->tree);
19049 +}
19050 +
19051 +/*
19052 + * search hash table for a jnode with given oid and index. If not found,
19053 + * allocate new jnode, insert it, and also insert into radix tree for the
19054 + * given inode/mapping.
19055 + */
19056 +static jnode *find_get_jnode(reiser4_tree * tree,
19057 +                            struct address_space *mapping,
19058 +                            oid_t oid, unsigned long index)
19059 +{
19060 +       jnode *result;
19061 +       jnode *shadow;
19062 +       int preload;
19063 +
19064 +       result = jnew_unformatted();
19065 +
19066 +       if (unlikely(result == NULL))
19067 +               return ERR_PTR(RETERR(-ENOMEM));
19068 +
19069 +       preload = radix_tree_preload(reiser4_ctx_gfp_mask_get());
19070 +       if (preload != 0)
19071 +               return ERR_PTR(preload);
19072 +
19073 +       write_lock_tree(tree);
19074 +       shadow = jfind_nolock(mapping, index);
19075 +       if (likely(shadow == NULL)) {
19076 +               /* add new jnode to hash table and inode's radix tree of
19077 +                * jnodes */
19078 +               jref(result);
19079 +               hash_unformatted_jnode(result, mapping, index);
19080 +       } else {
19081 +               /* jnode is found in inode's radix tree of jnodes */
19082 +               jref(shadow);
19083 +               jnode_free(result, JNODE_UNFORMATTED_BLOCK);
19084 +               assert("vs-1498", shadow->key.j.mapping == mapping);
19085 +               result = shadow;
19086 +       }
19087 +       write_unlock_tree(tree);
19088 +
19089 +       assert("nikita-2955",
19090 +              ergo(result != NULL, jnode_invariant(result, 0, 0)));
19091 +       radix_tree_preload_end();
19092 +       return result;
19093 +}
19094 +
19095 +/* jget() (a la zget() but for unformatted nodes). Returns (and possibly
19096 +   creates) jnode corresponding to page @pg. jnode is attached to page and
19097 +   inserted into jnode hash-table. */
19098 +static jnode *do_jget(reiser4_tree * tree, struct page *pg)
19099 +{
19100 +       /*
19101 +        * There are two ways to create jnode: starting with pre-existing page
19102 +        * and without page.
19103 +        *
19104 +        * When page already exists, jnode is created
19105 +        * (jnode_of_page()->do_jget()) under page lock. This is done in
19106 +        * ->writepage(), or when capturing anonymous page dirtied through
19107 +        * mmap.
19108 +        *
19109 +        * Jnode without page is created by index_extent_jnode().
19110 +        *
19111 +        */
19112 +
19113 +       jnode *result;
19114 +       oid_t oid = get_inode_oid(pg->mapping->host);
19115 +
19116 +       assert("umka-176", pg != NULL);
19117 +       assert("nikita-2394", PageLocked(pg));
19118 +
19119 +       result = jprivate(pg);
19120 +       if (likely(result != NULL))
19121 +               return jref(result);
19122 +
19123 +       tree = reiser4_tree_by_page(pg);
19124 +
19125 +       /* check hash-table first */
19126 +       result = jfind(pg->mapping, pg->index);
19127 +       if (unlikely(result != NULL)) {
19128 +               spin_lock_jnode(result);
19129 +               jnode_attach_page(result, pg);
19130 +               spin_unlock_jnode(result);
19131 +               result->key.j.mapping = pg->mapping;
19132 +               return result;
19133 +       }
19134 +
19135 +       /* since page is locked, jnode should be allocated with GFP_NOFS flag */
19136 +       reiser4_ctx_gfp_mask_force(GFP_NOFS);
19137 +       result = find_get_jnode(tree, pg->mapping, oid, pg->index);
19138 +       if (unlikely(IS_ERR(result)))
19139 +               return result;
19140 +       /* attach jnode to page */
19141 +       spin_lock_jnode(result);
19142 +       jnode_attach_page(result, pg);
19143 +       spin_unlock_jnode(result);
19144 +       return result;
19145 +}
19146 +
19147 +/*
19148 + * return jnode for @pg, creating it if necessary.
19149 + */
19150 +jnode *jnode_of_page(struct page *pg)
19151 +{
19152 +       jnode *result;
19153 +
19154 +       assert("umka-176", pg != NULL);
19155 +       assert("nikita-2394", PageLocked(pg));
19156 +
19157 +       result = do_jget(reiser4_tree_by_page(pg), pg);
19158 +
19159 +       if (REISER4_DEBUG && !IS_ERR(result)) {
19160 +               assert("nikita-3210", result == jprivate(pg));
19161 +               assert("nikita-2046", jnode_page(jprivate(pg)) == pg);
19162 +               if (jnode_is_unformatted(jprivate(pg))) {
19163 +                       assert("nikita-2364",
19164 +                              jprivate(pg)->key.j.index == pg->index);
19165 +                       assert("nikita-2367",
19166 +                              jprivate(pg)->key.j.mapping == pg->mapping);
19167 +                       assert("nikita-2365",
19168 +                              jprivate(pg)->key.j.objectid ==
19169 +                              get_inode_oid(pg->mapping->host));
19170 +                       assert("vs-1200",
19171 +                              jprivate(pg)->key.j.objectid ==
19172 +                              pg->mapping->host->i_ino);
19173 +                       assert("nikita-2356",
19174 +                              jnode_is_unformatted(jnode_by_page(pg)));
19175 +               }
19176 +               assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0));
19177 +       }
19178 +       return result;
19179 +}
19180 +
19181 +/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the
19182 + * page.*/
19183 +void jnode_attach_page(jnode * node, struct page *pg)
19184 +{
19185 +       assert("nikita-2060", node != NULL);
19186 +       assert("nikita-2061", pg != NULL);
19187 +
19188 +       assert("nikita-2050", jprivate(pg) == 0ul);
19189 +       assert("nikita-2393", !PagePrivate(pg));
19190 +       assert("vs-1741", node->pg == NULL);
19191 +
19192 +       assert("nikita-2396", PageLocked(pg));
19193 +       assert_spin_locked(&(node->guard));
19194 +
19195 +       page_cache_get(pg);
19196 +       set_page_private(pg, (unsigned long)node);
19197 +       node->pg = pg;
19198 +       SetPagePrivate(pg);
19199 +}
19200 +
19201 +/* Dual to jnode_attach_page: break a binding between page and jnode */
19202 +void page_clear_jnode(struct page *page, jnode * node)
19203 +{
19204 +       assert("nikita-2424", page != NULL);
19205 +       assert("nikita-2425", PageLocked(page));
19206 +       assert("nikita-2426", node != NULL);
19207 +       assert_spin_locked(&(node->guard));
19208 +       assert("nikita-2428", PagePrivate(page));
19209 +
19210 +       assert("nikita-3551", !PageWriteback(page));
19211 +
19212 +       JF_CLR(node, JNODE_PARSED);
19213 +       set_page_private(page, 0ul);
19214 +       ClearPagePrivate(page);
19215 +       node->pg = NULL;
19216 +       page_cache_release(page);
19217 +}
19218 +
19219 +#if 0
19220 +/* it is only used in one place to handle error */
19221 +void
19222 +page_detach_jnode(struct page *page, struct address_space *mapping,
19223 +                 unsigned long index)
19224 +{
19225 +       assert("nikita-2395", page != NULL);
19226 +
19227 +       lock_page(page);
19228 +       if ((page->mapping == mapping) && (page->index == index)
19229 +           && PagePrivate(page)) {
19230 +               jnode *node;
19231 +
19232 +               node = jprivate(page);
19233 +               spin_lock_jnode(node);
19234 +               page_clear_jnode(page, node);
19235 +               spin_unlock_jnode(node);
19236 +       }
19237 +       unlock_page(page);
19238 +}
19239 +#endif  /*  0  */
19240 +
19241 +/* return @node page locked.
19242 +
19243 +   Locking ordering requires that one first takes page lock and afterwards
19244 +   spin lock on node attached to this page. Sometimes it is necessary to go in
19245 +   the opposite direction. This is done through standard trylock-and-release
19246 +   loop.
19247 +*/
19248 +static struct page *jnode_lock_page(jnode * node)
19249 +{
19250 +       struct page *page;
19251 +
19252 +       assert("nikita-2052", node != NULL);
19253 +       assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode));
19254 +
19255 +       while (1) {
19256 +
19257 +               spin_lock_jnode(node);
19258 +               page = jnode_page(node);
19259 +               if (page == NULL)
19260 +                       break;
19261 +
19262 +               /* no need to page_cache_get( page ) here, because page cannot
19263 +                  be evicted from memory without detaching it from jnode and
19264 +                  this requires spin lock on jnode that we already hold.
19265 +                */
19266 +               if (trylock_page(page)) {
19267 +                       /* We won a lock on jnode page, proceed. */
19268 +                       break;
19269 +               }
19270 +
19271 +               /* Page is locked by someone else. */
19272 +               page_cache_get(page);
19273 +               spin_unlock_jnode(node);
19274 +               wait_on_page_locked(page);
19275 +               /* it is possible that page was detached from jnode and
19276 +                  returned to the free pool, or re-assigned while we were
19277 +                  waiting on locked bit. This will be rechecked on the next
19278 +                  loop iteration.
19279 +                */
19280 +               page_cache_release(page);
19281 +
19282 +               /* try again */
19283 +       }
19284 +       return page;
19285 +}
19286 +
19287 +/*
19288 + * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify
19289 + * validness of jnode content.
19290 + */
19291 +static inline int jparse(jnode * node)
19292 +{
19293 +       int result;
19294 +
19295 +       assert("nikita-2466", node != NULL);
19296 +
19297 +       spin_lock_jnode(node);
19298 +       if (likely(!jnode_is_parsed(node))) {
19299 +               result = jnode_ops(node)->parse(node);
19300 +               if (likely(result == 0))
19301 +                       JF_SET(node, JNODE_PARSED);
19302 +       } else
19303 +               result = 0;
19304 +       spin_unlock_jnode(node);
19305 +       return result;
19306 +}
19307 +
19308 +/* Lock a page attached to jnode, create and attach page to jnode if it had no
19309 + * one. */
19310 +static struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags)
19311 +{
19312 +       struct page *page;
19313 +
19314 +       spin_lock_jnode(node);
19315 +       page = jnode_page(node);
19316 +
19317 +       if (page == NULL) {
19318 +               spin_unlock_jnode(node);
19319 +               page = find_or_create_page(jnode_get_mapping(node),
19320 +                                          jnode_get_index(node), gfp_flags);
19321 +               if (page == NULL)
19322 +                       return ERR_PTR(RETERR(-ENOMEM));
19323 +       } else {
19324 +               if (trylock_page(page)) {
19325 +                       spin_unlock_jnode(node);
19326 +                       return page;
19327 +               }
19328 +               page_cache_get(page);
19329 +               spin_unlock_jnode(node);
19330 +               lock_page(page);
19331 +               assert("nikita-3134", page->mapping == jnode_get_mapping(node));
19332 +       }
19333 +
19334 +       spin_lock_jnode(node);
19335 +       if (!jnode_page(node))
19336 +               jnode_attach_page(node, page);
19337 +       spin_unlock_jnode(node);
19338 +
19339 +       page_cache_release(page);
19340 +       assert("zam-894", jnode_page(node) == page);
19341 +       return page;
19342 +}
19343 +
19344 +/* Start read operation for jnode's page if page is not up-to-date. */
19345 +static int jnode_start_read(jnode * node, struct page *page)
19346 +{
19347 +       assert("zam-893", PageLocked(page));
19348 +
19349 +       if (PageUptodate(page)) {
19350 +               unlock_page(page);
19351 +               return 0;
19352 +       }
19353 +       return reiser4_page_io(page, node, READ, reiser4_ctx_gfp_mask_get());
19354 +}
19355 +
19356 +#if REISER4_DEBUG
19357 +static void check_jload(jnode * node, struct page *page)
19358 +{
19359 +       if (jnode_is_znode(node)) {
19360 +               node40_header *nh;
19361 +               znode *z;
19362 +
19363 +               z = JZNODE(node);
19364 +               if (znode_is_any_locked(z)) {
19365 +                       nh = (node40_header *) kmap(page);
19366 +                       /* this only works for node40-only file systems. For
19367 +                        * debugging. */
19368 +                       assert("nikita-3253",
19369 +                              z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items)));
19370 +                       kunmap(page);
19371 +               }
19372 +               assert("nikita-3565", znode_invariant(z));
19373 +       }
19374 +}
19375 +#else
19376 +#define check_jload(node, page) noop
19377 +#endif
19378 +
19379 +/* prefetch jnode to speed up next call to jload. Call this when you are going
19380 + * to call jload() shortly. This will bring appropriate portion of jnode into
19381 + * CPU cache. */
19382 +void jload_prefetch(jnode * node)
19383 +{
19384 +       prefetchw(&node->x_count);
19385 +}
19386 +
19387 +/* load jnode's data into memory */
19388 +int jload_gfp(jnode * node /* node to load */ ,
19389 +             gfp_t gfp_flags /* allocation flags */ ,
19390 +             int do_kmap/* true if page should be kmapped */)
19391 +{
19392 +       struct page *page;
19393 +       int result = 0;
19394 +       int parsed;
19395 +
19396 +       assert("nikita-3010", reiser4_schedulable());
19397 +
19398 +       prefetchw(&node->pg);
19399 +
19400 +       /* taking d-reference implies taking x-reference. */
19401 +       jref(node);
19402 +
19403 +       /*
19404 +        * acquiring d-reference to @jnode and check for JNODE_PARSED bit
19405 +        * should be atomic, otherwise there is a race against
19406 +        * reiser4_releasepage().
19407 +        */
19408 +       spin_lock(&(node->load));
19409 +       add_d_ref(node);
19410 +       parsed = jnode_is_parsed(node);
19411 +       spin_unlock(&(node->load));
19412 +
19413 +       if (unlikely(!parsed)) {
19414 +               page = jnode_get_page_locked(node, gfp_flags);
19415 +               if (unlikely(IS_ERR(page))) {
19416 +                       result = PTR_ERR(page);
19417 +                       goto failed;
19418 +               }
19419 +
19420 +               result = jnode_start_read(node, page);
19421 +               if (unlikely(result != 0))
19422 +                       goto failed;
19423 +
19424 +               wait_on_page_locked(page);
19425 +               if (unlikely(!PageUptodate(page))) {
19426 +                       result = RETERR(-EIO);
19427 +                       goto failed;
19428 +               }
19429 +
19430 +               if (do_kmap)
19431 +                       node->data = kmap(page);
19432 +
19433 +               result = jparse(node);
19434 +               if (unlikely(result != 0)) {
19435 +                       if (do_kmap)
19436 +                               kunmap(page);
19437 +                       goto failed;
19438 +               }
19439 +               check_jload(node, page);
19440 +       } else {
19441 +               page = jnode_page(node);
19442 +               check_jload(node, page);
19443 +               if (do_kmap)
19444 +                       node->data = kmap(page);
19445 +       }
19446 +
19447 +       if (!is_writeout_mode())
19448 +               /* We do not mark pages active if jload is called as a part of
19449 +                * jnode_flush() or reiser4_write_logs().  Both jnode_flush()
19450 +                * and write_logs() add no value to cached data, there is no
19451 +                * sense to mark pages as active when they go to disk, it just
19452 +                * confuses vm scanning routines because clean page could be
19453 +                * moved out from inactive list as a result of this
19454 +                * mark_page_accessed() call. */
19455 +               mark_page_accessed(page);
19456 +
19457 +       return 0;
19458 +
19459 +failed:
19460 +       jrelse_tail(node);
19461 +       return result;
19462 +
19463 +}
19464 +
19465 +/* start asynchronous reading for given jnode's page. */
19466 +int jstartio(jnode * node)
19467 +{
19468 +       struct page *page;
19469 +
19470 +       page = jnode_get_page_locked(node, reiser4_ctx_gfp_mask_get());
19471 +       if (IS_ERR(page))
19472 +               return PTR_ERR(page);
19473 +
19474 +       return jnode_start_read(node, page);
19475 +}
19476 +
19477 +/* Initialize a node by calling appropriate plugin instead of reading
19478 + * node from disk as in jload(). */
19479 +int jinit_new(jnode * node, gfp_t gfp_flags)
19480 +{
19481 +       struct page *page;
19482 +       int result;
19483 +
19484 +       jref(node);
19485 +       add_d_ref(node);
19486 +
19487 +       page = jnode_get_page_locked(node, gfp_flags);
19488 +       if (IS_ERR(page)) {
19489 +               result = PTR_ERR(page);
19490 +               goto failed;
19491 +       }
19492 +
19493 +       SetPageUptodate(page);
19494 +       unlock_page(page);
19495 +
19496 +       node->data = kmap(page);
19497 +
19498 +       if (!jnode_is_parsed(node)) {
19499 +               jnode_plugin *jplug = jnode_ops(node);
19500 +               spin_lock_jnode(node);
19501 +               result = jplug->init(node);
19502 +               spin_unlock_jnode(node);
19503 +               if (result) {
19504 +                       kunmap(page);
19505 +                       goto failed;
19506 +               }
19507 +               JF_SET(node, JNODE_PARSED);
19508 +       }
19509 +
19510 +       return 0;
19511 +
19512 +failed:
19513 +       jrelse(node);
19514 +       return result;
19515 +}
19516 +
19517 +/* release a reference to jnode acquired by jload(), decrement ->d_count */
19518 +void jrelse_tail(jnode * node/* jnode to release references to */)
19519 +{
19520 +       assert("nikita-489", atomic_read(&node->d_count) > 0);
19521 +       atomic_dec(&node->d_count);
19522 +       /* release reference acquired in jload_gfp() or jinit_new() */
19523 +       jput(node);
19524 +       if (jnode_is_unformatted(node) || jnode_is_znode(node))
19525 +               LOCK_CNT_DEC(d_refs);
19526 +}
19527 +
19528 +/* drop reference to node data. When last reference is dropped, data are
19529 +   unloaded. */
19530 +void jrelse(jnode * node/* jnode to release references to */)
19531 +{
19532 +       struct page *page;
19533 +
19534 +       assert("nikita-487", node != NULL);
19535 +       assert_spin_not_locked(&(node->guard));
19536 +
19537 +       page = jnode_page(node);
19538 +       if (likely(page != NULL)) {
19539 +               /*
19540 +                * it is safe not to lock jnode here, because at this point
19541 +                * @node->d_count is greater than zero (if jrelse() is used
19542 +                * correctly, that is). JNODE_PARSED may be not set yet, if,
19543 +                * for example, we got here as a result of error handling path
19544 +                * in jload(). Anyway, page cannot be detached by
19545 +                * reiser4_releasepage(). truncate will invalidate page
19546 +                * regardless, but this should not be a problem.
19547 +                */
19548 +               kunmap(page);
19549 +       }
19550 +       jrelse_tail(node);
19551 +}
19552 +
19553 +/* called from jput() to wait for io completion */
19554 +static void jnode_finish_io(jnode * node)
19555 +{
19556 +       struct page *page;
19557 +
19558 +       assert("nikita-2922", node != NULL);
19559 +
19560 +       spin_lock_jnode(node);
19561 +       page = jnode_page(node);
19562 +       if (page != NULL) {
19563 +               page_cache_get(page);
19564 +               spin_unlock_jnode(node);
19565 +               wait_on_page_writeback(page);
19566 +               page_cache_release(page);
19567 +       } else
19568 +               spin_unlock_jnode(node);
19569 +}
19570 +
19571 +/*
19572 + * This is called by jput() when last reference to jnode is released. This is
19573 + * separate function, because we want fast path of jput() to be inline and,
19574 + * therefore, small.
19575 + */
19576 +void jput_final(jnode * node)
19577 +{
19578 +       int r_i_p;
19579 +
19580 +       /* A fast check for keeping node in cache. We always keep node in cache
19581 +        * if its page is present and node was not marked for deletion */
19582 +       if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
19583 +               rcu_read_unlock();
19584 +               return;
19585 +       }
19586 +       r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP);
19587 +       /*
19588 +        * if r_i_p is true, we were first to set JNODE_RIP on this node. In
19589 +        * this case it is safe to access node after unlock.
19590 +        */
19591 +       rcu_read_unlock();
19592 +       if (r_i_p) {
19593 +               jnode_finish_io(node);
19594 +               if (JF_ISSET(node, JNODE_HEARD_BANSHEE))
19595 +                       /* node is removed from the tree. */
19596 +                       jdelete(node);
19597 +               else
19598 +                       jnode_try_drop(node);
19599 +       }
19600 +       /* if !r_i_p some other thread is already killing it */
19601 +}
19602 +
19603 +int jwait_io(jnode * node, int rw)
19604 +{
19605 +       struct page *page;
19606 +       int result;
19607 +
19608 +       assert("zam-447", node != NULL);
19609 +       assert("zam-448", jnode_page(node) != NULL);
19610 +
19611 +       page = jnode_page(node);
19612 +
19613 +       result = 0;
19614 +       if (rw == READ) {
19615 +               wait_on_page_locked(page);
19616 +       } else {
19617 +               assert("nikita-2227", rw == WRITE);
19618 +               wait_on_page_writeback(page);
19619 +       }
19620 +       if (PageError(page))
19621 +               result = RETERR(-EIO);
19622 +
19623 +       return result;
19624 +}
19625 +
19626 +/*
19627 + * jnode types and plugins.
19628 + *
19629 + * jnode by itself is a "base type". There are several different jnode
19630 + * flavors, called "jnode types" (see jnode_type for a list). Sometimes code
19631 + * has to do different things based on jnode type. In the standard reiser4 way
19632 + * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin).
19633 + *
19634 + * Functions below deal with jnode types and define methods of jnode plugin.
19635 + *
19636 + */
19637 +
19638 +/* set jnode type. This is done during jnode initialization. */
19639 +static void jnode_set_type(jnode * node, jnode_type type)
19640 +{
19641 +       static unsigned long type_to_mask[] = {
19642 +               [JNODE_UNFORMATTED_BLOCK] = 1,
19643 +               [JNODE_FORMATTED_BLOCK] = 0,
19644 +               [JNODE_BITMAP] = 2,
19645 +               [JNODE_IO_HEAD] = 6,
19646 +               [JNODE_INODE] = 4
19647 +       };
19648 +
19649 +       assert("zam-647", type < LAST_JNODE_TYPE);
19650 +       assert("nikita-2815", !jnode_is_loaded(node));
19651 +       assert("nikita-3386", node->state == 0);
19652 +
19653 +       node->state |= (type_to_mask[type] << JNODE_TYPE_1);
19654 +}
19655 +
19656 +/* ->init() method of jnode plugin for jnodes that don't require plugin
19657 + * specific initialization. */
19658 +static int init_noinit(jnode * node UNUSED_ARG)
19659 +{
19660 +       return 0;
19661 +}
19662 +
19663 +/* ->parse() method of jnode plugin for jnodes that don't require plugin
19664 + * specific pasring. */
19665 +static int parse_noparse(jnode * node UNUSED_ARG)
19666 +{
19667 +       return 0;
19668 +}
19669 +
19670 +/* ->mapping() method for unformatted jnode */
19671 +struct address_space *mapping_jnode(const jnode * node)
19672 +{
19673 +       struct address_space *map;
19674 +
19675 +       assert("nikita-2713", node != NULL);
19676 +
19677 +       /* mapping is stored in jnode */
19678 +
19679 +       map = node->key.j.mapping;
19680 +       assert("nikita-2714", map != NULL);
19681 +       assert("nikita-2897", is_reiser4_inode(map->host));
19682 +       assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid);
19683 +       return map;
19684 +}
19685 +
19686 +/* ->index() method for unformatted jnodes */
19687 +unsigned long index_jnode(const jnode * node)
19688 +{
19689 +       /* index is stored in jnode */
19690 +       return node->key.j.index;
19691 +}
19692 +
19693 +/* ->remove() method for unformatted jnodes */
19694 +static inline void remove_jnode(jnode * node, reiser4_tree * tree)
19695 +{
19696 +       /* remove jnode from hash table and radix tree */
19697 +       if (node->key.j.mapping)
19698 +               unhash_unformatted_node_nolock(node);
19699 +}
19700 +
19701 +/* ->mapping() method for znodes */
19702 +static struct address_space *mapping_znode(const jnode * node)
19703 +{
19704 +       /* all znodes belong to fake inode */
19705 +       return reiser4_get_super_fake(jnode_get_tree(node)->super)->i_mapping;
19706 +}
19707 +
19708 +/* ->index() method for znodes */
19709 +static unsigned long index_znode(const jnode * node)
19710 +{
19711 +       unsigned long addr;
19712 +       assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode));
19713 +
19714 +       /* index of znode is just its address (shifted) */
19715 +       addr = (unsigned long)node;
19716 +       return (addr - PAGE_OFFSET) >> znode_shift_order;
19717 +}
19718 +
19719 +/* ->mapping() method for bitmap jnode */
19720 +static struct address_space *mapping_bitmap(const jnode * node)
19721 +{
19722 +       /* all bitmap blocks belong to special bitmap inode */
19723 +       return get_super_private(jnode_get_tree(node)->super)->bitmap->
19724 +           i_mapping;
19725 +}
19726 +
19727 +/* ->index() method for jnodes that are indexed by address */
19728 +static unsigned long index_is_address(const jnode * node)
19729 +{
19730 +       unsigned long ind;
19731 +
19732 +       ind = (unsigned long)node;
19733 +       return ind - PAGE_OFFSET;
19734 +}
19735 +
19736 +/* resolve race with jput */
19737 +jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node)
19738 +{
19739 +       /*
19740 +        * This is used as part of RCU-based jnode handling.
19741 +        *
19742 +        * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work
19743 +        * with unreferenced jnodes (ones with ->x_count == 0). Hash table is
19744 +        * not protected during this, so concurrent thread may execute
19745 +        * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be
19746 +        * freed in jput_final(). To avoid such races, jput_final() sets
19747 +        * JNODE_RIP on jnode (under tree lock). All places that work with
19748 +        * unreferenced jnodes call this function. It checks for JNODE_RIP bit
19749 +        * (first without taking tree lock), and if this bit is set, released
19750 +        * reference acquired by the current thread and returns NULL.
19751 +        *
19752 +        * As a result, if jnode is being concurrently freed, NULL is returned
19753 +        * and caller should pretend that jnode wasn't found in the first
19754 +        * place.
19755 +        *
19756 +        * Otherwise it's safe to release "rcu-read-lock" and continue with
19757 +        * jnode.
19758 +        */
19759 +       if (unlikely(JF_ISSET(node, JNODE_RIP))) {
19760 +               read_lock_tree(tree);
19761 +               if (JF_ISSET(node, JNODE_RIP)) {
19762 +                       dec_x_ref(node);
19763 +                       node = NULL;
19764 +               }
19765 +               read_unlock_tree(tree);
19766 +       }
19767 +       return node;
19768 +}
19769 +
19770 +reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key)
19771 +{
19772 +       struct inode *inode;
19773 +       item_plugin *iplug;
19774 +       loff_t off;
19775 +
19776 +       assert("nikita-3092", node != NULL);
19777 +       assert("nikita-3093", key != NULL);
19778 +       assert("nikita-3094", jnode_is_unformatted(node));
19779 +
19780 +       off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT;
19781 +       inode = mapping_jnode(node)->host;
19782 +
19783 +       if (node->parent_item_id != 0)
19784 +               iplug = item_plugin_by_id(node->parent_item_id);
19785 +       else
19786 +               iplug = NULL;
19787 +
19788 +       if (iplug != NULL && iplug->f.key_by_offset)
19789 +               iplug->f.key_by_offset(inode, off, key);
19790 +       else {
19791 +               file_plugin *fplug;
19792 +
19793 +               fplug = inode_file_plugin(inode);
19794 +               assert("zam-1007", fplug != NULL);
19795 +               assert("zam-1008", fplug->key_by_inode != NULL);
19796 +
19797 +               fplug->key_by_inode(inode, off, key);
19798 +       }
19799 +
19800 +       return key;
19801 +}
19802 +
19803 +/* ->parse() method for formatted nodes */
19804 +static int parse_znode(jnode * node)
19805 +{
19806 +       return zparse(JZNODE(node));
19807 +}
19808 +
19809 +/* ->delete() method for formatted nodes */
19810 +static void delete_znode(jnode * node, reiser4_tree * tree)
19811 +{
19812 +       znode *z;
19813 +
19814 +       assert_rw_write_locked(&(tree->tree_lock));
19815 +       assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE));
19816 +
19817 +       z = JZNODE(node);
19818 +       assert("vs-899", z->c_count == 0);
19819 +
19820 +       /* delete znode from sibling list. */
19821 +       sibling_list_remove(z);
19822 +
19823 +       znode_remove(z, tree);
19824 +}
19825 +
19826 +/* ->remove() method for formatted nodes */
19827 +static int remove_znode(jnode * node, reiser4_tree * tree)
19828 +{
19829 +       znode *z;
19830 +
19831 +       assert_rw_write_locked(&(tree->tree_lock));
19832 +       z = JZNODE(node);
19833 +
19834 +       if (z->c_count == 0) {
19835 +               /* detach znode from sibling list. */
19836 +               sibling_list_drop(z);
19837 +               /* this is called with tree spin-lock held, so call
19838 +                  znode_remove() directly (rather than znode_lock_remove()). */
19839 +               znode_remove(z, tree);
19840 +               return 0;
19841 +       }
19842 +       return RETERR(-EBUSY);
19843 +}
19844 +
19845 +/* ->init() method for formatted nodes */
19846 +static int init_znode(jnode * node)
19847 +{
19848 +       znode *z;
19849 +
19850 +       z = JZNODE(node);
19851 +       /* call node plugin to do actual initialization */
19852 +       return z->nplug->init(z);
19853 +}
19854 +
19855 +/* ->clone() method for formatted nodes */
19856 +static jnode *clone_formatted(jnode * node)
19857 +{
19858 +       znode *clone;
19859 +
19860 +       assert("vs-1430", jnode_is_znode(node));
19861 +       clone = zalloc(reiser4_ctx_gfp_mask_get());
19862 +       if (clone == NULL)
19863 +               return ERR_PTR(RETERR(-ENOMEM));
19864 +       zinit(clone, NULL, current_tree);
19865 +       jnode_set_block(ZJNODE(clone), jnode_get_block(node));
19866 +       /* ZJNODE(clone)->key.z is not initialized */
19867 +       clone->level = JZNODE(node)->level;
19868 +
19869 +       return ZJNODE(clone);
19870 +}
19871 +
19872 +/* jplug->clone for unformatted nodes */
19873 +static jnode *clone_unformatted(jnode * node)
19874 +{
19875 +       jnode *clone;
19876 +
19877 +       assert("vs-1431", jnode_is_unformatted(node));
19878 +       clone = jalloc();
19879 +       if (clone == NULL)
19880 +               return ERR_PTR(RETERR(-ENOMEM));
19881 +
19882 +       jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK);
19883 +       jnode_set_block(clone, jnode_get_block(node));
19884 +
19885 +       return clone;
19886 +
19887 +}
19888 +
19889 +/*
19890 + * Setup jnode plugin methods for various jnode types.
19891 + */
19892 +jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = {
19893 +       [JNODE_UNFORMATTED_BLOCK] = {
19894 +               .h = {
19895 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
19896 +                       .id = JNODE_UNFORMATTED_BLOCK,
19897 +                       .pops = NULL,
19898 +                       .label = "unformatted",
19899 +                       .desc = "unformatted node",
19900 +                       .linkage = {NULL, NULL}
19901 +               },
19902 +               .init = init_noinit,
19903 +               .parse = parse_noparse,
19904 +               .mapping = mapping_jnode,
19905 +               .index = index_jnode,
19906 +               .clone = clone_unformatted
19907 +       },
19908 +       [JNODE_FORMATTED_BLOCK] = {
19909 +               .h = {
19910 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
19911 +                       .id = JNODE_FORMATTED_BLOCK,
19912 +                       .pops = NULL,
19913 +                       .label = "formatted",
19914 +                       .desc = "formatted tree node",
19915 +                       .linkage = {NULL, NULL}
19916 +               },
19917 +               .init = init_znode,
19918 +               .parse = parse_znode,
19919 +               .mapping = mapping_znode,
19920 +               .index = index_znode,
19921 +               .clone = clone_formatted
19922 +       },
19923 +       [JNODE_BITMAP] = {
19924 +               .h = {
19925 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
19926 +                       .id = JNODE_BITMAP,
19927 +                       .pops = NULL,
19928 +                       .label = "bitmap",
19929 +                       .desc = "bitmap node",
19930 +                       .linkage = {NULL, NULL}
19931 +               },
19932 +               .init = init_noinit,
19933 +               .parse = parse_noparse,
19934 +               .mapping = mapping_bitmap,
19935 +               .index = index_is_address,
19936 +               .clone = NULL
19937 +       },
19938 +       [JNODE_IO_HEAD] = {
19939 +               .h = {
19940 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
19941 +                       .id = JNODE_IO_HEAD,
19942 +                       .pops = NULL,
19943 +                       .label = "io head",
19944 +                       .desc = "io head",
19945 +                       .linkage = {NULL, NULL}
19946 +               },
19947 +               .init = init_noinit,
19948 +               .parse = parse_noparse,
19949 +               .mapping = mapping_bitmap,
19950 +               .index = index_is_address,
19951 +               .clone = NULL
19952 +       },
19953 +       [JNODE_INODE] = {
19954 +               .h = {
19955 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
19956 +                       .id = JNODE_INODE,
19957 +                       .pops = NULL,
19958 +                       .label = "inode",
19959 +                       .desc = "inode's builtin jnode",
19960 +                       .linkage = {NULL, NULL}
19961 +               },
19962 +               .init = NULL,
19963 +               .parse = NULL,
19964 +               .mapping = NULL,
19965 +               .index = NULL,
19966 +               .clone = NULL
19967 +       }
19968 +};
19969 +
19970 +/*
19971 + * jnode destruction.
19972 + *
19973 + * Thread may use a jnode after it acquired a reference to it. References are
19974 + * counted in ->x_count field. Reference protects jnode from being
19975 + * recycled. This is different from protecting jnode data (that are stored in
19976 + * jnode page) from being evicted from memory. Data are protected by jload()
19977 + * and released by jrelse().
19978 + *
19979 + * If thread already possesses a reference to the jnode it can acquire another
19980 + * one through jref(). Initial reference is obtained (usually) by locating
19981 + * jnode in some indexing structure that depends on jnode type: formatted
19982 + * nodes are kept in global hash table, where they are indexed by block
19983 + * number, and also in the cbk cache. Unformatted jnodes are also kept in hash
19984 + * table, which is indexed by oid and offset within file, and in per-inode
19985 + * radix tree.
19986 + *
19987 + * Reference to jnode is released by jput(). If last reference is released,
19988 + * jput_final() is called. This function determines whether jnode has to be
19989 + * deleted (this happens when corresponding node is removed from the file
19990 + * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it
19991 + * should be just "removed" (deleted from memory).
19992 + *
19993 + * Jnode destruction is signally delicate dance because of locking and RCU.
19994 + */
19995 +
19996 +/*
19997 + * Returns true if jnode cannot be removed right now. This check is called
19998 + * under tree lock. If it returns true, jnode is irrevocably committed to be
19999 + * deleted/removed.
20000 + */
20001 +static inline int jnode_is_busy(const jnode * node, jnode_type jtype)
20002 +{
20003 +       /* if other thread managed to acquire a reference to this jnode, don't
20004 +        * free it. */
20005 +       if (atomic_read(&node->x_count) > 0)
20006 +               return 1;
20007 +       /* also, don't free znode that has children in memory */
20008 +       if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0)
20009 +               return 1;
20010 +       return 0;
20011 +}
20012 +
20013 +/*
20014 + * this is called as part of removing jnode. Based on jnode type, call
20015 + * corresponding function that removes jnode from indices and returns it back
20016 + * to the appropriate slab (through RCU).
20017 + */
20018 +static inline void
20019 +jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree)
20020 +{
20021 +       switch (jtype) {
20022 +       case JNODE_UNFORMATTED_BLOCK:
20023 +               remove_jnode(node, tree);
20024 +               break;
20025 +       case JNODE_IO_HEAD:
20026 +       case JNODE_BITMAP:
20027 +               break;
20028 +       case JNODE_INODE:
20029 +               break;
20030 +       case JNODE_FORMATTED_BLOCK:
20031 +               remove_znode(node, tree);
20032 +               break;
20033 +       default:
20034 +               wrong_return_value("nikita-3196", "Wrong jnode type");
20035 +       }
20036 +}
20037 +
20038 +/*
20039 + * this is called as part of deleting jnode. Based on jnode type, call
20040 + * corresponding function that removes jnode from indices and returns it back
20041 + * to the appropriate slab (through RCU).
20042 + *
20043 + * This differs from jnode_remove() only for formatted nodes---for them
20044 + * sibling list handling is different for removal and deletion.
20045 + */
20046 +static inline void
20047 +jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG)
20048 +{
20049 +       switch (jtype) {
20050 +       case JNODE_UNFORMATTED_BLOCK:
20051 +               remove_jnode(node, tree);
20052 +               break;
20053 +       case JNODE_IO_HEAD:
20054 +       case JNODE_BITMAP:
20055 +               break;
20056 +       case JNODE_FORMATTED_BLOCK:
20057 +               delete_znode(node, tree);
20058 +               break;
20059 +       case JNODE_INODE:
20060 +       default:
20061 +               wrong_return_value("nikita-3195", "Wrong jnode type");
20062 +       }
20063 +}
20064 +
20065 +#if REISER4_DEBUG
20066 +/*
20067 + * remove jnode from the debugging list of all jnodes hanging off super-block.
20068 + */
20069 +void jnode_list_remove(jnode * node)
20070 +{
20071 +       reiser4_super_info_data *sbinfo;
20072 +
20073 +       sbinfo = get_super_private(jnode_get_tree(node)->super);
20074 +
20075 +       spin_lock_irq(&sbinfo->all_guard);
20076 +       assert("nikita-2422", !list_empty(&node->jnodes));
20077 +       list_del_init(&node->jnodes);
20078 +       spin_unlock_irq(&sbinfo->all_guard);
20079 +}
20080 +#endif
20081 +
20082 +/*
20083 + * this is called by jput_final() to remove jnode when last reference to it is
20084 + * released.
20085 + */
20086 +static int jnode_try_drop(jnode * node)
20087 +{
20088 +       int result;
20089 +       reiser4_tree *tree;
20090 +       jnode_type jtype;
20091 +
20092 +       assert("nikita-2491", node != NULL);
20093 +       assert("nikita-2583", JF_ISSET(node, JNODE_RIP));
20094 +
20095 +       tree = jnode_get_tree(node);
20096 +       jtype = jnode_get_type(node);
20097 +
20098 +       spin_lock_jnode(node);
20099 +       write_lock_tree(tree);
20100 +       /*
20101 +        * if jnode has a page---leave it alone. Memory pressure will
20102 +        * eventually kill page and jnode.
20103 +        */
20104 +       if (jnode_page(node) != NULL) {
20105 +               write_unlock_tree(tree);
20106 +               spin_unlock_jnode(node);
20107 +               JF_CLR(node, JNODE_RIP);
20108 +               return RETERR(-EBUSY);
20109 +       }
20110 +
20111 +       /* re-check ->x_count under tree lock. */
20112 +       result = jnode_is_busy(node, jtype);
20113 +       if (result == 0) {
20114 +               assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
20115 +               assert("jmacd-511/b", atomic_read(&node->d_count) == 0);
20116 +
20117 +               spin_unlock_jnode(node);
20118 +               /* no page and no references---despatch him. */
20119 +               jnode_remove(node, jtype, tree);
20120 +               write_unlock_tree(tree);
20121 +               jnode_free(node, jtype);
20122 +       } else {
20123 +               /* busy check failed: reference was acquired by concurrent
20124 +                * thread. */
20125 +               write_unlock_tree(tree);
20126 +               spin_unlock_jnode(node);
20127 +               JF_CLR(node, JNODE_RIP);
20128 +       }
20129 +       return result;
20130 +}
20131 +
20132 +/* jdelete() -- Delete jnode from the tree and file system */
20133 +static int jdelete(jnode * node/* jnode to finish with */)
20134 +{
20135 +       struct page *page;
20136 +       int result;
20137 +       reiser4_tree *tree;
20138 +       jnode_type jtype;
20139 +
20140 +       assert("nikita-467", node != NULL);
20141 +       assert("nikita-2531", JF_ISSET(node, JNODE_RIP));
20142 +
20143 +       jtype = jnode_get_type(node);
20144 +
20145 +       page = jnode_lock_page(node);
20146 +       assert_spin_locked(&(node->guard));
20147 +
20148 +       tree = jnode_get_tree(node);
20149 +
20150 +       write_lock_tree(tree);
20151 +       /* re-check ->x_count under tree lock. */
20152 +       result = jnode_is_busy(node, jtype);
20153 +       if (likely(!result)) {
20154 +               assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE));
20155 +               assert("jmacd-511", atomic_read(&node->d_count) == 0);
20156 +
20157 +               /* detach page */
20158 +               if (page != NULL) {
20159 +                       /*
20160 +                        * FIXME this is racy against jnode_extent_write().
20161 +                        */
20162 +                       page_clear_jnode(page, node);
20163 +               }
20164 +               spin_unlock_jnode(node);
20165 +               /* goodbye */
20166 +               jnode_delete(node, jtype, tree);
20167 +               write_unlock_tree(tree);
20168 +               jnode_free(node, jtype);
20169 +               /* @node is no longer valid pointer */
20170 +               if (page != NULL)
20171 +                       reiser4_drop_page(page);
20172 +       } else {
20173 +               /* busy check failed: reference was acquired by concurrent
20174 +                * thread. */
20175 +               JF_CLR(node, JNODE_RIP);
20176 +               write_unlock_tree(tree);
20177 +               spin_unlock_jnode(node);
20178 +               if (page != NULL)
20179 +                       unlock_page(page);
20180 +       }
20181 +       return result;
20182 +}
20183 +
20184 +/* drop jnode on the floor.
20185 +
20186 +   Return value:
20187 +
20188 +    -EBUSY:  failed to drop jnode, because there are still references to it
20189 +
20190 +    0:       successfully dropped jnode
20191 +
20192 +*/
20193 +static int jdrop_in_tree(jnode * node, reiser4_tree * tree)
20194 +{
20195 +       struct page *page;
20196 +       jnode_type jtype;
20197 +       int result;
20198 +
20199 +       assert("zam-602", node != NULL);
20200 +       assert_rw_not_read_locked(&(tree->tree_lock));
20201 +       assert_rw_not_write_locked(&(tree->tree_lock));
20202 +       assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
20203 +
20204 +       jtype = jnode_get_type(node);
20205 +
20206 +       page = jnode_lock_page(node);
20207 +       assert_spin_locked(&(node->guard));
20208 +
20209 +       write_lock_tree(tree);
20210 +
20211 +       /* re-check ->x_count under tree lock. */
20212 +       result = jnode_is_busy(node, jtype);
20213 +       if (!result) {
20214 +               assert("nikita-2488", page == jnode_page(node));
20215 +               assert("nikita-2533", atomic_read(&node->d_count) == 0);
20216 +               if (page != NULL) {
20217 +                       assert("nikita-2126", !PageDirty(page));
20218 +                       assert("nikita-2127", PageUptodate(page));
20219 +                       assert("nikita-2181", PageLocked(page));
20220 +                       page_clear_jnode(page, node);
20221 +               }
20222 +               spin_unlock_jnode(node);
20223 +               jnode_remove(node, jtype, tree);
20224 +               write_unlock_tree(tree);
20225 +               jnode_free(node, jtype);
20226 +               if (page != NULL)
20227 +                       reiser4_drop_page(page);
20228 +       } else {
20229 +               /* busy check failed: reference was acquired by concurrent
20230 +                * thread. */
20231 +               JF_CLR(node, JNODE_RIP);
20232 +               write_unlock_tree(tree);
20233 +               spin_unlock_jnode(node);
20234 +               if (page != NULL)
20235 +                       unlock_page(page);
20236 +       }
20237 +       return result;
20238 +}
20239 +
20240 +/* This function frees jnode "if possible". In particular, [dcx]_count has to
20241 +   be 0 (where applicable).  */
20242 +void jdrop(jnode * node)
20243 +{
20244 +       jdrop_in_tree(node, jnode_get_tree(node));
20245 +}
20246 +
20247 +/* IO head jnode implementation; The io heads are simple j-nodes with limited
20248 +   functionality (these j-nodes are not in any hash table) just for reading
20249 +   from and writing to disk. */
20250 +
20251 +jnode *reiser4_alloc_io_head(const reiser4_block_nr * block)
20252 +{
20253 +       jnode *jal = jalloc();
20254 +
20255 +       if (jal != NULL) {
20256 +               jnode_init(jal, current_tree, JNODE_IO_HEAD);
20257 +               jnode_set_block(jal, block);
20258 +       }
20259 +
20260 +       jref(jal);
20261 +
20262 +       return jal;
20263 +}
20264 +
20265 +void reiser4_drop_io_head(jnode * node)
20266 +{
20267 +       assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD);
20268 +
20269 +       jput(node);
20270 +       jdrop(node);
20271 +}
20272 +
20273 +/* protect keep jnode data from reiser4_releasepage()  */
20274 +void pin_jnode_data(jnode * node)
20275 +{
20276 +       assert("zam-671", jnode_page(node) != NULL);
20277 +       page_cache_get(jnode_page(node));
20278 +}
20279 +
20280 +/* make jnode data free-able again */
20281 +void unpin_jnode_data(jnode * node)
20282 +{
20283 +       assert("zam-672", jnode_page(node) != NULL);
20284 +       page_cache_release(jnode_page(node));
20285 +}
20286 +
20287 +struct address_space *jnode_get_mapping(const jnode * node)
20288 +{
20289 +       assert("nikita-3162", node != NULL);
20290 +       return jnode_ops(node)->mapping(node);
20291 +}
20292 +
20293 +#if REISER4_DEBUG
20294 +/* debugging aid: jnode invariant */
20295 +int jnode_invariant_f(const jnode * node, char const **msg)
20296 +{
20297 +#define _ergo(ant, con)                                                \
20298 +       ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
20299 +#define _check(exp) ((*msg) = #exp, (exp))
20300 +
20301 +       return _check(node != NULL) &&
20302 +           /* [jnode-queued] */
20303 +           /* only relocated node can be queued, except that when znode
20304 +            * is being deleted, its JNODE_RELOC bit is cleared */
20305 +           _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED),
20306 +                 JF_ISSET(node, JNODE_RELOC) ||
20307 +                 JF_ISSET(node, JNODE_HEARD_BANSHEE)) &&
20308 +           _check(node->jnodes.prev != NULL) &&
20309 +           _check(node->jnodes.next != NULL) &&
20310 +           /* [jnode-dirty] invariant */
20311 +           /* dirty inode is part of atom */
20312 +           _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) &&
20313 +           /* [jnode-oid] invariant */
20314 +           /* for unformatted node ->objectid and ->mapping fields are
20315 +            * consistent */
20316 +           _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL,
20317 +                 node->key.j.objectid ==
20318 +                 get_inode_oid(node->key.j.mapping->host)) &&
20319 +           /* [jnode-atom-valid] invariant */
20320 +           /* node atom has valid state */
20321 +           _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) &&
20322 +           /* [jnode-page-binding] invariant */
20323 +           /* if node points to page, it points back to node */
20324 +           _ergo(node->pg != NULL, jprivate(node->pg) == node) &&
20325 +           /* [jnode-refs] invariant */
20326 +           /* only referenced jnode can be loaded */
20327 +           _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count));
20328 +
20329 +}
20330 +
20331 +static const char *jnode_type_name(jnode_type type)
20332 +{
20333 +       switch (type) {
20334 +       case JNODE_UNFORMATTED_BLOCK:
20335 +               return "unformatted";
20336 +       case JNODE_FORMATTED_BLOCK:
20337 +               return "formatted";
20338 +       case JNODE_BITMAP:
20339 +               return "bitmap";
20340 +       case JNODE_IO_HEAD:
20341 +               return "io head";
20342 +       case JNODE_INODE:
20343 +               return "inode";
20344 +       case LAST_JNODE_TYPE:
20345 +               return "last";
20346 +       default:{
20347 +                       static char unknown[30];
20348 +
20349 +                       sprintf(unknown, "unknown %i", type);
20350 +                       return unknown;
20351 +               }
20352 +       }
20353 +}
20354 +
20355 +#define jnode_state_name(node, flag)                   \
20356 +       (JF_ISSET((node), (flag)) ? ((#flag "|")+6) : "")
20357 +
20358 +/* debugging aid: output human readable information about @node */
20359 +static void info_jnode(const char *prefix /* prefix to print */ ,
20360 +                      const jnode * node/* node to print */)
20361 +{
20362 +       assert("umka-068", prefix != NULL);
20363 +
20364 +       if (node == NULL) {
20365 +               printk("%s: null\n", prefix);
20366 +               return;
20367 +       }
20368 +
20369 +       printk
20370 +           ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i,"
20371 +            " block: %s, d_count: %d, x_count: %d, "
20372 +            "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node,
20373 +            node->state,
20374 +            jnode_state_name(node, JNODE_PARSED),
20375 +            jnode_state_name(node, JNODE_HEARD_BANSHEE),
20376 +            jnode_state_name(node, JNODE_LEFT_CONNECTED),
20377 +            jnode_state_name(node, JNODE_RIGHT_CONNECTED),
20378 +            jnode_state_name(node, JNODE_ORPHAN),
20379 +            jnode_state_name(node, JNODE_CREATED),
20380 +            jnode_state_name(node, JNODE_RELOC),
20381 +            jnode_state_name(node, JNODE_OVRWR),
20382 +            jnode_state_name(node, JNODE_DIRTY),
20383 +            jnode_state_name(node, JNODE_IS_DYING),
20384 +            jnode_state_name(node, JNODE_RIP),
20385 +            jnode_state_name(node, JNODE_MISSED_IN_CAPTURE),
20386 +            jnode_state_name(node, JNODE_WRITEBACK),
20387 +            jnode_state_name(node, JNODE_NEW),
20388 +            jnode_state_name(node, JNODE_DKSET),
20389 +            jnode_state_name(node, JNODE_REPACK),
20390 +            jnode_state_name(node, JNODE_CLUSTER_PAGE),
20391 +            jnode_get_level(node), sprint_address(jnode_get_block(node)),
20392 +            atomic_read(&node->d_count), atomic_read(&node->x_count),
20393 +            jnode_page(node), node->atom, 0, 0,
20394 +            jnode_type_name(jnode_get_type(node)));
20395 +       if (jnode_is_unformatted(node)) {
20396 +               printk("inode: %llu, index: %lu, ",
20397 +                      node->key.j.objectid, node->key.j.index);
20398 +       }
20399 +}
20400 +
20401 +/* debugging aid: check znode invariant and panic if it doesn't hold */
20402 +static int jnode_invariant(jnode * node, int tlocked, int jlocked)
20403 +{
20404 +       char const *failed_msg;
20405 +       int result;
20406 +       reiser4_tree *tree;
20407 +
20408 +       tree = jnode_get_tree(node);
20409 +
20410 +       assert("umka-063312", node != NULL);
20411 +       assert("umka-064321", tree != NULL);
20412 +
20413 +       if (!jlocked && !tlocked)
20414 +               spin_lock_jnode((jnode *) node);
20415 +       if (!tlocked)
20416 +               read_lock_tree(jnode_get_tree(node));
20417 +       result = jnode_invariant_f(node, &failed_msg);
20418 +       if (!result) {
20419 +               info_jnode("corrupted node", node);
20420 +               warning("jmacd-555", "Condition %s failed", failed_msg);
20421 +       }
20422 +       if (!tlocked)
20423 +               read_unlock_tree(jnode_get_tree(node));
20424 +       if (!jlocked && !tlocked)
20425 +               spin_unlock_jnode((jnode *) node);
20426 +       return result;
20427 +}
20428 +
20429 +#endif                         /* REISER4_DEBUG */
20430 +
20431 +/* Make Linus happy.
20432 +   Local variables:
20433 +   c-indentation-style: "K&R"
20434 +   mode-name: "LC"
20435 +   c-basic-offset: 8
20436 +   tab-width: 8
20437 +   fill-column: 80
20438 +   End:
20439 +*/
20440 diff -urN linux-2.6.35.orig/fs/reiser4/jnode.h linux-2.6.35/fs/reiser4/jnode.h
20441 --- linux-2.6.35.orig/fs/reiser4/jnode.h        1970-01-01 01:00:00.000000000 +0100
20442 +++ linux-2.6.35/fs/reiser4/jnode.h     2010-08-04 15:44:57.000000000 +0200
20443 @@ -0,0 +1,704 @@
20444 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20445 + * reiser4/README */
20446 +
20447 +/* Declaration of jnode. See jnode.c for details. */
20448 +
20449 +#ifndef __JNODE_H__
20450 +#define __JNODE_H__
20451 +
20452 +#include "forward.h"
20453 +#include "type_safe_hash.h"
20454 +#include "txnmgr.h"
20455 +#include "key.h"
20456 +#include "debug.h"
20457 +#include "dformat.h"
20458 +#include "page_cache.h"
20459 +#include "context.h"
20460 +
20461 +#include "plugin/plugin.h"
20462 +
20463 +#include <linux/fs.h>
20464 +#include <linux/mm.h>
20465 +#include <linux/spinlock.h>
20466 +#include <asm/atomic.h>
20467 +#include <linux/bitops.h>
20468 +#include <linux/list.h>
20469 +#include <linux/rcupdate.h>
20470 +
20471 +/* declare hash table of jnodes (jnodes proper, that is, unformatted
20472 +   nodes)  */
20473 +TYPE_SAFE_HASH_DECLARE(j, jnode);
20474 +
20475 +/* declare hash table of znodes */
20476 +TYPE_SAFE_HASH_DECLARE(z, znode);
20477 +
20478 +struct jnode_key {
20479 +       __u64 objectid;
20480 +       unsigned long index;
20481 +       struct address_space *mapping;
20482 +};
20483 +
20484 +/*
20485 +   Jnode is the "base class" of other nodes in reiser4. It is also happens to
20486 +   be exactly the node we use for unformatted tree nodes.
20487 +
20488 +   Jnode provides following basic functionality:
20489 +
20490 +   . reference counting and indexing.
20491 +
20492 +   . integration with page cache. Jnode has ->pg reference to which page can
20493 +   be attached.
20494 +
20495 +   . interface to transaction manager. It is jnode that is kept in transaction
20496 +   manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this
20497 +   means, there should be special type of jnode for inode.)
20498 +
20499 +   Locking:
20500 +
20501 +   Spin lock: the following fields are protected by the per-jnode spin lock:
20502 +
20503 +    ->state
20504 +    ->atom
20505 +    ->capture_link
20506 +
20507 +   Following fields are protected by the global tree lock:
20508 +
20509 +    ->link
20510 +    ->key.z (content of ->key.z is only changed in znode_rehash())
20511 +    ->key.j
20512 +
20513 +   Atomic counters
20514 +
20515 +    ->x_count
20516 +    ->d_count
20517 +
20518 +    ->pg, and ->data are protected by spin lock for unused jnode and are
20519 +    immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable()
20520 +    is false).
20521 +
20522 +    ->tree is immutable after creation
20523 +
20524 +   Unclear
20525 +
20526 +    ->blocknr: should be under jnode spin-lock, but current interface is based
20527 +    on passing of block address.
20528 +
20529 +   If you ever need to spin lock two nodes at once, do this in "natural"
20530 +   memory order: lock znode with lower address first. (See lock_two_nodes().)
20531 +
20532 +   Invariants involving this data-type:
20533 +
20534 +      [jnode-dirty]
20535 +      [jnode-refs]
20536 +      [jnode-oid]
20537 +      [jnode-queued]
20538 +      [jnode-atom-valid]
20539 +      [jnode-page-binding]
20540 +*/
20541 +
20542 +struct jnode {
20543 +#if REISER4_DEBUG
20544 +#define JMAGIC 0x52654973      /* "ReIs" */
20545 +       int magic;
20546 +#endif
20547 +       /* FIRST CACHE LINE (16 bytes): data used by jload */
20548 +
20549 +       /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */
20550 +       /*   0 */ unsigned long state;
20551 +
20552 +       /* lock, protecting jnode's fields. */
20553 +       /*   4 */ spinlock_t load;
20554 +
20555 +       /* counter of references to jnode itself. Increased on jref().
20556 +          Decreased on jput().
20557 +        */
20558 +       /*   8 */ atomic_t x_count;
20559 +
20560 +       /* counter of references to jnode's data. Pin data page(s) in
20561 +          memory while this is greater than 0. Increased on jload().
20562 +          Decreased on jrelse().
20563 +        */
20564 +       /*   12 */ atomic_t d_count;
20565 +
20566 +       /* SECOND CACHE LINE: data used by hash table lookups */
20567 +
20568 +       /*   16 */ union {
20569 +               /* znodes are hashed by block number */
20570 +               reiser4_block_nr z;
20571 +               /* unformatted nodes are hashed by mapping plus offset */
20572 +               struct jnode_key j;
20573 +       } key;
20574 +
20575 +       /* THIRD CACHE LINE */
20576 +
20577 +       /*   32 */ union {
20578 +               /* pointers to maintain hash-table */
20579 +               z_hash_link z;
20580 +               j_hash_link j;
20581 +       } link;
20582 +
20583 +       /* pointer to jnode page.  */
20584 +       /*   36 */ struct page *pg;
20585 +       /* pointer to node itself. This is page_address(node->pg) when page is
20586 +          attached to the jnode
20587 +        */
20588 +       /*   40 */ void *data;
20589 +
20590 +       /*   44 */ reiser4_tree *tree;
20591 +
20592 +       /* FOURTH CACHE LINE: atom related fields */
20593 +
20594 +       /*   48 */ spinlock_t guard;
20595 +
20596 +       /* atom the block is in, if any */
20597 +       /*   52 */ txn_atom *atom;
20598 +
20599 +       /* capture list */
20600 +       /*   56 */ struct list_head capture_link;
20601 +
20602 +       /* FIFTH CACHE LINE */
20603 +
20604 +       /*   64 */ struct rcu_head rcu;
20605 +       /* crosses cache line */
20606 +
20607 +       /* SIXTH CACHE LINE */
20608 +
20609 +       /* the real blocknr (where io is going to/from) */
20610 +       /*   80 */ reiser4_block_nr blocknr;
20611 +       /* Parent item type, unformatted and CRC need it for
20612 +        * offset => key conversion.  */
20613 +       /* NOTE: this parent_item_id looks like jnode type. */
20614 +       /*   88 */ reiser4_plugin_id parent_item_id;
20615 +       /*   92 */
20616 +#if REISER4_DEBUG
20617 +       /* list of all jnodes for debugging purposes. */
20618 +       struct list_head jnodes;
20619 +       /* how many times this jnode was written in one transaction */
20620 +       int written;
20621 +       /* this indicates which atom's list the jnode is on */
20622 +       atom_list list;
20623 +#endif
20624 +} __attribute__ ((aligned(16)));
20625 +
20626 +/*
20627 + * jnode types. Enumeration of existing jnode types.
20628 + */
20629 +typedef enum {
20630 +       JNODE_UNFORMATTED_BLOCK,        /* unformatted block */
20631 +       JNODE_FORMATTED_BLOCK,  /* formatted block, znode */
20632 +       JNODE_BITMAP,           /* bitmap */
20633 +       JNODE_IO_HEAD,          /* jnode representing a block in the
20634 +                                * wandering log */
20635 +       JNODE_INODE,            /* jnode embedded into inode */
20636 +       LAST_JNODE_TYPE
20637 +} jnode_type;
20638 +
20639 +/* jnode states */
20640 +typedef enum {
20641 +       /* jnode's page is loaded and data checked */
20642 +       JNODE_PARSED = 0,
20643 +       /* node was deleted, not all locks on it were released. This
20644 +          node is empty and is going to be removed from the tree
20645 +          shortly. */
20646 +       JNODE_HEARD_BANSHEE = 1,
20647 +       /* left sibling pointer is valid */
20648 +       JNODE_LEFT_CONNECTED = 2,
20649 +       /* right sibling pointer is valid */
20650 +       JNODE_RIGHT_CONNECTED = 3,
20651 +
20652 +       /* znode was just created and doesn't yet have a pointer from
20653 +          its parent */
20654 +       JNODE_ORPHAN = 4,
20655 +
20656 +       /* this node was created by its transaction and has not been assigned
20657 +          a block address. */
20658 +       JNODE_CREATED = 5,
20659 +
20660 +       /* this node is currently relocated */
20661 +       JNODE_RELOC = 6,
20662 +       /* this node is currently wandered */
20663 +       JNODE_OVRWR = 7,
20664 +
20665 +       /* this znode has been modified */
20666 +       JNODE_DIRTY = 8,
20667 +
20668 +       /* znode lock is being invalidated */
20669 +       JNODE_IS_DYING = 9,
20670 +
20671 +       /* THIS PLACE IS INTENTIONALLY LEFT BLANK */
20672 +
20673 +       /* jnode is queued for flushing. */
20674 +       JNODE_FLUSH_QUEUED = 12,
20675 +
20676 +       /* In the following bits jnode type is encoded. */
20677 +       JNODE_TYPE_1 = 13,
20678 +       JNODE_TYPE_2 = 14,
20679 +       JNODE_TYPE_3 = 15,
20680 +
20681 +       /* jnode is being destroyed */
20682 +       JNODE_RIP = 16,
20683 +
20684 +       /* znode was not captured during locking (it might so be because
20685 +          ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */
20686 +       JNODE_MISSED_IN_CAPTURE = 17,
20687 +
20688 +       /* write is in progress */
20689 +       JNODE_WRITEBACK = 18,
20690 +
20691 +       /* FIXME: now it is used by crypto-compress plugin only */
20692 +       JNODE_NEW = 19,
20693 +
20694 +       /* delimiting keys are already set for this znode. */
20695 +       JNODE_DKSET = 20,
20696 +
20697 +       /* when this bit is set page and jnode can not be disconnected */
20698 +       JNODE_WRITE_PREPARED = 21,
20699 +
20700 +       JNODE_CLUSTER_PAGE = 22,
20701 +       /* Jnode is marked for repacking, that means the reiser4 flush and the
20702 +        * block allocator should process this node special way  */
20703 +       JNODE_REPACK = 23,
20704 +       /* node should be converted by flush in squalloc phase */
20705 +       JNODE_CONVERTIBLE = 24,
20706 +       /*
20707 +        * When jnode is dirtied for the first time in given transaction,
20708 +        * do_jnode_make_dirty() checks whether this jnode can possible became
20709 +        * member of overwrite set. If so, this bit is set, and one block is
20710 +        * reserved in the ->flush_reserved space of atom.
20711 +        *
20712 +        * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when
20713 +        *
20714 +        *     (1) flush decides that we want this block to go into relocate
20715 +        *     set after all.
20716 +        *
20717 +        *     (2) wandering log is allocated (by log writer)
20718 +        *
20719 +        *     (3) extent is allocated
20720 +        *
20721 +        */
20722 +       JNODE_FLUSH_RESERVED = 29
20723 +} reiser4_jnode_state;
20724 +
20725 +/* Macros for accessing the jnode state. */
20726 +
20727 +static inline void JF_CLR(jnode * j, int f)
20728 +{
20729 +       assert("unknown-1", j->magic == JMAGIC);
20730 +       clear_bit(f, &j->state);
20731 +}
20732 +static inline int JF_ISSET(const jnode * j, int f)
20733 +{
20734 +       assert("unknown-2", j->magic == JMAGIC);
20735 +       return test_bit(f, &((jnode *) j)->state);
20736 +}
20737 +static inline void JF_SET(jnode * j, int f)
20738 +{
20739 +       assert("unknown-3", j->magic == JMAGIC);
20740 +       set_bit(f, &j->state);
20741 +}
20742 +
20743 +static inline int JF_TEST_AND_SET(jnode * j, int f)
20744 +{
20745 +       assert("unknown-4", j->magic == JMAGIC);
20746 +       return test_and_set_bit(f, &j->state);
20747 +}
20748 +
20749 +static inline void spin_lock_jnode(jnode *node)
20750 +{
20751 +       /* check that spinlocks of lower priorities are not held */
20752 +       assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
20753 +                   LOCK_CNT_NIL(spin_locked_txnh) &&
20754 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
20755 +                   LOCK_CNT_NIL(rw_locked_dk) &&
20756 +                   LOCK_CNT_LT(spin_locked_jnode, 2)));
20757 +
20758 +       spin_lock(&(node->guard));
20759 +
20760 +       LOCK_CNT_INC(spin_locked_jnode);
20761 +       LOCK_CNT_INC(spin_locked);
20762 +}
20763 +
20764 +static inline void spin_unlock_jnode(jnode *node)
20765 +{
20766 +       assert_spin_locked(&(node->guard));
20767 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode));
20768 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
20769 +
20770 +       LOCK_CNT_DEC(spin_locked_jnode);
20771 +       LOCK_CNT_DEC(spin_locked);
20772 +
20773 +       spin_unlock(&(node->guard));
20774 +}
20775 +
20776 +static inline int jnode_is_in_deleteset(const jnode * node)
20777 +{
20778 +       return JF_ISSET(node, JNODE_RELOC);
20779 +}
20780 +
20781 +extern int init_jnodes(void);
20782 +extern void done_jnodes(void);
20783 +
20784 +/* Jnode routines */
20785 +extern jnode *jalloc(void);
20786 +extern void jfree(jnode * node) NONNULL;
20787 +extern jnode *jclone(jnode *);
20788 +extern jnode *jlookup(reiser4_tree * tree,
20789 +                     oid_t objectid, unsigned long ind) NONNULL;
20790 +extern jnode *jfind(struct address_space *, unsigned long index) NONNULL;
20791 +extern jnode *jnode_by_page(struct page *pg) NONNULL;
20792 +extern jnode *jnode_of_page(struct page *pg) NONNULL;
20793 +void jnode_attach_page(jnode * node, struct page *pg);
20794 +
20795 +void unhash_unformatted_jnode(jnode *);
20796 +extern jnode *page_next_jnode(jnode * node) NONNULL;
20797 +extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL;
20798 +extern void jnode_make_dirty(jnode * node) NONNULL;
20799 +extern void jnode_make_clean(jnode * node) NONNULL;
20800 +extern void jnode_make_wander_nolock(jnode * node) NONNULL;
20801 +extern void jnode_make_wander(jnode *) NONNULL;
20802 +extern void znode_make_reloc(znode * , flush_queue_t *) NONNULL;
20803 +extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL;
20804 +extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL;
20805 +
20806 +/**
20807 + * jnode_get_block
20808 + * @node: jnode to query
20809 + *
20810 + */
20811 +static inline const reiser4_block_nr *jnode_get_block(const jnode *node)
20812 +{
20813 +       assert("nikita-528", node != NULL);
20814 +
20815 +       return &node->blocknr;
20816 +}
20817 +
20818 +/**
20819 + * jnode_set_block
20820 + * @node: jnode to update
20821 + * @blocknr: new block nr
20822 + */
20823 +static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr)
20824 +{
20825 +       assert("nikita-2020", node != NULL);
20826 +       assert("umka-055", blocknr != NULL);
20827 +       node->blocknr = *blocknr;
20828 +}
20829 +
20830 +
20831 +/* block number for IO. Usually this is the same as jnode_get_block(), unless
20832 + * jnode was emergency flushed---then block number chosen by eflush is
20833 + * used. */
20834 +static inline const reiser4_block_nr *jnode_get_io_block(jnode * node)
20835 +{
20836 +       assert("nikita-2768", node != NULL);
20837 +       assert_spin_locked(&(node->guard));
20838 +
20839 +       return jnode_get_block(node);
20840 +}
20841 +
20842 +/* Jnode flush interface. */
20843 +extern reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t *pos);
20844 +extern flush_queue_t *reiser4_pos_fq(flush_pos_t *pos);
20845 +
20846 +/* FIXME-VS: these are used in plugin/item/extent.c */
20847 +
20848 +/* does extent_get_block have to be called */
20849 +#define jnode_mapped(node)     JF_ISSET (node, JNODE_MAPPED)
20850 +#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED)
20851 +
20852 +/* the node should be converted during flush squalloc phase */
20853 +#define jnode_convertible(node)        JF_ISSET (node, JNODE_CONVERTIBLE)
20854 +#define jnode_set_convertible(node)    JF_SET (node, JNODE_CONVERTIBLE)
20855 +
20856 +/* Macros to convert from jnode to znode, znode to jnode.  These are macros
20857 +   because C doesn't allow overloading of const prototypes. */
20858 +#define ZJNODE(x) (&(x)->zjnode)
20859 +#define JZNODE(x)                                              \
20860 +({                                                             \
20861 +       typeof(x) __tmp_x;                                      \
20862 +                                                               \
20863 +       __tmp_x = (x);                                          \
20864 +       assert("jmacd-1300", jnode_is_znode(__tmp_x));          \
20865 +       (znode*) __tmp_x;                                       \
20866 +})
20867 +
20868 +extern int jnodes_tree_init(reiser4_tree * tree);
20869 +extern int jnodes_tree_done(reiser4_tree * tree);
20870 +
20871 +#if REISER4_DEBUG
20872 +
20873 +extern int znode_is_any_locked(const znode * node);
20874 +extern void jnode_list_remove(jnode * node);
20875 +
20876 +#else
20877 +
20878 +#define jnode_list_remove(node) noop
20879 +
20880 +#endif
20881 +
20882 +int znode_is_root(const znode * node) NONNULL;
20883 +
20884 +/* bump reference counter on @node */
20885 +static inline void add_x_ref(jnode * node/* node to increase x_count of */)
20886 +{
20887 +       assert("nikita-1911", node != NULL);
20888 +
20889 +       atomic_inc(&node->x_count);
20890 +       LOCK_CNT_INC(x_refs);
20891 +}
20892 +
20893 +static inline void dec_x_ref(jnode * node)
20894 +{
20895 +       assert("nikita-3215", node != NULL);
20896 +       assert("nikita-3216", atomic_read(&node->x_count) > 0);
20897 +
20898 +       atomic_dec(&node->x_count);
20899 +       assert("nikita-3217", LOCK_CNT_GTZ(x_refs));
20900 +       LOCK_CNT_DEC(x_refs);
20901 +}
20902 +
20903 +/* jref() - increase counter of references to jnode/znode (x_count) */
20904 +static inline jnode *jref(jnode * node)
20905 +{
20906 +       assert("jmacd-508", (node != NULL) && !IS_ERR(node));
20907 +       add_x_ref(node);
20908 +       return node;
20909 +}
20910 +
20911 +/* get the page of jnode */
20912 +static inline struct page *jnode_page(const jnode * node)
20913 +{
20914 +       return node->pg;
20915 +}
20916 +
20917 +/* return pointer to jnode data */
20918 +static inline char *jdata(const jnode * node)
20919 +{
20920 +       assert("nikita-1415", node != NULL);
20921 +       assert("nikita-3198", jnode_page(node) != NULL);
20922 +       return node->data;
20923 +}
20924 +
20925 +static inline int jnode_is_loaded(const jnode * node)
20926 +{
20927 +       assert("zam-506", node != NULL);
20928 +       return atomic_read(&node->d_count) > 0;
20929 +}
20930 +
20931 +extern void page_clear_jnode(struct page *page, jnode * node) NONNULL;
20932 +
20933 +static inline void jnode_set_reloc(jnode * node)
20934 +{
20935 +       assert("nikita-2431", node != NULL);
20936 +       assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR));
20937 +       JF_SET(node, JNODE_RELOC);
20938 +}
20939 +
20940 +/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */
20941 +
20942 +extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL;
20943 +
20944 +static inline int jload(jnode *node)
20945 +{
20946 +       return jload_gfp(node, reiser4_ctx_gfp_mask_get(), 1);
20947 +}
20948 +
20949 +extern int jinit_new(jnode *, gfp_t) NONNULL;
20950 +extern int jstartio(jnode *) NONNULL;
20951 +
20952 +extern void jdrop(jnode *) NONNULL;
20953 +extern int jwait_io(jnode *, int rw) NONNULL;
20954 +
20955 +void jload_prefetch(jnode *);
20956 +
20957 +extern jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) NONNULL;
20958 +extern void reiser4_drop_io_head(jnode * node) NONNULL;
20959 +
20960 +static inline reiser4_tree *jnode_get_tree(const jnode * node)
20961 +{
20962 +       assert("nikita-2691", node != NULL);
20963 +       return node->tree;
20964 +}
20965 +
20966 +extern void pin_jnode_data(jnode *);
20967 +extern void unpin_jnode_data(jnode *);
20968 +
20969 +static inline jnode_type jnode_get_type(const jnode * node)
20970 +{
20971 +       static const unsigned long state_mask =
20972 +           (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3);
20973 +
20974 +       static jnode_type mask_to_type[] = {
20975 +               /*  JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */
20976 +
20977 +               /* 000 */
20978 +               [0] = JNODE_FORMATTED_BLOCK,
20979 +               /* 001 */
20980 +               [1] = JNODE_UNFORMATTED_BLOCK,
20981 +               /* 010 */
20982 +               [2] = JNODE_BITMAP,
20983 +               /* 011 */
20984 +               [3] = LAST_JNODE_TYPE,  /*invalid */
20985 +               /* 100 */
20986 +               [4] = JNODE_INODE,
20987 +               /* 101 */
20988 +               [5] = LAST_JNODE_TYPE,
20989 +               /* 110 */
20990 +               [6] = JNODE_IO_HEAD,
20991 +               /* 111 */
20992 +               [7] = LAST_JNODE_TYPE,  /* invalid */
20993 +       };
20994 +
20995 +       return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1];
20996 +}
20997 +
20998 +/* returns true if node is a znode */
20999 +static inline int jnode_is_znode(const jnode * node)
21000 +{
21001 +       return jnode_get_type(node) == JNODE_FORMATTED_BLOCK;
21002 +}
21003 +
21004 +static inline int jnode_is_flushprepped(jnode * node)
21005 +{
21006 +       assert("jmacd-78212", node != NULL);
21007 +       assert_spin_locked(&(node->guard));
21008 +       return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) ||
21009 +               JF_ISSET(node, JNODE_OVRWR);
21010 +}
21011 +
21012 +/* Return true if @node has already been processed by the squeeze and allocate
21013 +   process.  This implies the block address has been finalized for the
21014 +   duration of this atom (or it is clean and will remain in place).  If this
21015 +   returns true you may use the block number as a hint. */
21016 +static inline int jnode_check_flushprepped(jnode * node)
21017 +{
21018 +       int result;
21019 +
21020 +       /* It must be clean or relocated or wandered.  New allocations are set
21021 +        * to relocate. */
21022 +       spin_lock_jnode(node);
21023 +       result = jnode_is_flushprepped(node);
21024 +       spin_unlock_jnode(node);
21025 +       return result;
21026 +}
21027 +
21028 +/* returns true if node is unformatted */
21029 +static inline int jnode_is_unformatted(const jnode * node)
21030 +{
21031 +       assert("jmacd-0123", node != NULL);
21032 +       return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK;
21033 +}
21034 +
21035 +/* returns true if node represents a cluster cache page */
21036 +static inline int jnode_is_cluster_page(const jnode * node)
21037 +{
21038 +       assert("edward-50", node != NULL);
21039 +       return (JF_ISSET(node, JNODE_CLUSTER_PAGE));
21040 +}
21041 +
21042 +/* returns true is node is builtin inode's jnode */
21043 +static inline int jnode_is_inode(const jnode * node)
21044 +{
21045 +       assert("vs-1240", node != NULL);
21046 +       return jnode_get_type(node) == JNODE_INODE;
21047 +}
21048 +
21049 +static inline jnode_plugin *jnode_ops_of(const jnode_type type)
21050 +{
21051 +       assert("nikita-2367", type < LAST_JNODE_TYPE);
21052 +       return jnode_plugin_by_id((reiser4_plugin_id) type);
21053 +}
21054 +
21055 +static inline jnode_plugin *jnode_ops(const jnode * node)
21056 +{
21057 +       assert("nikita-2366", node != NULL);
21058 +
21059 +       return jnode_ops_of(jnode_get_type(node));
21060 +}
21061 +
21062 +/* Get the index of a block. */
21063 +static inline unsigned long jnode_get_index(jnode * node)
21064 +{
21065 +       return jnode_ops(node)->index(node);
21066 +}
21067 +
21068 +/* return true if "node" is the root */
21069 +static inline int jnode_is_root(const jnode * node)
21070 +{
21071 +       return jnode_is_znode(node) && znode_is_root(JZNODE(node));
21072 +}
21073 +
21074 +extern struct address_space *mapping_jnode(const jnode * node);
21075 +extern unsigned long index_jnode(const jnode * node);
21076 +
21077 +static inline void jput(jnode * node);
21078 +extern void jput_final(jnode * node);
21079 +
21080 +/* bump data counter on @node */
21081 +static inline void add_d_ref(jnode * node/* node to increase d_count of */)
21082 +{
21083 +       assert("nikita-1962", node != NULL);
21084 +
21085 +       atomic_inc(&node->d_count);
21086 +       if (jnode_is_unformatted(node) || jnode_is_znode(node))
21087 +               LOCK_CNT_INC(d_refs);
21088 +}
21089 +
21090 +/* jput() - decrement x_count reference counter on znode.
21091 +
21092 +   Count may drop to 0, jnode stays in cache until memory pressure causes the
21093 +   eviction of its page. The c_count variable also ensures that children are
21094 +   pressured out of memory before the parent. The jnode remains hashed as
21095 +   long as the VM allows its page to stay in memory.
21096 +*/
21097 +static inline void jput(jnode * node)
21098 +{
21099 +       assert("jmacd-509", node != NULL);
21100 +       assert("jmacd-510", atomic_read(&node->x_count) > 0);
21101 +       assert("zam-926", reiser4_schedulable());
21102 +       LOCK_CNT_DEC(x_refs);
21103 +
21104 +       rcu_read_lock();
21105 +       /*
21106 +        * we don't need any kind of lock here--jput_final() uses RCU.
21107 +        */
21108 +       if (unlikely(atomic_dec_and_test(&node->x_count)))
21109 +               jput_final(node);
21110 +       else
21111 +               rcu_read_unlock();
21112 +       assert("nikita-3473", reiser4_schedulable());
21113 +}
21114 +
21115 +extern void jrelse(jnode * node);
21116 +extern void jrelse_tail(jnode * node);
21117 +
21118 +extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node);
21119 +
21120 +/* resolve race with jput */
21121 +static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node)
21122 +{
21123 +       if (unlikely(JF_ISSET(node, JNODE_RIP)))
21124 +               node = jnode_rip_sync(tree, node);
21125 +       return node;
21126 +}
21127 +
21128 +extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key);
21129 +
21130 +#if REISER4_DEBUG
21131 +extern int jnode_invariant_f(const jnode *node, char const **msg);
21132 +#endif
21133 +
21134 +extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE];
21135 +
21136 +/* __JNODE_H__ */
21137 +#endif
21138 +
21139 +/* Make Linus happy.
21140 +   Local variables:
21141 +   c-indentation-style: "K&R"
21142 +   mode-name: "LC"
21143 +   c-basic-offset: 8
21144 +   tab-width: 8
21145 +   fill-column: 120
21146 +   End:
21147 +*/
21148 diff -urN linux-2.6.35.orig/fs/reiser4/kassign.c linux-2.6.35/fs/reiser4/kassign.c
21149 --- linux-2.6.35.orig/fs/reiser4/kassign.c      1970-01-01 01:00:00.000000000 +0100
21150 +++ linux-2.6.35/fs/reiser4/kassign.c   2010-08-04 15:44:57.000000000 +0200
21151 @@ -0,0 +1,677 @@
21152 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21153 + * reiser4/README */
21154 +
21155 +/* Key assignment policy implementation */
21156 +
21157 +/*
21158 + * In reiser4 every piece of file system data and meta-data has a key. Keys
21159 + * are used to store information in and retrieve it from reiser4 internal
21160 + * tree. In addition to this, keys define _ordering_ of all file system
21161 + * information: things having close keys are placed into the same or
21162 + * neighboring (in the tree order) nodes of the tree. As our block allocator
21163 + * tries to respect tree order (see flush.c), keys also define order in which
21164 + * things are laid out on the disk, and hence, affect performance directly.
21165 + *
21166 + * Obviously, assignment of keys to data and meta-data should be consistent
21167 + * across whole file system. Algorithm that calculates a key for a given piece
21168 + * of data or meta-data is referred to as "key assignment".
21169 + *
21170 + * Key assignment is too expensive to be implemented as a plugin (that is,
21171 + * with an ability to support different key assignment schemas in the same
21172 + * compiled kernel image). As a compromise, all key-assignment functions and
21173 + * data-structures are collected in this single file, so that modifications to
21174 + * key assignment algorithm can be localized. Additional changes may be
21175 + * required in key.[ch].
21176 + *
21177 + * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
21178 + * may guess, there is "Plan B" too.
21179 + *
21180 + */
21181 +
21182 +/*
21183 + * Additional complication with key assignment implementation is a requirement
21184 + * to support different key length.
21185 + */
21186 +
21187 +/*
21188 + *                   KEY ASSIGNMENT: PLAN A, LONG KEYS.
21189 + *
21190 + * DIRECTORY ITEMS
21191 + *
21192 + * |       60     | 4 | 7 |1|   56        |        64        |        64       |
21193 + * +--------------+---+---+-+-------------+------------------+-----------------+
21194 + * |    dirid     | 0 | F |H|  prefix-1   |    prefix-2      |  prefix-3/hash  |
21195 + * +--------------+---+---+-+-------------+------------------+-----------------+
21196 + * |                  |                   |                  |                 |
21197 + * |    8 bytes       |      8 bytes      |     8 bytes      |     8 bytes     |
21198 + *
21199 + * dirid         objectid of directory this item is for
21200 + *
21201 + * F             fibration, see fs/reiser4/plugin/fibration.[ch]
21202 + *
21203 + * H             1 if last 8 bytes of the key contain hash,
21204 + *               0 if last 8 bytes of the key contain prefix-3
21205 + *
21206 + * prefix-1      first 7 characters of file name.
21207 + *               Padded by zeroes if name is not long enough.
21208 + *
21209 + * prefix-2      next 8 characters of the file name.
21210 + *
21211 + * prefix-3      next 8 characters of the file name.
21212 + *
21213 + * hash          hash of the rest of file name (i.e., portion of file
21214 + *               name not included into prefix-1 and prefix-2).
21215 + *
21216 + * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
21217 + * in the key. Such file names are called "short". They are distinguished by H
21218 + * bit set 0 in the key.
21219 + *
21220 + * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
21221 + * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
21222 + * key. Last 8 bytes of the key are occupied by hash of the remaining
21223 + * characters of the name.
21224 + *
21225 + * This key assignment reaches following important goals:
21226 + *
21227 + *     (1) directory entries are sorted in approximately lexicographical
21228 + *     order.
21229 + *
21230 + *     (2) collisions (when multiple directory items have the same key), while
21231 + *     principally unavoidable in a tree with fixed length keys, are rare.
21232 + *
21233 + * STAT DATA
21234 + *
21235 + *  |       60     | 4 |       64        | 4 |     60       |        64       |
21236 + *  +--------------+---+-----------------+---+--------------+-----------------+
21237 + *  |  locality id | 1 |    ordering     | 0 |  objectid    |        0        |
21238 + *  +--------------+---+-----------------+---+--------------+-----------------+
21239 + *  |                  |                 |                  |                 |
21240 + *  |    8 bytes       |    8 bytes      |     8 bytes      |     8 bytes     |
21241 + *
21242 + * locality id     object id of a directory where first name was created for
21243 + *                 the object
21244 + *
21245 + * ordering        copy of second 8-byte portion of the key of directory
21246 + *                 entry for the first name of this object. Ordering has a form
21247 + *                         {
21248 + *                                 fibration :7;
21249 + *                                 h         :1;
21250 + *                                 prefix1   :56;
21251 + *                         }
21252 + *                 see description of key for directory entry above.
21253 + *
21254 + * objectid        object id for this object
21255 + *
21256 + * This key assignment policy is designed to keep stat-data in the same order
21257 + * as corresponding directory items, thus speeding up readdir/stat types of
21258 + * workload.
21259 + *
21260 + * FILE BODY
21261 + *
21262 + *  |       60     | 4 |       64        | 4 |     60       |        64       |
21263 + *  +--------------+---+-----------------+---+--------------+-----------------+
21264 + *  |  locality id | 4 |    ordering     | 0 |  objectid    |      offset     |
21265 + *  +--------------+---+-----------------+---+--------------+-----------------+
21266 + *  |                  |                 |                  |                 |
21267 + *  |    8 bytes       |    8 bytes      |     8 bytes      |     8 bytes     |
21268 + *
21269 + * locality id     object id of a directory where first name was created for
21270 + *                 the object
21271 + *
21272 + * ordering        the same as in the key of stat-data for this object
21273 + *
21274 + * objectid        object id for this object
21275 + *
21276 + * offset          logical offset from the beginning of this file.
21277 + *                 Measured in bytes.
21278 + *
21279 + *
21280 + *                   KEY ASSIGNMENT: PLAN A, SHORT KEYS.
21281 + *
21282 + * DIRECTORY ITEMS
21283 + *
21284 + *  |       60     | 4 | 7 |1|   56        |        64       |
21285 + *  +--------------+---+---+-+-------------+-----------------+
21286 + *  |    dirid     | 0 | F |H|  prefix-1   |  prefix-2/hash  |
21287 + *  +--------------+---+---+-+-------------+-----------------+
21288 + *  |                  |                   |                 |
21289 + *  |    8 bytes       |      8 bytes      |     8 bytes     |
21290 + *
21291 + * dirid         objectid of directory this item is for
21292 + *
21293 + * F             fibration, see fs/reiser4/plugin/fibration.[ch]
21294 + *
21295 + * H             1 if last 8 bytes of the key contain hash,
21296 + *               0 if last 8 bytes of the key contain prefix-2
21297 + *
21298 + * prefix-1      first 7 characters of file name.
21299 + *               Padded by zeroes if name is not long enough.
21300 + *
21301 + * prefix-2      next 8 characters of the file name.
21302 + *
21303 + * hash          hash of the rest of file name (i.e., portion of file
21304 + *               name not included into prefix-1).
21305 + *
21306 + * File names shorter than 15 (== 7 + 8) characters are completely encoded in
21307 + * the key. Such file names are called "short". They are distinguished by H
21308 + * bit set in the key.
21309 + *
21310 + * Other file names are "long". For long name, H bit is 0, and first 7
21311 + * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
21312 + * key are occupied by hash of the remaining characters of the name.
21313 + *
21314 + * STAT DATA
21315 + *
21316 + *  |       60     | 4 | 4 |     60       |        64       |
21317 + *  +--------------+---+---+--------------+-----------------+
21318 + *  |  locality id | 1 | 0 |  objectid    |        0        |
21319 + *  +--------------+---+---+--------------+-----------------+
21320 + *  |                  |                  |                 |
21321 + *  |    8 bytes       |     8 bytes      |     8 bytes     |
21322 + *
21323 + * locality id     object id of a directory where first name was created for
21324 + *                 the object
21325 + *
21326 + * objectid        object id for this object
21327 + *
21328 + * FILE BODY
21329 + *
21330 + *  |       60     | 4 | 4 |     60       |        64       |
21331 + *  +--------------+---+---+--------------+-----------------+
21332 + *  |  locality id | 4 | 0 |  objectid    |      offset     |
21333 + *  +--------------+---+---+--------------+-----------------+
21334 + *  |                  |                  |                 |
21335 + *  |    8 bytes       |     8 bytes      |     8 bytes     |
21336 + *
21337 + * locality id     object id of a directory where first name was created for
21338 + *                 the object
21339 + *
21340 + * objectid        object id for this object
21341 + *
21342 + * offset          logical offset from the beginning of this file.
21343 + *                 Measured in bytes.
21344 + *
21345 + *
21346 + */
21347 +
21348 +#include "debug.h"
21349 +#include "key.h"
21350 +#include "kassign.h"
21351 +#include "vfs_ops.h"
21352 +#include "inode.h"
21353 +#include "super.h"
21354 +#include "dscale.h"
21355 +
21356 +#include <linux/types.h>       /* for __u??  */
21357 +#include <linux/fs.h>          /* for struct super_block, etc  */
21358 +
21359 +/* bitmask for H bit (see comment at the beginning of this file */
21360 +static const __u64 longname_mark = 0x0100000000000000ull;
21361 +/* bitmask for F and H portions of the key. */
21362 +static const __u64 fibration_mask = 0xff00000000000000ull;
21363 +
21364 +/* return true if name is not completely encoded in @key */
21365 +int is_longname_key(const reiser4_key * key)
21366 +{
21367 +       __u64 highpart;
21368 +
21369 +       assert("nikita-2863", key != NULL);
21370 +       if (get_key_type(key) != KEY_FILE_NAME_MINOR)
21371 +               reiser4_print_key("oops", key);
21372 +       assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
21373 +
21374 +       if (REISER4_LARGE_KEY)
21375 +               highpart = get_key_ordering(key);
21376 +       else
21377 +               highpart = get_key_objectid(key);
21378 +
21379 +       return (highpart & longname_mark) ? 1 : 0;
21380 +}
21381 +
21382 +/* return true if @name is too long to be completely encoded in the key */
21383 +int is_longname(const char *name UNUSED_ARG, int len)
21384 +{
21385 +       if (REISER4_LARGE_KEY)
21386 +               return len > 23;
21387 +       else
21388 +               return len > 15;
21389 +}
21390 +
21391 +/* code ascii string into __u64.
21392 +
21393 +   Put characters of @name into result (@str) one after another starting
21394 +   from @start_idx-th highest (arithmetically) byte. This produces
21395 +   endian-safe encoding. memcpy(2) will not do.
21396 +
21397 +*/
21398 +static __u64 pack_string(const char *name /* string to encode */ ,
21399 +                        int start_idx  /* highest byte in result from
21400 +                                        * which to start encoding */ )
21401 +{
21402 +       unsigned i;
21403 +       __u64 str;
21404 +
21405 +       str = 0;
21406 +       for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
21407 +               str <<= 8;
21408 +               str |= (unsigned char)name[i];
21409 +       }
21410 +       str <<= (sizeof str - i - start_idx) << 3;
21411 +       return str;
21412 +}
21413 +
21414 +/* opposite to pack_string(). Takes value produced by pack_string(), restores
21415 + * string encoded in it and stores result in @buf */
21416 +char *reiser4_unpack_string(__u64 value, char *buf)
21417 +{
21418 +       do {
21419 +               *buf = value >> (64 - 8);
21420 +               if (*buf)
21421 +                       ++buf;
21422 +               value <<= 8;
21423 +       } while (value != 0);
21424 +       *buf = 0;
21425 +       return buf;
21426 +}
21427 +
21428 +/* obtain name encoded in @key and store it in @buf */
21429 +char *extract_name_from_key(const reiser4_key * key, char *buf)
21430 +{
21431 +       char *c;
21432 +
21433 +       assert("nikita-2868", !is_longname_key(key));
21434 +
21435 +       c = buf;
21436 +       if (REISER4_LARGE_KEY) {
21437 +               c = reiser4_unpack_string(get_key_ordering(key) &
21438 +                                         ~fibration_mask, c);
21439 +               c = reiser4_unpack_string(get_key_fulloid(key), c);
21440 +       } else
21441 +               c = reiser4_unpack_string(get_key_fulloid(key) &
21442 +                                         ~fibration_mask, c);
21443 +       reiser4_unpack_string(get_key_offset(key), c);
21444 +       return buf;
21445 +}
21446 +
21447 +/**
21448 + * complete_entry_key - calculate entry key by name
21449 + * @dir: directory where entry is (or will be) in
21450 + * @name: name to calculate key of
21451 + * @len: lenth of name
21452 + * @result: place to store result in
21453 + *
21454 + * Sets fields of entry key @result which depend on file name.
21455 + * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
21456 + * objectid and offset. Otherwise, objectid and offset are set.
21457 + */
21458 +void complete_entry_key(const struct inode *dir, const char *name,
21459 +                       int len, reiser4_key *result)
21460 +{
21461 +#if REISER4_LARGE_KEY
21462 +       __u64 ordering;
21463 +       __u64 objectid;
21464 +       __u64 offset;
21465 +
21466 +       assert("nikita-1139", dir != NULL);
21467 +       assert("nikita-1142", result != NULL);
21468 +       assert("nikita-2867", strlen(name) == len);
21469 +
21470 +       /*
21471 +        * key allocation algorithm for directory entries in case of large
21472 +        * keys:
21473 +        *
21474 +        * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
21475 +        * characters into ordering field of key, next 8 charactes (if any)
21476 +        * into objectid field of key and next 8 ones (of any) into offset
21477 +        * field of key
21478 +        *
21479 +        * If file name is longer than 23 characters, put first 7 characters
21480 +        * into key's ordering, next 8 to objectid and hash of remaining
21481 +        * characters into offset field.
21482 +        *
21483 +        * To distinguish above cases, in latter set up unused high bit in
21484 +        * ordering field.
21485 +        */
21486 +
21487 +       /* [0-6] characters to ordering */
21488 +       ordering = pack_string(name, 1);
21489 +       if (len > 7) {
21490 +               /* [7-14] characters to objectid */
21491 +               objectid = pack_string(name + 7, 0);
21492 +               if (len > 15) {
21493 +                       if (len <= 23) {
21494 +                               /* [15-23] characters to offset */
21495 +                               offset = pack_string(name + 15, 0);
21496 +                       } else {
21497 +                               /* note in a key the fact that offset contains
21498 +                                * hash */
21499 +                               ordering |= longname_mark;
21500 +
21501 +                               /* offset is the hash of the file name's tail */
21502 +                               offset = inode_hash_plugin(dir)->hash(name + 15,
21503 +                                                                     len - 15);
21504 +                       }
21505 +               } else {
21506 +                       offset = 0ull;
21507 +               }
21508 +       } else {
21509 +               objectid = 0ull;
21510 +               offset = 0ull;
21511 +       }
21512 +
21513 +       assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21514 +       ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21515 +
21516 +       set_key_ordering(result, ordering);
21517 +       set_key_fulloid(result, objectid);
21518 +       set_key_offset(result, offset);
21519 +       return;
21520 +
21521 +#else
21522 +       __u64 objectid;
21523 +       __u64 offset;
21524 +
21525 +       assert("nikita-1139", dir != NULL);
21526 +       assert("nikita-1142", result != NULL);
21527 +       assert("nikita-2867", strlen(name) == len);
21528 +
21529 +       /*
21530 +        * key allocation algorithm for directory entries in case of not large
21531 +        * keys:
21532 +        *
21533 +        * If name is not longer than 7 + 8 = 15 characters, put first 7
21534 +        * characters into objectid field of key, next 8 charactes (if any)
21535 +        * into offset field of key
21536 +        *
21537 +        * If file name is longer than 15 characters, put first 7 characters
21538 +        * into key's objectid, and hash of remaining characters into offset
21539 +        * field.
21540 +        *
21541 +        * To distinguish above cases, in latter set up unused high bit in
21542 +        * objectid field.
21543 +        */
21544 +
21545 +       /* [0-6] characters to objectid */
21546 +       objectid = pack_string(name, 1);
21547 +       if (len > 7) {
21548 +               if (len <= 15) {
21549 +                       /* [7-14] characters to offset */
21550 +                       offset = pack_string(name + 7, 0);
21551 +               } else {
21552 +                       /* note in a key the fact that offset contains hash. */
21553 +                       objectid |= longname_mark;
21554 +
21555 +                       /* offset is the hash of the file name. */
21556 +                       offset = inode_hash_plugin(dir)->hash(name + 7,
21557 +                                                             len - 7);
21558 +               }
21559 +       } else
21560 +               offset = 0ull;
21561 +
21562 +       assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21563 +       objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21564 +
21565 +       set_key_fulloid(result, objectid);
21566 +       set_key_offset(result, offset);
21567 +       return;
21568 +#endif                         /* ! REISER4_LARGE_KEY */
21569 +}
21570 +
21571 +/* true, if @key is the key of "." */
21572 +int is_dot_key(const reiser4_key * key/* key to check */)
21573 +{
21574 +       assert("nikita-1717", key != NULL);
21575 +       assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
21576 +       return
21577 +           (get_key_ordering(key) == 0ull) &&
21578 +           (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
21579 +}
21580 +
21581 +/* build key for stat-data.
21582 +
21583 +   return key of stat-data of this object. This should became sd plugin
21584 +   method in the future. For now, let it be here.
21585 +
21586 +*/
21587 +reiser4_key *build_sd_key(const struct inode *target /* inode of an object */ ,
21588 +                         reiser4_key * result  /* resulting key of @target
21589 +                                                  stat-data */ )
21590 +{
21591 +       assert("nikita-261", result != NULL);
21592 +
21593 +       reiser4_key_init(result);
21594 +       set_key_locality(result, reiser4_inode_data(target)->locality_id);
21595 +       set_key_ordering(result, get_inode_ordering(target));
21596 +       set_key_objectid(result, get_inode_oid(target));
21597 +       set_key_type(result, KEY_SD_MINOR);
21598 +       set_key_offset(result, (__u64) 0);
21599 +       return result;
21600 +}
21601 +
21602 +/* encode part of key into &obj_key_id
21603 +
21604 +   This encodes into @id part of @key sufficient to restore @key later,
21605 +   given that latter is key of object (key of stat-data).
21606 +
21607 +   See &obj_key_id
21608 +*/
21609 +int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
21610 +                    obj_key_id * id/* id where key is encoded in */)
21611 +{
21612 +       assert("nikita-1151", key != NULL);
21613 +       assert("nikita-1152", id != NULL);
21614 +
21615 +       memcpy(id, key, sizeof *id);
21616 +       return 0;
21617 +}
21618 +
21619 +/* encode reference to @obj in @id.
21620 +
21621 +   This is like build_obj_key_id() above, but takes inode as parameter. */
21622 +int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
21623 +                      obj_key_id * id/* result */)
21624 +{
21625 +       reiser4_key sdkey;
21626 +
21627 +       assert("nikita-1166", obj != NULL);
21628 +       assert("nikita-1167", id != NULL);
21629 +
21630 +       build_sd_key(obj, &sdkey);
21631 +       build_obj_key_id(&sdkey, id);
21632 +       return 0;
21633 +}
21634 +
21635 +/* decode @id back into @key
21636 +
21637 +   Restore key of object stat-data from @id. This is dual to
21638 +   build_obj_key_id() above.
21639 +*/
21640 +int extract_key_from_id(const obj_key_id * id  /* object key id to extract key
21641 +                                                * from */ ,
21642 +                       reiser4_key * key/* result */)
21643 +{
21644 +       assert("nikita-1153", id != NULL);
21645 +       assert("nikita-1154", key != NULL);
21646 +
21647 +       reiser4_key_init(key);
21648 +       memcpy(key, id, sizeof *id);
21649 +       return 0;
21650 +}
21651 +
21652 +/* extract objectid of directory from key of directory entry within said
21653 +   directory.
21654 +   */
21655 +oid_t extract_dir_id_from_key(const reiser4_key * de_key       /* key of
21656 +                                                                * directory
21657 +                                                                * entry */ )
21658 +{
21659 +       assert("nikita-1314", de_key != NULL);
21660 +       return get_key_locality(de_key);
21661 +}
21662 +
21663 +/* encode into @id key of directory entry.
21664 +
21665 +   Encode into @id information sufficient to later distinguish directory
21666 +   entries within the same directory. This is not whole key, because all
21667 +   directory entries within directory item share locality which is equal
21668 +   to objectid of their directory.
21669 +
21670 +*/
21671 +int build_de_id(const struct inode *dir /* inode of directory */ ,
21672 +               const struct qstr *name /* name to be given to @obj by
21673 +                                        * directory entry being
21674 +                                        * constructed */ ,
21675 +               de_id * id/* short key of directory entry */)
21676 +{
21677 +       reiser4_key key;
21678 +
21679 +       assert("nikita-1290", dir != NULL);
21680 +       assert("nikita-1292", id != NULL);
21681 +
21682 +       /* NOTE-NIKITA this is suboptimal. */
21683 +       inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
21684 +       return build_de_id_by_key(&key, id);
21685 +}
21686 +
21687 +/* encode into @id key of directory entry.
21688 +
21689 +   Encode into @id information sufficient to later distinguish directory
21690 +   entries within the same directory. This is not whole key, because all
21691 +   directory entries within directory item share locality which is equal
21692 +   to objectid of their directory.
21693 +
21694 +*/
21695 +int build_de_id_by_key(const reiser4_key * entry_key   /* full key of directory
21696 +                                                        * entry */ ,
21697 +                      de_id * id/* short key of directory entry */)
21698 +{
21699 +       memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
21700 +       return 0;
21701 +}
21702 +
21703 +/* restore from @id key of directory entry.
21704 +
21705 +   Function dual to build_de_id(): given @id and locality, build full
21706 +   key of directory entry within directory item.
21707 +
21708 +*/
21709 +int extract_key_from_de_id(const oid_t locality        /* locality of directory
21710 +                                                * entry */ ,
21711 +                          const de_id * id /* directory entry id */ ,
21712 +                          reiser4_key * key/* result */)
21713 +{
21714 +       /* no need to initialise key here: all fields are overwritten */
21715 +       memcpy(((__u64 *) key) + 1, id, sizeof *id);
21716 +       set_key_locality(key, locality);
21717 +       set_key_type(key, KEY_FILE_NAME_MINOR);
21718 +       return 0;
21719 +}
21720 +
21721 +/* compare two &de_id's */
21722 +cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
21723 +               const de_id * id2/* second &de_id to compare */)
21724 +{
21725 +       /* NOTE-NIKITA ugly implementation */
21726 +       reiser4_key k1;
21727 +       reiser4_key k2;
21728 +
21729 +       extract_key_from_de_id((oid_t) 0, id1, &k1);
21730 +       extract_key_from_de_id((oid_t) 0, id2, &k2);
21731 +       return keycmp(&k1, &k2);
21732 +}
21733 +
21734 +/* compare &de_id with key */
21735 +cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
21736 +                   const reiser4_key * key/* key to compare */)
21737 +{
21738 +       cmp_t result;
21739 +       reiser4_key *k1;
21740 +
21741 +       k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
21742 +       result = KEY_DIFF_EL(k1, key, 1);
21743 +       if (result == EQUAL_TO) {
21744 +               result = KEY_DIFF_EL(k1, key, 2);
21745 +               if (REISER4_LARGE_KEY && result == EQUAL_TO)
21746 +                       result = KEY_DIFF_EL(k1, key, 3);
21747 +       }
21748 +       return result;
21749 +}
21750 +
21751 +/*
21752 + * return number of bytes necessary to encode @inode identity.
21753 + */
21754 +int inode_onwire_size(const struct inode *inode)
21755 +{
21756 +       int result;
21757 +
21758 +       result = dscale_bytes_to_write(get_inode_oid(inode));
21759 +       result += dscale_bytes_to_write(get_inode_locality(inode));
21760 +
21761 +       /*
21762 +        * ordering is large (it usually has highest bits set), so it makes
21763 +        * little sense to dscale it.
21764 +        */
21765 +       if (REISER4_LARGE_KEY)
21766 +               result += sizeof(get_inode_ordering(inode));
21767 +       return result;
21768 +}
21769 +
21770 +/*
21771 + * encode @inode identity at @start
21772 + */
21773 +char *build_inode_onwire(const struct inode *inode, char *start)
21774 +{
21775 +       start += dscale_write(start, get_inode_locality(inode));
21776 +       start += dscale_write(start, get_inode_oid(inode));
21777 +
21778 +       if (REISER4_LARGE_KEY) {
21779 +               put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
21780 +               start += sizeof(get_inode_ordering(inode));
21781 +       }
21782 +       return start;
21783 +}
21784 +
21785 +/*
21786 + * extract key that was previously encoded by build_inode_onwire() at @addr
21787 + */
21788 +char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
21789 +{
21790 +       __u64 val;
21791 +
21792 +       addr += dscale_read(addr, &val);
21793 +       val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
21794 +       put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
21795 +       addr += dscale_read(addr, &val);
21796 +       put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
21797 +#if REISER4_LARGE_KEY
21798 +       memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
21799 +       addr += sizeof key_id->ordering;
21800 +#endif
21801 +       return addr;
21802 +}
21803 +
21804 +/*
21805 + * skip a key that was previously encoded by build_inode_onwire() at @addr
21806 + * FIXME: handle IO errors.
21807 + */
21808 +char * locate_obj_key_id_onwire(char * addr)
21809 +{
21810 +       /* locality */
21811 +       addr += dscale_bytes_to_read(addr);
21812 +       /* objectid */
21813 +       addr += dscale_bytes_to_read(addr);
21814 +#if REISER4_LARGE_KEY
21815 +       addr += sizeof ((obj_key_id *)0)->ordering;
21816 +#endif
21817 +       return addr;
21818 +}
21819 +
21820 +/* Make Linus happy.
21821 +   Local variables:
21822 +   c-indentation-style: "K&R"
21823 +   mode-name: "LC"
21824 +   c-basic-offset: 8
21825 +   tab-width: 8
21826 +   fill-column: 120
21827 +   End:
21828 +*/
21829 diff -urN linux-2.6.35.orig/fs/reiser4/kassign.h linux-2.6.35/fs/reiser4/kassign.h
21830 --- linux-2.6.35.orig/fs/reiser4/kassign.h      1970-01-01 01:00:00.000000000 +0100
21831 +++ linux-2.6.35/fs/reiser4/kassign.h   2010-08-04 15:44:57.000000000 +0200
21832 @@ -0,0 +1,111 @@
21833 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21834 + * reiser4/README */
21835 +
21836 +/* Key assignment policy interface. See kassign.c for details. */
21837 +
21838 +#if !defined(__KASSIGN_H__)
21839 +#define __KASSIGN_H__
21840 +
21841 +#include "forward.h"
21842 +#include "key.h"
21843 +#include "dformat.h"
21844 +
21845 +#include <linux/types.h>       /* for __u??  */
21846 +#include <linux/fs.h>          /* for struct super_block, etc  */
21847 +#include <linux/dcache.h>      /* for struct qstr */
21848 +
21849 +/* key assignment functions */
21850 +
21851 +/* Information from which key of file stat-data can be uniquely
21852 +   restored. This depends on key assignment policy for
21853 +   stat-data. Currently it's enough to store object id and locality id
21854 +   (60+60==120) bits, because minor packing locality and offset of
21855 +   stat-data key are always known constants: KEY_SD_MINOR and 0
21856 +   respectively. For simplicity 4 bits are wasted in each id, and just
21857 +   two 64 bit integers are stored.
21858 +
21859 +   This field has to be byte-aligned, because we don't want to waste
21860 +   space in directory entries. There is another side of a coin of
21861 +   course: we waste CPU and bus bandwidth in stead, by copying data back
21862 +   and forth.
21863 +
21864 +   Next optimization: &obj_key_id is mainly used to address stat data from
21865 +   directory entries. Under the assumption that majority of files only have
21866 +   only name (one hard link) from *the* parent directory it seems reasonable
21867 +   to only store objectid of stat data and take its locality from key of
21868 +   directory item.
21869 +
21870 +   This requires some flag to be added to the &obj_key_id to distinguish
21871 +   between these two cases. Remaining bits in flag byte are then asking to be
21872 +   used to store file type.
21873 +
21874 +   This optimization requires changes in directory item handling code.
21875 +
21876 +*/
21877 +typedef struct obj_key_id {
21878 +       d8 locality[sizeof(__u64)];
21879 +        ON_LARGE_KEY(d8 ordering[sizeof(__u64)];
21880 +           )
21881 +       d8 objectid[sizeof(__u64)];
21882 +}
21883 +obj_key_id;
21884 +
21885 +/* Information sufficient to uniquely identify directory entry within
21886 +   compressed directory item.
21887 +
21888 +   For alignment issues see &obj_key_id above.
21889 +*/
21890 +typedef struct de_id {
21891 +       ON_LARGE_KEY(d8 ordering[sizeof(__u64)];)
21892 +       d8 objectid[sizeof(__u64)];
21893 +       d8 offset[sizeof(__u64)];
21894 +}
21895 +de_id;
21896 +
21897 +extern int inode_onwire_size(const struct inode *obj);
21898 +extern char *build_inode_onwire(const struct inode *obj, char *area);
21899 +extern char *locate_obj_key_id_onwire(char *area);
21900 +extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id);
21901 +
21902 +extern int build_inode_key_id(const struct inode *obj, obj_key_id * id);
21903 +extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key);
21904 +extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id);
21905 +extern oid_t extract_dir_id_from_key(const reiser4_key * de_key);
21906 +extern int build_de_id(const struct inode *dir, const struct qstr *name,
21907 +                      de_id * id);
21908 +extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id);
21909 +extern int extract_key_from_de_id(const oid_t locality, const de_id * id,
21910 +                                 reiser4_key * key);
21911 +extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2);
21912 +extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key);
21913 +
21914 +extern int build_readdir_key_common(struct file *dir, reiser4_key * result);
21915 +extern void build_entry_key_common(const struct inode *dir,
21916 +                                  const struct qstr *name,
21917 +                                  reiser4_key * result);
21918 +extern void build_entry_key_stable_entry(const struct inode *dir,
21919 +                                        const struct qstr *name,
21920 +                                        reiser4_key * result);
21921 +extern int is_dot_key(const reiser4_key * key);
21922 +extern reiser4_key *build_sd_key(const struct inode *target,
21923 +                                reiser4_key * result);
21924 +
21925 +extern int is_longname_key(const reiser4_key * key);
21926 +extern int is_longname(const char *name, int len);
21927 +extern char *extract_name_from_key(const reiser4_key * key, char *buf);
21928 +extern char *reiser4_unpack_string(__u64 value, char *buf);
21929 +extern void complete_entry_key(const struct inode *dir, const char *name,
21930 +                              int len, reiser4_key *result);
21931 +
21932 +/* __KASSIGN_H__ */
21933 +#endif
21934 +
21935 +/* Make Linus happy.
21936 +   Local variables:
21937 +   c-indentation-style: "K&R"
21938 +   mode-name: "LC"
21939 +   c-basic-offset: 8
21940 +   tab-width: 8
21941 +   fill-column: 120
21942 +   End:
21943 +*/
21944 diff -urN linux-2.6.35.orig/fs/reiser4/Kconfig linux-2.6.35/fs/reiser4/Kconfig
21945 --- linux-2.6.35.orig/fs/reiser4/Kconfig        1970-01-01 01:00:00.000000000 +0100
21946 +++ linux-2.6.35/fs/reiser4/Kconfig     2010-08-04 15:44:57.000000000 +0200
21947 @@ -0,0 +1,34 @@
21948 +config REISER4_FS
21949 +       tristate "Reiser4 (EXPERIMENTAL)"
21950 +       depends on EXPERIMENTAL
21951 +       select ZLIB_INFLATE
21952 +       select ZLIB_DEFLATE
21953 +       select LZO_COMPRESS
21954 +       select LZO_DECOMPRESS
21955 +       select CRYPTO
21956 +       help
21957 +         Reiser4 is a filesystem that performs all filesystem operations
21958 +         as atomic transactions, which means that it either performs a
21959 +         write, or it does not, and in the event of a crash it does not
21960 +         partially perform it or corrupt it.
21961 +
21962 +         It stores files in dancing trees, which are like balanced trees but
21963 +         faster.  It packs small files together so that they share blocks
21964 +         without wasting space.  This means you can use it to store really
21965 +         small files.  It also means that it saves you disk space.  It avoids
21966 +         hassling you with anachronisms like having a maximum number of
21967 +         inodes, and wasting space if you use less than that number.
21968 +
21969 +         Reiser4 is a distinct filesystem type from reiserfs (V3).
21970 +         It's therefore not possible to use reiserfs file systems
21971 +         with reiser4.
21972 +
21973 +         To learn more about reiser4, go to http://www.namesys.com
21974 +
21975 +config REISER4_DEBUG
21976 +       bool "Enable reiser4 debug mode"
21977 +       depends on REISER4_FS
21978 +       help
21979 +         Don't use this unless you are debugging reiser4.
21980 +
21981 +         If unsure, say N.
21982 diff -urN linux-2.6.35.orig/fs/reiser4/key.c linux-2.6.35/fs/reiser4/key.c
21983 --- linux-2.6.35.orig/fs/reiser4/key.c  1970-01-01 01:00:00.000000000 +0100
21984 +++ linux-2.6.35/fs/reiser4/key.c       2010-08-04 15:44:57.000000000 +0200
21985 @@ -0,0 +1,138 @@
21986 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
21987 + * reiser4/README */
21988 +
21989 +/* Key manipulations. */
21990 +
21991 +#include "debug.h"
21992 +#include "key.h"
21993 +#include "super.h"
21994 +#include "reiser4.h"
21995 +
21996 +#include <linux/types.h>       /* for __u??  */
21997 +
21998 +/* Minimal possible key: all components are zero. It is presumed that this is
21999 +   independent of key scheme. */
22000 +static const reiser4_key MINIMAL_KEY = {
22001 +       .el = {
22002 +               0ull,
22003 +               ON_LARGE_KEY(0ull,)
22004 +               0ull,
22005 +               0ull
22006 +       }
22007 +};
22008 +
22009 +/* Maximal possible key: all components are ~0. It is presumed that this is
22010 +   independent of key scheme. */
22011 +static const reiser4_key MAXIMAL_KEY = {
22012 +       .el = {
22013 +               __constant_cpu_to_le64(~0ull),
22014 +               ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),)
22015 +               __constant_cpu_to_le64(~0ull),
22016 +               __constant_cpu_to_le64(~0ull)
22017 +       }
22018 +};
22019 +
22020 +/* Initialize key. */
22021 +void reiser4_key_init(reiser4_key * key/* key to init */)
22022 +{
22023 +       assert("nikita-1169", key != NULL);
22024 +       memset(key, 0, sizeof *key);
22025 +}
22026 +
22027 +/* minimal possible key in the tree. Return pointer to the static storage. */
22028 +const reiser4_key * reiser4_min_key(void)
22029 +{
22030 +       return &MINIMAL_KEY;
22031 +}
22032 +
22033 +/* maximum possible key in the tree. Return pointer to the static storage. */
22034 +const reiser4_key * reiser4_max_key(void)
22035 +{
22036 +       return &MAXIMAL_KEY;
22037 +}
22038 +
22039 +#if REISER4_DEBUG
22040 +/* debugging aid: print symbolic name of key type */
22041 +static const char *type_name(unsigned int key_type/* key type */)
22042 +{
22043 +       switch (key_type) {
22044 +       case KEY_FILE_NAME_MINOR:
22045 +               return "file name";
22046 +       case KEY_SD_MINOR:
22047 +               return "stat data";
22048 +       case KEY_ATTR_NAME_MINOR:
22049 +               return "attr name";
22050 +       case KEY_ATTR_BODY_MINOR:
22051 +               return "attr body";
22052 +       case KEY_BODY_MINOR:
22053 +               return "file body";
22054 +       default:
22055 +               return "unknown";
22056 +       }
22057 +}
22058 +
22059 +/* debugging aid: print human readable information about key */
22060 +void reiser4_print_key(const char *prefix /* prefix to print */ ,
22061 +              const reiser4_key * key/* key to print */)
22062 +{
22063 +       /* turn bold on */
22064 +       /* printf ("\033[1m"); */
22065 +       if (key == NULL)
22066 +               printk("%s: null key\n", prefix);
22067 +       else {
22068 +               if (REISER4_LARGE_KEY)
22069 +                       printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix,
22070 +                              get_key_locality(key),
22071 +                              get_key_type(key),
22072 +                              get_key_ordering(key),
22073 +                              get_key_band(key),
22074 +                              get_key_objectid(key), get_key_offset(key));
22075 +               else
22076 +                       printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix,
22077 +                              get_key_locality(key),
22078 +                              get_key_type(key),
22079 +                              get_key_band(key),
22080 +                              get_key_objectid(key), get_key_offset(key));
22081 +               /*
22082 +                * if this is a key of directory entry, try to decode part of
22083 +                * a name stored in the key, and output it.
22084 +                */
22085 +               if (get_key_type(key) == KEY_FILE_NAME_MINOR) {
22086 +                       char buf[DE_NAME_BUF_LEN];
22087 +                       char *c;
22088 +
22089 +                       c = buf;
22090 +                       c = reiser4_unpack_string(get_key_ordering(key), c);
22091 +                       reiser4_unpack_string(get_key_fulloid(key), c);
22092 +                       printk("[%s", buf);
22093 +                       if (is_longname_key(key))
22094 +                               /*
22095 +                                * only part of the name is stored in the key.
22096 +                                */
22097 +                               printk("...]\n");
22098 +                       else {
22099 +                               /*
22100 +                                * whole name is stored in the key.
22101 +                                */
22102 +                               reiser4_unpack_string(get_key_offset(key), buf);
22103 +                               printk("%s]\n", buf);
22104 +                       }
22105 +               } else {
22106 +                       printk("[%s]\n", type_name(get_key_type(key)));
22107 +               }
22108 +       }
22109 +       /* turn bold off */
22110 +       /* printf ("\033[m\017"); */
22111 +}
22112 +
22113 +#endif
22114 +
22115 +/* Make Linus happy.
22116 +   Local variables:
22117 +   c-indentation-style: "K&R"
22118 +   mode-name: "LC"
22119 +   c-basic-offset: 8
22120 +   tab-width: 8
22121 +   fill-column: 120
22122 +   End:
22123 +*/
22124 diff -urN linux-2.6.35.orig/fs/reiser4/key.h linux-2.6.35/fs/reiser4/key.h
22125 --- linux-2.6.35.orig/fs/reiser4/key.h  1970-01-01 01:00:00.000000000 +0100
22126 +++ linux-2.6.35/fs/reiser4/key.h       2010-08-04 15:44:57.000000000 +0200
22127 @@ -0,0 +1,392 @@
22128 +/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by
22129 + * reiser4/README */
22130 +
22131 +/* Declarations of key-related data-structures and operations on keys. */
22132 +
22133 +#if !defined(__REISER4_KEY_H__)
22134 +#define __REISER4_KEY_H__
22135 +
22136 +#include "dformat.h"
22137 +#include "forward.h"
22138 +#include "debug.h"
22139 +
22140 +#include <linux/types.h>       /* for __u??  */
22141 +
22142 +/* Operations on keys in reiser4 tree */
22143 +
22144 +/* No access to any of these fields shall be done except via a
22145 +   wrapping macro/function, and that wrapping macro/function shall
22146 +   convert to little endian order. Compare keys will consider cpu byte order. */
22147 +
22148 +/* A storage layer implementation difference between a regular unix file body
22149 +   and its attributes is in the typedef below which causes all of the attributes
22150 +   of a file to be near in key to all of the other attributes for all of the
22151 +   files within that directory, and not near to the file itself. It is
22152 +   interesting to consider whether this is the wrong approach, and whether there
22153 +   should be no difference at all. For current usage patterns this choice is
22154 +   probably the right one.  */
22155 +
22156 +/* possible values for minor packing locality (4 bits required) */
22157 +typedef enum {
22158 +       /* file name */
22159 +       KEY_FILE_NAME_MINOR = 0,
22160 +       /* stat-data */
22161 +       KEY_SD_MINOR = 1,
22162 +       /* file attribute name */
22163 +       KEY_ATTR_NAME_MINOR = 2,
22164 +       /* file attribute value */
22165 +       KEY_ATTR_BODY_MINOR = 3,
22166 +       /* file body (tail or extent) */
22167 +       KEY_BODY_MINOR = 4,
22168 +} key_minor_locality;
22169 +
22170 +/* Everything stored in the tree has a unique key, which means that the tree is
22171 +   (logically) fully ordered by key. Physical order is determined by dynamic
22172 +   heuristics that attempt to reflect key order when allocating available space,
22173 +   and by the repacker. It is stylistically better to put aggregation
22174 +   information into the key. Thus, if you want to segregate extents from tails,
22175 +   it is better to give them distinct minor packing localities rather than
22176 +   changing block_alloc.c to check the node type when deciding where to allocate
22177 +   the node.
22178 +
22179 +   The need to randomly displace new directories and large files disturbs this
22180 +   symmetry unfortunately. However, it should be noted that this is a need that
22181 +   is not clearly established given the existence of a repacker. Also, in our
22182 +   current implementation tails have a different minor packing locality from
22183 +   extents, and no files have both extents and tails, so maybe symmetry can be
22184 +   had without performance cost after all. Symmetry is what we ship for now....
22185 +*/
22186 +
22187 +/* Arbitrary major packing localities can be assigned to objects using
22188 +   the reiser4(filenameA/..packing<=some_number) system call.
22189 +
22190 +   In reiser4, the creat() syscall creates a directory
22191 +
22192 +   whose default flow (that which is referred to if the directory is
22193 +   read as a file) is the traditional unix file body.
22194 +
22195 +   whose directory plugin is the 'filedir'
22196 +
22197 +   whose major packing locality is that of the parent of the object created.
22198 +
22199 +   The static_stat item is a particular commonly used directory
22200 +   compression (the one for normal unix files).
22201 +
22202 +   The filedir plugin checks to see if the static_stat item exists.
22203 +   There is a unique key for static_stat.  If yes, then it uses the
22204 +   static_stat item for all of the values that it contains.  The
22205 +   static_stat item contains a flag for each stat it contains which
22206 +   indicates whether one should look outside the static_stat item for its
22207 +   contents.
22208 +*/
22209 +
22210 +/* offset of fields in reiser4_key. Value of each element of this enum
22211 +    is index within key (thought as array of __u64's) where this field
22212 +    is. */
22213 +typedef enum {
22214 +       /* major "locale", aka dirid. Sits in 1st element */
22215 +       KEY_LOCALITY_INDEX = 0,
22216 +       /* minor "locale", aka item type. Sits in 1st element */
22217 +       KEY_TYPE_INDEX = 0,
22218 +       ON_LARGE_KEY(KEY_ORDERING_INDEX,)
22219 +           /* "object band". Sits in 2nd element */
22220 +           KEY_BAND_INDEX,
22221 +       /* objectid. Sits in 2nd element */
22222 +       KEY_OBJECTID_INDEX = KEY_BAND_INDEX,
22223 +       /* full objectid. Sits in 2nd element */
22224 +       KEY_FULLOID_INDEX = KEY_BAND_INDEX,
22225 +       /* Offset. Sits in 3rd element */
22226 +       KEY_OFFSET_INDEX,
22227 +       /* Name hash. Sits in 3rd element */
22228 +       KEY_HASH_INDEX = KEY_OFFSET_INDEX,
22229 +       KEY_CACHELINE_END = KEY_OFFSET_INDEX,
22230 +       KEY_LAST_INDEX
22231 +} reiser4_key_field_index;
22232 +
22233 +/* key in reiser4 internal "balanced" tree. It is just array of three
22234 +    64bit integers in disk byte order (little-endian by default). This
22235 +    array is actually indexed by reiser4_key_field.  Each __u64 within
22236 +    this array is called "element". Logical key component encoded within
22237 +    elements are called "fields".
22238 +
22239 +    We declare this as union with second component dummy to suppress
22240 +    inconvenient array<->pointer casts implied in C. */
22241 +union reiser4_key {
22242 +       __le64 el[KEY_LAST_INDEX];
22243 +       int pad;
22244 +};
22245 +
22246 +/* bitmasks showing where within reiser4_key particular key is stored. */
22247 +/* major locality occupies higher 60 bits of the first element */
22248 +#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull
22249 +
22250 +/* minor locality occupies lower 4 bits of the first element */
22251 +#define KEY_TYPE_MASK 0xfull
22252 +
22253 +/* controversial band occupies higher 4 bits of the 2nd element */
22254 +#define KEY_BAND_MASK 0xf000000000000000ull
22255 +
22256 +/* objectid occupies lower 60 bits of the 2nd element */
22257 +#define KEY_OBJECTID_MASK 0x0fffffffffffffffull
22258 +
22259 +/* full 64bit objectid*/
22260 +#define KEY_FULLOID_MASK 0xffffffffffffffffull
22261 +
22262 +/* offset is just 3rd L.M.Nt itself */
22263 +#define KEY_OFFSET_MASK 0xffffffffffffffffull
22264 +
22265 +/* ordering is whole second element */
22266 +#define KEY_ORDERING_MASK 0xffffffffffffffffull
22267 +
22268 +/* how many bits key element should be shifted to left to get particular field
22269 + */
22270 +typedef enum {
22271 +       KEY_LOCALITY_SHIFT = 4,
22272 +       KEY_TYPE_SHIFT = 0,
22273 +       KEY_BAND_SHIFT = 60,
22274 +       KEY_OBJECTID_SHIFT = 0,
22275 +       KEY_FULLOID_SHIFT = 0,
22276 +       KEY_OFFSET_SHIFT = 0,
22277 +       KEY_ORDERING_SHIFT = 0,
22278 +} reiser4_key_field_shift;
22279 +
22280 +static inline __u64
22281 +get_key_el(const reiser4_key * key, reiser4_key_field_index off)
22282 +{
22283 +       assert("nikita-753", key != NULL);
22284 +       assert("nikita-754", off < KEY_LAST_INDEX);
22285 +       return le64_to_cpu(get_unaligned(&key->el[off]));
22286 +}
22287 +
22288 +static inline void
22289 +set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value)
22290 +{
22291 +       assert("nikita-755", key != NULL);
22292 +       assert("nikita-756", off < KEY_LAST_INDEX);
22293 +       put_unaligned(cpu_to_le64(value), &key->el[off]);
22294 +}
22295 +
22296 +/* macro to define getter and setter functions for field F with type T */
22297 +#define DEFINE_KEY_FIELD(L, U, T)                                      \
22298 +static inline T get_key_ ## L(const reiser4_key *key)                  \
22299 +{                                                                      \
22300 +       assert("nikita-750", key != NULL);                              \
22301 +       return (T) (get_key_el(key, KEY_ ## U ## _INDEX) &              \
22302 +                KEY_ ## U ## _MASK) >> KEY_ ## U ## _SHIFT;            \
22303 +}                                                                      \
22304 +                                                                       \
22305 +static inline void set_key_ ## L(reiser4_key * key, T loc)             \
22306 +{                                                                      \
22307 +       __u64 el;                                                       \
22308 +                                                                       \
22309 +       assert("nikita-752", key != NULL);                              \
22310 +                                                                       \
22311 +       el = get_key_el(key, KEY_ ## U ## _INDEX);                      \
22312 +       /* clear field bits in the key */                               \
22313 +       el &= ~KEY_ ## U ## _MASK;                                      \
22314 +       /* actually it should be                                        \
22315 +                                                                       \
22316 +          el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK;   \
22317 +                                                                       \
22318 +          but we trust user to never pass values that wouldn't fit     \
22319 +          into field. Clearing extra bits is one operation, but this   \
22320 +          function is time-critical.                                   \
22321 +          But check this in assertion. */                              \
22322 +       assert("nikita-759", ((loc << KEY_ ## U ## _SHIFT) &            \
22323 +               ~KEY_ ## U ## _MASK) == 0);                             \
22324 +       el |= (loc << KEY_ ## U ## _SHIFT);                             \
22325 +       set_key_el(key, KEY_ ## U ## _INDEX, el);                       \
22326 +}
22327 +
22328 +typedef __u64 oid_t;
22329 +
22330 +/* define get_key_locality(), set_key_locality() */
22331 +DEFINE_KEY_FIELD(locality, LOCALITY, oid_t);
22332 +/* define get_key_type(), set_key_type() */
22333 +DEFINE_KEY_FIELD(type, TYPE, key_minor_locality);
22334 +/* define get_key_band(), set_key_band() */
22335 +DEFINE_KEY_FIELD(band, BAND, __u64);
22336 +/* define get_key_objectid(), set_key_objectid() */
22337 +DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t);
22338 +/* define get_key_fulloid(), set_key_fulloid() */
22339 +DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t);
22340 +/* define get_key_offset(), set_key_offset() */
22341 +DEFINE_KEY_FIELD(offset, OFFSET, __u64);
22342 +#if (REISER4_LARGE_KEY)
22343 +/* define get_key_ordering(), set_key_ordering() */
22344 +DEFINE_KEY_FIELD(ordering, ORDERING, __u64);
22345 +#else
22346 +static inline __u64 get_key_ordering(const reiser4_key * key)
22347 +{
22348 +       return 0;
22349 +}
22350 +
22351 +static inline void set_key_ordering(reiser4_key * key, __u64 val)
22352 +{
22353 +}
22354 +#endif
22355 +
22356 +/* key comparison result */
22357 +typedef enum { LESS_THAN = -1, /* if first key is less than second */
22358 +       EQUAL_TO = 0,           /* if keys are equal */
22359 +       GREATER_THAN = +1       /* if first key is greater than second */
22360 +} cmp_t;
22361 +
22362 +void reiser4_key_init(reiser4_key * key);
22363 +
22364 +/* minimal possible key in the tree. Return pointer to the static storage. */
22365 +extern const reiser4_key *reiser4_min_key(void);
22366 +extern const reiser4_key *reiser4_max_key(void);
22367 +
22368 +/* helper macro for keycmp() */
22369 +#define KEY_DIFF(k1, k2, field)                                                \
22370 +({                                                                     \
22371 +       typeof(get_key_ ## field(k1)) f1;                               \
22372 +       typeof(get_key_ ## field(k2)) f2;                               \
22373 +                                                                       \
22374 +       f1 = get_key_ ## field(k1);                                     \
22375 +       f2 = get_key_ ## field(k2);                                     \
22376 +                                                                       \
22377 +       (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN); \
22378 +})
22379 +
22380 +/* helper macro for keycmp() */
22381 +#define KEY_DIFF_EL(k1, k2, off)                                       \
22382 +({                                                                     \
22383 +       __u64 e1;                                                       \
22384 +       __u64 e2;                                                       \
22385 +                                                                       \
22386 +       e1 = get_key_el(k1, off);                                       \
22387 +       e2 = get_key_el(k2, off);                                       \
22388 +                                                                       \
22389 +       (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN); \
22390 +})
22391 +
22392 +/* compare `k1' and `k2'.  This function is a heart of "key allocation
22393 +    policy". All you need to implement new policy is to add yet another
22394 +    clause here. */
22395 +static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ ,
22396 +                          const reiser4_key * k2/* second key to compare */)
22397 +{
22398 +       cmp_t result;
22399 +
22400 +       /*
22401 +        * This function is the heart of reiser4 tree-routines. Key comparison
22402 +        * is among most heavily used operations in the file system.
22403 +        */
22404 +
22405 +       assert("nikita-439", k1 != NULL);
22406 +       assert("nikita-440", k2 != NULL);
22407 +
22408 +       /* there is no actual branch here: condition is compile time constant
22409 +        * and constant folding and propagation ensures that only one branch
22410 +        * is actually compiled in. */
22411 +
22412 +       if (REISER4_PLANA_KEY_ALLOCATION) {
22413 +               /* if physical order of fields in a key is identical
22414 +                  with logical order, we can implement key comparison
22415 +                  as three 64bit comparisons. */
22416 +               /* logical order of fields in plan-a:
22417 +                  locality->type->objectid->offset. */
22418 +               /* compare locality and type at once */
22419 +               result = KEY_DIFF_EL(k1, k2, 0);
22420 +               if (result == EQUAL_TO) {
22421 +                       /* compare objectid (and band if it's there) */
22422 +                       result = KEY_DIFF_EL(k1, k2, 1);
22423 +                       /* compare offset */
22424 +                       if (result == EQUAL_TO) {
22425 +                               result = KEY_DIFF_EL(k1, k2, 2);
22426 +                               if (REISER4_LARGE_KEY && result == EQUAL_TO)
22427 +                                       result = KEY_DIFF_EL(k1, k2, 3);
22428 +                       }
22429 +               }
22430 +       } else if (REISER4_3_5_KEY_ALLOCATION) {
22431 +               result = KEY_DIFF(k1, k2, locality);
22432 +               if (result == EQUAL_TO) {
22433 +                       result = KEY_DIFF(k1, k2, objectid);
22434 +                       if (result == EQUAL_TO) {
22435 +                               result = KEY_DIFF(k1, k2, type);
22436 +                               if (result == EQUAL_TO)
22437 +                                       result = KEY_DIFF(k1, k2, offset);
22438 +                       }
22439 +               }
22440 +       } else
22441 +               impossible("nikita-441", "Unknown key allocation scheme!");
22442 +       return result;
22443 +}
22444 +
22445 +/* true if @k1 equals @k2 */
22446 +static inline int keyeq(const reiser4_key * k1 /* first key to compare */ ,
22447 +                       const reiser4_key * k2/* second key to compare */)
22448 +{
22449 +       assert("nikita-1879", k1 != NULL);
22450 +       assert("nikita-1880", k2 != NULL);
22451 +       return !memcmp(k1, k2, sizeof *k1);
22452 +}
22453 +
22454 +/* true if @k1 is less than @k2 */
22455 +static inline int keylt(const reiser4_key * k1 /* first key to compare */ ,
22456 +                       const reiser4_key * k2/* second key to compare */)
22457 +{
22458 +       assert("nikita-1952", k1 != NULL);
22459 +       assert("nikita-1953", k2 != NULL);
22460 +       return keycmp(k1, k2) == LESS_THAN;
22461 +}
22462 +
22463 +/* true if @k1 is less than or equal to @k2 */
22464 +static inline int keyle(const reiser4_key * k1 /* first key to compare */ ,
22465 +                       const reiser4_key * k2/* second key to compare */)
22466 +{
22467 +       assert("nikita-1954", k1 != NULL);
22468 +       assert("nikita-1955", k2 != NULL);
22469 +       return keycmp(k1, k2) != GREATER_THAN;
22470 +}
22471 +
22472 +/* true if @k1 is greater than @k2 */
22473 +static inline int keygt(const reiser4_key * k1 /* first key to compare */ ,
22474 +                       const reiser4_key * k2/* second key to compare */)
22475 +{
22476 +       assert("nikita-1959", k1 != NULL);
22477 +       assert("nikita-1960", k2 != NULL);
22478 +       return keycmp(k1, k2) == GREATER_THAN;
22479 +}
22480 +
22481 +/* true if @k1 is greater than or equal to @k2 */
22482 +static inline int keyge(const reiser4_key * k1 /* first key to compare */ ,
22483 +                       const reiser4_key * k2/* second key to compare */)
22484 +{
22485 +       assert("nikita-1956", k1 != NULL);
22486 +       assert("nikita-1957", k2 != NULL);      /* October  4: sputnik launched
22487 +                                                * November 3: Laika */
22488 +       return keycmp(k1, k2) != LESS_THAN;
22489 +}
22490 +
22491 +static inline void prefetchkey(reiser4_key * key)
22492 +{
22493 +       prefetch(key);
22494 +       prefetch(&key->el[KEY_CACHELINE_END]);
22495 +}
22496 +
22497 +/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) =
22498 +       1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */
22499 +/* size of a buffer suitable to hold human readable key representation */
22500 +#define KEY_BUF_LEN (80)
22501 +
22502 +#if REISER4_DEBUG
22503 +extern void reiser4_print_key(const char *prefix, const reiser4_key * key);
22504 +#else
22505 +#define reiser4_print_key(p, k) noop
22506 +#endif
22507 +
22508 +/* __FS_REISERFS_KEY_H__ */
22509 +#endif
22510 +
22511 +/* Make Linus happy.
22512 +   Local variables:
22513 +   c-indentation-style: "K&R"
22514 +   mode-name: "LC"
22515 +   c-basic-offset: 8
22516 +   tab-width: 8
22517 +   fill-column: 120
22518 +   End:
22519 +*/
22520 diff -urN linux-2.6.35.orig/fs/reiser4/ktxnmgrd.c linux-2.6.35/fs/reiser4/ktxnmgrd.c
22521 --- linux-2.6.35.orig/fs/reiser4/ktxnmgrd.c     1970-01-01 01:00:00.000000000 +0100
22522 +++ linux-2.6.35/fs/reiser4/ktxnmgrd.c  2010-08-04 15:44:57.000000000 +0200
22523 @@ -0,0 +1,215 @@
22524 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22525 +/* Transaction manager daemon. */
22526 +
22527 +/*
22528 + * ktxnmgrd is a kernel daemon responsible for committing transactions. It is
22529 + * needed/important for the following reasons:
22530 + *
22531 + *     1. in reiser4 atom is not committed immediately when last transaction
22532 + *     handle closes, unless atom is either too old or too large (see
22533 + *     atom_should_commit()). This is done to avoid committing too frequently.
22534 + *     because:
22535 + *
22536 + *     2. sometimes we don't want to commit atom when closing last transaction
22537 + *     handle even if it is old and fat enough. For example, because we are at
22538 + *     this point under directory semaphore, and committing would stall all
22539 + *     accesses to this directory.
22540 + *
22541 + * ktxnmgrd binds its time sleeping on condition variable. When is awakes
22542 + * either due to (tunable) timeout or because it was explicitly woken up by
22543 + * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones
22544 + * eligible.
22545 + *
22546 + */
22547 +
22548 +#include "debug.h"
22549 +#include "txnmgr.h"
22550 +#include "tree.h"
22551 +#include "ktxnmgrd.h"
22552 +#include "super.h"
22553 +#include "reiser4.h"
22554 +
22555 +#include <linux/sched.h>       /* for struct task_struct */
22556 +#include <linux/wait.h>
22557 +#include <linux/suspend.h>
22558 +#include <linux/kernel.h>
22559 +#include <linux/writeback.h>
22560 +#include <linux/kthread.h>
22561 +#include <linux/freezer.h>
22562 +
22563 +static int scan_mgr(struct super_block *);
22564 +
22565 +/*
22566 + * change current->comm so that ps, top, and friends will see changed
22567 + * state. This serves no useful purpose whatsoever, but also costs nothing. May
22568 + * be it will make lonely system administrator feeling less alone at 3 A.M.
22569 + */
22570 +#define set_comm(state)                                                \
22571 +       snprintf(current->comm, sizeof(current->comm),                  \
22572 +                 "%s:%s:%s", __FUNCTION__, (super)->s_id, (state))
22573 +
22574 +/**
22575 + * ktxnmgrd - kernel txnmgr daemon
22576 + * @arg: pointer to super block
22577 + *
22578 + * The background transaction manager daemon, started as a kernel thread during
22579 + * reiser4 initialization.
22580 + */
22581 +static int ktxnmgrd(void *arg)
22582 +{
22583 +       struct super_block *super;
22584 +       ktxnmgrd_context *ctx;
22585 +       txn_mgr *mgr;
22586 +       int done = 0;
22587 +
22588 +       super = arg;
22589 +       mgr = &get_super_private(super)->tmgr;
22590 +
22591 +       /*
22592 +        * do_fork() just copies task_struct into the new thread. ->fs_context
22593 +        * shouldn't be copied of course. This shouldn't be a problem for the
22594 +        * rest of the code though.
22595 +        */
22596 +       current->journal_info = NULL;
22597 +       ctx = mgr->daemon;
22598 +       while (1) {
22599 +               try_to_freeze();
22600 +               set_comm("wait");
22601 +               {
22602 +                       DEFINE_WAIT(__wait);
22603 +
22604 +                       prepare_to_wait(&ctx->wait, &__wait,
22605 +                                       TASK_INTERRUPTIBLE);
22606 +                       if (kthread_should_stop())
22607 +                               done = 1;
22608 +                       else
22609 +                               schedule_timeout(ctx->timeout);
22610 +                       finish_wait(&ctx->wait, &__wait);
22611 +               }
22612 +               if (done)
22613 +                       break;
22614 +               set_comm("run");
22615 +               spin_lock(&ctx->guard);
22616 +               /*
22617 +                * wait timed out or ktxnmgrd was woken up by explicit request
22618 +                * to commit something. Scan list of atoms in txnmgr and look
22619 +                * for too old atoms.
22620 +                */
22621 +               do {
22622 +                       ctx->rescan = 0;
22623 +                       scan_mgr(super);
22624 +                       spin_lock(&ctx->guard);
22625 +                       if (ctx->rescan) {
22626 +                               /*
22627 +                                * the list could be modified while ctx
22628 +                                * spinlock was released, we have to repeat
22629 +                                * scanning from the beginning
22630 +                                */
22631 +                               break;
22632 +                       }
22633 +               } while (ctx->rescan);
22634 +               spin_unlock(&ctx->guard);
22635 +       }
22636 +       return 0;
22637 +}
22638 +
22639 +#undef set_comm
22640 +
22641 +/**
22642 + * reiser4_init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon
22643 + * @super: pointer to super block
22644 + *
22645 + * Allocates and initializes ktxnmgrd_context, attaches it to transaction
22646 + * manager. Starts kernel txnmgr daemon. This is called on mount.
22647 + */
22648 +int reiser4_init_ktxnmgrd(struct super_block *super)
22649 +{
22650 +       txn_mgr *mgr;
22651 +       ktxnmgrd_context *ctx;
22652 +
22653 +       mgr = &get_super_private(super)->tmgr;
22654 +
22655 +       assert("zam-1014", mgr->daemon == NULL);
22656 +
22657 +       ctx = kzalloc(sizeof(ktxnmgrd_context), reiser4_ctx_gfp_mask_get());
22658 +       if (!ctx)
22659 +               return RETERR(-ENOMEM);
22660 +
22661 +       assert("nikita-2442", ctx != NULL);
22662 +
22663 +       init_waitqueue_head(&ctx->wait);
22664 +
22665 +       /*kcond_init(&ctx->startup);*/
22666 +       spin_lock_init(&ctx->guard);
22667 +       ctx->timeout = REISER4_TXNMGR_TIMEOUT;
22668 +       ctx->rescan = 1;
22669 +       mgr->daemon = ctx;
22670 +
22671 +       ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd");
22672 +       if (IS_ERR(ctx->tsk)) {
22673 +               int ret = PTR_ERR(ctx->tsk);
22674 +               mgr->daemon = NULL;
22675 +               kfree(ctx);
22676 +               return RETERR(ret);
22677 +       }
22678 +       return 0;
22679 +}
22680 +
22681 +void ktxnmgrd_kick(txn_mgr *mgr)
22682 +{
22683 +       assert("nikita-3234", mgr != NULL);
22684 +       assert("nikita-3235", mgr->daemon != NULL);
22685 +       wake_up(&mgr->daemon->wait);
22686 +}
22687 +
22688 +int is_current_ktxnmgrd(void)
22689 +{
22690 +       return (get_current_super_private()->tmgr.daemon->tsk == current);
22691 +}
22692 +
22693 +/**
22694 + * scan_mgr - commit atoms which are to be committed
22695 + * @super: super block to commit atoms of
22696 + *
22697 + * Commits old atoms.
22698 + */
22699 +static int scan_mgr(struct super_block *super)
22700 +{
22701 +       int ret;
22702 +       reiser4_context ctx;
22703 +
22704 +       init_stack_context(&ctx, super);
22705 +
22706 +       ret = commit_some_atoms(&get_super_private(super)->tmgr);
22707 +
22708 +       reiser4_exit_context(&ctx);
22709 +       return ret;
22710 +}
22711 +
22712 +/**
22713 + * reiser4_done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context
22714 + * @mgr:
22715 + *
22716 + * This is called on umount. Stops ktxnmgrd and free t
22717 + */
22718 +void reiser4_done_ktxnmgrd(struct super_block *super)
22719 +{
22720 +       txn_mgr *mgr;
22721 +
22722 +       mgr = &get_super_private(super)->tmgr;
22723 +       assert("zam-1012", mgr->daemon != NULL);
22724 +
22725 +       kthread_stop(mgr->daemon->tsk);
22726 +       kfree(mgr->daemon);
22727 +       mgr->daemon = NULL;
22728 +}
22729 +
22730 +/*
22731 + * Local variables:
22732 + * c-indentation-style: "K&R"
22733 + * mode-name: "LC"
22734 + * c-basic-offset: 8
22735 + * tab-width: 8
22736 + * fill-column: 120
22737 + * End:
22738 + */
22739 diff -urN linux-2.6.35.orig/fs/reiser4/ktxnmgrd.h linux-2.6.35/fs/reiser4/ktxnmgrd.h
22740 --- linux-2.6.35.orig/fs/reiser4/ktxnmgrd.h     1970-01-01 01:00:00.000000000 +0100
22741 +++ linux-2.6.35/fs/reiser4/ktxnmgrd.h  2010-08-04 15:44:57.000000000 +0200
22742 @@ -0,0 +1,52 @@
22743 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22744 + * reiser4/README */
22745 +
22746 +/* Transaction manager daemon. See ktxnmgrd.c for comments. */
22747 +
22748 +#ifndef __KTXNMGRD_H__
22749 +#define __KTXNMGRD_H__
22750 +
22751 +#include "txnmgr.h"
22752 +
22753 +#include <linux/fs.h>
22754 +#include <linux/wait.h>
22755 +#include <linux/completion.h>
22756 +#include <linux/spinlock.h>
22757 +#include <asm/atomic.h>
22758 +#include <linux/sched.h>       /* for struct task_struct */
22759 +
22760 +/* in this structure all data necessary to start up, shut down and communicate
22761 + * with ktxnmgrd are kept. */
22762 +struct ktxnmgrd_context {
22763 +       /* wait queue head on which ktxnmgrd sleeps */
22764 +       wait_queue_head_t wait;
22765 +       /* spin lock protecting all fields of this structure */
22766 +       spinlock_t guard;
22767 +       /* timeout of sleeping on ->wait */
22768 +       signed long timeout;
22769 +       /* kernel thread running ktxnmgrd */
22770 +       struct task_struct *tsk;
22771 +       /* list of all file systems served by this ktxnmgrd */
22772 +       struct list_head queue;
22773 +       /* should ktxnmgrd repeat scanning of atoms? */
22774 +       unsigned int rescan:1;
22775 +};
22776 +
22777 +extern int reiser4_init_ktxnmgrd(struct super_block *);
22778 +extern void reiser4_done_ktxnmgrd(struct super_block *);
22779 +
22780 +extern void ktxnmgrd_kick(txn_mgr * mgr);
22781 +extern int is_current_ktxnmgrd(void);
22782 +
22783 +/* __KTXNMGRD_H__ */
22784 +#endif
22785 +
22786 +/* Make Linus happy.
22787 +   Local variables:
22788 +   c-indentation-style: "K&R"
22789 +   mode-name: "LC"
22790 +   c-basic-offset: 8
22791 +   tab-width: 8
22792 +   fill-column: 120
22793 +   End:
22794 +*/
22795 diff -urN linux-2.6.35.orig/fs/reiser4/lock.c linux-2.6.35/fs/reiser4/lock.c
22796 --- linux-2.6.35.orig/fs/reiser4/lock.c 1970-01-01 01:00:00.000000000 +0100
22797 +++ linux-2.6.35/fs/reiser4/lock.c      2010-08-04 15:44:57.000000000 +0200
22798 @@ -0,0 +1,1237 @@
22799 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22800 + * reiser4/README */
22801 +
22802 +/* Traditional deadlock avoidance is achieved by acquiring all locks in a single
22803 +   order.  V4 balances the tree from the bottom up, and searches the tree from
22804 +   the top down, and that is really the way we want it, so tradition won't work
22805 +   for us.
22806 +
22807 +   Instead we have two lock orderings, a high priority lock ordering, and a low
22808 +   priority lock ordering.  Each node in the tree has a lock in its znode.
22809 +
22810 +   Suppose we have a set of processes which lock (R/W) tree nodes. Each process
22811 +   has a set (maybe empty) of already locked nodes ("process locked set"). Each
22812 +   process may have a pending lock request to a node locked by another process.
22813 +   Note: we lock and unlock, but do not transfer locks: it is possible
22814 +   transferring locks instead would save some bus locking....
22815 +
22816 +   Deadlock occurs when we have a loop constructed from process locked sets and
22817 +   lock request vectors.
22818 +
22819 +   NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in
22820 +   memory is extended with "znodes" with which we connect nodes with their left
22821 +   and right neighbors using sibling pointers stored in the znodes.  When we
22822 +   perform balancing operations we often go from left to right and from right to
22823 +   left.
22824 +
22825 +   +-P1-+          +-P3-+
22826 +   |+--+|   V1     |+--+|
22827 +   ||N1|| -------> ||N3||
22828 +   |+--+|          |+--+|
22829 +   +----+          +----+
22830 +     ^               |
22831 +     |V2             |V3
22832 +     |               v
22833 +   +---------P2---------+
22834 +   |+--+            +--+|
22835 +   ||N2|  --------  |N4||
22836 +   |+--+            +--+|
22837 +   +--------------------+
22838 +
22839 +   We solve this by ensuring that only low priority processes lock in top to
22840 +   bottom order and from right to left, and high priority processes lock from
22841 +   bottom to top and left to right.
22842 +
22843 +   ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and
22844 +   kill those damn busy loops.
22845 +   ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom
22846 +   stage) cannot be ordered that way. There are no rules what nodes can belong
22847 +   to the atom and what nodes cannot.  We cannot define what is right or left
22848 +   direction, what is top or bottom.  We can take immediate parent or side
22849 +   neighbor of one node, but nobody guarantees that, say, left neighbor node is
22850 +   not a far right neighbor for other nodes from the same atom.  It breaks
22851 +   deadlock avoidance rules and hi-low priority locking cannot be applied for
22852 +   atom locks.
22853 +
22854 +   How does it help to avoid deadlocks ?
22855 +
22856 +   Suppose we have a deadlock with n processes. Processes from one priority
22857 +   class never deadlock because they take locks in one consistent
22858 +   order.
22859 +
22860 +   So, any possible deadlock loop must have low priority as well as high
22861 +   priority processes.  There are no other lock priority levels except low and
22862 +   high. We know that any deadlock loop contains at least one node locked by a
22863 +   low priority process and requested by a high priority process. If this
22864 +   situation is caught and resolved it is sufficient to avoid deadlocks.
22865 +
22866 +   V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION.
22867 +
22868 +   The deadlock prevention algorithm is based on comparing
22869 +   priorities of node owners (processes which keep znode locked) and
22870 +   requesters (processes which want to acquire a lock on znode).  We
22871 +   implement a scheme where low-priority owners yield locks to
22872 +   high-priority requesters. We created a signal passing system that
22873 +   is used to ask low-priority processes to yield one or more locked
22874 +   znodes.
22875 +
22876 +   The condition when a znode needs to change its owners is described by the
22877 +   following formula:
22878 +
22879 +   #############################################
22880 +   #                                           #
22881 +   # (number of high-priority requesters) >  0 #
22882 +   #                AND                        #
22883 +   # (numbers of high-priority owners)    == 0 #
22884 +   #                                           #
22885 +   #############################################
22886 +
22887 +   Note that a low-priority process delays node releasing if another
22888 +   high-priority process owns this node.  So, slightly more strictly speaking,
22889 +   to have a deadlock capable cycle you must have a loop in which a high
22890 +   priority process is waiting on a low priority process to yield a node, which
22891 +   is slightly different from saying a high priority process is waiting on a
22892 +   node owned by a low priority process.
22893 +
22894 +   It is enough to avoid deadlocks if we prevent any low-priority process from
22895 +   falling asleep if its locked set contains a node which satisfies the
22896 +   deadlock condition.
22897 +
22898 +   That condition is implicitly or explicitly checked in all places where new
22899 +   high-priority requests may be added or removed from node request queue or
22900 +   high-priority process takes or releases a lock on node. The main
22901 +   goal of these checks is to never lose the moment when node becomes "has
22902 +   wrong owners" and send "must-yield-this-lock" signals to its low-pri owners
22903 +   at that time.
22904 +
22905 +   The information about received signals is stored in the per-process
22906 +   structure (lock stack) and analyzed before a low-priority process goes to
22907 +   sleep but after a "fast" attempt to lock a node fails. Any signal wakes
22908 +   sleeping process up and forces him to re-check lock status and received
22909 +   signal info. If "must-yield-this-lock" signals were received the locking
22910 +   primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code.
22911 +
22912 +   V4 LOCKING DRAWBACKS
22913 +
22914 +   If we have already balanced on one level, and we are propagating our changes
22915 +   upward to a higher level, it could be very messy to surrender all locks on
22916 +   the lower level because we put so much computational work into it, and
22917 +   reverting them to their state before they were locked might be very complex.
22918 +   We also don't want to acquire all locks before performing balancing because
22919 +   that would either be almost as much work as the balancing, or it would be
22920 +   too conservative and lock too much.  We want balancing to be done only at
22921 +   high priority.  Yet, we might want to go to the left one node and use some
22922 +   of its empty space... So we make one attempt at getting the node to the left
22923 +   using try_lock, and if it fails we do without it, because we didn't really
22924 +   need it, it was only a nice to have.
22925 +
22926 +   LOCK STRUCTURES DESCRIPTION
22927 +
22928 +   The following data structures are used in the reiser4 locking
22929 +   implementation:
22930 +
22931 +   All fields related to long-term locking are stored in znode->lock.
22932 +
22933 +   The lock stack is a per thread object.  It owns all znodes locked by the
22934 +   thread. One znode may be locked by several threads in case of read lock or
22935 +   one znode may be write locked by one thread several times. The special link
22936 +   objects (lock handles) support n<->m relation between znodes and lock
22937 +   owners.
22938 +
22939 +   <Thread 1>                       <Thread 2>
22940 +
22941 +   +---------+                     +---------+
22942 +   |  LS1    |                    |  LS2    |
22943 +   +---------+                    +---------+
22944 +       ^                                ^
22945 +       |---------------+                +----------+
22946 +       v               v                v          v
22947 +   +---------+      +---------+    +---------+   +---------+
22948 +   |  LH1    |      |   LH2   |           |  LH3    |   |   LH4   |
22949 +   +---------+     +---------+    +---------+   +---------+
22950 +       ^                   ^            ^           ^
22951 +       |                   +------------+           |
22952 +       v                   v                        v
22953 +   +---------+      +---------+                  +---------+
22954 +   |  Z1     |     |   Z2    |                  |  Z3     |
22955 +   +---------+     +---------+                  +---------+
22956 +
22957 +   Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The
22958 +   picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and
22959 +   LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it.  Znode
22960 +   Z1 is locked by only one thread, znode has only one lock handle LH1 on its
22961 +   list, similar situation is for Z3 which is locked by the thread 2 only. Z2
22962 +   is locked (for read) twice by different threads and two lock handles are on
22963 +   its list. Each lock handle represents a single relation of a locking of a
22964 +   znode by a thread. Locking of a znode is an establishing of a locking
22965 +   relation between the lock stack and the znode by adding of a new lock handle
22966 +   to a list of lock handles, the lock stack.  The lock stack links all lock
22967 +   handles for all znodes locked by the lock stack.  The znode list groups all
22968 +   lock handles for all locks stacks which locked the znode.
22969 +
22970 +   Yet another relation may exist between znode and lock owners.  If lock
22971 +   procedure cannot immediately take lock on an object it adds the lock owner
22972 +   on special `requestors' list belongs to znode.  That list represents a
22973 +   queue of pending lock requests.  Because one lock owner may request only
22974 +   only one lock object at a time, it is a 1->n relation between lock objects
22975 +   and a lock owner implemented as it is described above. Full information
22976 +   (priority, pointers to lock and link objects) about each lock request is
22977 +   stored in lock owner structure in `request' field.
22978 +
22979 +   SHORT_TERM LOCKING
22980 +
22981 +   This is a list of primitive operations over lock stacks / lock handles /
22982 +   znodes and locking descriptions for them.
22983 +
22984 +   1. locking / unlocking which is done by two list insertion/deletion, one
22985 +      to/from znode's list of lock handles, another one is to/from lock stack's
22986 +      list of lock handles.  The first insertion is protected by
22987 +      znode->lock.guard spinlock.  The list owned by the lock stack can be
22988 +      modified only by thread who owns the lock stack and nobody else can
22989 +      modify/read it. There is nothing to be protected by a spinlock or
22990 +      something else.
22991 +
22992 +   2. adding/removing a lock request to/from znode requesters list. The rule is
22993 +      that znode->lock.guard spinlock should be taken for this.
22994 +
22995 +   3. we can traverse list of lock handles and use references to lock stacks who
22996 +      locked given znode if znode->lock.guard spinlock is taken.
22997 +
22998 +   4. If a lock stack is associated with a znode as a lock requestor or lock
22999 +      owner its existence is guaranteed by znode->lock.guard spinlock.  Some its
23000 +      (lock stack's) fields should be protected from being accessed in parallel
23001 +      by two or more threads. Please look at  lock_stack structure definition
23002 +      for the info how those fields are protected. */
23003 +
23004 +/* Znode lock and capturing intertwining. */
23005 +/* In current implementation we capture formatted nodes before locking
23006 +   them. Take a look on longterm lock znode, reiser4_try_capture() request
23007 +   precedes locking requests.  The longterm_lock_znode function unconditionally
23008 +   captures znode before even checking of locking conditions.
23009 +
23010 +   Another variant is to capture znode after locking it.  It was not tested, but
23011 +   at least one deadlock condition is supposed to be there.  One thread has
23012 +   locked a znode (Node-1) and calls reiser4_try_capture() for it.
23013 +   reiser4_try_capture() sleeps because znode's atom has CAPTURE_WAIT state.
23014 +   Second thread is a flushing thread, its current atom is the atom Node-1
23015 +   belongs to. Second thread wants to lock Node-1 and sleeps because Node-1
23016 +   is locked by the first thread.  The described situation is a deadlock. */
23017 +
23018 +#include "debug.h"
23019 +#include "txnmgr.h"
23020 +#include "znode.h"
23021 +#include "jnode.h"
23022 +#include "tree.h"
23023 +#include "plugin/node/node.h"
23024 +#include "super.h"
23025 +
23026 +#include <linux/spinlock.h>
23027 +
23028 +#if REISER4_DEBUG
23029 +static int request_is_deadlock_safe(znode * , znode_lock_mode,
23030 +                                   znode_lock_request);
23031 +#endif
23032 +
23033 +/* Returns a lock owner associated with current thread */
23034 +lock_stack *get_current_lock_stack(void)
23035 +{
23036 +       return &get_current_context()->stack;
23037 +}
23038 +
23039 +/* Wakes up all low priority owners informing them about possible deadlock */
23040 +static void wake_up_all_lopri_owners(znode * node)
23041 +{
23042 +       lock_handle *handle;
23043 +
23044 +       assert_spin_locked(&(node->lock.guard));
23045 +       list_for_each_entry(handle, &node->lock.owners, owners_link) {
23046 +               assert("nikita-1832", handle->node == node);
23047 +               /* count this signal in owner->nr_signaled */
23048 +               if (!handle->signaled) {
23049 +                       handle->signaled = 1;
23050 +                       atomic_inc(&handle->owner->nr_signaled);
23051 +                       /* Wake up a single process */
23052 +                       reiser4_wake_up(handle->owner);
23053 +               }
23054 +       }
23055 +}
23056 +
23057 +/* Adds a lock to a lock owner, which means creating a link to the lock and
23058 +   putting the link into the two lists all links are on (the doubly linked list
23059 +   that forms the lock_stack, and the doubly linked list of links attached
23060 +   to a lock.
23061 +*/
23062 +static inline void
23063 +link_object(lock_handle * handle, lock_stack * owner, znode * node)
23064 +{
23065 +       assert("jmacd-810", handle->owner == NULL);
23066 +       assert_spin_locked(&(node->lock.guard));
23067 +
23068 +       handle->owner = owner;
23069 +       handle->node = node;
23070 +
23071 +       assert("reiser4-4",
23072 +              ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0));
23073 +
23074 +       /* add lock handle to the end of lock_stack's list of locks */
23075 +       list_add_tail(&handle->locks_link, &owner->locks);
23076 +       ON_DEBUG(owner->nr_locks++);
23077 +       reiser4_ctx_gfp_mask_set();
23078 +
23079 +       /* add lock handle to the head of znode's list of owners */
23080 +       list_add(&handle->owners_link, &node->lock.owners);
23081 +       handle->signaled = 0;
23082 +}
23083 +
23084 +/* Breaks a relation between a lock and its owner */
23085 +static inline void unlink_object(lock_handle * handle)
23086 +{
23087 +       assert("zam-354", handle->owner != NULL);
23088 +       assert("nikita-1608", handle->node != NULL);
23089 +       assert_spin_locked(&(handle->node->lock.guard));
23090 +       assert("nikita-1829", handle->owner == get_current_lock_stack());
23091 +       assert("reiser4-5", handle->owner->nr_locks > 0);
23092 +
23093 +       /* remove lock handle from lock_stack's list of locks */
23094 +       list_del(&handle->locks_link);
23095 +       ON_DEBUG(handle->owner->nr_locks--);
23096 +       reiser4_ctx_gfp_mask_set();
23097 +       assert("reiser4-6",
23098 +              ergo(list_empty_careful(&handle->owner->locks),
23099 +                   handle->owner->nr_locks == 0));
23100 +       /* remove lock handle from znode's list of owners */
23101 +       list_del(&handle->owners_link);
23102 +       /* indicates that lock handle is free now */
23103 +       handle->node = NULL;
23104 +#if REISER4_DEBUG
23105 +       INIT_LIST_HEAD(&handle->locks_link);
23106 +       INIT_LIST_HEAD(&handle->owners_link);
23107 +       handle->owner = NULL;
23108 +#endif
23109 +}
23110 +
23111 +/* Actually locks an object knowing that we are able to do this */
23112 +static void lock_object(lock_stack * owner)
23113 +{
23114 +       struct lock_request *request;
23115 +       znode *node;
23116 +
23117 +       request = &owner->request;
23118 +       node = request->node;
23119 +       assert_spin_locked(&(node->lock.guard));
23120 +       if (request->mode == ZNODE_READ_LOCK) {
23121 +               node->lock.nr_readers++;
23122 +       } else {
23123 +               /* check that we don't switched from read to write lock */
23124 +               assert("nikita-1840", node->lock.nr_readers <= 0);
23125 +               /* We allow recursive locking; a node can be locked several
23126 +                  times for write by same process */
23127 +               node->lock.nr_readers--;
23128 +       }
23129 +
23130 +       link_object(request->handle, owner, node);
23131 +
23132 +       if (owner->curpri)
23133 +               node->lock.nr_hipri_owners++;
23134 +}
23135 +
23136 +/* Check for recursive write locking */
23137 +static int recursive(lock_stack * owner)
23138 +{
23139 +       int ret;
23140 +       znode *node;
23141 +       lock_handle *lh;
23142 +
23143 +       node = owner->request.node;
23144 +
23145 +       /* Owners list is not empty for a locked node */
23146 +       assert("zam-314", !list_empty_careful(&node->lock.owners));
23147 +       assert("nikita-1841", owner == get_current_lock_stack());
23148 +       assert_spin_locked(&(node->lock.guard));
23149 +
23150 +       lh = list_entry(node->lock.owners.next, lock_handle, owners_link);
23151 +       ret = (lh->owner == owner);
23152 +
23153 +       /* Recursive read locking should be done usual way */
23154 +       assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK);
23155 +       /* mixing of read/write locks is not allowed */
23156 +       assert("zam-341", !ret || znode_is_wlocked(node));
23157 +
23158 +       return ret;
23159 +}
23160 +
23161 +#if REISER4_DEBUG
23162 +/* Returns true if the lock is held by the calling thread. */
23163 +int znode_is_any_locked(const znode * node)
23164 +{
23165 +       lock_handle *handle;
23166 +       lock_stack *stack;
23167 +       int ret;
23168 +
23169 +       if (!znode_is_locked(node))
23170 +               return 0;
23171 +
23172 +       stack = get_current_lock_stack();
23173 +
23174 +       spin_lock_stack(stack);
23175 +
23176 +       ret = 0;
23177 +
23178 +       list_for_each_entry(handle, &stack->locks, locks_link) {
23179 +               if (handle->node == node) {
23180 +                       ret = 1;
23181 +                       break;
23182 +               }
23183 +       }
23184 +
23185 +       spin_unlock_stack(stack);
23186 +
23187 +       return ret;
23188 +}
23189 +
23190 +#endif
23191 +
23192 +/* Returns true if a write lock is held by the calling thread. */
23193 +int znode_is_write_locked(const znode * node)
23194 +{
23195 +       lock_stack *stack;
23196 +       lock_handle *handle;
23197 +
23198 +       assert("jmacd-8765", node != NULL);
23199 +
23200 +       if (!znode_is_wlocked(node))
23201 +               return 0;
23202 +
23203 +       stack = get_current_lock_stack();
23204 +
23205 +       /*
23206 +        * When znode is write locked, all owner handles point to the same lock
23207 +        * stack. Get pointer to lock stack from the first lock handle from
23208 +        * znode's owner list
23209 +        */
23210 +       handle = list_entry(node->lock.owners.next, lock_handle, owners_link);
23211 +
23212 +       return (handle->owner == stack);
23213 +}
23214 +
23215 +/* This "deadlock" condition is the essential part of reiser4 locking
23216 +   implementation. This condition is checked explicitly by calling
23217 +   check_deadlock_condition() or implicitly in all places where znode lock
23218 +   state (set of owners and request queue) is changed. Locking code is
23219 +   designed to use this condition to trigger procedure of passing object from
23220 +   low priority owner(s) to high priority one(s).
23221 +
23222 +   The procedure results in passing an event (setting lock_handle->signaled
23223 +   flag) and counting this event in nr_signaled field of owner's lock stack
23224 +   object and wakeup owner's process.
23225 +*/
23226 +static inline int check_deadlock_condition(znode * node)
23227 +{
23228 +       assert_spin_locked(&(node->lock.guard));
23229 +       return node->lock.nr_hipri_requests > 0
23230 +           && node->lock.nr_hipri_owners == 0;
23231 +}
23232 +
23233 +static int check_livelock_condition(znode * node, znode_lock_mode mode)
23234 +{
23235 +       zlock * lock = &node->lock;
23236 +
23237 +       return mode == ZNODE_READ_LOCK &&
23238 +               lock->nr_readers >= 0 && lock->nr_hipri_write_requests > 0;
23239 +}
23240 +
23241 +/* checks lock/request compatibility */
23242 +static int can_lock_object(lock_stack * owner)
23243 +{
23244 +       znode *node = owner->request.node;
23245 +
23246 +       assert_spin_locked(&(node->lock.guard));
23247 +
23248 +       /* See if the node is disconnected. */
23249 +       if (unlikely(ZF_ISSET(node, JNODE_IS_DYING)))
23250 +               return RETERR(-EINVAL);
23251 +
23252 +       /* Do not ever try to take a lock if we are going in low priority
23253 +          direction and a node have a high priority request without high
23254 +          priority owners. */
23255 +       if (unlikely(!owner->curpri && check_deadlock_condition(node)))
23256 +               return RETERR(-E_REPEAT);
23257 +       if (unlikely(owner->curpri &&
23258 +                    check_livelock_condition(node, owner->request.mode)))
23259 +               return RETERR(-E_REPEAT);
23260 +       if (unlikely(!is_lock_compatible(node, owner->request.mode)))
23261 +               return RETERR(-E_REPEAT);
23262 +       return 0;
23263 +}
23264 +
23265 +/* Setting of a high priority to the process. It clears "signaled" flags
23266 +   because znode locked by high-priority process can't satisfy our "deadlock
23267 +   condition". */
23268 +static void set_high_priority(lock_stack * owner)
23269 +{
23270 +       assert("nikita-1846", owner == get_current_lock_stack());
23271 +       /* Do nothing if current priority is already high */
23272 +       if (!owner->curpri) {
23273 +               /* We don't need locking for owner->locks list, because, this
23274 +                * function is only called with the lock stack of the current
23275 +                * thread, and no other thread can play with owner->locks list
23276 +                * and/or change ->node pointers of lock handles in this list.
23277 +                *
23278 +                * (Interrupts also are not involved.)
23279 +                */
23280 +               lock_handle *item = list_entry(owner->locks.next, lock_handle,
23281 +                                              locks_link);
23282 +               while (&owner->locks != &item->locks_link) {
23283 +                       znode *node = item->node;
23284 +
23285 +                       spin_lock_zlock(&node->lock);
23286 +
23287 +                       node->lock.nr_hipri_owners++;
23288 +
23289 +                       /* we can safely set signaled to zero, because
23290 +                          previous statement (nr_hipri_owners ++) guarantees
23291 +                          that signaled will be never set again. */
23292 +                       item->signaled = 0;
23293 +                       spin_unlock_zlock(&node->lock);
23294 +
23295 +                       item = list_entry(item->locks_link.next, lock_handle,
23296 +                                         locks_link);
23297 +               }
23298 +               owner->curpri = 1;
23299 +               atomic_set(&owner->nr_signaled, 0);
23300 +       }
23301 +}
23302 +
23303 +/* Sets a low priority to the process. */
23304 +static void set_low_priority(lock_stack * owner)
23305 +{
23306 +       assert("nikita-3075", owner == get_current_lock_stack());
23307 +       /* Do nothing if current priority is already low */
23308 +       if (owner->curpri) {
23309 +               /* scan all locks (lock handles) held by @owner, which is
23310 +                  actually current thread, and check whether we are reaching
23311 +                  deadlock possibility anywhere.
23312 +                */
23313 +               lock_handle *handle = list_entry(owner->locks.next, lock_handle,
23314 +                                                locks_link);
23315 +               while (&owner->locks != &handle->locks_link) {
23316 +                       znode *node = handle->node;
23317 +                       spin_lock_zlock(&node->lock);
23318 +                       /* this thread just was hipri owner of @node, so
23319 +                          nr_hipri_owners has to be greater than zero. */
23320 +                       assert("nikita-1835", node->lock.nr_hipri_owners > 0);
23321 +                       node->lock.nr_hipri_owners--;
23322 +                       /* If we have deadlock condition, adjust a nr_signaled
23323 +                          field. It is enough to set "signaled" flag only for
23324 +                          current process, other low-pri owners will be
23325 +                          signaled and waken up after current process unlocks
23326 +                          this object and any high-priority requestor takes
23327 +                          control. */
23328 +                       if (check_deadlock_condition(node)
23329 +                           && !handle->signaled) {
23330 +                               handle->signaled = 1;
23331 +                               atomic_inc(&owner->nr_signaled);
23332 +                       }
23333 +                       spin_unlock_zlock(&node->lock);
23334 +                       handle = list_entry(handle->locks_link.next,
23335 +                                           lock_handle, locks_link);
23336 +               }
23337 +               owner->curpri = 0;
23338 +       }
23339 +}
23340 +
23341 +static void remove_lock_request(lock_stack * requestor)
23342 +{
23343 +       zlock * lock = &requestor->request.node->lock;
23344 +
23345 +       if (requestor->curpri) {
23346 +               assert("nikita-1838", lock->nr_hipri_requests > 0);
23347 +               lock->nr_hipri_requests--;
23348 +               if (requestor->request.mode == ZNODE_WRITE_LOCK)
23349 +                       lock->nr_hipri_write_requests--;
23350 +       }
23351 +       list_del(&requestor->requestors_link);
23352 +}
23353 +
23354 +static void invalidate_all_lock_requests(znode * node)
23355 +{
23356 +       lock_stack *requestor, *tmp;
23357 +
23358 +       assert_spin_locked(&(node->lock.guard));
23359 +
23360 +       list_for_each_entry_safe(requestor, tmp, &node->lock.requestors,
23361 +                                requestors_link) {
23362 +               remove_lock_request(requestor);
23363 +               requestor->request.ret_code = -EINVAL;
23364 +               reiser4_wake_up(requestor);
23365 +               requestor->request.mode = ZNODE_NO_LOCK;
23366 +       }
23367 +}
23368 +
23369 +static void dispatch_lock_requests(znode * node)
23370 +{
23371 +       lock_stack *requestor, *tmp;
23372 +
23373 +       assert_spin_locked(&(node->lock.guard));
23374 +
23375 +       list_for_each_entry_safe(requestor, tmp, &node->lock.requestors,
23376 +                                requestors_link) {
23377 +               if (znode_is_write_locked(node))
23378 +                       break;
23379 +               if (!can_lock_object(requestor)) {
23380 +                       lock_object(requestor);
23381 +                       remove_lock_request(requestor);
23382 +                       requestor->request.ret_code = 0;
23383 +                       reiser4_wake_up(requestor);
23384 +                       requestor->request.mode = ZNODE_NO_LOCK;
23385 +               }
23386 +       }
23387 +}
23388 +
23389 +/* release long-term lock, acquired by longterm_lock_znode() */
23390 +void longterm_unlock_znode(lock_handle * handle)
23391 +{
23392 +       znode *node = handle->node;
23393 +       lock_stack *oldowner = handle->owner;
23394 +       int hipri;
23395 +       int readers;
23396 +       int rdelta;
23397 +       int youdie;
23398 +
23399 +       /*
23400 +        * this is time-critical and highly optimized code. Modify carefully.
23401 +        */
23402 +
23403 +       assert("jmacd-1021", handle != NULL);
23404 +       assert("jmacd-1022", handle->owner != NULL);
23405 +       assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode));
23406 +
23407 +       assert("zam-130", oldowner == get_current_lock_stack());
23408 +
23409 +       LOCK_CNT_DEC(long_term_locked_znode);
23410 +
23411 +       /*
23412 +        * to minimize amount of operations performed under lock, pre-compute
23413 +        * all variables used within critical section. This makes code
23414 +        * obscure.
23415 +        */
23416 +
23417 +       /* was this lock of hi or lo priority */
23418 +       hipri = oldowner->curpri ? 1 : 0;
23419 +       /* number of readers */
23420 +       readers = node->lock.nr_readers;
23421 +       /* +1 if write lock, -1 if read lock */
23422 +       rdelta = (readers > 0) ? -1 : +1;
23423 +       /* true if node is to die and write lock is released */
23424 +       youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0);
23425 +
23426 +       spin_lock_zlock(&node->lock);
23427 +
23428 +       assert("zam-101", znode_is_locked(node));
23429 +
23430 +       /* Adjust a number of high priority owners of this lock */
23431 +       assert("nikita-1836", node->lock.nr_hipri_owners >= hipri);
23432 +       node->lock.nr_hipri_owners -= hipri;
23433 +
23434 +       /* Handle znode deallocation on last write-lock release. */
23435 +       if (znode_is_wlocked_once(node)) {
23436 +               if (youdie) {
23437 +                       forget_znode(handle);
23438 +                       assert("nikita-2191", znode_invariant(node));
23439 +                       zput(node);
23440 +                       return;
23441 +               }
23442 +       }
23443 +
23444 +       if (handle->signaled)
23445 +               atomic_dec(&oldowner->nr_signaled);
23446 +
23447 +       /* Unlocking means owner<->object link deletion */
23448 +       unlink_object(handle);
23449 +
23450 +       /* This is enough to be sure whether an object is completely
23451 +          unlocked. */
23452 +       node->lock.nr_readers += rdelta;
23453 +
23454 +       /* If the node is locked it must have an owners list.  Likewise, if
23455 +          the node is unlocked it must have an empty owners list. */
23456 +       assert("zam-319", equi(znode_is_locked(node),
23457 +                              !list_empty_careful(&node->lock.owners)));
23458 +
23459 +#if REISER4_DEBUG
23460 +       if (!znode_is_locked(node))
23461 +               ++node->times_locked;
23462 +#endif
23463 +
23464 +       /* If there are pending lock requests we wake up a requestor */
23465 +       if (!znode_is_wlocked(node))
23466 +               dispatch_lock_requests(node);
23467 +       if (check_deadlock_condition(node))
23468 +               wake_up_all_lopri_owners(node);
23469 +       spin_unlock_zlock(&node->lock);
23470 +
23471 +       /* minus one reference from handle->node */
23472 +       assert("nikita-2190", znode_invariant(node));
23473 +       ON_DEBUG(check_lock_data());
23474 +       ON_DEBUG(check_lock_node_data(node));
23475 +       zput(node);
23476 +}
23477 +
23478 +/* final portion of longterm-lock */
23479 +static int
23480 +lock_tail(lock_stack * owner, int ok, znode_lock_mode mode)
23481 +{
23482 +       znode *node = owner->request.node;
23483 +
23484 +       assert_spin_locked(&(node->lock.guard));
23485 +
23486 +       /* If we broke with (ok == 0) it means we can_lock, now do it. */
23487 +       if (ok == 0) {
23488 +               lock_object(owner);
23489 +               owner->request.mode = 0;
23490 +               /* count a reference from lockhandle->node
23491 +
23492 +                  znode was already referenced at the entry to this function,
23493 +                  hence taking spin-lock here is not necessary (see comment
23494 +                  in the zref()).
23495 +                */
23496 +               zref(node);
23497 +
23498 +               LOCK_CNT_INC(long_term_locked_znode);
23499 +       }
23500 +       spin_unlock_zlock(&node->lock);
23501 +       ON_DEBUG(check_lock_data());
23502 +       ON_DEBUG(check_lock_node_data(node));
23503 +       return ok;
23504 +}
23505 +
23506 +/*
23507 + * version of longterm_znode_lock() optimized for the most common case: read
23508 + * lock without any special flags. This is the kind of lock that any tree
23509 + * traversal takes on the root node of the tree, which is very frequent.
23510 + */
23511 +static int longterm_lock_tryfast(lock_stack * owner)
23512 +{
23513 +       int result;
23514 +       znode *node;
23515 +       zlock *lock;
23516 +
23517 +       node = owner->request.node;
23518 +       lock = &node->lock;
23519 +
23520 +       assert("nikita-3340", reiser4_schedulable());
23521 +       assert("nikita-3341", request_is_deadlock_safe(node,
23522 +                                                      ZNODE_READ_LOCK,
23523 +                                                      ZNODE_LOCK_LOPRI));
23524 +       spin_lock_zlock(lock);
23525 +       result = can_lock_object(owner);
23526 +       spin_unlock_zlock(lock);
23527 +
23528 +       if (likely(result != -EINVAL)) {
23529 +               spin_lock_znode(node);
23530 +               result = reiser4_try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0);
23531 +               spin_unlock_znode(node);
23532 +               spin_lock_zlock(lock);
23533 +               if (unlikely(result != 0)) {
23534 +                       owner->request.mode = 0;
23535 +               } else {
23536 +                       result = can_lock_object(owner);
23537 +                       if (unlikely(result == -E_REPEAT)) {
23538 +                               /* fall back to longterm_lock_znode() */
23539 +                               spin_unlock_zlock(lock);
23540 +                               return 1;
23541 +                       }
23542 +               }
23543 +               return lock_tail(owner, result, ZNODE_READ_LOCK);
23544 +       } else
23545 +               return 1;
23546 +}
23547 +
23548 +/* locks given lock object */
23549 +int longterm_lock_znode(
23550 +                              /* local link object (allocated by lock owner
23551 +                               * thread, usually on its own stack) */
23552 +                              lock_handle * handle,
23553 +                              /* znode we want to lock. */
23554 +                              znode * node,
23555 +                              /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */
23556 +                              znode_lock_mode mode,
23557 +                              /* {0, -EINVAL, -E_DEADLOCK}, see return codes
23558 +                                 description. */
23559 +                              znode_lock_request request) {
23560 +       int ret;
23561 +       int hipri = (request & ZNODE_LOCK_HIPRI) != 0;
23562 +       int non_blocking = 0;
23563 +       int has_atom;
23564 +       txn_capture cap_flags;
23565 +       zlock *lock;
23566 +       txn_handle *txnh;
23567 +       tree_level level;
23568 +
23569 +       /* Get current process context */
23570 +       lock_stack *owner = get_current_lock_stack();
23571 +
23572 +       /* Check that the lock handle is initialized and isn't already being
23573 +        * used. */
23574 +       assert("jmacd-808", handle->owner == NULL);
23575 +       assert("nikita-3026", reiser4_schedulable());
23576 +       assert("nikita-3219", request_is_deadlock_safe(node, mode, request));
23577 +       assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0);
23578 +       /* long term locks are not allowed in the VM contexts (->writepage(),
23579 +        * prune_{d,i}cache()).
23580 +        *
23581 +        * FIXME this doesn't work due to unused-dentry-with-unlinked-inode
23582 +        * bug caused by d_splice_alias() only working for directories.
23583 +        */
23584 +       assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0));
23585 +       assert("zam-1055", mode != ZNODE_NO_LOCK);
23586 +
23587 +       cap_flags = 0;
23588 +       if (request & ZNODE_LOCK_NONBLOCK) {
23589 +               cap_flags |= TXN_CAPTURE_NONBLOCKING;
23590 +               non_blocking = 1;
23591 +       }
23592 +
23593 +       if (request & ZNODE_LOCK_DONT_FUSE)
23594 +               cap_flags |= TXN_CAPTURE_DONT_FUSE;
23595 +
23596 +       /* If we are changing our process priority we must adjust a number
23597 +          of high priority owners for each znode that we already lock */
23598 +       if (hipri) {
23599 +               set_high_priority(owner);
23600 +       } else {
23601 +               set_low_priority(owner);
23602 +       }
23603 +
23604 +       level = znode_get_level(node);
23605 +
23606 +       /* Fill request structure with our values. */
23607 +       owner->request.mode = mode;
23608 +       owner->request.handle = handle;
23609 +       owner->request.node = node;
23610 +
23611 +       txnh = get_current_context()->trans;
23612 +       lock = &node->lock;
23613 +
23614 +       if (mode == ZNODE_READ_LOCK && request == 0) {
23615 +               ret = longterm_lock_tryfast(owner);
23616 +               if (ret <= 0)
23617 +                       return ret;
23618 +       }
23619 +
23620 +       has_atom = (txnh->atom != NULL);
23621 +
23622 +       /* Synchronize on node's zlock guard lock. */
23623 +       spin_lock_zlock(lock);
23624 +
23625 +       if (znode_is_locked(node) &&
23626 +           mode == ZNODE_WRITE_LOCK && recursive(owner))
23627 +               return lock_tail(owner, 0, mode);
23628 +
23629 +       for (;;) {
23630 +               /* Check the lock's availability: if it is unavaiable we get
23631 +                  E_REPEAT, 0 indicates "can_lock", otherwise the node is
23632 +                  invalid.  */
23633 +               ret = can_lock_object(owner);
23634 +
23635 +               if (unlikely(ret == -EINVAL)) {
23636 +                       /* @node is dying. Leave it alone. */
23637 +                       break;
23638 +               }
23639 +
23640 +               if (unlikely(ret == -E_REPEAT && non_blocking)) {
23641 +                       /* either locking of @node by the current thread will
23642 +                        * lead to the deadlock, or lock modes are
23643 +                        * incompatible. */
23644 +                       break;
23645 +               }
23646 +
23647 +               assert("nikita-1844", (ret == 0)
23648 +                      || ((ret == -E_REPEAT) && !non_blocking));
23649 +               /* If we can get the lock... Try to capture first before
23650 +                  taking the lock. */
23651 +
23652 +               /* first handle commonest case where node and txnh are already
23653 +                * in the same atom. */
23654 +               /* safe to do without taking locks, because:
23655 +                *
23656 +                * 1. read of aligned word is atomic with respect to writes to
23657 +                * this word
23658 +                *
23659 +                * 2. false negatives are handled in reiser4_try_capture().
23660 +                *
23661 +                * 3. false positives are impossible.
23662 +                *
23663 +                * PROOF: left as an exercise to the curious reader.
23664 +                *
23665 +                * Just kidding. Here is one:
23666 +                *
23667 +                * At the time T0 txnh->atom is stored in txnh_atom.
23668 +                *
23669 +                * At the time T1 node->atom is stored in node_atom.
23670 +                *
23671 +                * At the time T2 we observe that
23672 +                *
23673 +                *     txnh_atom != NULL && node_atom == txnh_atom.
23674 +                *
23675 +                * Imagine that at this moment we acquire node and txnh spin
23676 +                * lock in this order. Suppose that under spin lock we have
23677 +                *
23678 +                *     node->atom != txnh->atom,                       (S1)
23679 +                *
23680 +                * at the time T3.
23681 +                *
23682 +                * txnh->atom != NULL still, because txnh is open by the
23683 +                * current thread.
23684 +                *
23685 +                * Suppose node->atom == NULL, that is, node was un-captured
23686 +                * between T1, and T3. But un-capturing of formatted node is
23687 +                * always preceded by the call to reiser4_invalidate_lock(),
23688 +                * which marks znode as JNODE_IS_DYING under zlock spin
23689 +                * lock. Contradiction, because can_lock_object() above checks
23690 +                * for JNODE_IS_DYING. Hence, node->atom != NULL at T3.
23691 +                *
23692 +                * Suppose that node->atom != node_atom, that is, atom, node
23693 +                * belongs to was fused into another atom: node_atom was fused
23694 +                * into node->atom. Atom of txnh was equal to node_atom at T2,
23695 +                * which means that under spin lock, txnh->atom == node->atom,
23696 +                * because txnh->atom can only follow fusion
23697 +                * chain. Contradicts S1.
23698 +                *
23699 +                * The same for hypothesis txnh->atom != txnh_atom. Hence,
23700 +                * node->atom == node_atom == txnh_atom == txnh->atom. Again
23701 +                * contradicts S1. Hence S1 is false. QED.
23702 +                *
23703 +                */
23704 +
23705 +               if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) {
23706 +                       ;
23707 +               } else {
23708 +                       /*
23709 +                        * unlock zlock spin lock here. It is possible for
23710 +                        * longterm_unlock_znode() to sneak in here, but there
23711 +                        * is no harm: reiser4_invalidate_lock() will mark znode
23712 +                        * as JNODE_IS_DYING and this will be noted by
23713 +                        * can_lock_object() below.
23714 +                        */
23715 +                       spin_unlock_zlock(lock);
23716 +                       spin_lock_znode(node);
23717 +                       ret = reiser4_try_capture(ZJNODE(node), mode,
23718 +                                                 cap_flags);
23719 +                       spin_unlock_znode(node);
23720 +                       spin_lock_zlock(lock);
23721 +                       if (unlikely(ret != 0)) {
23722 +                               /* In the failure case, the txnmgr releases
23723 +                                  the znode's lock (or in some cases, it was
23724 +                                  released a while ago).  There's no need to
23725 +                                  reacquire it so we should return here,
23726 +                                  avoid releasing the lock. */
23727 +                               owner->request.mode = 0;
23728 +                               break;
23729 +                       }
23730 +
23731 +                       /* Check the lock's availability again -- this is
23732 +                          because under some circumstances the capture code
23733 +                          has to release and reacquire the znode spinlock. */
23734 +                       ret = can_lock_object(owner);
23735 +               }
23736 +
23737 +               /* This time, a return of (ret == 0) means we can lock, so we
23738 +                  should break out of the loop. */
23739 +               if (likely(ret != -E_REPEAT || non_blocking))
23740 +                       break;
23741 +
23742 +               /* Lock is unavailable, we have to wait. */
23743 +               ret = reiser4_prepare_to_sleep(owner);
23744 +               if (unlikely(ret != 0))
23745 +                       break;
23746 +
23747 +               assert_spin_locked(&(node->lock.guard));
23748 +               if (hipri) {
23749 +                       /* If we are going in high priority direction then
23750 +                          increase high priority requests counter for the
23751 +                          node */
23752 +                       lock->nr_hipri_requests++;
23753 +                       if (mode == ZNODE_WRITE_LOCK)
23754 +                               lock->nr_hipri_write_requests++;
23755 +                       /* If there are no high priority owners for a node,
23756 +                          then immediately wake up low priority owners, so
23757 +                          they can detect possible deadlock */
23758 +                       if (lock->nr_hipri_owners == 0)
23759 +                               wake_up_all_lopri_owners(node);
23760 +               }
23761 +               list_add_tail(&owner->requestors_link, &lock->requestors);
23762 +
23763 +               /* Ok, here we have prepared a lock request, so unlock
23764 +                  a znode ... */
23765 +               spin_unlock_zlock(lock);
23766 +               /* ... and sleep */
23767 +               reiser4_go_to_sleep(owner);
23768 +               if (owner->request.mode == ZNODE_NO_LOCK)
23769 +                       goto request_is_done;
23770 +               spin_lock_zlock(lock);
23771 +               if (owner->request.mode == ZNODE_NO_LOCK) {
23772 +                       spin_unlock_zlock(lock);
23773 +request_is_done:
23774 +                       if (owner->request.ret_code == 0) {
23775 +                               LOCK_CNT_INC(long_term_locked_znode);
23776 +                               zref(node);
23777 +                       }
23778 +                       return owner->request.ret_code;
23779 +               }
23780 +               remove_lock_request(owner);
23781 +       }
23782 +
23783 +       return lock_tail(owner, ret, mode);
23784 +}
23785 +
23786 +/* lock object invalidation means changing of lock object state to `INVALID'
23787 +   and waiting for all other processes to cancel theirs lock requests. */
23788 +void reiser4_invalidate_lock(lock_handle * handle      /* path to lock
23789 +                                                        * owner and lock
23790 +                                                        * object is being
23791 +                                                        * invalidated. */ )
23792 +{
23793 +       znode *node = handle->node;
23794 +       lock_stack *owner = handle->owner;
23795 +
23796 +       assert("zam-325", owner == get_current_lock_stack());
23797 +       assert("zam-103", znode_is_write_locked(node));
23798 +       assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED));
23799 +       assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED));
23800 +       assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
23801 +       assert("nikita-3097", znode_is_wlocked_once(node));
23802 +       assert_spin_locked(&(node->lock.guard));
23803 +
23804 +       if (handle->signaled)
23805 +               atomic_dec(&owner->nr_signaled);
23806 +
23807 +       ZF_SET(node, JNODE_IS_DYING);
23808 +       unlink_object(handle);
23809 +       node->lock.nr_readers = 0;
23810 +
23811 +       invalidate_all_lock_requests(node);
23812 +       spin_unlock_zlock(&node->lock);
23813 +}
23814 +
23815 +/* Initializes lock_stack. */
23816 +void init_lock_stack(lock_stack * owner        /* pointer to
23817 +                                        * allocated
23818 +                                        * structure. */ )
23819 +{
23820 +       INIT_LIST_HEAD(&owner->locks);
23821 +       INIT_LIST_HEAD(&owner->requestors_link);
23822 +       spin_lock_init(&owner->sguard);
23823 +       owner->curpri = 1;
23824 +       init_waitqueue_head(&owner->wait);
23825 +}
23826 +
23827 +/* Initializes lock object. */
23828 +void reiser4_init_lock(zlock * lock    /* pointer on allocated
23829 +                                        * uninitialized lock object
23830 +                                        * structure. */ )
23831 +{
23832 +       memset(lock, 0, sizeof(zlock));
23833 +       spin_lock_init(&lock->guard);
23834 +       INIT_LIST_HEAD(&lock->requestors);
23835 +       INIT_LIST_HEAD(&lock->owners);
23836 +}
23837 +
23838 +/* Transfer a lock handle (presumably so that variables can be moved between
23839 +   stack and heap locations). */
23840 +static void
23841 +move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old)
23842 +{
23843 +       znode *node = old->node;
23844 +       lock_stack *owner = old->owner;
23845 +       int signaled;
23846 +
23847 +       /* locks_list, modified by link_object() is not protected by
23848 +          anything. This is valid because only current thread ever modifies
23849 +          locks_list of its lock_stack.
23850 +        */
23851 +       assert("nikita-1827", owner == get_current_lock_stack());
23852 +       assert("nikita-1831", new->owner == NULL);
23853 +
23854 +       spin_lock_zlock(&node->lock);
23855 +
23856 +       signaled = old->signaled;
23857 +       if (unlink_old) {
23858 +               unlink_object(old);
23859 +       } else {
23860 +               if (node->lock.nr_readers > 0) {
23861 +                       node->lock.nr_readers += 1;
23862 +               } else {
23863 +                       node->lock.nr_readers -= 1;
23864 +               }
23865 +               if (signaled)
23866 +                       atomic_inc(&owner->nr_signaled);
23867 +               if (owner->curpri)
23868 +                       node->lock.nr_hipri_owners += 1;
23869 +               LOCK_CNT_INC(long_term_locked_znode);
23870 +
23871 +               zref(node);
23872 +       }
23873 +       link_object(new, owner, node);
23874 +       new->signaled = signaled;
23875 +
23876 +       spin_unlock_zlock(&node->lock);
23877 +}
23878 +
23879 +void move_lh(lock_handle * new, lock_handle * old)
23880 +{
23881 +       move_lh_internal(new, old, /*unlink_old */ 1);
23882 +}
23883 +
23884 +void copy_lh(lock_handle * new, lock_handle * old)
23885 +{
23886 +       move_lh_internal(new, old, /*unlink_old */ 0);
23887 +}
23888 +
23889 +/* after getting -E_DEADLOCK we unlock znodes until this function returns false
23890 + */
23891 +int reiser4_check_deadlock(void)
23892 +{
23893 +       lock_stack *owner = get_current_lock_stack();
23894 +       return atomic_read(&owner->nr_signaled) != 0;
23895 +}
23896 +
23897 +/* Before going to sleep we re-check "release lock" requests which might come
23898 +   from threads with hi-pri lock priorities. */
23899 +int reiser4_prepare_to_sleep(lock_stack * owner)
23900 +{
23901 +       assert("nikita-1847", owner == get_current_lock_stack());
23902 +
23903 +       /* We return -E_DEADLOCK if one or more "give me the lock" messages are
23904 +        * counted in nr_signaled */
23905 +       if (unlikely(atomic_read(&owner->nr_signaled) != 0)) {
23906 +               assert("zam-959", !owner->curpri);
23907 +               return RETERR(-E_DEADLOCK);
23908 +       }
23909 +       return 0;
23910 +}
23911 +
23912 +/* Wakes up a single thread */
23913 +void __reiser4_wake_up(lock_stack * owner)
23914 +{
23915 +       atomic_set(&owner->wakeup, 1);
23916 +       wake_up(&owner->wait);
23917 +}
23918 +
23919 +/* Puts a thread to sleep */
23920 +void reiser4_go_to_sleep(lock_stack * owner)
23921 +{
23922 +       /* Well, we might sleep here, so holding of any spinlocks is no-no */
23923 +       assert("nikita-3027", reiser4_schedulable());
23924 +
23925 +       wait_event(owner->wait, atomic_read(&owner->wakeup));
23926 +       atomic_set(&owner->wakeup, 0);
23927 +}
23928 +
23929 +int lock_stack_isclean(lock_stack * owner)
23930 +{
23931 +       if (list_empty_careful(&owner->locks)) {
23932 +               assert("zam-353", atomic_read(&owner->nr_signaled) == 0);
23933 +               return 1;
23934 +       }
23935 +
23936 +       return 0;
23937 +}
23938 +
23939 +#if REISER4_DEBUG
23940 +
23941 +/*
23942 + * debugging functions
23943 + */
23944 +
23945 +static void list_check(struct list_head *head)
23946 +{
23947 +       struct list_head *pos;
23948 +
23949 +       list_for_each(pos, head)
23950 +               assert("", (pos->prev != NULL && pos->next != NULL &&
23951 +                           pos->prev->next == pos && pos->next->prev == pos));
23952 +}
23953 +
23954 +/* check consistency of locking data-structures hanging of the @stack */
23955 +static void check_lock_stack(lock_stack * stack)
23956 +{
23957 +       spin_lock_stack(stack);
23958 +       /* check that stack->locks is not corrupted */
23959 +       list_check(&stack->locks);
23960 +       spin_unlock_stack(stack);
23961 +}
23962 +
23963 +/* check consistency of locking data structures */
23964 +void check_lock_data(void)
23965 +{
23966 +       check_lock_stack(&get_current_context()->stack);
23967 +}
23968 +
23969 +/* check consistency of locking data structures for @node */
23970 +void check_lock_node_data(znode * node)
23971 +{
23972 +       spin_lock_zlock(&node->lock);
23973 +       list_check(&node->lock.owners);
23974 +       list_check(&node->lock.requestors);
23975 +       spin_unlock_zlock(&node->lock);
23976 +}
23977 +
23978 +/* check that given lock request is dead lock safe. This check is, of course,
23979 + * not exhaustive. */
23980 +static int
23981 +request_is_deadlock_safe(znode * node, znode_lock_mode mode,
23982 +                        znode_lock_request request)
23983 +{
23984 +       lock_stack *owner;
23985 +
23986 +       owner = get_current_lock_stack();
23987 +       /*
23988 +        * check that hipri lock request is not issued when there are locked
23989 +        * nodes at the higher levels.
23990 +        */
23991 +       if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) &&
23992 +           znode_get_level(node) != 0) {
23993 +               lock_handle *item;
23994 +
23995 +               list_for_each_entry(item, &owner->locks, locks_link) {
23996 +                       znode *other;
23997 +
23998 +                       other = item->node;
23999 +
24000 +                       if (znode_get_level(other) == 0)
24001 +                               continue;
24002 +                       if (znode_get_level(other) > znode_get_level(node))
24003 +                               return 0;
24004 +               }
24005 +       }
24006 +       return 1;
24007 +}
24008 +
24009 +#endif
24010 +
24011 +/* return pointer to static storage with name of lock_mode. For
24012 +    debugging */
24013 +const char *lock_mode_name(znode_lock_mode lock/* lock mode to get name of */)
24014 +{
24015 +       if (lock == ZNODE_READ_LOCK)
24016 +               return "read";
24017 +       else if (lock == ZNODE_WRITE_LOCK)
24018 +               return "write";
24019 +       else {
24020 +               static char buf[30];
24021 +
24022 +               sprintf(buf, "unknown: %i", lock);
24023 +               return buf;
24024 +       }
24025 +}
24026 +
24027 +/* Make Linus happy.
24028 +   Local variables:
24029 +   c-indentation-style: "K&R"
24030 +   mode-name: "LC"
24031 +   c-basic-offset: 8
24032 +   tab-width: 8
24033 +   fill-column: 79
24034 +   End:
24035 +*/
24036 diff -urN linux-2.6.35.orig/fs/reiser4/lock.h linux-2.6.35/fs/reiser4/lock.h
24037 --- linux-2.6.35.orig/fs/reiser4/lock.h 1970-01-01 01:00:00.000000000 +0100
24038 +++ linux-2.6.35/fs/reiser4/lock.h      2010-08-04 15:44:57.000000000 +0200
24039 @@ -0,0 +1,250 @@
24040 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
24041 + * reiser4/README */
24042 +
24043 +/* Long term locking data structures. See lock.c for details. */
24044 +
24045 +#ifndef __LOCK_H__
24046 +#define __LOCK_H__
24047 +
24048 +#include "forward.h"
24049 +#include "debug.h"
24050 +#include "dformat.h"
24051 +#include "key.h"
24052 +#include "coord.h"
24053 +#include "plugin/node/node.h"
24054 +#include "txnmgr.h"
24055 +#include "readahead.h"
24056 +
24057 +#include <linux/types.h>
24058 +#include <linux/spinlock.h>
24059 +#include <linux/pagemap.h>     /* for PAGE_CACHE_SIZE */
24060 +#include <asm/atomic.h>
24061 +#include <linux/wait.h>
24062 +
24063 +/* Per-znode lock object */
24064 +struct zlock {
24065 +       spinlock_t guard;
24066 +       /* The number of readers if positive; the number of recursively taken
24067 +          write locks if negative. Protected by zlock spin lock. */
24068 +       int nr_readers;
24069 +       /* A number of processes (lock_stacks) that have this object
24070 +          locked with high priority */
24071 +       unsigned nr_hipri_owners;
24072 +       /* A number of attempts to lock znode in high priority direction */
24073 +       unsigned nr_hipri_requests;
24074 +       /* A linked list of lock_handle objects that contains pointers
24075 +          for all lock_stacks which have this lock object locked */
24076 +       unsigned nr_hipri_write_requests;
24077 +       struct list_head owners;
24078 +       /* A linked list of lock_stacks that wait for this lock */
24079 +       struct list_head requestors;
24080 +};
24081 +
24082 +static inline void spin_lock_zlock(zlock *lock)
24083 +{
24084 +       /* check that zlock is not locked */
24085 +       assert("", LOCK_CNT_NIL(spin_locked_zlock));
24086 +       /* check that spinlocks of lower priorities are not held */
24087 +       assert("", LOCK_CNT_NIL(spin_locked_stack));
24088 +
24089 +       spin_lock(&lock->guard);
24090 +
24091 +       LOCK_CNT_INC(spin_locked_zlock);
24092 +       LOCK_CNT_INC(spin_locked);
24093 +}
24094 +
24095 +static inline void spin_unlock_zlock(zlock *lock)
24096 +{
24097 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock));
24098 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24099 +
24100 +       LOCK_CNT_DEC(spin_locked_zlock);
24101 +       LOCK_CNT_DEC(spin_locked);
24102 +
24103 +       spin_unlock(&lock->guard);
24104 +}
24105 +
24106 +#define lock_is_locked(lock)          ((lock)->nr_readers != 0)
24107 +#define lock_is_rlocked(lock)         ((lock)->nr_readers > 0)
24108 +#define lock_is_wlocked(lock)         ((lock)->nr_readers < 0)
24109 +#define lock_is_wlocked_once(lock)    ((lock)->nr_readers == -1)
24110 +#define lock_can_be_rlocked(lock)     ((lock)->nr_readers >= 0)
24111 +#define lock_mode_compatible(lock, mode)                               \
24112 +             (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \
24113 +             ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock)))
24114 +
24115 +/* Since we have R/W znode locks we need additional bidirectional `link'
24116 +   objects to implement n<->m relationship between lock owners and lock
24117 +   objects. We call them `lock handles'.
24118 +
24119 +   Locking: see lock.c/"SHORT-TERM LOCKING"
24120 +*/
24121 +struct lock_handle {
24122 +       /* This flag indicates that a signal to yield a lock was passed to
24123 +          lock owner and counted in owner->nr_signalled
24124 +
24125 +          Locking: this is accessed under spin lock on ->node.
24126 +        */
24127 +       int signaled;
24128 +       /* A link to owner of a lock */
24129 +       lock_stack *owner;
24130 +       /* A link to znode locked */
24131 +       znode *node;
24132 +       /* A list of all locks for a process */
24133 +       struct list_head locks_link;
24134 +       /* A list of all owners for a znode */
24135 +       struct list_head owners_link;
24136 +};
24137 +
24138 +struct lock_request {
24139 +       /* A pointer to uninitialized link object */
24140 +       lock_handle *handle;
24141 +       /* A pointer to the object we want to lock */
24142 +       znode *node;
24143 +       /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */
24144 +       znode_lock_mode mode;
24145 +       /* how dispatch_lock_requests() returns lock request result code */
24146 +       int ret_code;
24147 +};
24148 +
24149 +/* A lock stack structure for accumulating locks owned by a process */
24150 +struct lock_stack {
24151 +       /* A guard lock protecting a lock stack */
24152 +       spinlock_t sguard;
24153 +       /* number of znodes which were requested by high priority processes */
24154 +       atomic_t nr_signaled;
24155 +       /* Current priority of a process
24156 +
24157 +          This is only accessed by the current thread and thus requires no
24158 +          locking.
24159 +        */
24160 +       int curpri;
24161 +       /* A list of all locks owned by this process. Elements can be added to
24162 +        * this list only by the current thread. ->node pointers in this list
24163 +        * can be only changed by the current thread. */
24164 +       struct list_head locks;
24165 +       /* When lock_stack waits for the lock, it puts itself on double-linked
24166 +          requestors list of that lock */
24167 +       struct list_head requestors_link;
24168 +       /* Current lock request info.
24169 +
24170 +          This is only accessed by the current thread and thus requires no
24171 +          locking.
24172 +        */
24173 +       struct lock_request request;
24174 +       /* the following two fields are the lock stack's
24175 +        * synchronization object to use with the standard linux/wait.h
24176 +        * interface. See reiser4_go_to_sleep and __reiser4_wake_up for
24177 +        * usage details. */
24178 +       wait_queue_head_t wait;
24179 +       atomic_t wakeup;
24180 +#if REISER4_DEBUG
24181 +       int nr_locks;           /* number of lock handles in the above list */
24182 +#endif
24183 +};
24184 +
24185 +/*
24186 +  User-visible znode locking functions
24187 +*/
24188 +
24189 +extern int longterm_lock_znode(lock_handle * handle,
24190 +                              znode * node,
24191 +                              znode_lock_mode mode,
24192 +                              znode_lock_request request);
24193 +
24194 +extern void longterm_unlock_znode(lock_handle * handle);
24195 +
24196 +extern int reiser4_check_deadlock(void);
24197 +
24198 +extern lock_stack *get_current_lock_stack(void);
24199 +
24200 +extern void init_lock_stack(lock_stack * owner);
24201 +extern void reiser4_init_lock(zlock * lock);
24202 +
24203 +static inline void init_lh(lock_handle *lh)
24204 +{
24205 +#if REISER4_DEBUG
24206 +       memset(lh, 0, sizeof *lh);
24207 +       INIT_LIST_HEAD(&lh->locks_link);
24208 +       INIT_LIST_HEAD(&lh->owners_link);
24209 +#else
24210 +       lh->node = NULL;
24211 +#endif
24212 +}
24213 +
24214 +static inline  void done_lh(lock_handle *lh)
24215 +{
24216 +       assert("zam-342", lh != NULL);
24217 +       if (lh->node != NULL)
24218 +               longterm_unlock_znode(lh);
24219 +}
24220 +
24221 +extern void move_lh(lock_handle * new, lock_handle * old);
24222 +extern void copy_lh(lock_handle * new, lock_handle * old);
24223 +
24224 +extern int reiser4_prepare_to_sleep(lock_stack * owner);
24225 +extern void reiser4_go_to_sleep(lock_stack * owner);
24226 +extern void __reiser4_wake_up(lock_stack * owner);
24227 +
24228 +extern int lock_stack_isclean(lock_stack * owner);
24229 +
24230 +/* zlock object state check macros: only used in assertions. Both forms imply
24231 +   that the lock is held by the current thread. */
24232 +extern int znode_is_write_locked(const znode *);
24233 +extern void reiser4_invalidate_lock(lock_handle *);
24234 +
24235 +/* lock ordering is: first take zlock spin lock, then lock stack spin lock */
24236 +#define spin_ordering_pred_stack(stack)                        \
24237 +       (LOCK_CNT_NIL(spin_locked_stack) &&             \
24238 +        LOCK_CNT_NIL(spin_locked_txnmgr) &&            \
24239 +        LOCK_CNT_NIL(spin_locked_inode) &&             \
24240 +        LOCK_CNT_NIL(rw_locked_cbk_cache) &&           \
24241 +        LOCK_CNT_NIL(spin_locked_super_eflush))
24242 +
24243 +static inline void spin_lock_stack(lock_stack *stack)
24244 +{
24245 +       assert("", spin_ordering_pred_stack(stack));
24246 +       spin_lock(&(stack->sguard));
24247 +       LOCK_CNT_INC(spin_locked_stack);
24248 +       LOCK_CNT_INC(spin_locked);
24249 +}
24250 +
24251 +static inline void spin_unlock_stack(lock_stack *stack)
24252 +{
24253 +       assert_spin_locked(&(stack->sguard));
24254 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack));
24255 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24256 +       LOCK_CNT_DEC(spin_locked_stack);
24257 +       LOCK_CNT_DEC(spin_locked);
24258 +       spin_unlock(&(stack->sguard));
24259 +}
24260 +
24261 +static inline void reiser4_wake_up(lock_stack * owner)
24262 +{
24263 +       spin_lock_stack(owner);
24264 +       __reiser4_wake_up(owner);
24265 +       spin_unlock_stack(owner);
24266 +}
24267 +
24268 +const char *lock_mode_name(znode_lock_mode lock);
24269 +
24270 +#if REISER4_DEBUG
24271 +extern void check_lock_data(void);
24272 +extern void check_lock_node_data(znode * node);
24273 +#else
24274 +#define check_lock_data() noop
24275 +#define check_lock_node_data() noop
24276 +#endif
24277 +
24278 +/* __LOCK_H__ */
24279 +#endif
24280 +
24281 +/* Make Linus happy.
24282 +   Local variables:
24283 +   c-indentation-style: "K&R"
24284 +   mode-name: "LC"
24285 +   c-basic-offset: 8
24286 +   tab-width: 8
24287 +   fill-column: 120
24288 +   End:
24289 +*/
24290 diff -urN linux-2.6.35.orig/fs/reiser4/Makefile linux-2.6.35/fs/reiser4/Makefile
24291 --- linux-2.6.35.orig/fs/reiser4/Makefile       1970-01-01 01:00:00.000000000 +0100
24292 +++ linux-2.6.35/fs/reiser4/Makefile    2010-08-04 15:44:57.000000000 +0200
24293 @@ -0,0 +1,98 @@
24294 +#
24295 +# reiser4/Makefile
24296 +#
24297 +
24298 +obj-$(CONFIG_REISER4_FS) += reiser4.o
24299 +
24300 +reiser4-y := \
24301 +                  debug.o \
24302 +                  jnode.o \
24303 +                  znode.o \
24304 +                  key.o \
24305 +                  pool.o \
24306 +                  tree_mod.o \
24307 +                  estimate.o \
24308 +                  carry.o \
24309 +                  carry_ops.o \
24310 +                  lock.o \
24311 +                  tree.o \
24312 +                  context.o \
24313 +                  tap.o \
24314 +                  coord.o \
24315 +                  block_alloc.o \
24316 +                  txnmgr.o \
24317 +                  kassign.o \
24318 +                  flush.o \
24319 +                  wander.o \
24320 +                  eottl.o \
24321 +                  search.o \
24322 +                  page_cache.o \
24323 +                  seal.o \
24324 +                  dscale.o \
24325 +                  flush_queue.o \
24326 +                  ktxnmgrd.o \
24327 +                  blocknrset.o \
24328 +                  super.o \
24329 +                  super_ops.o \
24330 +                  fsdata.o \
24331 +                  export_ops.o \
24332 +                  oid.o \
24333 +                  tree_walk.o \
24334 +                  inode.o \
24335 +                  vfs_ops.o \
24336 +                  as_ops.o \
24337 +                  entd.o\
24338 +                  readahead.o \
24339 +                  status_flags.o \
24340 +                  init_super.o \
24341 +                  safe_link.o \
24342 +           \
24343 +                  plugin/plugin.o \
24344 +                  plugin/plugin_set.o \
24345 +                  plugin/node/node.o \
24346 +                  plugin/object.o \
24347 +                  plugin/cluster.o \
24348 +                  plugin/inode_ops.o \
24349 +                  plugin/inode_ops_rename.o \
24350 +                  plugin/file_ops.o \
24351 +                  plugin/file_ops_readdir.o \
24352 +                  plugin/file_plugin_common.o \
24353 +                  plugin/file/file.o \
24354 +                  plugin/file/tail_conversion.o \
24355 +                  plugin/file/file_conversion.o \
24356 +                  plugin/file/symlink.o \
24357 +                  plugin/file/cryptcompress.o \
24358 +                  plugin/dir_plugin_common.o \
24359 +                  plugin/dir/hashed_dir.o \
24360 +                  plugin/dir/seekable_dir.o \
24361 +                  plugin/node/node40.o \
24362 +           \
24363 +                  plugin/crypto/cipher.o \
24364 +                  plugin/crypto/digest.o \
24365 +           \
24366 +                  plugin/compress/compress.o \
24367 +                  plugin/compress/compress_mode.o \
24368 +           \
24369 +                  plugin/item/static_stat.o \
24370 +                  plugin/item/sde.o \
24371 +                  plugin/item/cde.o \
24372 +                  plugin/item/blackbox.o \
24373 +                  plugin/item/internal.o \
24374 +                  plugin/item/tail.o \
24375 +                  plugin/item/ctail.o \
24376 +                  plugin/item/extent.o \
24377 +                  plugin/item/extent_item_ops.o \
24378 +                  plugin/item/extent_file_ops.o \
24379 +                  plugin/item/extent_flush_ops.o \
24380 +           \
24381 +                  plugin/hash.o \
24382 +                  plugin/fibration.o \
24383 +                  plugin/tail_policy.o \
24384 +                  plugin/item/item.o \
24385 +           \
24386 +                  plugin/security/perm.o \
24387 +                  plugin/space/bitmap.o \
24388 +           \
24389 +                  plugin/disk_format/disk_format40.o \
24390 +                  plugin/disk_format/disk_format.o
24391 +
24392 diff -urN linux-2.6.35.orig/fs/reiser4/oid.c linux-2.6.35/fs/reiser4/oid.c
24393 --- linux-2.6.35.orig/fs/reiser4/oid.c  1970-01-01 01:00:00.000000000 +0100
24394 +++ linux-2.6.35/fs/reiser4/oid.c       2010-08-04 15:44:57.000000000 +0200
24395 @@ -0,0 +1,141 @@
24396 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
24397 +
24398 +#include "debug.h"
24399 +#include "super.h"
24400 +#include "txnmgr.h"
24401 +
24402 +/* we used to have oid allocation plugin. It was removed because it
24403 +   was recognized as providing unneeded level of abstraction. If one
24404 +   ever will find it useful - look at yet_unneeded_abstractions/oid
24405 +*/
24406 +
24407 +/*
24408 + * initialize in-memory data for oid allocator at @super. @nr_files and @next
24409 + * are provided by disk format plugin that reads them from the disk during
24410 + * mount.
24411 + */
24412 +int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next)
24413 +{
24414 +       reiser4_super_info_data *sbinfo;
24415 +
24416 +       sbinfo = get_super_private(super);
24417 +
24418 +       sbinfo->next_to_use = next;
24419 +       sbinfo->oids_in_use = nr_files;
24420 +       return 0;
24421 +}
24422 +
24423 +/*
24424 + * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator
24425 + * runs out of oids.
24426 + */
24427 +oid_t oid_allocate(struct super_block *super)
24428 +{
24429 +       reiser4_super_info_data *sbinfo;
24430 +       oid_t oid;
24431 +
24432 +       sbinfo = get_super_private(super);
24433 +
24434 +       spin_lock_reiser4_super(sbinfo);
24435 +       if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) {
24436 +               oid = sbinfo->next_to_use++;
24437 +               sbinfo->oids_in_use++;
24438 +       } else
24439 +               oid = ABSOLUTE_MAX_OID;
24440 +       spin_unlock_reiser4_super(sbinfo);
24441 +       return oid;
24442 +}
24443 +
24444 +/*
24445 + * Tell oid allocator that @oid is now free.
24446 + */
24447 +int oid_release(struct super_block *super, oid_t oid UNUSED_ARG)
24448 +{
24449 +       reiser4_super_info_data *sbinfo;
24450 +
24451 +       sbinfo = get_super_private(super);
24452 +
24453 +       spin_lock_reiser4_super(sbinfo);
24454 +       sbinfo->oids_in_use--;
24455 +       spin_unlock_reiser4_super(sbinfo);
24456 +       return 0;
24457 +}
24458 +
24459 +/*
24460 + * return next @oid that would be allocated (i.e., returned by oid_allocate())
24461 + * without actually allocating it. This is used by disk format plugin to save
24462 + * oid allocator state on the disk.
24463 + */
24464 +oid_t oid_next(const struct super_block *super)
24465 +{
24466 +       reiser4_super_info_data *sbinfo;
24467 +       oid_t oid;
24468 +
24469 +       sbinfo = get_super_private(super);
24470 +
24471 +       spin_lock_reiser4_super(sbinfo);
24472 +       oid = sbinfo->next_to_use;
24473 +       spin_unlock_reiser4_super(sbinfo);
24474 +       return oid;
24475 +}
24476 +
24477 +/*
24478 + * returns number of currently used oids. This is used by statfs(2) to report
24479 + * number of "inodes" and by disk format plugin to save oid allocator state on
24480 + * the disk.
24481 + */
24482 +long oids_used(const struct super_block *super)
24483 +{
24484 +       reiser4_super_info_data *sbinfo;
24485 +       oid_t used;
24486 +
24487 +       sbinfo = get_super_private(super);
24488 +
24489 +       spin_lock_reiser4_super(sbinfo);
24490 +       used = sbinfo->oids_in_use;
24491 +       spin_unlock_reiser4_super(sbinfo);
24492 +       if (used < (__u64) ((long)~0) >> 1)
24493 +               return (long)used;
24494 +       else
24495 +               return (long)-1;
24496 +}
24497 +
24498 +/*
24499 + * Count oid as allocated in atom. This is done after call to oid_allocate()
24500 + * at the point when we are irrevocably committed to creation of the new file
24501 + * (i.e., when oid allocation cannot be any longer rolled back due to some
24502 + * error).
24503 + */
24504 +void oid_count_allocated(void)
24505 +{
24506 +       txn_atom *atom;
24507 +
24508 +       atom = get_current_atom_locked();
24509 +       atom->nr_objects_created++;
24510 +       spin_unlock_atom(atom);
24511 +}
24512 +
24513 +/*
24514 + * Count oid as free in atom. This is done after call to oid_release() at the
24515 + * point when we are irrevocably committed to the deletion of the file (i.e.,
24516 + * when oid release cannot be any longer rolled back due to some error).
24517 + */
24518 +void oid_count_released(void)
24519 +{
24520 +       txn_atom *atom;
24521 +
24522 +       atom = get_current_atom_locked();
24523 +       atom->nr_objects_deleted++;
24524 +       spin_unlock_atom(atom);
24525 +}
24526 +
24527 +/*
24528 +   Local variables:
24529 +   c-indentation-style: "K&R"
24530 +   mode-name: "LC"
24531 +   c-basic-offset: 8
24532 +   tab-width: 8
24533 +   fill-column: 120
24534 +   scroll-step: 1
24535 +   End:
24536 +*/
24537 diff -urN linux-2.6.35.orig/fs/reiser4/page_cache.c linux-2.6.35/fs/reiser4/page_cache.c
24538 --- linux-2.6.35.orig/fs/reiser4/page_cache.c   1970-01-01 01:00:00.000000000 +0100
24539 +++ linux-2.6.35/fs/reiser4/page_cache.c        2010-08-04 16:59:56.000000000 +0200
24540 @@ -0,0 +1,690 @@
24541 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
24542 + * reiser4/README */
24543 +
24544 +/* Memory pressure hooks. Fake inodes handling. */
24545 +
24546 +/*   GLOSSARY
24547 +
24548 +   . Formatted and unformatted nodes.
24549 +     Elements of reiser4 balanced tree to store data and metadata.
24550 +     Unformatted nodes are pointed to by extent pointers. Such nodes
24551 +     are used to store data of large objects. Unlike unformatted nodes,
24552 +     formatted ones have associated format described by node4X plugin.
24553 +
24554 +   . Jnode (or journal node)
24555 +     The in-memory header which is used to track formatted and unformatted
24556 +     nodes, bitmap nodes, etc. In particular, jnodes are used to track
24557 +     transactional information associated with each block(see reiser4/jnode.c
24558 +     for details).
24559 +
24560 +   . Znode
24561 +     The in-memory header which is used to track formatted nodes. Contains
24562 +     embedded jnode (see reiser4/znode.c for details).
24563 +*/
24564 +
24565 +/* We store all file system meta data (and data, of course) in the page cache.
24566 +
24567 +   What does this mean? In stead of using bread/brelse we create special
24568 +   "fake" inode (one per super block) and store content of formatted nodes
24569 +   into pages bound to this inode in the page cache. In newer kernels bread()
24570 +   already uses inode attached to block device (bd_inode). Advantage of having
24571 +   our own fake inode is that we can install appropriate methods in its
24572 +   address_space operations. Such methods are called by VM on memory pressure
24573 +   (or during background page flushing) and we can use them to react
24574 +   appropriately.
24575 +
24576 +   In initial version we only support one block per page. Support for multiple
24577 +   blocks per page is complicated by relocation.
24578 +
24579 +   To each page, used by reiser4, jnode is attached. jnode is analogous to
24580 +   buffer head. Difference is that jnode is bound to the page permanently:
24581 +   jnode cannot be removed from memory until its backing page is.
24582 +
24583 +   jnode contain pointer to page (->pg field) and page contain pointer to
24584 +   jnode in ->private field. Pointer from jnode to page is protected to by
24585 +   jnode's spinlock and pointer from page to jnode is protected by page lock
24586 +   (PG_locked bit). Lock ordering is: first take page lock, then jnode spin
24587 +   lock. To go into reverse direction use jnode_lock_page() function that uses
24588 +   standard try-lock-and-release device.
24589 +
24590 +   Properties:
24591 +
24592 +   1. when jnode-to-page mapping is established (by jnode_attach_page()), page
24593 +   reference counter is increased.
24594 +
24595 +   2. when jnode-to-page mapping is destroyed (by page_clear_jnode(), page
24596 +   reference counter is decreased.
24597 +
24598 +   3. on jload() reference counter on jnode page is increased, page is
24599 +   kmapped and `referenced'.
24600 +
24601 +   4. on jrelse() inverse operations are performed.
24602 +
24603 +   5. kmapping/kunmapping of unformatted pages is done by read/write methods.
24604 +
24605 +   DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting
24606 +   historically.]
24607 +
24608 +   [In the following discussion, `lock' invariably means long term lock on
24609 +   znode.] (What about page locks?)
24610 +
24611 +   There is some special class of deadlock possibilities related to memory
24612 +   pressure. Locks acquired by other reiser4 threads are accounted for in
24613 +   deadlock prevention mechanism (lock.c), but when ->vm_writeback() is
24614 +   invoked additional hidden arc is added to the locking graph: thread that
24615 +   tries to allocate memory waits for ->vm_writeback() to finish. If this
24616 +   thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock
24617 +   prevention is useless.
24618 +
24619 +   Another related problem is possibility for ->vm_writeback() to run out of
24620 +   memory itself. This is not a problem for ext2 and friends, because their
24621 +   ->vm_writeback() don't allocate much memory, but reiser4 flush is
24622 +   definitely able to allocate huge amounts of memory.
24623 +
24624 +   It seems that there is no reliable way to cope with the problems above. In
24625 +   stead it was decided that ->vm_writeback() (as invoked in the kswapd
24626 +   context) wouldn't perform any flushing itself, but rather should just wake
24627 +   up some auxiliary thread dedicated for this purpose (or, the same thread
24628 +   that does periodic commit of old atoms (ktxnmgrd.c)).
24629 +
24630 +   Details:
24631 +
24632 +   1. Page is called `reclaimable' against particular reiser4 mount F if this
24633 +   page can be ultimately released by try_to_free_pages() under presumptions
24634 +   that:
24635 +
24636 +    a. ->vm_writeback() for F is no-op, and
24637 +
24638 +    b. none of the threads accessing F are making any progress, and
24639 +
24640 +    c. other reiser4 mounts obey the same memory reservation protocol as F
24641 +    (described below).
24642 +
24643 +   For example, clean un-pinned page, or page occupied by ext2 data are
24644 +   reclaimable against any reiser4 mount.
24645 +
24646 +   When there is more than one reiser4 mount in a system, condition (c) makes
24647 +   reclaim-ability not easily verifiable beyond trivial cases mentioned above.
24648 +
24649 +   THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE
24650 +
24651 +   Fake inode is used to bound formatted nodes and each node is indexed within
24652 +   fake inode by its block number. If block size of smaller than page size, it
24653 +   may so happen that block mapped to the page with formatted node is occupied
24654 +   by unformatted node or is unallocated. This lead to some complications,
24655 +   because flushing whole page can lead to an incorrect overwrite of
24656 +   unformatted node that is moreover, can be cached in some other place as
24657 +   part of the file body. To avoid this, buffers for unformatted nodes are
24658 +   never marked dirty. Also pages in the fake are never marked dirty. This
24659 +   rules out usage of ->writepage() as memory pressure hook. In stead
24660 +   ->releasepage() is used.
24661 +
24662 +   Josh is concerned that page->buffer is going to die. This should not pose
24663 +   significant problem though, because we need to add some data structures to
24664 +   the page anyway (jnode) and all necessary book keeping can be put there.
24665 +
24666 +*/
24667 +
24668 +/* Life cycle of pages/nodes.
24669 +
24670 +   jnode contains reference to page and page contains reference back to
24671 +   jnode. This reference is counted in page ->count. Thus, page bound to jnode
24672 +   cannot be released back into free pool.
24673 +
24674 +    1. Formatted nodes.
24675 +
24676 +      1. formatted node is represented by znode. When new znode is created its
24677 +      ->pg pointer is NULL initially.
24678 +
24679 +      2. when node content is loaded into znode (by call to zload()) for the
24680 +      first time following happens (in call to ->read_node() or
24681 +      ->allocate_node()):
24682 +
24683 +       1. new page is added to the page cache.
24684 +
24685 +       2. this page is attached to znode and its ->count is increased.
24686 +
24687 +       3. page is kmapped.
24688 +
24689 +      3. if more calls to zload() follow (without corresponding zrelses), page
24690 +      counter is left intact and in its stead ->d_count is increased in znode.
24691 +
24692 +      4. each call to zrelse decreases ->d_count. When ->d_count drops to zero
24693 +      ->release_node() is called and page is kunmapped as result.
24694 +
24695 +      5. at some moment node can be captured by a transaction. Its ->x_count
24696 +      is then increased by transaction manager.
24697 +
24698 +      6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE
24699 +      bit set) following will happen (also see comment at the top of znode.c):
24700 +
24701 +       1. when last lock is released, node will be uncaptured from
24702 +       transaction. This released reference that transaction manager acquired
24703 +       at the step 5.
24704 +
24705 +       2. when last reference is released, zput() detects that node is
24706 +       actually deleted and calls ->delete_node()
24707 +       operation. page_cache_delete_node() implementation detaches jnode from
24708 +       page and releases page.
24709 +
24710 +      7. otherwise (node wasn't removed from the tree), last reference to
24711 +      znode will be released after transaction manager committed transaction
24712 +      node was in. This implies squallocing of this node (see
24713 +      flush.c). Nothing special happens at this point. Znode is still in the
24714 +      hash table and page is still attached to it.
24715 +
24716 +      8. znode is actually removed from the memory because of the memory
24717 +      pressure, or during umount (znodes_tree_done()). Anyway, znode is
24718 +      removed by the call to zdrop(). At this moment, page is detached from
24719 +      znode and removed from the inode address space.
24720 +
24721 +*/
24722 +
24723 +#include "debug.h"
24724 +#include "dformat.h"
24725 +#include "key.h"
24726 +#include "txnmgr.h"
24727 +#include "jnode.h"
24728 +#include "znode.h"
24729 +#include "block_alloc.h"
24730 +#include "tree.h"
24731 +#include "vfs_ops.h"
24732 +#include "inode.h"
24733 +#include "super.h"
24734 +#include "entd.h"
24735 +#include "page_cache.h"
24736 +#include "ktxnmgrd.h"
24737 +
24738 +#include <linux/types.h>
24739 +#include <linux/fs.h>
24740 +#include <linux/mm.h>          /* for struct page */
24741 +#include <linux/swap.h>                /* for struct page */
24742 +#include <linux/pagemap.h>
24743 +#include <linux/bio.h>
24744 +#include <linux/writeback.h>
24745 +#include <linux/blkdev.h>
24746 +
24747 +static struct bio *page_bio(struct page *, jnode * , int rw, gfp_t gfp);
24748 +
24749 +static struct address_space_operations formatted_fake_as_ops;
24750 +
24751 +static const oid_t fake_ino = 0x1;
24752 +static const oid_t bitmap_ino = 0x2;
24753 +static const oid_t cc_ino = 0x3;
24754 +
24755 +static void
24756 +init_fake_inode(struct super_block *super, struct inode *fake,
24757 +               struct inode **pfake)
24758 +{
24759 +       assert("nikita-2168", fake->i_state & I_NEW);
24760 +       fake->i_mapping->a_ops = &formatted_fake_as_ops;
24761 +       *pfake = fake;
24762 +       /* NOTE-NIKITA something else? */
24763 +       unlock_new_inode(fake);
24764 +}
24765 +
24766 +/**
24767 + * reiser4_init_formatted_fake - iget inodes for formatted nodes and bitmaps
24768 + * @super: super block to init fake inode for
24769 + *
24770 + * Initializes fake inode to which formatted nodes are bound in the page cache
24771 + * and inode for bitmaps.
24772 + */
24773 +int reiser4_init_formatted_fake(struct super_block *super)
24774 +{
24775 +       struct inode *fake;
24776 +       struct inode *bitmap;
24777 +       struct inode *cc;
24778 +       reiser4_super_info_data *sinfo;
24779 +
24780 +       assert("nikita-1703", super != NULL);
24781 +
24782 +       sinfo = get_super_private_nocheck(super);
24783 +       fake = iget_locked(super, oid_to_ino(fake_ino));
24784 +
24785 +       if (fake != NULL) {
24786 +               init_fake_inode(super, fake, &sinfo->fake);
24787 +
24788 +               bitmap = iget_locked(super, oid_to_ino(bitmap_ino));
24789 +               if (bitmap != NULL) {
24790 +                       init_fake_inode(super, bitmap, &sinfo->bitmap);
24791 +
24792 +                       cc = iget_locked(super, oid_to_ino(cc_ino));
24793 +                       if (cc != NULL) {
24794 +                               init_fake_inode(super, cc, &sinfo->cc);
24795 +                               return 0;
24796 +                       } else {
24797 +                               iput(sinfo->fake);
24798 +                               iput(sinfo->bitmap);
24799 +                               sinfo->fake = NULL;
24800 +                               sinfo->bitmap = NULL;
24801 +                       }
24802 +               } else {
24803 +                       iput(sinfo->fake);
24804 +                       sinfo->fake = NULL;
24805 +               }
24806 +       }
24807 +       return RETERR(-ENOMEM);
24808 +}
24809 +
24810 +/**
24811 + * reiser4_done_formatted_fake - release inode used by formatted nodes and bitmaps
24812 + * @super: super block to init fake inode for
24813 + *
24814 + * Releases inodes which were used as address spaces of bitmap and formatted
24815 + * nodes.
24816 + */
24817 +void reiser4_done_formatted_fake(struct super_block *super)
24818 +{
24819 +       reiser4_super_info_data *sinfo;
24820 +
24821 +       sinfo = get_super_private_nocheck(super);
24822 +
24823 +       if (sinfo->fake != NULL) {
24824 +               iput(sinfo->fake);
24825 +               sinfo->fake = NULL;
24826 +       }
24827 +
24828 +       if (sinfo->bitmap != NULL) {
24829 +               iput(sinfo->bitmap);
24830 +               sinfo->bitmap = NULL;
24831 +       }
24832 +
24833 +       if (sinfo->cc != NULL) {
24834 +               iput(sinfo->cc);
24835 +               sinfo->cc = NULL;
24836 +       }
24837 +       return;
24838 +}
24839 +
24840 +void reiser4_wait_page_writeback(struct page *page)
24841 +{
24842 +       assert("zam-783", PageLocked(page));
24843 +
24844 +       do {
24845 +               unlock_page(page);
24846 +               wait_on_page_writeback(page);
24847 +               lock_page(page);
24848 +       } while (PageWriteback(page));
24849 +}
24850 +
24851 +/* return tree @page is in */
24852 +reiser4_tree *reiser4_tree_by_page(const struct page *page/* page to query */)
24853 +{
24854 +       assert("nikita-2461", page != NULL);
24855 +       return &get_super_private(page->mapping->host->i_sb)->tree;
24856 +}
24857 +
24858 +/* completion handler for single page bio-based read.
24859 +
24860 +   mpage_end_io_read() would also do. But it's static.
24861 +
24862 +*/
24863 +static void
24864 +end_bio_single_page_read(struct bio *bio, int err UNUSED_ARG)
24865 +{
24866 +       struct page *page;
24867 +
24868 +       page = bio->bi_io_vec[0].bv_page;
24869 +
24870 +       if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
24871 +               SetPageUptodate(page);
24872 +       } else {
24873 +               ClearPageUptodate(page);
24874 +               SetPageError(page);
24875 +       }
24876 +       unlock_page(page);
24877 +       bio_put(bio);
24878 +}
24879 +
24880 +/* completion handler for single page bio-based write.
24881 +
24882 +   mpage_end_io_write() would also do. But it's static.
24883 +
24884 +*/
24885 +static void
24886 +end_bio_single_page_write(struct bio *bio, int err UNUSED_ARG)
24887 +{
24888 +       struct page *page;
24889 +
24890 +       page = bio->bi_io_vec[0].bv_page;
24891 +
24892 +       if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
24893 +               SetPageError(page);
24894 +       end_page_writeback(page);
24895 +       bio_put(bio);
24896 +}
24897 +
24898 +/* ->readpage() method for formatted nodes */
24899 +static int formatted_readpage(struct file *f UNUSED_ARG,
24900 +                             struct page *page/* page to read */)
24901 +{
24902 +       assert("nikita-2412", PagePrivate(page) && jprivate(page));
24903 +       return reiser4_page_io(page, jprivate(page), READ,
24904 +                              reiser4_ctx_gfp_mask_get());
24905 +}
24906 +
24907 +/**
24908 + * reiser4_page_io - submit single-page bio request
24909 + * @page: page to perform io for
24910 + * @node: jnode of page
24911 + * @rw: read or write
24912 + * @gfp: gfp mask for bio allocation
24913 + *
24914 + * Submits single page read or write.
24915 + */
24916 +int reiser4_page_io(struct page *page, jnode *node, int rw, gfp_t gfp)
24917 +{
24918 +       struct bio *bio;
24919 +       int result;
24920 +
24921 +       assert("nikita-2094", page != NULL);
24922 +       assert("nikita-2226", PageLocked(page));
24923 +       assert("nikita-2634", node != NULL);
24924 +       assert("nikita-2893", rw == READ || rw == WRITE);
24925 +
24926 +       if (rw) {
24927 +               if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) {
24928 +                       unlock_page(page);
24929 +                       return 0;
24930 +               }
24931 +       }
24932 +
24933 +       bio = page_bio(page, node, rw, gfp);
24934 +       if (!IS_ERR(bio)) {
24935 +               if (rw == WRITE) {
24936 +                       set_page_writeback(page);
24937 +                       unlock_page(page);
24938 +               }
24939 +               reiser4_submit_bio(rw, bio);
24940 +               result = 0;
24941 +       } else {
24942 +               unlock_page(page);
24943 +               result = PTR_ERR(bio);
24944 +       }
24945 +
24946 +       return result;
24947 +}
24948 +
24949 +/* helper function to construct bio for page */
24950 +static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp)
24951 +{
24952 +       struct bio *bio;
24953 +       assert("nikita-2092", page != NULL);
24954 +       assert("nikita-2633", node != NULL);
24955 +
24956 +       /* Simple implementation in the assumption that blocksize == pagesize.
24957 +
24958 +          We only have to submit one block, but submit_bh() will allocate bio
24959 +          anyway, so lets use all the bells-and-whistles of bio code.
24960 +        */
24961 +
24962 +       bio = bio_alloc(gfp, 1);
24963 +       if (bio != NULL) {
24964 +               int blksz;
24965 +               struct super_block *super;
24966 +               reiser4_block_nr blocknr;
24967 +
24968 +               super = page->mapping->host->i_sb;
24969 +               assert("nikita-2029", super != NULL);
24970 +               blksz = super->s_blocksize;
24971 +               assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE);
24972 +
24973 +               spin_lock_jnode(node);
24974 +               blocknr = *jnode_get_io_block(node);
24975 +               spin_unlock_jnode(node);
24976 +
24977 +               assert("nikita-2275", blocknr != (reiser4_block_nr) 0);
24978 +               assert("nikita-2276", !reiser4_blocknr_is_fake(&blocknr));
24979 +
24980 +               bio->bi_bdev = super->s_bdev;
24981 +               /* fill bio->bi_sector before calling bio_add_page(), because
24982 +                * q->merge_bvec_fn may want to inspect it (see
24983 +                * drivers/md/linear.c:linear_mergeable_bvec() for example. */
24984 +               bio->bi_sector = blocknr * (blksz >> 9);
24985 +
24986 +               if (!bio_add_page(bio, page, blksz, 0)) {
24987 +                       warning("nikita-3452",
24988 +                               "Single page bio cannot be constructed");
24989 +                       return ERR_PTR(RETERR(-EINVAL));
24990 +               }
24991 +
24992 +               /* bio -> bi_idx is filled by bio_init() */
24993 +               bio->bi_end_io = (rw == READ) ?
24994 +                   end_bio_single_page_read : end_bio_single_page_write;
24995 +
24996 +               return bio;
24997 +       } else
24998 +               return ERR_PTR(RETERR(-ENOMEM));
24999 +}
25000 +
25001 +#if 0
25002 +static int can_hit_entd(reiser4_context *ctx, struct super_block *s)
25003 +{
25004 +       if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic)
25005 +               return 1;
25006 +       if (ctx->super != s)
25007 +               return 1;
25008 +       if (get_super_private(s)->entd.tsk == current)
25009 +               return 0;
25010 +       if (!lock_stack_isclean(&ctx->stack))
25011 +               return 0;
25012 +       if (ctx->trans->atom != NULL)
25013 +               return 0;
25014 +       return 1;
25015 +}
25016 +#endif
25017 +
25018 +/**
25019 + * reiser4_writepage - writepage of struct address_space_operations
25020 + * @page: page to write
25021 + * @wbc:
25022 + *
25023 + *
25024 + */
25025 +/* Common memory pressure notification. */
25026 +int reiser4_writepage(struct page *page,
25027 +                     struct writeback_control *wbc)
25028 +{
25029 +       /*
25030 +        * assert("edward-1562",
25031 +        * can_hit_entd(get_current_context_check(), sb));
25032 +        */
25033 +       assert("vs-828", PageLocked(page));
25034 +
25035 +       return write_page_by_ent(page, wbc);
25036 +}
25037 +
25038 +/* ->set_page_dirty() method of formatted address_space */
25039 +static int formatted_set_page_dirty(struct page *page)
25040 +{
25041 +       assert("nikita-2173", page != NULL);
25042 +       BUG();
25043 +       return __set_page_dirty_nobuffers(page);
25044 +}
25045 +
25046 +/* writepages method of address space operations in reiser4 is used to involve
25047 +   into transactions pages which are dirtied via mmap. Only regular files can
25048 +   have such pages. Fake inode is used to access formatted nodes via page
25049 +   cache. As formatted nodes can never be mmaped, fake inode's writepages has
25050 +   nothing to do */
25051 +static int
25052 +writepages_fake(struct address_space *mapping, struct writeback_control *wbc)
25053 +{
25054 +       return 0;
25055 +}
25056 +
25057 +/* address space operations for the fake inode */
25058 +static struct address_space_operations formatted_fake_as_ops = {
25059 +       /* Perform a writeback of a single page as a memory-freeing
25060 +        * operation. */
25061 +       .writepage = reiser4_writepage,
25062 +       /* this is called to read formatted node */
25063 +       .readpage = formatted_readpage,
25064 +       /* ->sync_page() method of fake inode address space operations. Called
25065 +          from wait_on_page() and lock_page().
25066 +
25067 +          This is most annoyingly misnomered method. Actually it is called
25068 +          from wait_on_page_bit() and lock_page() and its purpose is to
25069 +          actually start io by jabbing device drivers.
25070 +        */
25071 +       .sync_page = block_sync_page,
25072 +       /* Write back some dirty pages from this mapping. Called from sync.
25073 +          called during sync (pdflush) */
25074 +       .writepages = writepages_fake,
25075 +       /* Set a page dirty */
25076 +       .set_page_dirty = formatted_set_page_dirty,
25077 +       /* used for read-ahead. Not applicable */
25078 +       .readpages = NULL,
25079 +       .write_begin = NULL,
25080 +       .write_end = NULL,
25081 +       .bmap = NULL,
25082 +       /* called just before page is being detached from inode mapping and
25083 +          removed from memory. Called on truncate, cut/squeeze, and
25084 +          umount. */
25085 +       .invalidatepage = reiser4_invalidatepage,
25086 +       /* this is called by shrink_cache() so that file system can try to
25087 +          release objects (jnodes, buffers, journal heads) attached to page
25088 +          and, may be made page itself free-able.
25089 +        */
25090 +       .releasepage = reiser4_releasepage,
25091 +       .direct_IO = NULL
25092 +};
25093 +
25094 +/* called just before page is released (no longer used by reiser4). Callers:
25095 +   jdelete() and extent2tail(). */
25096 +void reiser4_drop_page(struct page *page)
25097 +{
25098 +       assert("nikita-2181", PageLocked(page));
25099 +       clear_page_dirty_for_io(page);
25100 +       ClearPageUptodate(page);
25101 +#if defined(PG_skipped)
25102 +       ClearPageSkipped(page);
25103 +#endif
25104 +       unlock_page(page);
25105 +}
25106 +
25107 +#define JNODE_GANG_SIZE (16)
25108 +
25109 +/* find all jnodes from range specified and invalidate them */
25110 +static int
25111 +truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count)
25112 +{
25113 +       reiser4_inode *info;
25114 +       int truncated_jnodes;
25115 +       reiser4_tree *tree;
25116 +       unsigned long index;
25117 +       unsigned long end;
25118 +
25119 +       if (inode_file_plugin(inode) ==
25120 +           file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
25121 +               /*
25122 +                * No need to get rid of jnodes here: if the single jnode of
25123 +                * page cluster did not have page, then it was found and killed
25124 +                * before in
25125 +                * truncate_complete_page_cluster()->jput()->jput_final(),
25126 +                * otherwise it will be dropped by reiser4_invalidatepage()
25127 +                */
25128 +               return 0;
25129 +       truncated_jnodes = 0;
25130 +
25131 +       info = reiser4_inode_data(inode);
25132 +       tree = reiser4_tree_by_inode(inode);
25133 +
25134 +       index = from;
25135 +       end = from + count;
25136 +
25137 +       while (1) {
25138 +               jnode *gang[JNODE_GANG_SIZE];
25139 +               int taken;
25140 +               int i;
25141 +               jnode *node;
25142 +
25143 +               assert("nikita-3466", index <= end);
25144 +
25145 +               read_lock_tree(tree);
25146 +               taken =
25147 +                   radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info),
25148 +                                          (void **)gang, index,
25149 +                                          JNODE_GANG_SIZE);
25150 +               for (i = 0; i < taken; ++i) {
25151 +                       node = gang[i];
25152 +                       if (index_jnode(node) < end)
25153 +                               jref(node);
25154 +                       else
25155 +                               gang[i] = NULL;
25156 +               }
25157 +               read_unlock_tree(tree);
25158 +
25159 +               for (i = 0; i < taken; ++i) {
25160 +                       node = gang[i];
25161 +                       if (node != NULL) {
25162 +                               index = max(index, index_jnode(node));
25163 +                               spin_lock_jnode(node);
25164 +                               assert("edward-1457", node->pg == NULL);
25165 +                               /* this is always called after
25166 +                                  truncate_inode_pages_range(). Therefore, here
25167 +                                  jnode can not have page. New pages can not be
25168 +                                  created because truncate_jnodes_range goes
25169 +                                  under exclusive access on file obtained,
25170 +                                  where as new page creation requires
25171 +                                  non-exclusive access obtained */
25172 +                               JF_SET(node, JNODE_HEARD_BANSHEE);
25173 +                               reiser4_uncapture_jnode(node);
25174 +                               unhash_unformatted_jnode(node);
25175 +                               truncated_jnodes++;
25176 +                               jput(node);
25177 +                       } else
25178 +                               break;
25179 +               }
25180 +               if (i != taken || taken == 0)
25181 +                       break;
25182 +       }
25183 +       return truncated_jnodes;
25184 +}
25185 +
25186 +/* Truncating files in reiser4: problems and solutions.
25187 +
25188 +   VFS calls fs's truncate after it has called truncate_inode_pages()
25189 +   to get rid of pages corresponding to part of file being truncated.
25190 +   In reiser4 it may cause existence of unallocated extents which do
25191 +   not have jnodes. Flush code does not expect that. Solution of this
25192 +   problem is straightforward. As vfs's truncate is implemented using
25193 +   setattr operation, it seems reasonable to have ->setattr() that
25194 +   will cut file body. However, flush code also does not expect dirty
25195 +   pages without parent items, so it is impossible to cut all items,
25196 +   then truncate all pages in two steps. We resolve this problem by
25197 +   cutting items one-by-one. Each such fine-grained step performed
25198 +   under longterm znode lock calls at the end ->kill_hook() method of
25199 +   a killed item to remove its binded pages and jnodes.
25200 +
25201 +   The following function is a common part of mentioned kill hooks.
25202 +   Also, this is called before tail-to-extent conversion (to not manage
25203 +   few copies of the data).
25204 +*/
25205 +void reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from,
25206 +                             unsigned long count, int even_cows)
25207 +{
25208 +       loff_t from_bytes, count_bytes;
25209 +
25210 +       if (count == 0)
25211 +               return;
25212 +       from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT;
25213 +       count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT;
25214 +
25215 +       unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows);
25216 +       truncate_inode_pages_range(mapping, from_bytes,
25217 +                                  from_bytes + count_bytes - 1);
25218 +       truncate_jnodes_range(mapping->host, from, count);
25219 +}
25220 +
25221 +/*
25222 + * Local variables:
25223 + * c-indentation-style: "K&R"
25224 + * mode-name: "LC"
25225 + * c-basic-offset: 8
25226 + * tab-width: 8
25227 + * fill-column: 120
25228 + * scroll-step: 1
25229 + * End:
25230 + */
25231 diff -urN linux-2.6.35.orig/fs/reiser4/page_cache.h linux-2.6.35/fs/reiser4/page_cache.h
25232 --- linux-2.6.35.orig/fs/reiser4/page_cache.h   1970-01-01 01:00:00.000000000 +0100
25233 +++ linux-2.6.35/fs/reiser4/page_cache.h        2010-08-04 15:44:57.000000000 +0200
25234 @@ -0,0 +1,66 @@
25235 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
25236 + * reiser4/README */
25237 +/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */
25238 +
25239 +#if !defined(__REISER4_PAGE_CACHE_H__)
25240 +#define __REISER4_PAGE_CACHE_H__
25241 +
25242 +#include "forward.h"
25243 +#include "context.h"            /* for reiser4_ctx_gfp_mask_get() */
25244 +
25245 +#include <linux/fs.h>          /* for struct super_block, address_space  */
25246 +#include <linux/mm.h>          /* for struct page  */
25247 +#include <linux/pagemap.h>     /* for lock_page()  */
25248 +#include <linux/vmalloc.h>     /* for __vmalloc()  */
25249 +
25250 +extern int reiser4_init_formatted_fake(struct super_block *);
25251 +extern void reiser4_done_formatted_fake(struct super_block *);
25252 +
25253 +extern reiser4_tree *reiser4_tree_by_page(const struct page *);
25254 +
25255 +#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio))
25256 +
25257 +extern void reiser4_wait_page_writeback(struct page *);
25258 +static inline void lock_and_wait_page_writeback(struct page *page)
25259 +{
25260 +       lock_page(page);
25261 +       if (unlikely(PageWriteback(page)))
25262 +               reiser4_wait_page_writeback(page);
25263 +}
25264 +
25265 +#define jprivate(page) ((jnode *)page_private(page))
25266 +
25267 +extern int reiser4_page_io(struct page *, jnode *, int rw, gfp_t);
25268 +extern void reiser4_drop_page(struct page *);
25269 +extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from,
25270 +                                    unsigned long count, int even_cows);
25271 +extern void capture_reiser4_inodes(struct super_block *,
25272 +                                  struct writeback_control *);
25273 +static inline void *reiser4_vmalloc(unsigned long size)
25274 +{
25275 +       return __vmalloc(size,
25276 +                        reiser4_ctx_gfp_mask_get() | __GFP_HIGHMEM,
25277 +                        PAGE_KERNEL);
25278 +}
25279 +
25280 +#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY
25281 +
25282 +#if REISER4_DEBUG
25283 +extern void print_page(const char *prefix, struct page *page);
25284 +#else
25285 +#define print_page(prf, p) noop
25286 +#endif
25287 +
25288 +/* __REISER4_PAGE_CACHE_H__ */
25289 +#endif
25290 +
25291 +/* Make Linus happy.
25292 +   Local variables:
25293 +   c-indentation-style: "K&R"
25294 +   mode-name: "LC"
25295 +   c-basic-offset: 8
25296 +   tab-width: 8
25297 +   fill-column: 120
25298 +   scroll-step: 1
25299 +   End:
25300 +*/
25301 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/cluster.c linux-2.6.35/fs/reiser4/plugin/cluster.c
25302 --- linux-2.6.35.orig/fs/reiser4/plugin/cluster.c       1970-01-01 01:00:00.000000000 +0100
25303 +++ linux-2.6.35/fs/reiser4/plugin/cluster.c    2010-08-04 15:44:57.000000000 +0200
25304 @@ -0,0 +1,72 @@
25305 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
25306 + * reiser4/README */
25307 +
25308 +/* Contains reiser4 cluster plugins (see
25309 +   http://www.namesys.com/cryptcompress_design.html
25310 +   "Concepts of clustering" for details). */
25311 +
25312 +#include "plugin_header.h"
25313 +#include "plugin.h"
25314 +#include "../inode.h"
25315 +
25316 +static int change_cluster(struct inode *inode,
25317 +                         reiser4_plugin * plugin,
25318 +                         pset_member memb)
25319 +{
25320 +       assert("edward-1324", inode != NULL);
25321 +       assert("edward-1325", plugin != NULL);
25322 +       assert("edward-1326", is_reiser4_inode(inode));
25323 +       assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE);
25324 +
25325 +       /* Can't change the cluster plugin for already existent regular files */
25326 +       if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25327 +               return RETERR(-EINVAL);
25328 +
25329 +       /* If matches, nothing to change. */
25330 +       if (inode_hash_plugin(inode) != NULL &&
25331 +           inode_hash_plugin(inode)->h.id == plugin->h.id)
25332 +               return 0;
25333 +
25334 +       return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25335 +                              PSET_CLUSTER, plugin);
25336 +}
25337 +
25338 +static reiser4_plugin_ops cluster_plugin_ops = {
25339 +       .init = NULL,
25340 +       .load = NULL,
25341 +       .save_len = NULL,
25342 +       .save = NULL,
25343 +       .change = &change_cluster
25344 +};
25345 +
25346 +#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC)                        \
25347 +       [CLUSTER_ ## ID ## _ID] = {                             \
25348 +               .h = {                                          \
25349 +                       .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \
25350 +                       .id = CLUSTER_ ## ID ## _ID,            \
25351 +                       .pops = &cluster_plugin_ops,            \
25352 +                       .label = LABEL,                         \
25353 +                       .desc = DESC,                           \
25354 +                       .linkage = {NULL, NULL}                 \
25355 +               },                                              \
25356 +               .shift = SHIFT                                  \
25357 +       }
25358 +
25359 +cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = {
25360 +       SUPPORT_CLUSTER(16, 64K, "64K", "Large"),
25361 +       SUPPORT_CLUSTER(15, 32K, "32K", "Big"),
25362 +       SUPPORT_CLUSTER(14, 16K, "16K", "Average"),
25363 +       SUPPORT_CLUSTER(13, 8K, "8K", "Small"),
25364 +       SUPPORT_CLUSTER(12, 4K, "4K", "Minimal")
25365 +};
25366 +
25367 +/*
25368 +  Local variables:
25369 +  c-indentation-style: "K&R"
25370 +  mode-name: "LC"
25371 +  c-basic-offset: 8
25372 +  tab-width: 8
25373 +  fill-column: 120
25374 +  scroll-step: 1
25375 +  End:
25376 +*/
25377 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/cluster.h linux-2.6.35/fs/reiser4/plugin/cluster.h
25378 --- linux-2.6.35.orig/fs/reiser4/plugin/cluster.h       1970-01-01 01:00:00.000000000 +0100
25379 +++ linux-2.6.35/fs/reiser4/plugin/cluster.h    2010-08-04 15:44:57.000000000 +0200
25380 @@ -0,0 +1,410 @@
25381 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25382 +
25383 +/* This file contains size/offset translators, modulators
25384 +   and other helper functions. */
25385 +
25386 +#if !defined(__FS_REISER4_CLUSTER_H__)
25387 +#define __FS_REISER4_CLUSTER_H__
25388 +
25389 +#include "../inode.h"
25390 +
25391 +static inline int inode_cluster_shift(struct inode *inode)
25392 +{
25393 +       assert("edward-92", inode != NULL);
25394 +       assert("edward-93", reiser4_inode_data(inode) != NULL);
25395 +
25396 +       return inode_cluster_plugin(inode)->shift;
25397 +}
25398 +
25399 +static inline unsigned cluster_nrpages_shift(struct inode *inode)
25400 +{
25401 +       return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT;
25402 +}
25403 +
25404 +/* cluster size in page units */
25405 +static inline unsigned cluster_nrpages(struct inode *inode)
25406 +{
25407 +       return 1U << cluster_nrpages_shift(inode);
25408 +}
25409 +
25410 +static inline size_t inode_cluster_size(struct inode *inode)
25411 +{
25412 +       assert("edward-96", inode != NULL);
25413 +
25414 +       return 1U << inode_cluster_shift(inode);
25415 +}
25416 +
25417 +static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode)
25418 +{
25419 +       return idx >> cluster_nrpages_shift(inode);
25420 +}
25421 +
25422 +static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode)
25423 +{
25424 +       return idx << cluster_nrpages_shift(inode);
25425 +}
25426 +
25427 +static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode)
25428 +{
25429 +       return clust_to_pg(pg_to_clust(idx, inode), inode);
25430 +}
25431 +
25432 +static inline pgoff_t off_to_pg(loff_t off)
25433 +{
25434 +       return (off >> PAGE_CACHE_SHIFT);
25435 +}
25436 +
25437 +static inline loff_t pg_to_off(pgoff_t idx)
25438 +{
25439 +       return ((loff_t) (idx) << PAGE_CACHE_SHIFT);
25440 +}
25441 +
25442 +static inline cloff_t off_to_clust(loff_t off, struct inode *inode)
25443 +{
25444 +       return off >> inode_cluster_shift(inode);
25445 +}
25446 +
25447 +static inline loff_t clust_to_off(cloff_t idx, struct inode *inode)
25448 +{
25449 +       return (loff_t) idx << inode_cluster_shift(inode);
25450 +}
25451 +
25452 +static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode)
25453 +{
25454 +       return clust_to_off(off_to_clust(off, inode), inode);
25455 +}
25456 +
25457 +static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode)
25458 +{
25459 +       return clust_to_pg(off_to_clust(off, inode), inode);
25460 +}
25461 +
25462 +static inline unsigned off_to_pgoff(loff_t off)
25463 +{
25464 +       return off & (PAGE_CACHE_SIZE - 1);
25465 +}
25466 +
25467 +static inline unsigned off_to_cloff(loff_t off, struct inode *inode)
25468 +{
25469 +       return off & ((loff_t) (inode_cluster_size(inode)) - 1);
25470 +}
25471 +
25472 +static inline  pgoff_t offset_in_clust(struct page *page)
25473 +{
25474 +       assert("edward-1488", page != NULL);
25475 +       assert("edward-1489", page->mapping != NULL);
25476 +
25477 +       return page_index(page) & ((cluster_nrpages(page->mapping->host)) - 1);
25478 +}
25479 +
25480 +static inline int first_page_in_cluster(struct page *page)
25481 +{
25482 +       return offset_in_clust(page) == 0;
25483 +}
25484 +
25485 +static inline int last_page_in_cluster(struct page *page)
25486 +{
25487 +       return offset_in_clust(page) ==
25488 +               cluster_nrpages(page->mapping->host) - 1;
25489 +}
25490 +
25491 +static inline unsigned
25492 +pg_to_off_to_cloff(unsigned long idx, struct inode *inode)
25493 +{
25494 +       return off_to_cloff(pg_to_off(idx), inode);
25495 +}
25496 +
25497 +/*********************** Size translators **************************/
25498 +
25499 +/* Translate linear size.
25500 + * New units are (1 << @blk_shift) times larger, then old ones.
25501 + * In other words, calculate number of logical blocks, occupied
25502 + * by @count elements
25503 + */
25504 +static inline unsigned long size_in_blocks(loff_t count, unsigned blkbits)
25505 +{
25506 +       return (count + (1UL << blkbits) - 1) >> blkbits;
25507 +}
25508 +
25509 +/* size in pages */
25510 +static inline pgoff_t size_in_pages(loff_t size)
25511 +{
25512 +       return size_in_blocks(size, PAGE_CACHE_SHIFT);
25513 +}
25514 +
25515 +/* size in logical clusters */
25516 +static inline cloff_t size_in_lc(loff_t size, struct inode *inode)
25517 +{
25518 +       return size_in_blocks(size, inode_cluster_shift(inode));
25519 +}
25520 +
25521 +/* size in pages to the size in page clusters */
25522 +static inline cloff_t sp_to_spcl(pgoff_t size, struct inode *inode)
25523 +{
25524 +       return size_in_blocks(size, cluster_nrpages_shift(inode));
25525 +}
25526 +
25527 +/*********************** Size modulators ***************************/
25528 +
25529 +/*
25530 +  Modulate linear size by nominated block size and offset.
25531 +
25532 +  The "finite" function (which is zero almost everywhere).
25533 +  How much is a height of the figure at a position @pos,
25534 +  when trying to construct rectangle of height (1 << @blkbits),
25535 +  and square @size.
25536 +
25537 +  ******
25538 +  *******
25539 +  *******
25540 +  *******
25541 +  ----------> pos
25542 +*/
25543 +static inline unsigned __mbb(loff_t size, unsigned long pos, int blkbits)
25544 +{
25545 +       unsigned end = size >> blkbits;
25546 +       if (pos < end)
25547 +               return 1U << blkbits;
25548 +       if (unlikely(pos > end))
25549 +               return 0;
25550 +       return size & ~(~0ull << blkbits);
25551 +}
25552 +
25553 +/* the same as above, but block size is page size */
25554 +static inline unsigned __mbp(loff_t size, pgoff_t pos)
25555 +{
25556 +       return __mbb(size, pos, PAGE_CACHE_SHIFT);
25557 +}
25558 +
25559 +/* number of file's bytes in the nominated logical cluster */
25560 +static inline unsigned lbytes(cloff_t index, struct inode *inode)
25561 +{
25562 +       return __mbb(i_size_read(inode), index, inode_cluster_shift(inode));
25563 +}
25564 +
25565 +/* number of file's bytes in the nominated page */
25566 +static inline unsigned pbytes(pgoff_t index, struct inode *inode)
25567 +{
25568 +       return __mbp(i_size_read(inode), index);
25569 +}
25570 +
25571 +/**
25572 + * number of pages occuped by @win->count bytes starting from
25573 + * @win->off at logical cluster defined by @win. This is exactly
25574 + * a number of pages to be modified and dirtied in any cluster operation.
25575 + */
25576 +static inline pgoff_t win_count_to_nrpages(struct reiser4_slide * win)
25577 +{
25578 +       return ((win->off + win->count +
25579 +                (1UL << PAGE_CACHE_SHIFT) - 1) >> PAGE_CACHE_SHIFT) -
25580 +               off_to_pg(win->off);
25581 +}
25582 +
25583 +/* return true, if logical cluster is not occupied by the file */
25584 +static inline int new_logical_cluster(struct cluster_handle *clust,
25585 +                                     struct inode *inode)
25586 +{
25587 +       return clust_to_off(clust->index, inode) >= i_size_read(inode);
25588 +}
25589 +
25590 +/* return true, if pages @p1 and @p2 are of the same page cluster */
25591 +static inline int same_page_cluster(struct page *p1, struct page *p2)
25592 +{
25593 +       assert("edward-1490", p1 != NULL);
25594 +       assert("edward-1491", p2 != NULL);
25595 +       assert("edward-1492", p1->mapping != NULL);
25596 +       assert("edward-1493", p2->mapping != NULL);
25597 +
25598 +       return (pg_to_clust(page_index(p1), p1->mapping->host) ==
25599 +               pg_to_clust(page_index(p2), p2->mapping->host));
25600 +}
25601 +
25602 +static inline int cluster_is_complete(struct cluster_handle *clust,
25603 +                                     struct inode *inode)
25604 +{
25605 +       return clust->tc.lsize == inode_cluster_size(inode);
25606 +}
25607 +
25608 +static inline void reiser4_slide_init(struct reiser4_slide *win)
25609 +{
25610 +       assert("edward-1084", win != NULL);
25611 +       memset(win, 0, sizeof *win);
25612 +}
25613 +
25614 +static inline tfm_action
25615 +cluster_get_tfm_act(struct tfm_cluster *tc)
25616 +{
25617 +       assert("edward-1356", tc != NULL);
25618 +       return tc->act;
25619 +}
25620 +
25621 +static inline void
25622 +cluster_set_tfm_act(struct tfm_cluster *tc, tfm_action act)
25623 +{
25624 +       assert("edward-1356", tc != NULL);
25625 +       tc->act = act;
25626 +}
25627 +
25628 +static inline void cluster_init_act(struct cluster_handle *clust,
25629 +                                   tfm_action act,
25630 +                                   struct reiser4_slide *window)
25631 +{
25632 +       assert("edward-84", clust != NULL);
25633 +       memset(clust, 0, sizeof *clust);
25634 +       cluster_set_tfm_act(&clust->tc, act);
25635 +       clust->dstat = INVAL_DISK_CLUSTER;
25636 +       clust->win = window;
25637 +}
25638 +
25639 +static inline void cluster_init_read(struct cluster_handle *clust,
25640 +                                    struct reiser4_slide *window)
25641 +{
25642 +       cluster_init_act(clust, TFMA_READ, window);
25643 +}
25644 +
25645 +static inline void cluster_init_write(struct cluster_handle *clust,
25646 +                                     struct reiser4_slide *window)
25647 +{
25648 +       cluster_init_act(clust, TFMA_WRITE, window);
25649 +}
25650 +
25651 +/* true if @p1 and @p2 are items of the same disk cluster */
25652 +static inline int same_disk_cluster(const coord_t *p1, const coord_t *p2)
25653 +{
25654 +       /* drop this if you have other items to aggregate */
25655 +       assert("edward-1494", item_id_by_coord(p1) == CTAIL_ID);
25656 +
25657 +       return item_plugin_by_coord(p1)->b.mergeable(p1, p2);
25658 +}
25659 +
25660 +static inline int dclust_get_extension_dsize(hint_t *hint)
25661 +{
25662 +       return hint->ext_coord.extension.ctail.dsize;
25663 +}
25664 +
25665 +static inline void dclust_set_extension_dsize(hint_t *hint, int dsize)
25666 +{
25667 +       hint->ext_coord.extension.ctail.dsize = dsize;
25668 +}
25669 +
25670 +static inline int dclust_get_extension_shift(hint_t *hint)
25671 +{
25672 +       return hint->ext_coord.extension.ctail.shift;
25673 +}
25674 +
25675 +static inline int dclust_get_extension_ncount(hint_t *hint)
25676 +{
25677 +       return hint->ext_coord.extension.ctail.ncount;
25678 +}
25679 +
25680 +static inline void dclust_inc_extension_ncount(hint_t *hint)
25681 +{
25682 +       hint->ext_coord.extension.ctail.ncount++;
25683 +}
25684 +
25685 +static inline void dclust_init_extension(hint_t *hint)
25686 +{
25687 +       memset(&hint->ext_coord.extension.ctail, 0,
25688 +              sizeof(hint->ext_coord.extension.ctail));
25689 +}
25690 +
25691 +static inline int hint_is_unprepped_dclust(hint_t *hint)
25692 +{
25693 +       assert("edward-1451", hint_is_valid(hint));
25694 +       return dclust_get_extension_shift(hint) == (int)UCTAIL_SHIFT;
25695 +}
25696 +
25697 +static inline void coord_set_between_clusters(coord_t *coord)
25698 +{
25699 +#if REISER4_DEBUG
25700 +       int result;
25701 +       result = zload(coord->node);
25702 +       assert("edward-1296", !result);
25703 +#endif
25704 +       if (!coord_is_between_items(coord)) {
25705 +               coord->between = AFTER_ITEM;
25706 +               coord->unit_pos = 0;
25707 +       }
25708 +#if REISER4_DEBUG
25709 +       zrelse(coord->node);
25710 +#endif
25711 +}
25712 +
25713 +int reiser4_inflate_cluster(struct cluster_handle *, struct inode *);
25714 +int find_disk_cluster(struct cluster_handle *, struct inode *, int read,
25715 +                     znode_lock_mode mode);
25716 +int checkout_logical_cluster(struct cluster_handle *, jnode * , struct inode *);
25717 +int reiser4_deflate_cluster(struct cluster_handle *, struct inode *);
25718 +void truncate_complete_page_cluster(struct inode *inode, cloff_t start,
25719 +                                        int even_cows);
25720 +void invalidate_hint_cluster(struct cluster_handle *clust);
25721 +int get_disk_cluster_locked(struct cluster_handle *clust, struct inode *inode,
25722 +                           znode_lock_mode lock_mode);
25723 +void reset_cluster_params(struct cluster_handle *clust);
25724 +int set_cluster_by_page(struct cluster_handle *clust, struct page *page,
25725 +                       int count);
25726 +int prepare_page_cluster(struct inode *inode, struct cluster_handle *clust,
25727 +                        rw_op rw);
25728 +void __put_page_cluster(int from, int count, struct page **pages,
25729 +                       struct inode *inode);
25730 +void put_page_cluster(struct cluster_handle *clust,
25731 +                     struct inode *inode, rw_op rw);
25732 +void put_cluster_handle(struct cluster_handle *clust);
25733 +int grab_tfm_stream(struct inode *inode, struct tfm_cluster *tc,
25734 +                   tfm_stream_id id);
25735 +int tfm_cluster_is_uptodate(struct tfm_cluster *tc);
25736 +void tfm_cluster_set_uptodate(struct tfm_cluster *tc);
25737 +void tfm_cluster_clr_uptodate(struct tfm_cluster *tc);
25738 +
25739 +/* move cluster handle to the target position
25740 +   specified by the page of index @pgidx */
25741 +static inline void move_cluster_forward(struct cluster_handle *clust,
25742 +                                       struct inode *inode,
25743 +                                       pgoff_t pgidx)
25744 +{
25745 +       assert("edward-1297", clust != NULL);
25746 +       assert("edward-1298", inode != NULL);
25747 +
25748 +       reset_cluster_params(clust);
25749 +       if (clust->index_valid &&
25750 +           /* Hole in the indices. Hint became invalid and can not be
25751 +              used by find_cluster_item() even if seal/node versions
25752 +              will coincide */
25753 +           pg_to_clust(pgidx, inode) != clust->index + 1) {
25754 +               reiser4_unset_hint(clust->hint);
25755 +               invalidate_hint_cluster(clust);
25756 +       }
25757 +       clust->index = pg_to_clust(pgidx, inode);
25758 +       clust->index_valid = 1;
25759 +}
25760 +
25761 +static inline int alloc_clust_pages(struct cluster_handle *clust,
25762 +                                   struct inode *inode)
25763 +{
25764 +       assert("edward-791", clust != NULL);
25765 +       assert("edward-792", inode != NULL);
25766 +       clust->pages =
25767 +               kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode),
25768 +                       reiser4_ctx_gfp_mask_get());
25769 +       if (!clust->pages)
25770 +               return -ENOMEM;
25771 +       return 0;
25772 +}
25773 +
25774 +static inline void free_clust_pages(struct cluster_handle *clust)
25775 +{
25776 +       kfree(clust->pages);
25777 +}
25778 +
25779 +#endif                         /* __FS_REISER4_CLUSTER_H__ */
25780 +
25781 +/* Make Linus happy.
25782 +   Local variables:
25783 +   c-indentation-style: "K&R"
25784 +   mode-name: "LC"
25785 +   c-basic-offset: 8
25786 +   tab-width: 8
25787 +   fill-column: 120
25788 +   scroll-step: 1
25789 +   End:
25790 +*/
25791 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/compress/compress.c linux-2.6.35/fs/reiser4/plugin/compress/compress.c
25792 --- linux-2.6.35.orig/fs/reiser4/plugin/compress/compress.c     1970-01-01 01:00:00.000000000 +0100
25793 +++ linux-2.6.35/fs/reiser4/plugin/compress/compress.c  2010-08-04 15:44:57.000000000 +0200
25794 @@ -0,0 +1,355 @@
25795 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25796 +/* reiser4 compression transform plugins */
25797 +
25798 +#include "../../debug.h"
25799 +#include "../../inode.h"
25800 +#include "../plugin.h"
25801 +
25802 +#include <linux/lzo.h>
25803 +#include <linux/zlib.h>
25804 +#include <linux/types.h>
25805 +#include <linux/hardirq.h>
25806 +
25807 +static int change_compression(struct inode *inode,
25808 +                             reiser4_plugin * plugin,
25809 +                             pset_member memb)
25810 +{
25811 +       assert("edward-1316", inode != NULL);
25812 +       assert("edward-1317", plugin != NULL);
25813 +       assert("edward-1318", is_reiser4_inode(inode));
25814 +       assert("edward-1319",
25815 +              plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE);
25816 +
25817 +       /* cannot change compression plugin of already existing regular object */
25818 +       if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25819 +               return RETERR(-EINVAL);
25820 +
25821 +       /* If matches, nothing to change. */
25822 +       if (inode_hash_plugin(inode) != NULL &&
25823 +           inode_hash_plugin(inode)->h.id == plugin->h.id)
25824 +               return 0;
25825 +
25826 +       return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25827 +                              PSET_COMPRESSION, plugin);
25828 +}
25829 +
25830 +static reiser4_plugin_ops compression_plugin_ops = {
25831 +       .init = NULL,
25832 +       .load = NULL,
25833 +       .save_len = NULL,
25834 +       .save = NULL,
25835 +       .change = &change_compression
25836 +};
25837 +
25838 +/******************************************************************************/
25839 +/*                         gzip1 compression                                  */
25840 +/******************************************************************************/
25841 +
25842 +#define GZIP1_DEF_LEVEL                        Z_BEST_SPEED
25843 +#define GZIP1_DEF_WINBITS              15
25844 +#define GZIP1_DEF_MEMLEVEL             MAX_MEM_LEVEL
25845 +
25846 +static int gzip1_init(void)
25847 +{
25848 +       return 0;
25849 +}
25850 +
25851 +static int gzip1_overrun(unsigned src_len UNUSED_ARG)
25852 +{
25853 +       return 0;
25854 +}
25855 +
25856 +static coa_t gzip1_alloc(tfm_action act)
25857 +{
25858 +       coa_t coa = NULL;
25859 +       int ret = 0;
25860 +       switch (act) {
25861 +       case TFMA_WRITE:        /* compress */
25862 +               coa = reiser4_vmalloc(zlib_deflate_workspacesize());
25863 +               if (!coa) {
25864 +                       ret = -ENOMEM;
25865 +                       break;
25866 +               }
25867 +               break;
25868 +       case TFMA_READ: /* decompress */
25869 +               coa = reiser4_vmalloc(zlib_inflate_workspacesize());
25870 +               if (!coa) {
25871 +                       ret = -ENOMEM;
25872 +                       break;
25873 +               }
25874 +               break;
25875 +       default:
25876 +               impossible("edward-767",
25877 +                          "trying to alloc workspace for unknown tfm action");
25878 +       }
25879 +       if (ret) {
25880 +               warning("edward-768",
25881 +                       "alloc workspace for gzip1 (tfm action = %d) failed\n",
25882 +                       act);
25883 +               return ERR_PTR(ret);
25884 +       }
25885 +       return coa;
25886 +}
25887 +
25888 +static void gzip1_free(coa_t coa, tfm_action act)
25889 +{
25890 +       assert("edward-769", coa != NULL);
25891 +
25892 +       switch (act) {
25893 +       case TFMA_WRITE:        /* compress */
25894 +               vfree(coa);
25895 +               break;
25896 +       case TFMA_READ:         /* decompress */
25897 +               vfree(coa);
25898 +               break;
25899 +       default:
25900 +               impossible("edward-770", "unknown tfm action");
25901 +       }
25902 +       return;
25903 +}
25904 +
25905 +static int gzip1_min_size_deflate(void)
25906 +{
25907 +       return 64;
25908 +}
25909 +
25910 +static void
25911 +gzip1_compress(coa_t coa, __u8 * src_first, size_t src_len,
25912 +              __u8 * dst_first, size_t *dst_len)
25913 +{
25914 +       int ret = 0;
25915 +       struct z_stream_s stream;
25916 +
25917 +       assert("edward-842", coa != NULL);
25918 +       assert("edward-875", src_len != 0);
25919 +
25920 +       stream.workspace = coa;
25921 +       ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED,
25922 +                               -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL,
25923 +                               Z_DEFAULT_STRATEGY);
25924 +       if (ret != Z_OK) {
25925 +               warning("edward-771", "zlib_deflateInit2 returned %d\n", ret);
25926 +               goto rollback;
25927 +       }
25928 +       ret = zlib_deflateReset(&stream);
25929 +       if (ret != Z_OK) {
25930 +               warning("edward-772", "zlib_deflateReset returned %d\n", ret);
25931 +               goto rollback;
25932 +       }
25933 +       stream.next_in = src_first;
25934 +       stream.avail_in = src_len;
25935 +       stream.next_out = dst_first;
25936 +       stream.avail_out = *dst_len;
25937 +
25938 +       ret = zlib_deflate(&stream, Z_FINISH);
25939 +       if (ret != Z_STREAM_END) {
25940 +               if (ret != Z_OK)
25941 +                       warning("edward-773",
25942 +                               "zlib_deflate returned %d\n", ret);
25943 +               goto rollback;
25944 +       }
25945 +       *dst_len = stream.total_out;
25946 +       return;
25947 +      rollback:
25948 +       *dst_len = src_len;
25949 +       return;
25950 +}
25951 +
25952 +static void
25953 +gzip1_decompress(coa_t coa, __u8 * src_first, size_t src_len,
25954 +                __u8 * dst_first, size_t *dst_len)
25955 +{
25956 +       int ret = 0;
25957 +       struct z_stream_s stream;
25958 +
25959 +       assert("edward-843", coa != NULL);
25960 +       assert("edward-876", src_len != 0);
25961 +
25962 +       stream.workspace = coa;
25963 +       ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS);
25964 +       if (ret != Z_OK) {
25965 +               warning("edward-774", "zlib_inflateInit2 returned %d\n", ret);
25966 +               return;
25967 +       }
25968 +       ret = zlib_inflateReset(&stream);
25969 +       if (ret != Z_OK) {
25970 +               warning("edward-775", "zlib_inflateReset returned %d\n", ret);
25971 +               return;
25972 +       }
25973 +
25974 +       stream.next_in = src_first;
25975 +       stream.avail_in = src_len;
25976 +       stream.next_out = dst_first;
25977 +       stream.avail_out = *dst_len;
25978 +
25979 +       ret = zlib_inflate(&stream, Z_SYNC_FLUSH);
25980 +       /*
25981 +        * Work around a bug in zlib, which sometimes wants to taste an extra
25982 +        * byte when being used in the (undocumented) raw deflate mode.
25983 +        * (From USAGI).
25984 +        */
25985 +       if (ret == Z_OK && !stream.avail_in && stream.avail_out) {
25986 +               u8 zerostuff = 0;
25987 +               stream.next_in = &zerostuff;
25988 +               stream.avail_in = 1;
25989 +               ret = zlib_inflate(&stream, Z_FINISH);
25990 +       }
25991 +       if (ret != Z_STREAM_END) {
25992 +               warning("edward-776", "zlib_inflate returned %d\n", ret);
25993 +               return;
25994 +       }
25995 +       *dst_len = stream.total_out;
25996 +       return;
25997 +}
25998 +
25999 +/******************************************************************************/
26000 +/*                            lzo1 compression                                */
26001 +/******************************************************************************/
26002 +
26003 +static int lzo1_init(void)
26004 +{
26005 +       return 0;
26006 +}
26007 +
26008 +static int lzo1_overrun(unsigned in_len)
26009 +{
26010 +       return in_len / 64 + 16 + 3;
26011 +}
26012 +
26013 +static coa_t lzo1_alloc(tfm_action act)
26014 +{
26015 +       int ret = 0;
26016 +       coa_t coa = NULL;
26017 +
26018 +       switch (act) {
26019 +       case TFMA_WRITE:        /* compress */
26020 +               coa = reiser4_vmalloc(LZO1X_1_MEM_COMPRESS);
26021 +               if (!coa) {
26022 +                       ret = -ENOMEM;
26023 +                       break;
26024 +               }
26025 +       case TFMA_READ:         /* decompress */
26026 +               break;
26027 +       default:
26028 +               impossible("edward-877",
26029 +                          "trying to alloc workspace for unknown tfm action");
26030 +       }
26031 +       if (ret) {
26032 +               warning("edward-878",
26033 +                       "alloc workspace for lzo1 (tfm action = %d) failed\n",
26034 +                       act);
26035 +               return ERR_PTR(ret);
26036 +       }
26037 +       return coa;
26038 +}
26039 +
26040 +static void lzo1_free(coa_t coa, tfm_action act)
26041 +{
26042 +       assert("edward-879", coa != NULL);
26043 +
26044 +       switch (act) {
26045 +       case TFMA_WRITE:        /* compress */
26046 +               vfree(coa);
26047 +               break;
26048 +       case TFMA_READ:         /* decompress */
26049 +               impossible("edward-1304",
26050 +                          "trying to free non-allocated workspace");
26051 +       default:
26052 +               impossible("edward-880", "unknown tfm action");
26053 +       }
26054 +       return;
26055 +}
26056 +
26057 +static int lzo1_min_size_deflate(void)
26058 +{
26059 +       return 256;
26060 +}
26061 +
26062 +static void
26063 +lzo1_compress(coa_t coa, __u8 * src_first, size_t src_len,
26064 +             __u8 * dst_first, size_t *dst_len)
26065 +{
26066 +       int result;
26067 +
26068 +       assert("edward-846", coa != NULL);
26069 +       assert("edward-847", src_len != 0);
26070 +
26071 +       result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa);
26072 +       if (unlikely(result != LZO_E_OK)) {
26073 +               warning("edward-849", "lzo1x_1_compress failed\n");
26074 +               goto out;
26075 +       }
26076 +       if (*dst_len >= src_len) {
26077 +               //warning("edward-850", "lzo1x_1_compress: incompressible data\n");
26078 +               goto out;
26079 +       }
26080 +       return;
26081 +      out:
26082 +       *dst_len = src_len;
26083 +       return;
26084 +}
26085 +
26086 +static void
26087 +lzo1_decompress(coa_t coa, __u8 * src_first, size_t src_len,
26088 +               __u8 * dst_first, size_t *dst_len)
26089 +{
26090 +       int result;
26091 +
26092 +       assert("edward-851", coa == NULL);
26093 +       assert("edward-852", src_len != 0);
26094 +
26095 +       result = lzo1x_decompress_safe(src_first, src_len, dst_first, dst_len);
26096 +       if (result != LZO_E_OK)
26097 +               warning("edward-853", "lzo1x_1_decompress failed\n");
26098 +       return;
26099 +}
26100 +
26101 +compression_plugin compression_plugins[LAST_COMPRESSION_ID] = {
26102 +       [LZO1_COMPRESSION_ID] = {
26103 +               .h = {
26104 +                       .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
26105 +                       .id = LZO1_COMPRESSION_ID,
26106 +                       .pops = &compression_plugin_ops,
26107 +                       .label = "lzo1",
26108 +                       .desc = "lzo1 compression transform",
26109 +                       .linkage = {NULL, NULL}
26110 +               },
26111 +               .init = lzo1_init,
26112 +               .overrun = lzo1_overrun,
26113 +               .alloc = lzo1_alloc,
26114 +               .free = lzo1_free,
26115 +               .min_size_deflate = lzo1_min_size_deflate,
26116 +               .checksum = reiser4_adler32,
26117 +               .compress = lzo1_compress,
26118 +               .decompress = lzo1_decompress
26119 +       },
26120 +       [GZIP1_COMPRESSION_ID] = {
26121 +               .h = {
26122 +                       .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
26123 +                       .id = GZIP1_COMPRESSION_ID,
26124 +                       .pops = &compression_plugin_ops,
26125 +                       .label = "gzip1",
26126 +                       .desc = "gzip1 compression transform",
26127 +                       .linkage = {NULL, NULL}
26128 +               },
26129 +               .init = gzip1_init,
26130 +               .overrun = gzip1_overrun,
26131 +               .alloc = gzip1_alloc,
26132 +               .free = gzip1_free,
26133 +               .min_size_deflate = gzip1_min_size_deflate,
26134 +               .checksum = reiser4_adler32,
26135 +               .compress = gzip1_compress,
26136 +               .decompress = gzip1_decompress
26137 +       }
26138 +};
26139 +
26140 +/*
26141 +  Local variables:
26142 +  c-indentation-style: "K&R"
26143 +  mode-name: "LC"
26144 +  c-basic-offset: 8
26145 +  tab-width: 8
26146 +  fill-column: 120
26147 +  scroll-step: 1
26148 +  End:
26149 +*/
26150 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/compress/compress.h linux-2.6.35/fs/reiser4/plugin/compress/compress.h
26151 --- linux-2.6.35.orig/fs/reiser4/plugin/compress/compress.h     1970-01-01 01:00:00.000000000 +0100
26152 +++ linux-2.6.35/fs/reiser4/plugin/compress/compress.h  2010-08-04 15:44:57.000000000 +0200
26153 @@ -0,0 +1,43 @@
26154 +#if !defined( __FS_REISER4_COMPRESS_H__ )
26155 +#define __FS_REISER4_COMPRESS_H__
26156 +
26157 +#include <linux/types.h>
26158 +#include <linux/string.h>
26159 +
26160 +/* transform direction */
26161 +typedef enum {
26162 +       TFMA_READ,   /* decrypt, decompress */
26163 +       TFMA_WRITE,  /* encrypt, compress */
26164 +       TFMA_LAST
26165 +} tfm_action;
26166 +
26167 +/* supported compression algorithms */
26168 +typedef enum {
26169 +       LZO1_COMPRESSION_ID,
26170 +       GZIP1_COMPRESSION_ID,
26171 +       LAST_COMPRESSION_ID,
26172 +} reiser4_compression_id;
26173 +
26174 +/* the same as pgoff, but units are page clusters */
26175 +typedef unsigned long cloff_t;
26176 +
26177 +/* working data of a (de)compression algorithm */
26178 +typedef void *coa_t;
26179 +
26180 +/* table for all supported (de)compression algorithms */
26181 +typedef coa_t coa_set[LAST_COMPRESSION_ID][TFMA_LAST];
26182 +
26183 +__u32 reiser4_adler32(char *data, __u32 len);
26184 +
26185 +#endif                         /* __FS_REISER4_COMPRESS_H__ */
26186 +
26187 +/* Make Linus happy.
26188 +   Local variables:
26189 +   c-indentation-style: "K&R"
26190 +   mode-name: "LC"
26191 +   c-basic-offset: 8
26192 +   tab-width: 8
26193 +   fill-column: 120
26194 +   scroll-step: 1
26195 +   End:
26196 +*/
26197 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/compress/compress_mode.c linux-2.6.35/fs/reiser4/plugin/compress/compress_mode.c
26198 --- linux-2.6.35.orig/fs/reiser4/plugin/compress/compress_mode.c        1970-01-01 01:00:00.000000000 +0100
26199 +++ linux-2.6.35/fs/reiser4/plugin/compress/compress_mode.c     2010-08-04 15:44:57.000000000 +0200
26200 @@ -0,0 +1,162 @@
26201 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26202 +/* This file contains Reiser4 compression mode plugins.
26203 +
26204 +   Compression mode plugin is a set of handlers called by compressor
26205 +   at flush time and represent some heuristics including the ones
26206 +   which are to avoid compression of incompressible data, see
26207 +   http://www.namesys.com/cryptcompress_design.html for more details.
26208 +*/
26209 +#include "../../inode.h"
26210 +#include "../plugin.h"
26211 +
26212 +static int should_deflate_none(struct inode * inode, cloff_t index)
26213 +{
26214 +       return 0;
26215 +}
26216 +
26217 +static int should_deflate_common(struct inode * inode, cloff_t index)
26218 +{
26219 +       return compression_is_on(cryptcompress_inode_data(inode));
26220 +}
26221 +
26222 +static int discard_hook_ultim(struct inode *inode, cloff_t index)
26223 +{
26224 +       turn_off_compression(cryptcompress_inode_data(inode));
26225 +       return 0;
26226 +}
26227 +
26228 +static int discard_hook_lattd(struct inode *inode, cloff_t index)
26229 +{
26230 +       struct cryptcompress_info * info = cryptcompress_inode_data(inode);
26231 +
26232 +       assert("edward-1462",
26233 +              get_lattice_factor(info) >= MIN_LATTICE_FACTOR &&
26234 +              get_lattice_factor(info) <= MAX_LATTICE_FACTOR);
26235 +
26236 +       turn_off_compression(info);
26237 +       if (get_lattice_factor(info) < MAX_LATTICE_FACTOR)
26238 +               set_lattice_factor(info, get_lattice_factor(info) << 1);
26239 +       return 0;
26240 +}
26241 +
26242 +static int accept_hook_lattd(struct inode *inode, cloff_t index)
26243 +{
26244 +       turn_on_compression(cryptcompress_inode_data(inode));
26245 +       set_lattice_factor(cryptcompress_inode_data(inode), MIN_LATTICE_FACTOR);
26246 +       return 0;
26247 +}
26248 +
26249 +/* Check on dynamic lattice, the adaptive compression modes which
26250 +   defines the following behavior:
26251 +
26252 +   Compression is on: try to compress everything and turn
26253 +   it off, whenever cluster is incompressible.
26254 +
26255 +   Compression is off: try to compress clusters of indexes
26256 +   k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of
26257 +   them is compressible. If incompressible, then increase FACTOR */
26258 +
26259 +/* check if @index belongs to one-dimensional lattice
26260 +   of sparce factor @factor */
26261 +static int is_on_lattice(cloff_t index, int factor)
26262 +{
26263 +       return (factor ? index % factor == 0: index == 0);
26264 +}
26265 +
26266 +static int should_deflate_lattd(struct inode * inode, cloff_t index)
26267 +{
26268 +       return should_deflate_common(inode, index) ||
26269 +               is_on_lattice(index,
26270 +                             get_lattice_factor
26271 +                             (cryptcompress_inode_data(inode)));
26272 +}
26273 +
26274 +/* compression mode_plugins */
26275 +compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = {
26276 +       [NONE_COMPRESSION_MODE_ID] = {
26277 +               .h = {
26278 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26279 +                       .id = NONE_COMPRESSION_MODE_ID,
26280 +                       .pops = NULL,
26281 +                       .label = "none",
26282 +                       .desc = "Compress nothing",
26283 +                       .linkage = {NULL, NULL}
26284 +               },
26285 +               .should_deflate = should_deflate_none,
26286 +               .accept_hook = NULL,
26287 +               .discard_hook = NULL
26288 +       },
26289 +       /* Check-on-dynamic-lattice adaptive compression mode */
26290 +       [LATTD_COMPRESSION_MODE_ID] = {
26291 +               .h = {
26292 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26293 +                       .id = LATTD_COMPRESSION_MODE_ID,
26294 +                       .pops = NULL,
26295 +                       .label = "lattd",
26296 +                       .desc = "Check on dynamic lattice",
26297 +                       .linkage = {NULL, NULL}
26298 +               },
26299 +               .should_deflate = should_deflate_lattd,
26300 +               .accept_hook = accept_hook_lattd,
26301 +               .discard_hook = discard_hook_lattd
26302 +       },
26303 +       /* Check-ultimately compression mode:
26304 +          Turn off compression forever as soon as we meet
26305 +          incompressible data */
26306 +       [ULTIM_COMPRESSION_MODE_ID] = {
26307 +               .h = {
26308 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26309 +                       .id = ULTIM_COMPRESSION_MODE_ID,
26310 +                       .pops = NULL,
26311 +                       .label = "ultim",
26312 +                       .desc = "Check ultimately",
26313 +                       .linkage = {NULL, NULL}
26314 +               },
26315 +               .should_deflate = should_deflate_common,
26316 +               .accept_hook = NULL,
26317 +               .discard_hook = discard_hook_ultim
26318 +       },
26319 +       /* Force-to-compress-everything compression mode */
26320 +       [FORCE_COMPRESSION_MODE_ID] = {
26321 +               .h = {
26322 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26323 +                       .id = FORCE_COMPRESSION_MODE_ID,
26324 +                       .pops = NULL,
26325 +                       .label = "force",
26326 +                       .desc = "Force to compress everything",
26327 +                       .linkage = {NULL, NULL}
26328 +               },
26329 +               .should_deflate = NULL,
26330 +               .accept_hook = NULL,
26331 +               .discard_hook = NULL
26332 +       },
26333 +       /* Convert-to-extent compression mode.
26334 +          In this mode items will be converted to extents and management
26335 +          will be passed to (classic) unix file plugin as soon as ->write()
26336 +          detects that the first complete logical cluster (of index #0) is
26337 +          incompressible. */
26338 +       [CONVX_COMPRESSION_MODE_ID] = {
26339 +               .h = {
26340 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26341 +                       .id = CONVX_COMPRESSION_MODE_ID,
26342 +                       .pops = NULL,
26343 +                       .label = "conv",
26344 +                       .desc = "Convert to extent",
26345 +                       .linkage = {NULL, NULL}
26346 +               },
26347 +               .should_deflate = should_deflate_common,
26348 +               .accept_hook = NULL,
26349 +               .discard_hook = NULL
26350 +       }
26351 +};
26352 +
26353 +/*
26354 +  Local variables:
26355 +  c-indentation-style: "K&R"
26356 +  mode-name: "LC"
26357 +  c-basic-offset: 8
26358 +  tab-width: 8
26359 +  fill-column: 120
26360 +  scroll-step: 1
26361 +  End:
26362 +*/
26363 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/compress/Makefile linux-2.6.35/fs/reiser4/plugin/compress/Makefile
26364 --- linux-2.6.35.orig/fs/reiser4/plugin/compress/Makefile       1970-01-01 01:00:00.000000000 +0100
26365 +++ linux-2.6.35/fs/reiser4/plugin/compress/Makefile    2010-08-04 15:44:57.000000000 +0200
26366 @@ -0,0 +1,5 @@
26367 +obj-$(CONFIG_REISER4_FS) += compress_plugins.o
26368 +
26369 +compress_plugins-objs :=       \
26370 +       compress.o              \
26371 +       compress_mode.o
26372 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/crypto/cipher.c linux-2.6.35/fs/reiser4/plugin/crypto/cipher.c
26373 --- linux-2.6.35.orig/fs/reiser4/plugin/crypto/cipher.c 1970-01-01 01:00:00.000000000 +0100
26374 +++ linux-2.6.35/fs/reiser4/plugin/crypto/cipher.c      2010-08-04 15:44:57.000000000 +0200
26375 @@ -0,0 +1,37 @@
26376 +/* Copyright 2001, 2002, 2003 by Hans Reiser,
26377 +   licensing governed by reiser4/README */
26378 +/* Reiser4 cipher transform plugins */
26379 +
26380 +#include "../../debug.h"
26381 +#include "../plugin.h"
26382 +
26383 +cipher_plugin cipher_plugins[LAST_CIPHER_ID] = {
26384 +       [NONE_CIPHER_ID] = {
26385 +               .h = {
26386 +                       .type_id = REISER4_CIPHER_PLUGIN_TYPE,
26387 +                       .id = NONE_CIPHER_ID,
26388 +                       .pops = NULL,
26389 +                       .label = "none",
26390 +                       .desc = "no cipher transform",
26391 +                       .linkage = {NULL, NULL}
26392 +               },
26393 +               .alloc = NULL,
26394 +               .free = NULL,
26395 +               .scale = NULL,
26396 +               .align_stream = NULL,
26397 +               .setkey = NULL,
26398 +               .encrypt = NULL,
26399 +               .decrypt = NULL
26400 +       }
26401 +};
26402 +
26403 +/* Make Linus happy.
26404 +   Local variables:
26405 +   c-indentation-style: "K&R"
26406 +   mode-name: "LC"
26407 +   c-basic-offset: 8
26408 +   tab-width: 8
26409 +   fill-column: 120
26410 +   scroll-step: 1
26411 +   End:
26412 +*/
26413 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/crypto/cipher.h linux-2.6.35/fs/reiser4/plugin/crypto/cipher.h
26414 --- linux-2.6.35.orig/fs/reiser4/plugin/crypto/cipher.h 1970-01-01 01:00:00.000000000 +0100
26415 +++ linux-2.6.35/fs/reiser4/plugin/crypto/cipher.h      2010-08-04 15:44:57.000000000 +0200
26416 @@ -0,0 +1,55 @@
26417 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26418 +/* This file contains definitions for the objects operated
26419 +   by reiser4 key manager, which is something like keyring
26420 +   wrapped by appropriate reiser4 plugin */
26421 +
26422 +#if !defined( __FS_REISER4_CRYPT_H__ )
26423 +#define __FS_REISER4_CRYPT_H__
26424 +
26425 +#include <linux/crypto.h>
26426 +
26427 +/* key info imported from user space */
26428 +struct reiser4_crypto_data {
26429 +       int keysize;    /* uninstantiated key size */
26430 +       __u8 * key;     /* uninstantiated key */
26431 +       int keyid_size; /* size of passphrase */
26432 +       __u8 * keyid;   /* passphrase */
26433 +};
26434 +
26435 +/* This object contains all needed infrastructure to implement
26436 +   cipher transform. This is operated (allocating, inheriting,
26437 +   validating, binding to host inode, etc..) by reiser4 key manager.
26438 +
26439 +   This info can be allocated in two cases:
26440 +   1. importing a key from user space.
26441 +   2. reading inode from disk */
26442 +struct reiser4_crypto_info {
26443 +       struct inode * host;
26444 +       struct crypto_hash      * digest;
26445 +       struct crypto_blkcipher * cipher;
26446 +#if 0
26447 +       cipher_key_plugin * kplug; /* key manager */
26448 +#endif
26449 +       __u8 * keyid;              /* key fingerprint, created by digest plugin,
26450 +                                     using uninstantiated key and passphrase.
26451 +                                     supposed to be stored in disk stat-data */
26452 +       int inst;                  /* this indicates if the cipher key is
26453 +                                     instantiated (case 1 above) */
26454 +       int keysize;               /* uninstantiated key size (bytes), supposed
26455 +                                     to be stored in disk stat-data */
26456 +       int keyload_count;         /* number of the objects which has this
26457 +                                     crypto-stat attached */
26458 +};
26459 +
26460 +#endif /* __FS_REISER4_CRYPT_H__ */
26461 +
26462 +/*
26463 +   Local variables:
26464 +   c-indentation-style: "K&R"
26465 +   mode-name: "LC"
26466 +   c-basic-offset: 8
26467 +   tab-width: 8
26468 +   fill-column: 120
26469 +   scroll-step: 1
26470 +   End:
26471 +*/
26472 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/crypto/digest.c linux-2.6.35/fs/reiser4/plugin/crypto/digest.c
26473 --- linux-2.6.35.orig/fs/reiser4/plugin/crypto/digest.c 1970-01-01 01:00:00.000000000 +0100
26474 +++ linux-2.6.35/fs/reiser4/plugin/crypto/digest.c      2010-08-04 15:44:57.000000000 +0200
26475 @@ -0,0 +1,58 @@
26476 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26477 +
26478 +/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */
26479 +/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */
26480 +#include "../../debug.h"
26481 +#include "../plugin_header.h"
26482 +#include "../plugin.h"
26483 +#include "../file/cryptcompress.h"
26484 +
26485 +#include <linux/types.h>
26486 +
26487 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
26488 +
26489 +static struct crypto_hash * alloc_sha256 (void)
26490 +{
26491 +#if REISER4_SHA256
26492 +       return crypto_alloc_hash ("sha256", 0, CRYPTO_ALG_ASYNC);
26493 +#else
26494 +       warning("edward-1418", "sha256 unsupported");
26495 +       return ERR_PTR(-EINVAL);
26496 +#endif
26497 +}
26498 +
26499 +static void free_sha256 (struct crypto_hash * tfm)
26500 +{
26501 +#if REISER4_SHA256
26502 +       crypto_free_hash(tfm);
26503 +#endif
26504 +       return;
26505 +}
26506 +
26507 +/* digest plugins */
26508 +digest_plugin digest_plugins[LAST_DIGEST_ID] = {
26509 +       [SHA256_32_DIGEST_ID] = {
26510 +               .h = {
26511 +                       .type_id = REISER4_DIGEST_PLUGIN_TYPE,
26512 +                       .id = SHA256_32_DIGEST_ID,
26513 +                       .pops = NULL,
26514 +                       .label = "sha256_32",
26515 +                       .desc = "sha256_32 digest transform",
26516 +                       .linkage = {NULL, NULL}
26517 +               },
26518 +               .fipsize = sizeof(__u32),
26519 +               .alloc = alloc_sha256,
26520 +               .free = free_sha256
26521 +       }
26522 +};
26523 +
26524 +/*
26525 +  Local variables:
26526 +  c-indentation-style: "K&R"
26527 +  mode-name: "LC"
26528 +  c-basic-offset: 8
26529 +  tab-width: 8
26530 +  fill-column: 120
26531 +  scroll-step: 1
26532 +  End:
26533 +*/
26534 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/dir/dir.h linux-2.6.35/fs/reiser4/plugin/dir/dir.h
26535 --- linux-2.6.35.orig/fs/reiser4/plugin/dir/dir.h       1970-01-01 01:00:00.000000000 +0100
26536 +++ linux-2.6.35/fs/reiser4/plugin/dir/dir.h    2010-08-04 15:44:57.000000000 +0200
26537 @@ -0,0 +1,36 @@
26538 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
26539 + * reiser4/README */
26540 +
26541 +/* this file contains declarations of methods implementing directory plugins */
26542 +
26543 +#if !defined( __REISER4_DIR_H__ )
26544 +#define __REISER4_DIR_H__
26545 +
26546 +/*#include "../../key.h"
26547 +
26548 +#include <linux/fs.h>*/
26549 +
26550 +/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */
26551 +
26552 +/* "hashed" directory methods of dir plugin */
26553 +void build_entry_key_hashed(const struct inode *, const struct qstr *,
26554 +                           reiser4_key *);
26555 +
26556 +/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */
26557 +
26558 +/* "seekable" directory methods of dir plugin */
26559 +void build_entry_key_seekable(const struct inode *, const struct qstr *,
26560 +                             reiser4_key *);
26561 +
26562 +/* __REISER4_DIR_H__ */
26563 +#endif
26564 +
26565 +/*
26566 +   Local variables:
26567 +   c-indentation-style: "K&R"
26568 +   mode-name: "LC"
26569 +   c-basic-offset: 8
26570 +   tab-width: 8
26571 +   fill-column: 120
26572 +   End:
26573 +*/
26574 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/dir/hashed_dir.c linux-2.6.35/fs/reiser4/plugin/dir/hashed_dir.c
26575 --- linux-2.6.35.orig/fs/reiser4/plugin/dir/hashed_dir.c        1970-01-01 01:00:00.000000000 +0100
26576 +++ linux-2.6.35/fs/reiser4/plugin/dir/hashed_dir.c     2010-08-04 15:44:57.000000000 +0200
26577 @@ -0,0 +1,81 @@
26578 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
26579 + * reiser4/README */
26580 +
26581 +/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file
26582 +   names to the files. */
26583 +
26584 +/*
26585 + * Hashed directory logically consists of persistent directory
26586 + * entries. Directory entry is a pair of a file name and a key of stat-data of
26587 + * a file that has this name in the given directory.
26588 + *
26589 + * Directory entries are stored in the tree in the form of directory
26590 + * items. Directory item should implement dir_entry_ops portion of item plugin
26591 + * interface (see plugin/item/item.h). Hashed directory interacts with
26592 + * directory item plugin exclusively through dir_entry_ops operations.
26593 + *
26594 + * Currently there are two implementations of directory items: "simple
26595 + * directory item" (plugin/item/sde.[ch]), and "compound directory item"
26596 + * (plugin/item/cde.[ch]) with the latter being the default.
26597 + *
26598 + * There is, however some delicate way through which directory code interferes
26599 + * with item plugin: key assignment policy. A key for a directory item is
26600 + * chosen by directory code, and as described in kassign.c, this key contains
26601 + * a portion of file name. Directory item uses this knowledge to avoid storing
26602 + * this portion of file name twice: in the key and in the directory item body.
26603 + *
26604 + */
26605 +
26606 +#include "../../inode.h"
26607 +
26608 +void complete_entry_key(const struct inode *, const char *name,
26609 +                       int len, reiser4_key * result);
26610 +
26611 +/* this is implementation of build_entry_key method of dir
26612 +   plugin for HASHED_DIR_PLUGIN_ID
26613 + */
26614 +void build_entry_key_hashed(const struct inode *dir,   /* directory where entry is
26615 +                                                        * (or will be) in.*/
26616 +                           const struct qstr *qname,   /* name of file referenced
26617 +                                                        * by this entry */
26618 +                           reiser4_key * result        /* resulting key of directory
26619 +                                                        * entry */ )
26620 +{
26621 +       const char *name;
26622 +       int len;
26623 +
26624 +       assert("nikita-1139", dir != NULL);
26625 +       assert("nikita-1140", qname != NULL);
26626 +       assert("nikita-1141", qname->name != NULL);
26627 +       assert("nikita-1142", result != NULL);
26628 +
26629 +       name = qname->name;
26630 +       len = qname->len;
26631 +
26632 +       assert("nikita-2867", strlen(name) == len);
26633 +
26634 +       reiser4_key_init(result);
26635 +       /* locality of directory entry's key is objectid of parent
26636 +          directory */
26637 +       set_key_locality(result, get_inode_oid(dir));
26638 +       /* minor packing locality is constant */
26639 +       set_key_type(result, KEY_FILE_NAME_MINOR);
26640 +       /* dot is special case---we always want it to be first entry in
26641 +          a directory. Actually, we just want to have smallest
26642 +          directory entry.
26643 +        */
26644 +       if (len == 1 && name[0] == '.')
26645 +               return;
26646 +
26647 +       /* initialize part of entry key which depends on file name */
26648 +       complete_entry_key(dir, name, len, result);
26649 +}
26650 +
26651 +/* Local variables:
26652 +   c-indentation-style: "K&R"
26653 +   mode-name: "LC"
26654 +   c-basic-offset: 8
26655 +   tab-width: 8
26656 +   fill-column: 120
26657 +   End:
26658 +*/
26659 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/dir/Makefile linux-2.6.35/fs/reiser4/plugin/dir/Makefile
26660 --- linux-2.6.35.orig/fs/reiser4/plugin/dir/Makefile    1970-01-01 01:00:00.000000000 +0100
26661 +++ linux-2.6.35/fs/reiser4/plugin/dir/Makefile 2010-08-04 15:44:57.000000000 +0200
26662 @@ -0,0 +1,5 @@
26663 +obj-$(CONFIG_REISER4_FS) += dir_plugins.o
26664 +
26665 +dir_plugins-objs :=    \
26666 +       hashed_dir.o    \
26667 +       seekable_dir.o
26668 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/dir/seekable_dir.c linux-2.6.35/fs/reiser4/plugin/dir/seekable_dir.c
26669 --- linux-2.6.35.orig/fs/reiser4/plugin/dir/seekable_dir.c      1970-01-01 01:00:00.000000000 +0100
26670 +++ linux-2.6.35/fs/reiser4/plugin/dir/seekable_dir.c   2010-08-04 15:44:57.000000000 +0200
26671 @@ -0,0 +1,46 @@
26672 +/* Copyright 2005 by Hans Reiser, licensing governed by
26673 + * reiser4/README */
26674 +
26675 +#include "../../inode.h"
26676 +
26677 +/* this is implementation of build_entry_key method of dir
26678 +   plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID
26679 +   This is for directories where we want repeatable and restartable readdir()
26680 +   even in case 32bit user level struct dirent (readdir(3)).
26681 +*/
26682 +void
26683 +build_entry_key_seekable(const struct inode *dir, const struct qstr *name,
26684 +                        reiser4_key * result)
26685 +{
26686 +       oid_t objectid;
26687 +
26688 +       assert("nikita-2283", dir != NULL);
26689 +       assert("nikita-2284", name != NULL);
26690 +       assert("nikita-2285", name->name != NULL);
26691 +       assert("nikita-2286", result != NULL);
26692 +
26693 +       reiser4_key_init(result);
26694 +       /* locality of directory entry's key is objectid of parent
26695 +          directory */
26696 +       set_key_locality(result, get_inode_oid(dir));
26697 +       /* minor packing locality is constant */
26698 +       set_key_type(result, KEY_FILE_NAME_MINOR);
26699 +       /* dot is special case---we always want it to be first entry in
26700 +          a directory. Actually, we just want to have smallest
26701 +          directory entry.
26702 +        */
26703 +       if ((name->len == 1) && (name->name[0] == '.'))
26704 +               return;
26705 +
26706 +       /* objectid of key is 31 lowest bits of hash. */
26707 +       objectid =
26708 +           inode_hash_plugin(dir)->hash(name->name,
26709 +                                        (int)name->len) & 0x7fffffff;
26710 +
26711 +       assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK));
26712 +       set_key_objectid(result, objectid);
26713 +
26714 +       /* offset is always 0. */
26715 +       set_key_offset(result, (__u64) 0);
26716 +       return;
26717 +}
26718 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/dir_plugin_common.c linux-2.6.35/fs/reiser4/plugin/dir_plugin_common.c
26719 --- linux-2.6.35.orig/fs/reiser4/plugin/dir_plugin_common.c     1970-01-01 01:00:00.000000000 +0100
26720 +++ linux-2.6.35/fs/reiser4/plugin/dir_plugin_common.c  2010-08-04 15:44:57.000000000 +0200
26721 @@ -0,0 +1,865 @@
26722 +/* Copyright 2005 by Hans Reiser, licensing governed by
26723 +   reiser4/README */
26724 +
26725 +/* this file contains typical implementations for most of methods of
26726 +   directory plugin
26727 +*/
26728 +
26729 +#include "../inode.h"
26730 +
26731 +int reiser4_find_entry(struct inode *dir, struct dentry *name,
26732 +              lock_handle * , znode_lock_mode, reiser4_dir_entry_desc *);
26733 +int reiser4_lookup_name(struct inode *parent, struct dentry *dentry,
26734 +                       reiser4_key * key);
26735 +void check_light_weight(struct inode *inode, struct inode *parent);
26736 +
26737 +/* this is common implementation of get_parent method of dir plugin
26738 +   this is used by NFS kernel server to "climb" up directory tree to
26739 +   check permissions
26740 + */
26741 +struct dentry *get_parent_common(struct inode *child)
26742 +{
26743 +       struct super_block *s;
26744 +       struct inode *parent;
26745 +       struct dentry dotdot;
26746 +       struct dentry *dentry;
26747 +       reiser4_key key;
26748 +       int result;
26749 +
26750 +       /*
26751 +        * lookup dotdot entry.
26752 +        */
26753 +
26754 +       s = child->i_sb;
26755 +       memset(&dotdot, 0, sizeof(dotdot));
26756 +       dotdot.d_name.name = "..";
26757 +       dotdot.d_name.len = 2;
26758 +       dotdot.d_op = &get_super_private(s)->ops.dentry;
26759 +
26760 +       result = reiser4_lookup_name(child, &dotdot, &key);
26761 +       if (result != 0)
26762 +               return ERR_PTR(result);
26763 +
26764 +       parent = reiser4_iget(s, &key, 1);
26765 +       if (!IS_ERR(parent)) {
26766 +               /*
26767 +                * FIXME-NIKITA dubious: attributes are inherited from @child
26768 +                * to @parent. But:
26769 +                *
26770 +                *     (*) this is the only this we can do
26771 +                *
26772 +                *     (*) attributes of light-weight object are inherited
26773 +                *     from a parent through which object was looked up first,
26774 +                *     so it is ambiguous anyway.
26775 +                *
26776 +                */
26777 +               check_light_weight(parent, child);
26778 +               reiser4_iget_complete(parent);
26779 +               dentry = d_obtain_alias(parent);
26780 +               if (!IS_ERR(dentry))
26781 +                       dentry->d_op = &get_super_private(s)->ops.dentry;
26782 +       } else if (PTR_ERR(parent) == -ENOENT)
26783 +               dentry = ERR_PTR(RETERR(-ESTALE));
26784 +       else
26785 +               dentry = (void *)parent;
26786 +       return dentry;
26787 +}
26788 +
26789 +/* this is common implementation of is_name_acceptable method of dir
26790 +   plugin
26791 + */
26792 +int is_name_acceptable_common(const struct inode *inode, /* directory to check*/
26793 +                             const char *name UNUSED_ARG, /* name to check */
26794 +                             int len/* @name's length */)
26795 +{
26796 +       assert("nikita-733", inode != NULL);
26797 +       assert("nikita-734", name != NULL);
26798 +       assert("nikita-735", len > 0);
26799 +
26800 +       return len <= reiser4_max_filename_len(inode);
26801 +}
26802 +
26803 +/* there is no common implementation of build_entry_key method of dir
26804 +   plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or
26805 +   plugin/dir/seekable.c:build_entry_key_seekable() for example
26806 +*/
26807 +
26808 +/* this is common implementation of build_readdir_key method of dir
26809 +   plugin
26810 +   see reiser4_readdir_common for more details
26811 +*/
26812 +int build_readdir_key_common(struct file *dir /* directory being read */ ,
26813 +                            reiser4_key * result/* where to store key */)
26814 +{
26815 +       reiser4_file_fsdata *fdata;
26816 +       struct inode *inode;
26817 +
26818 +       assert("nikita-1361", dir != NULL);
26819 +       assert("nikita-1362", result != NULL);
26820 +       assert("nikita-1363", dir->f_dentry != NULL);
26821 +       inode = dir->f_dentry->d_inode;
26822 +       assert("nikita-1373", inode != NULL);
26823 +
26824 +       fdata = reiser4_get_file_fsdata(dir);
26825 +       if (IS_ERR(fdata))
26826 +               return PTR_ERR(fdata);
26827 +       assert("nikita-1364", fdata != NULL);
26828 +       return extract_key_from_de_id(get_inode_oid(inode),
26829 +                                     &fdata->dir.readdir.position.
26830 +                                     dir_entry_key, result);
26831 +
26832 +}
26833 +
26834 +void reiser4_adjust_dir_file(struct inode *, const struct dentry *, int offset,
26835 +                            int adj);
26836 +
26837 +/* this is common implementation of add_entry method of dir plugin
26838 +*/
26839 +int reiser4_add_entry_common(struct inode *object, /* directory to add new name
26840 +                                                   * in */
26841 +                            struct dentry *where,      /* new name */
26842 +                            reiser4_object_create_data * data, /* parameters of
26843 +                                                               *  new object */
26844 +                            reiser4_dir_entry_desc * entry /* parameters of
26845 +                                                            * new directory
26846 +                                                            * entry */)
26847 +{
26848 +       int result;
26849 +       coord_t *coord;
26850 +       lock_handle lh;
26851 +       struct reiser4_dentry_fsdata *fsdata;
26852 +       reiser4_block_nr reserve;
26853 +
26854 +       assert("nikita-1114", object != NULL);
26855 +       assert("nikita-1250", where != NULL);
26856 +
26857 +       fsdata = reiser4_get_dentry_fsdata(where);
26858 +       if (unlikely(IS_ERR(fsdata)))
26859 +               return PTR_ERR(fsdata);
26860 +
26861 +       reserve = inode_dir_plugin(object)->estimate.add_entry(object);
26862 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
26863 +               return RETERR(-ENOSPC);
26864 +
26865 +       init_lh(&lh);
26866 +       coord = &fsdata->dec.entry_coord;
26867 +       coord_clear_iplug(coord);
26868 +
26869 +       /* check for this entry in a directory. This is plugin method. */
26870 +       result = reiser4_find_entry(object, where, &lh, ZNODE_WRITE_LOCK,
26871 +                                   entry);
26872 +       if (likely(result == -ENOENT)) {
26873 +               /* add new entry. Just pass control to the directory
26874 +                  item plugin. */
26875 +               assert("nikita-1709", inode_dir_item_plugin(object));
26876 +               assert("nikita-2230", coord->node == lh.node);
26877 +               reiser4_seal_done(&fsdata->dec.entry_seal);
26878 +               result =
26879 +                   inode_dir_item_plugin(object)->s.dir.add_entry(object,
26880 +                                                                  coord, &lh,
26881 +                                                                  where,
26882 +                                                                  entry);
26883 +               if (result == 0) {
26884 +                       reiser4_adjust_dir_file(object, where,
26885 +                                               fsdata->dec.pos + 1, +1);
26886 +                       INODE_INC_FIELD(object, i_size);
26887 +               }
26888 +       } else if (result == 0) {
26889 +               assert("nikita-2232", coord->node == lh.node);
26890 +               result = RETERR(-EEXIST);
26891 +       }
26892 +       done_lh(&lh);
26893 +
26894 +       return result;
26895 +}
26896 +
26897 +/**
26898 + * rem_entry - remove entry from directory item
26899 + * @dir:
26900 + * @dentry:
26901 + * @entry:
26902 + * @coord:
26903 + * @lh:
26904 + *
26905 + * Checks that coordinate @coord is set properly and calls item plugin
26906 + * method to cut entry.
26907 + */
26908 +static int
26909 +rem_entry(struct inode *dir, struct dentry *dentry,
26910 +         reiser4_dir_entry_desc * entry, coord_t *coord, lock_handle * lh)
26911 +{
26912 +       item_plugin *iplug;
26913 +       struct inode *child;
26914 +
26915 +       iplug = inode_dir_item_plugin(dir);
26916 +       child = dentry->d_inode;
26917 +       assert("nikita-3399", child != NULL);
26918 +
26919 +       /* check that we are really destroying an entry for @child */
26920 +       if (REISER4_DEBUG) {
26921 +               int result;
26922 +               reiser4_key key;
26923 +
26924 +               result = iplug->s.dir.extract_key(coord, &key);
26925 +               if (result != 0)
26926 +                       return result;
26927 +               if (get_key_objectid(&key) != get_inode_oid(child)) {
26928 +                       warning("nikita-3397",
26929 +                               "rem_entry: %#llx != %#llx\n",
26930 +                               get_key_objectid(&key),
26931 +                               (unsigned long long)get_inode_oid(child));
26932 +                       return RETERR(-EIO);
26933 +               }
26934 +       }
26935 +       return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry);
26936 +}
26937 +
26938 +/**
26939 + * reiser4_rem_entry_common - remove entry from a directory
26940 + * @dir: directory to remove entry from
26941 + * @where: name that is being removed
26942 + * @entry: description of entry being removed
26943 + *
26944 + * This is common implementation of rem_entry method of dir plugin.
26945 + */
26946 +int reiser4_rem_entry_common(struct inode *dir,
26947 +                            struct dentry *dentry,
26948 +                            reiser4_dir_entry_desc * entry)
26949 +{
26950 +       int result;
26951 +       coord_t *coord;
26952 +       lock_handle lh;
26953 +       struct reiser4_dentry_fsdata *fsdata;
26954 +       __u64 tograb;
26955 +
26956 +       assert("nikita-1124", dir != NULL);
26957 +       assert("nikita-1125", dentry != NULL);
26958 +
26959 +       tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir);
26960 +       result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED);
26961 +       if (result != 0)
26962 +               return RETERR(-ENOSPC);
26963 +
26964 +       init_lh(&lh);
26965 +
26966 +       /* check for this entry in a directory. This is plugin method. */
26967 +       result = reiser4_find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry);
26968 +       fsdata = reiser4_get_dentry_fsdata(dentry);
26969 +       if (IS_ERR(fsdata)) {
26970 +               done_lh(&lh);
26971 +               return PTR_ERR(fsdata);
26972 +       }
26973 +
26974 +       coord = &fsdata->dec.entry_coord;
26975 +
26976 +       assert("nikita-3404",
26977 +              get_inode_oid(dentry->d_inode) != get_inode_oid(dir) ||
26978 +              dir->i_size <= 1);
26979 +
26980 +       coord_clear_iplug(coord);
26981 +       if (result == 0) {
26982 +               /* remove entry. Just pass control to the directory item
26983 +                  plugin. */
26984 +               assert("vs-542", inode_dir_item_plugin(dir));
26985 +               reiser4_seal_done(&fsdata->dec.entry_seal);
26986 +               reiser4_adjust_dir_file(dir, dentry, fsdata->dec.pos, -1);
26987 +               result =
26988 +                   WITH_COORD(coord,
26989 +                              rem_entry(dir, dentry, entry, coord, &lh));
26990 +               if (result == 0) {
26991 +                       if (dir->i_size >= 1)
26992 +                               INODE_DEC_FIELD(dir, i_size);
26993 +                       else {
26994 +                               warning("nikita-2509", "Dir %llu is runt",
26995 +                                       (unsigned long long)
26996 +                                       get_inode_oid(dir));
26997 +                               result = RETERR(-EIO);
26998 +                       }
26999 +
27000 +                       assert("nikita-3405", dentry->d_inode->i_nlink != 1 ||
27001 +                              dentry->d_inode->i_size != 2 ||
27002 +                              inode_dir_plugin(dentry->d_inode) == NULL);
27003 +               }
27004 +       }
27005 +       done_lh(&lh);
27006 +
27007 +       return result;
27008 +}
27009 +
27010 +static reiser4_block_nr estimate_init(struct inode *parent,
27011 +                                     struct inode *object);
27012 +static int create_dot_dotdot(struct inode *object, struct inode *parent);
27013 +
27014 +/* this is common implementation of init method of dir plugin
27015 +   create "." and ".." entries
27016 +*/
27017 +int reiser4_dir_init_common(struct inode *object,      /* new directory */
27018 +                           struct inode *parent,       /* parent directory */
27019 +                           reiser4_object_create_data * data /* info passed
27020 +                                                              * to us, this
27021 +                                                              * is filled by
27022 +                                                              * reiser4()
27023 +                                                              * syscall in
27024 +                                                              * particular */)
27025 +{
27026 +       reiser4_block_nr reserve;
27027 +
27028 +       assert("nikita-680", object != NULL);
27029 +       assert("nikita-681", S_ISDIR(object->i_mode));
27030 +       assert("nikita-682", parent != NULL);
27031 +       assert("nikita-684", data != NULL);
27032 +       assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID);
27033 +       assert("nikita-687", object->i_mode & S_IFDIR);
27034 +
27035 +       reserve = estimate_init(parent, object);
27036 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
27037 +               return RETERR(-ENOSPC);
27038 +
27039 +       return create_dot_dotdot(object, parent);
27040 +}
27041 +
27042 +/* this is common implementation of done method of dir plugin
27043 +   remove "." entry
27044 +*/
27045 +int reiser4_dir_done_common(struct inode *object/* object being deleted */)
27046 +{
27047 +       int result;
27048 +       reiser4_block_nr reserve;
27049 +       struct dentry goodby_dots;
27050 +       reiser4_dir_entry_desc entry;
27051 +
27052 +       assert("nikita-1449", object != NULL);
27053 +
27054 +       if (reiser4_inode_get_flag(object, REISER4_NO_SD))
27055 +               return 0;
27056 +
27057 +       /* of course, this can be rewritten to sweep everything in one
27058 +          reiser4_cut_tree(). */
27059 +       memset(&entry, 0, sizeof entry);
27060 +
27061 +       /* FIXME: this done method is called from reiser4_delete_dir_common
27062 +        * which reserved space already */
27063 +       reserve = inode_dir_plugin(object)->estimate.rem_entry(object);
27064 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED))
27065 +               return RETERR(-ENOSPC);
27066 +
27067 +       memset(&goodby_dots, 0, sizeof goodby_dots);
27068 +       entry.obj = goodby_dots.d_inode = object;
27069 +       goodby_dots.d_name.name = ".";
27070 +       goodby_dots.d_name.len = 1;
27071 +       result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
27072 +       reiser4_free_dentry_fsdata(&goodby_dots);
27073 +       if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
27074 +               warning("nikita-2252", "Cannot remove dot of %lli: %i",
27075 +                       (unsigned long long)get_inode_oid(object), result);
27076 +       return 0;
27077 +}
27078 +
27079 +/* this is common implementation of attach method of dir plugin
27080 +*/
27081 +int reiser4_attach_common(struct inode *child UNUSED_ARG,
27082 +                         struct inode *parent UNUSED_ARG)
27083 +{
27084 +       assert("nikita-2647", child != NULL);
27085 +       assert("nikita-2648", parent != NULL);
27086 +
27087 +       return 0;
27088 +}
27089 +
27090 +/* this is common implementation of detach method of dir plugin
27091 +   remove "..", decrease nlink on parent
27092 +*/
27093 +int reiser4_detach_common(struct inode *object, struct inode *parent)
27094 +{
27095 +       int result;
27096 +       struct dentry goodby_dots;
27097 +       reiser4_dir_entry_desc entry;
27098 +
27099 +       assert("nikita-2885", object != NULL);
27100 +       assert("nikita-2886", !reiser4_inode_get_flag(object, REISER4_NO_SD));
27101 +
27102 +       memset(&entry, 0, sizeof entry);
27103 +
27104 +       /* NOTE-NIKITA this only works if @parent is -the- parent of
27105 +          @object, viz. object whose key is stored in dotdot
27106 +          entry. Wouldn't work with hard-links on directories. */
27107 +       memset(&goodby_dots, 0, sizeof goodby_dots);
27108 +       entry.obj = goodby_dots.d_inode = parent;
27109 +       goodby_dots.d_name.name = "..";
27110 +       goodby_dots.d_name.len = 2;
27111 +       result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
27112 +       reiser4_free_dentry_fsdata(&goodby_dots);
27113 +       if (result == 0) {
27114 +               /* the dot should be the only entry remaining at this time... */
27115 +               assert("nikita-3400",
27116 +                      object->i_size == 1 && object->i_nlink <= 2);
27117 +#if 0
27118 +               /* and, together with the only name directory can have, they
27119 +                * provides for the last 2 remaining references. If we get
27120 +                * here as part of error handling during mkdir, @object
27121 +                * possibly has no name yet, so its nlink == 1. If we get here
27122 +                * from rename (targeting empty directory), it has no name
27123 +                * already, so its nlink == 1. */
27124 +               assert("nikita-3401",
27125 +                      object->i_nlink == 2 || object->i_nlink == 1);
27126 +#endif
27127 +
27128 +               /* decrement nlink of directory removed ".." pointed
27129 +                  to */
27130 +               reiser4_del_nlink(parent, NULL, 0);
27131 +       }
27132 +       return result;
27133 +}
27134 +
27135 +/* this is common implementation of estimate.add_entry method of
27136 +   dir plugin
27137 +   estimation of adding entry which supposes that entry is inserting a
27138 +   unit into item
27139 +*/
27140 +reiser4_block_nr estimate_add_entry_common(const struct inode *inode)
27141 +{
27142 +       return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
27143 +}
27144 +
27145 +/* this is common implementation of estimate.rem_entry method of dir
27146 +   plugin
27147 +*/
27148 +reiser4_block_nr estimate_rem_entry_common(const struct inode *inode)
27149 +{
27150 +       return estimate_one_item_removal(reiser4_tree_by_inode(inode));
27151 +}
27152 +
27153 +/* this is common implementation of estimate.unlink method of dir
27154 +   plugin
27155 +*/
27156 +reiser4_block_nr
27157 +dir_estimate_unlink_common(const struct inode *parent,
27158 +                          const struct inode *object)
27159 +{
27160 +       reiser4_block_nr res;
27161 +
27162 +       /* hashed_rem_entry(object) */
27163 +       res = inode_dir_plugin(object)->estimate.rem_entry(object);
27164 +       /* del_nlink(parent) */
27165 +       res += 2 * inode_file_plugin(parent)->estimate.update(parent);
27166 +
27167 +       return res;
27168 +}
27169 +
27170 +/*
27171 + * helper for inode_ops ->lookup() and dir plugin's ->get_parent()
27172 + * methods: if @inode is a light-weight file, setup its credentials
27173 + * that are not stored in the stat-data in this case
27174 + */
27175 +void check_light_weight(struct inode *inode, struct inode *parent)
27176 +{
27177 +       if (reiser4_inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) {
27178 +               inode->i_uid = parent->i_uid;
27179 +               inode->i_gid = parent->i_gid;
27180 +               /* clear light-weight flag. If inode would be read by any
27181 +                  other name, [ug]id wouldn't change. */
27182 +               reiser4_inode_clr_flag(inode, REISER4_LIGHT_WEIGHT);
27183 +       }
27184 +}
27185 +
27186 +/* looks for name specified in @dentry in directory @parent and if name is
27187 +   found - key of object found entry points to is stored in @entry->key */
27188 +int reiser4_lookup_name(struct inode *parent,  /* inode of directory to lookup
27189 +                                        * for name in */
27190 +               struct dentry *dentry,  /* name to look for */
27191 +               reiser4_key * key/* place to store key */)
27192 +{
27193 +       int result;
27194 +       coord_t *coord;
27195 +       lock_handle lh;
27196 +       const char *name;
27197 +       int len;
27198 +       reiser4_dir_entry_desc entry;
27199 +       struct reiser4_dentry_fsdata *fsdata;
27200 +
27201 +       assert("nikita-1247", parent != NULL);
27202 +       assert("nikita-1248", dentry != NULL);
27203 +       assert("nikita-1123", dentry->d_name.name != NULL);
27204 +       assert("vs-1486",
27205 +              dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry);
27206 +
27207 +       name = dentry->d_name.name;
27208 +       len = dentry->d_name.len;
27209 +
27210 +       if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len))
27211 +               /* some arbitrary error code to return */
27212 +               return RETERR(-ENAMETOOLONG);
27213 +
27214 +       fsdata = reiser4_get_dentry_fsdata(dentry);
27215 +       if (IS_ERR(fsdata))
27216 +               return PTR_ERR(fsdata);
27217 +
27218 +       coord = &fsdata->dec.entry_coord;
27219 +       coord_clear_iplug(coord);
27220 +       init_lh(&lh);
27221 +
27222 +       /* find entry in a directory. This is plugin method. */
27223 +       result = reiser4_find_entry(parent, dentry, &lh, ZNODE_READ_LOCK,
27224 +                                   &entry);
27225 +       if (result == 0) {
27226 +               /* entry was found, extract object key from it. */
27227 +               result =
27228 +                   WITH_COORD(coord,
27229 +                              item_plugin_by_coord(coord)->s.dir.
27230 +                              extract_key(coord, key));
27231 +       }
27232 +       done_lh(&lh);
27233 +       return result;
27234 +
27235 +}
27236 +
27237 +/* helper for reiser4_dir_init_common(): estimate number of blocks to reserve */
27238 +static reiser4_block_nr
27239 +estimate_init(struct inode *parent, struct inode *object)
27240 +{
27241 +       reiser4_block_nr res = 0;
27242 +
27243 +       assert("vpf-321", parent != NULL);
27244 +       assert("vpf-322", object != NULL);
27245 +
27246 +       /* hashed_add_entry(object) */
27247 +       res += inode_dir_plugin(object)->estimate.add_entry(object);
27248 +       /* reiser4_add_nlink(object) */
27249 +       res += inode_file_plugin(object)->estimate.update(object);
27250 +       /* hashed_add_entry(object) */
27251 +       res += inode_dir_plugin(object)->estimate.add_entry(object);
27252 +       /* reiser4_add_nlink(parent) */
27253 +       res += inode_file_plugin(parent)->estimate.update(parent);
27254 +
27255 +       return 0;
27256 +}
27257 +
27258 +/* helper function for reiser4_dir_init_common(). Create "." and ".." */
27259 +static int create_dot_dotdot(struct inode *object/* object to create dot and
27260 +                                                 * dotdot for */ ,
27261 +                            struct inode *parent/* parent of @object */)
27262 +{
27263 +       int result;
27264 +       struct dentry dots_entry;
27265 +       reiser4_dir_entry_desc entry;
27266 +
27267 +       assert("nikita-688", object != NULL);
27268 +       assert("nikita-689", S_ISDIR(object->i_mode));
27269 +       assert("nikita-691", parent != NULL);
27270 +
27271 +       /* We store dot and dotdot as normal directory entries. This is
27272 +          not necessary, because almost all information stored in them
27273 +          is already in the stat-data of directory, the only thing
27274 +          being missed is objectid of grand-parent directory that can
27275 +          easily be added there as extension.
27276 +
27277 +          But it is done the way it is done, because not storing dot
27278 +          and dotdot will lead to the following complications:
27279 +
27280 +          . special case handling in ->lookup().
27281 +          . addition of another extension to the sd.
27282 +          . dependency on key allocation policy for stat data.
27283 +
27284 +        */
27285 +
27286 +       memset(&entry, 0, sizeof entry);
27287 +       memset(&dots_entry, 0, sizeof dots_entry);
27288 +       entry.obj = dots_entry.d_inode = object;
27289 +       dots_entry.d_name.name = ".";
27290 +       dots_entry.d_name.len = 1;
27291 +       result = reiser4_add_entry_common(object, &dots_entry, NULL, &entry);
27292 +       reiser4_free_dentry_fsdata(&dots_entry);
27293 +
27294 +       if (result == 0) {
27295 +               result = reiser4_add_nlink(object, object, 0);
27296 +               if (result == 0) {
27297 +                       entry.obj = dots_entry.d_inode = parent;
27298 +                       dots_entry.d_name.name = "..";
27299 +                       dots_entry.d_name.len = 2;
27300 +                       result = reiser4_add_entry_common(object,
27301 +                                                 &dots_entry, NULL, &entry);
27302 +                       reiser4_free_dentry_fsdata(&dots_entry);
27303 +                       /* if creation of ".." failed, iput() will delete
27304 +                          object with ".". */
27305 +                       if (result == 0) {
27306 +                               result = reiser4_add_nlink(parent, object, 0);
27307 +                               if (result != 0)
27308 +                                       /*
27309 +                                        * if we failed to bump i_nlink, try
27310 +                                        * to remove ".."
27311 +                                        */
27312 +                                       reiser4_detach_common(object, parent);
27313 +                       }
27314 +               }
27315 +       }
27316 +
27317 +       if (result != 0) {
27318 +               /*
27319 +                * in the case of error, at least update stat-data so that,
27320 +                * ->i_nlink updates are not lingering.
27321 +                */
27322 +               reiser4_update_sd(object);
27323 +               reiser4_update_sd(parent);
27324 +       }
27325 +
27326 +       return result;
27327 +}
27328 +
27329 +/*
27330 + * return 0 iff @coord contains a directory entry for the file with the name
27331 + * @name.
27332 + */
27333 +static int
27334 +check_item(const struct inode *dir, const coord_t *coord, const char *name)
27335 +{
27336 +       item_plugin *iplug;
27337 +       char buf[DE_NAME_BUF_LEN];
27338 +
27339 +       iplug = item_plugin_by_coord(coord);
27340 +       if (iplug == NULL) {
27341 +               warning("nikita-1135", "Cannot get item plugin");
27342 +               print_coord("coord", coord, 1);
27343 +               return RETERR(-EIO);
27344 +       } else if (item_id_by_coord(coord) !=
27345 +                  item_id_by_plugin(inode_dir_item_plugin(dir))) {
27346 +               /* item id of current item does not match to id of items a
27347 +                  directory is built of */
27348 +               warning("nikita-1136", "Wrong item plugin");
27349 +               print_coord("coord", coord, 1);
27350 +               return RETERR(-EIO);
27351 +       }
27352 +       assert("nikita-1137", iplug->s.dir.extract_name);
27353 +
27354 +       /* Compare name stored in this entry with name we are looking for.
27355 +
27356 +          NOTE-NIKITA Here should go code for support of something like
27357 +          unicode, code tables, etc.
27358 +        */
27359 +       return !!strcmp(name, iplug->s.dir.extract_name(coord, buf));
27360 +}
27361 +
27362 +static int
27363 +check_entry(const struct inode *dir, coord_t *coord, const struct qstr *name)
27364 +{
27365 +       return WITH_COORD(coord, check_item(dir, coord, name->name));
27366 +}
27367 +
27368 +/*
27369 + * argument package used by entry_actor to scan entries with identical keys.
27370 + */
27371 +struct entry_actor_args {
27372 +       /* name we are looking for */
27373 +       const char *name;
27374 +       /* key of directory entry. entry_actor() scans through sequence of
27375 +        * items/units having the same key */
27376 +       reiser4_key *key;
27377 +       /* how many entries with duplicate key was scanned so far. */
27378 +       int non_uniq;
27379 +#if REISER4_USE_COLLISION_LIMIT
27380 +       /* scan limit */
27381 +       int max_non_uniq;
27382 +#endif
27383 +       /* return parameter: set to true, if ->name wasn't found */
27384 +       int not_found;
27385 +       /* what type of lock to take when moving to the next node during
27386 +        * scan */
27387 +       znode_lock_mode mode;
27388 +
27389 +       /* last coord that was visited during scan */
27390 +       coord_t last_coord;
27391 +       /* last node locked during scan */
27392 +       lock_handle last_lh;
27393 +       /* inode of directory */
27394 +       const struct inode *inode;
27395 +};
27396 +
27397 +/* Function called by reiser4_find_entry() to look for given name
27398 +   in the directory. */
27399 +static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ ,
27400 +                      coord_t *coord /* current coord */ ,
27401 +                      lock_handle * lh /* current lock handle */ ,
27402 +                      void *entry_actor_arg/* argument to scan */)
27403 +{
27404 +       reiser4_key unit_key;
27405 +       struct entry_actor_args *args;
27406 +
27407 +       assert("nikita-1131", tree != NULL);
27408 +       assert("nikita-1132", coord != NULL);
27409 +       assert("nikita-1133", entry_actor_arg != NULL);
27410 +
27411 +       args = entry_actor_arg;
27412 +       ++args->non_uniq;
27413 +#if REISER4_USE_COLLISION_LIMIT
27414 +       if (args->non_uniq > args->max_non_uniq) {
27415 +               args->not_found = 1;
27416 +               /* hash collision overflow. */
27417 +               return RETERR(-EBUSY);
27418 +       }
27419 +#endif
27420 +
27421 +       /*
27422 +        * did we just reach the end of the sequence of items/units with
27423 +        * identical keys?
27424 +        */
27425 +       if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) {
27426 +               assert("nikita-1791",
27427 +                      keylt(args->key, unit_key_by_coord(coord, &unit_key)));
27428 +               args->not_found = 1;
27429 +               args->last_coord.between = AFTER_UNIT;
27430 +               return 0;
27431 +       }
27432 +
27433 +       coord_dup(&args->last_coord, coord);
27434 +       /*
27435 +        * did scan just moved to the next node?
27436 +        */
27437 +       if (args->last_lh.node != lh->node) {
27438 +               int lock_result;
27439 +
27440 +               /*
27441 +                * if so, lock new node with the mode requested by the caller
27442 +                */
27443 +               done_lh(&args->last_lh);
27444 +               assert("nikita-1896", znode_is_any_locked(lh->node));
27445 +               lock_result = longterm_lock_znode(&args->last_lh, lh->node,
27446 +                                                 args->mode, ZNODE_LOCK_HIPRI);
27447 +               if (lock_result != 0)
27448 +                       return lock_result;
27449 +       }
27450 +       return check_item(args->inode, coord, args->name);
27451 +}
27452 +
27453 +/* Look for given @name within directory @dir.
27454 +
27455 +   This is called during lookup, creation and removal of directory
27456 +   entries and on reiser4_rename_common
27457 +
27458 +   First calculate key that directory entry for @name would have. Search
27459 +   for this key in the tree. If such key is found, scan all items with
27460 +   the same key, checking name in each directory entry along the way.
27461 +*/
27462 +int reiser4_find_entry(struct inode *dir,      /* directory to scan */
27463 +                      struct dentry *de,       /* name to search for */
27464 +                      lock_handle * lh,        /* resulting lock handle */
27465 +                      znode_lock_mode mode,    /* required lock mode */
27466 +                      reiser4_dir_entry_desc * entry   /* parameters of found
27467 +                                                          directory entry */)
27468 +{
27469 +       const struct qstr *name;
27470 +       seal_t *seal;
27471 +       coord_t *coord;
27472 +       int result;
27473 +       __u32 flags;
27474 +       struct de_location *dec;
27475 +       struct reiser4_dentry_fsdata *fsdata;
27476 +
27477 +       assert("nikita-1130", lh != NULL);
27478 +       assert("nikita-1128", dir != NULL);
27479 +
27480 +       name = &de->d_name;
27481 +       assert("nikita-1129", name != NULL);
27482 +
27483 +       /* dentry private data don't require lock, because dentry
27484 +          manipulations are protected by i_mutex on parent.
27485 +
27486 +          This is not so for inodes, because there is no -the- parent in
27487 +          inode case.
27488 +        */
27489 +       fsdata = reiser4_get_dentry_fsdata(de);
27490 +       if (IS_ERR(fsdata))
27491 +               return PTR_ERR(fsdata);
27492 +       dec = &fsdata->dec;
27493 +
27494 +       coord = &dec->entry_coord;
27495 +       coord_clear_iplug(coord);
27496 +       seal = &dec->entry_seal;
27497 +       /* compose key of directory entry for @name */
27498 +       inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key);
27499 +
27500 +       if (reiser4_seal_is_set(seal)) {
27501 +               /* check seal */
27502 +               result = reiser4_seal_validate(seal, coord, &entry->key,
27503 +                                              lh, mode, ZNODE_LOCK_LOPRI);
27504 +               if (result == 0) {
27505 +                       /* key was found. Check that it is really item we are
27506 +                          looking for. */
27507 +                       result = check_entry(dir, coord, name);
27508 +                       if (result == 0)
27509 +                               return 0;
27510 +               }
27511 +       }
27512 +       flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
27513 +       /*
27514 +        * find place in the tree where directory item should be located.
27515 +        */
27516 +       result = reiser4_object_lookup(dir, &entry->key, coord, lh, mode,
27517 +                                      FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL,
27518 +                                      flags, NULL/*ra_info */);
27519 +       if (result == CBK_COORD_FOUND) {
27520 +               struct entry_actor_args arg;
27521 +
27522 +               /* fast path: no hash collisions */
27523 +               result = check_entry(dir, coord, name);
27524 +               if (result == 0) {
27525 +                       reiser4_seal_init(seal, coord, &entry->key);
27526 +                       dec->pos = 0;
27527 +               } else if (result > 0) {
27528 +                       /* Iterate through all units with the same keys. */
27529 +                       arg.name = name->name;
27530 +                       arg.key = &entry->key;
27531 +                       arg.not_found = 0;
27532 +                       arg.non_uniq = 0;
27533 +#if REISER4_USE_COLLISION_LIMIT
27534 +                       arg.max_non_uniq = max_hash_collisions(dir);
27535 +                       assert("nikita-2851", arg.max_non_uniq > 1);
27536 +#endif
27537 +                       arg.mode = mode;
27538 +                       arg.inode = dir;
27539 +                       coord_init_zero(&arg.last_coord);
27540 +                       init_lh(&arg.last_lh);
27541 +
27542 +                       result = reiser4_iterate_tree
27543 +                               (reiser4_tree_by_inode(dir),
27544 +                                coord, lh,
27545 +                                entry_actor, &arg, mode, 1);
27546 +                       /* if end of the tree or extent was reached during
27547 +                          scanning. */
27548 +                       if (arg.not_found || (result == -E_NO_NEIGHBOR)) {
27549 +                               /* step back */
27550 +                               done_lh(lh);
27551 +
27552 +                               result = zload(arg.last_coord.node);
27553 +                               if (result == 0) {
27554 +                                       coord_clear_iplug(&arg.last_coord);
27555 +                                       coord_dup(coord, &arg.last_coord);
27556 +                                       move_lh(lh, &arg.last_lh);
27557 +                                       result = RETERR(-ENOENT);
27558 +                                       zrelse(arg.last_coord.node);
27559 +                                       --arg.non_uniq;
27560 +                               }
27561 +                       }
27562 +
27563 +                       done_lh(&arg.last_lh);
27564 +                       if (result == 0)
27565 +                               reiser4_seal_init(seal, coord, &entry->key);
27566 +
27567 +                       if (result == 0 || result == -ENOENT) {
27568 +                               assert("nikita-2580", arg.non_uniq > 0);
27569 +                               dec->pos = arg.non_uniq - 1;
27570 +                       }
27571 +               }
27572 +       } else
27573 +               dec->pos = -1;
27574 +       return result;
27575 +}
27576 +
27577 +/*
27578 +   Local variables:
27579 +   c-indentation-style: "K&R"
27580 +   mode-name: "LC"
27581 +   c-basic-offset: 8
27582 +   tab-width: 8
27583 +   fill-column: 120
27584 +   scroll-step: 1
27585 +   End:
27586 +*/
27587 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/disk_format/disk_format40.c linux-2.6.35/fs/reiser4/plugin/disk_format/disk_format40.c
27588 --- linux-2.6.35.orig/fs/reiser4/plugin/disk_format/disk_format40.c     1970-01-01 01:00:00.000000000 +0100
27589 +++ linux-2.6.35/fs/reiser4/plugin/disk_format/disk_format40.c  2010-08-04 15:44:57.000000000 +0200
27590 @@ -0,0 +1,655 @@
27591 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
27592 +
27593 +#include "../../debug.h"
27594 +#include "../../dformat.h"
27595 +#include "../../key.h"
27596 +#include "../node/node.h"
27597 +#include "../space/space_allocator.h"
27598 +#include "disk_format40.h"
27599 +#include "../plugin.h"
27600 +#include "../../txnmgr.h"
27601 +#include "../../jnode.h"
27602 +#include "../../tree.h"
27603 +#include "../../super.h"
27604 +#include "../../wander.h"
27605 +#include "../../inode.h"
27606 +#include "../../ktxnmgrd.h"
27607 +#include "../../status_flags.h"
27608 +
27609 +#include <linux/types.h>       /* for __u??  */
27610 +#include <linux/fs.h>          /* for struct super_block  */
27611 +#include <linux/buffer_head.h>
27612 +
27613 +/* reiser 4.0 default disk layout */
27614 +
27615 +/* Amount of free blocks needed to perform release_format40 when fs gets
27616 +   mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header
27617 +   & tx record. */
27618 +#define RELEASE_RESERVED 4
27619 +
27620 +/* The greatest supported format40 version number */
27621 +#define FORMAT40_VERSION PLUGIN_LIBRARY_VERSION
27622 +
27623 +/* This flag indicates that backup should be updated
27624 +   (the update is performed by fsck) */
27625 +#define FORMAT40_UPDATE_BACKUP (1 << 31)
27626 +
27627 +/* functions to access fields of format40_disk_super_block */
27628 +static __u64 get_format40_block_count(const format40_disk_super_block * sb)
27629 +{
27630 +       return le64_to_cpu(get_unaligned(&sb->block_count));
27631 +}
27632 +
27633 +static __u64 get_format40_free_blocks(const format40_disk_super_block * sb)
27634 +{
27635 +       return le64_to_cpu(get_unaligned(&sb->free_blocks));
27636 +}
27637 +
27638 +static __u64 get_format40_root_block(const format40_disk_super_block * sb)
27639 +{
27640 +       return le64_to_cpu(get_unaligned(&sb->root_block));
27641 +}
27642 +
27643 +static __u16 get_format40_tree_height(const format40_disk_super_block * sb)
27644 +{
27645 +       return le16_to_cpu(get_unaligned(&sb->tree_height));
27646 +}
27647 +
27648 +static __u64 get_format40_file_count(const format40_disk_super_block * sb)
27649 +{
27650 +       return le64_to_cpu(get_unaligned(&sb->file_count));
27651 +}
27652 +
27653 +static __u64 get_format40_oid(const format40_disk_super_block * sb)
27654 +{
27655 +       return le64_to_cpu(get_unaligned(&sb->oid));
27656 +}
27657 +
27658 +static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb)
27659 +{
27660 +       return le32_to_cpu(get_unaligned(&sb->mkfs_id));
27661 +}
27662 +
27663 +static __u64 get_format40_flags(const format40_disk_super_block * sb)
27664 +{
27665 +       return le64_to_cpu(get_unaligned(&sb->flags));
27666 +}
27667 +
27668 +static __u32 get_format40_version(const format40_disk_super_block * sb)
27669 +{
27670 +       return le32_to_cpu(get_unaligned(&sb->version)) &
27671 +               ~FORMAT40_UPDATE_BACKUP;
27672 +}
27673 +
27674 +static int update_backup_version(const format40_disk_super_block * sb)
27675 +{
27676 +       return (le32_to_cpu(get_unaligned(&sb->version)) &
27677 +               FORMAT40_UPDATE_BACKUP);
27678 +}
27679 +
27680 +static int update_disk_version(const format40_disk_super_block * sb)
27681 +{
27682 +       return (get_format40_version(sb) < FORMAT40_VERSION);
27683 +}
27684 +
27685 +static int incomplete_compatibility(const format40_disk_super_block * sb)
27686 +{
27687 +       return (get_format40_version(sb) > FORMAT40_VERSION);
27688 +}
27689 +
27690 +static format40_super_info *get_sb_info(struct super_block *super)
27691 +{
27692 +       return &get_super_private(super)->u.format40;
27693 +}
27694 +
27695 +static int consult_diskmap(struct super_block *s)
27696 +{
27697 +       format40_super_info *info;
27698 +       journal_location *jloc;
27699 +
27700 +       info = get_sb_info(s);
27701 +       jloc = &get_super_private(s)->jloc;
27702 +       /* Default format-specific locations, if there is nothing in
27703 +        * diskmap */
27704 +       jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR;
27705 +       jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR;
27706 +       info->loc.super = FORMAT40_OFFSET / s->s_blocksize;
27707 +#ifdef CONFIG_REISER4_BADBLOCKS
27708 +       reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF,
27709 +                                 &jloc->footer);
27710 +       reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH,
27711 +                                 &jloc->header);
27712 +       reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER,
27713 +                                 &info->loc.super);
27714 +#endif
27715 +       return 0;
27716 +}
27717 +
27718 +/* find any valid super block of disk_format40 (even if the first
27719 +   super block is destroyed), will change block numbers of actual journal header/footer (jf/jh)
27720 +   if needed */
27721 +static struct buffer_head *find_a_disk_format40_super_block(struct super_block
27722 +                                                           *s)
27723 +{
27724 +       struct buffer_head *super_bh;
27725 +       format40_disk_super_block *disk_sb;
27726 +       format40_super_info *info;
27727 +
27728 +       assert("umka-487", s != NULL);
27729 +
27730 +       info = get_sb_info(s);
27731 +
27732 +       super_bh = sb_bread(s, info->loc.super);
27733 +       if (super_bh == NULL)
27734 +               return ERR_PTR(RETERR(-EIO));
27735 +
27736 +       disk_sb = (format40_disk_super_block *) super_bh->b_data;
27737 +       if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) {
27738 +               brelse(super_bh);
27739 +               return ERR_PTR(RETERR(-EINVAL));
27740 +       }
27741 +
27742 +       reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)));
27743 +       reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) -
27744 +                               le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
27745 +       reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
27746 +
27747 +       return super_bh;
27748 +}
27749 +
27750 +/* find the most recent version of super block. This is called after journal is
27751 +   replayed */
27752 +static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG)
27753 +{
27754 +       /* Here the most recent superblock copy has to be read. However, as
27755 +          journal replay isn't complete, we are using
27756 +          find_a_disk_format40_super_block() function. */
27757 +       return find_a_disk_format40_super_block(s);
27758 +}
27759 +
27760 +static int get_super_jnode(struct super_block *s)
27761 +{
27762 +       reiser4_super_info_data *sbinfo = get_super_private(s);
27763 +       jnode *sb_jnode;
27764 +       int ret;
27765 +
27766 +       sb_jnode = reiser4_alloc_io_head(&get_sb_info(s)->loc.super);
27767 +
27768 +       ret = jload(sb_jnode);
27769 +
27770 +       if (ret) {
27771 +               reiser4_drop_io_head(sb_jnode);
27772 +               return ret;
27773 +       }
27774 +
27775 +       pin_jnode_data(sb_jnode);
27776 +       jrelse(sb_jnode);
27777 +
27778 +       sbinfo->u.format40.sb_jnode = sb_jnode;
27779 +
27780 +       return 0;
27781 +}
27782 +
27783 +static void done_super_jnode(struct super_block *s)
27784 +{
27785 +       jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode;
27786 +
27787 +       if (sb_jnode) {
27788 +               unpin_jnode_data(sb_jnode);
27789 +               reiser4_drop_io_head(sb_jnode);
27790 +       }
27791 +}
27792 +
27793 +typedef enum format40_init_stage {
27794 +       NONE_DONE = 0,
27795 +       CONSULT_DISKMAP,
27796 +       FIND_A_SUPER,
27797 +       INIT_JOURNAL_INFO,
27798 +       INIT_STATUS,
27799 +       JOURNAL_REPLAY,
27800 +       READ_SUPER,
27801 +       KEY_CHECK,
27802 +       INIT_OID,
27803 +       INIT_TREE,
27804 +       JOURNAL_RECOVER,
27805 +       INIT_SA,
27806 +       INIT_JNODE,
27807 +       ALL_DONE
27808 +} format40_init_stage;
27809 +
27810 +static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh)
27811 +{
27812 +       format40_disk_super_block *sb_copy;
27813 +
27814 +       sb_copy = kmalloc(sizeof(format40_disk_super_block),
27815 +                         reiser4_ctx_gfp_mask_get());
27816 +       if (sb_copy == NULL)
27817 +               return ERR_PTR(RETERR(-ENOMEM));
27818 +       memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data),
27819 +              sizeof(format40_disk_super_block));
27820 +       return sb_copy;
27821 +}
27822 +
27823 +static int check_key_format(const format40_disk_super_block *sb_copy)
27824 +{
27825 +       if (!equi(REISER4_LARGE_KEY,
27826 +                 get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) {
27827 +               warning("nikita-3228", "Key format mismatch. "
27828 +                       "Only %s keys are supported.",
27829 +                       REISER4_LARGE_KEY ? "large" : "small");
27830 +               return RETERR(-EINVAL);
27831 +       }
27832 +       return 0;
27833 +}
27834 +
27835 +/**
27836 + * try_init_format40
27837 + * @super:
27838 + * @stage:
27839 + *
27840 + */
27841 +static int try_init_format40(struct super_block *super,
27842 +                            format40_init_stage *stage)
27843 +{
27844 +       int result;
27845 +       struct buffer_head *super_bh;
27846 +       reiser4_super_info_data *sbinfo;
27847 +       format40_disk_super_block *sb_copy;
27848 +       tree_level height;
27849 +       reiser4_block_nr root_block;
27850 +       node_plugin *nplug;
27851 +
27852 +       assert("vs-475", super != NULL);
27853 +       assert("vs-474", get_super_private(super));
27854 +
27855 +       *stage = NONE_DONE;
27856 +
27857 +       result = consult_diskmap(super);
27858 +       if (result)
27859 +               return result;
27860 +       *stage = CONSULT_DISKMAP;
27861 +
27862 +       super_bh = find_a_disk_format40_super_block(super);
27863 +       if (IS_ERR(super_bh))
27864 +               return PTR_ERR(super_bh);
27865 +       brelse(super_bh);
27866 +       *stage = FIND_A_SUPER;
27867 +
27868 +       /* ok, we are sure that filesystem format is a format40 format */
27869 +
27870 +       /* map jnodes for journal control blocks (header, footer) to disk  */
27871 +       result = reiser4_init_journal_info(super);
27872 +       if (result)
27873 +               return result;
27874 +       *stage = INIT_JOURNAL_INFO;
27875 +
27876 +       /* ok, we are sure that filesystem format is a format40 format */
27877 +       /* Now check it's state */
27878 +       result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR);
27879 +       if (result != 0 && result != -EINVAL)
27880 +               /* -EINVAL means there is no magic, so probably just old
27881 +                * fs. */
27882 +               return result;
27883 +       *stage = INIT_STATUS;
27884 +
27885 +       result = reiser4_status_query(NULL, NULL);
27886 +       if (result == REISER4_STATUS_MOUNT_WARN)
27887 +               notice("vpf-1363", "Warning: mounting %s with errors.",
27888 +                      super->s_id);
27889 +       if (result == REISER4_STATUS_MOUNT_RO)
27890 +               notice("vpf-1364", "Warning: mounting %s with fatal errors,"
27891 +                      " forcing read-only mount.", super->s_id);
27892 +       result = reiser4_journal_replay(super);
27893 +       if (result)
27894 +               return result;
27895 +       *stage = JOURNAL_REPLAY;
27896 +
27897 +       super_bh = read_super_block(super);
27898 +       if (IS_ERR(super_bh))
27899 +               return PTR_ERR(super_bh);
27900 +       *stage = READ_SUPER;
27901 +
27902 +       /* allocate and make a copy of format40_disk_super_block */
27903 +       sb_copy = copy_sb(super_bh);
27904 +       brelse(super_bh);
27905 +
27906 +       if (IS_ERR(sb_copy))
27907 +               return PTR_ERR(sb_copy);
27908 +       printk("reiser4: %s: found disk format 4.0.%u.\n",
27909 +              super->s_id,
27910 +              get_format40_version(sb_copy));
27911 +       if (incomplete_compatibility(sb_copy))
27912 +               printk("reiser4: Warning: The last completely supported "
27913 +                      "version of disk format40 is %u. Some objects of "
27914 +                      "the semantic tree can be unaccessible.\n",
27915 +                      FORMAT40_VERSION);
27916 +       /* make sure that key format of kernel and filesystem match */
27917 +       result = check_key_format(sb_copy);
27918 +       if (result) {
27919 +               kfree(sb_copy);
27920 +               return result;
27921 +       }
27922 +       *stage = KEY_CHECK;
27923 +
27924 +       result = oid_init_allocator(super, get_format40_file_count(sb_copy),
27925 +                                   get_format40_oid(sb_copy));
27926 +       if (result) {
27927 +               kfree(sb_copy);
27928 +               return result;
27929 +       }
27930 +       *stage = INIT_OID;
27931 +
27932 +       /* get things necessary to init reiser4_tree */
27933 +       root_block = get_format40_root_block(sb_copy);
27934 +       height = get_format40_tree_height(sb_copy);
27935 +       nplug = node_plugin_by_id(NODE40_ID);
27936 +
27937 +       /* initialize reiser4_super_info_data */
27938 +       sbinfo = get_super_private(super);
27939 +       assert("", sbinfo->tree.super == super);
27940 +       /* init reiser4_tree for the filesystem */
27941 +       result = reiser4_init_tree(&sbinfo->tree, &root_block, height, nplug);
27942 +       if (result) {
27943 +               kfree(sb_copy);
27944 +               return result;
27945 +       }
27946 +       *stage = INIT_TREE;
27947 +
27948 +       /*
27949 +        * initialize reiser4_super_info_data with data from format40 super
27950 +        * block
27951 +        */
27952 +       sbinfo->default_uid = 0;
27953 +       sbinfo->default_gid = 0;
27954 +       sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy);
27955 +       /* number of blocks in filesystem and reserved space */
27956 +       reiser4_set_block_count(super, get_format40_block_count(sb_copy));
27957 +       sbinfo->blocks_free = get_format40_free_blocks(sb_copy);
27958 +       sbinfo->version = get_format40_version(sb_copy);
27959 +       kfree(sb_copy);
27960 +
27961 +       if (update_backup_version(sb_copy))
27962 +               printk("reiser4: Warning: metadata backup is not updated. "
27963 +                      "Please run 'fsck.reiser4 --fix' on %s.\n",
27964 +                      super->s_id);
27965 +
27966 +       sbinfo->fsuid = 0;
27967 +       sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories
27968 +                                                * are not supported */
27969 +       sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN);     /* all nodes in
27970 +                                                                * layout 40 are
27971 +                                                                * of one
27972 +                                                                * plugin */
27973 +       /* sbinfo->tmgr is initialized already */
27974 +
27975 +       /* recover sb data which were logged separately from sb block */
27976 +
27977 +       /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls
27978 +        * oid_init_allocator() and reiser4_set_free_blocks() with new
27979 +        * data. What's the reason to call them above? */
27980 +       result = reiser4_journal_recover_sb_data(super);
27981 +       if (result != 0)
27982 +               return result;
27983 +       *stage = JOURNAL_RECOVER;
27984 +
27985 +       /*
27986 +        * Set number of used blocks.  The number of used blocks is not stored
27987 +        * neither in on-disk super block nor in the journal footer blocks.  At
27988 +        * this moment actual values of total blocks and free block counters
27989 +        * are set in the reiser4 super block (in-memory structure) and we can
27990 +        * calculate number of used blocks from them.
27991 +        */
27992 +       reiser4_set_data_blocks(super,
27993 +                               reiser4_block_count(super) -
27994 +                               reiser4_free_blocks(super));
27995 +
27996 +#if REISER4_DEBUG
27997 +       sbinfo->min_blocks_used = 16 /* reserved area */  +
27998 +               2 /* super blocks */  +
27999 +               2 /* journal footer and header */ ;
28000 +#endif
28001 +
28002 +       /* init disk space allocator */
28003 +       result = sa_init_allocator(reiser4_get_space_allocator(super),
28004 +                                  super, NULL);
28005 +       if (result)
28006 +               return result;
28007 +       *stage = INIT_SA;
28008 +
28009 +       result = get_super_jnode(super);
28010 +       if (result == 0)
28011 +               *stage = ALL_DONE;
28012 +       return result;
28013 +}
28014 +
28015 +/* plugin->u.format.get_ready */
28016 +int init_format_format40(struct super_block *s, void *data UNUSED_ARG)
28017 +{
28018 +       int result;
28019 +       format40_init_stage stage;
28020 +
28021 +       result = try_init_format40(s, &stage);
28022 +       switch (stage) {
28023 +       case ALL_DONE:
28024 +               assert("nikita-3458", result == 0);
28025 +               break;
28026 +       case INIT_JNODE:
28027 +               done_super_jnode(s);
28028 +       case INIT_SA:
28029 +               sa_destroy_allocator(reiser4_get_space_allocator(s), s);
28030 +       case JOURNAL_RECOVER:
28031 +       case INIT_TREE:
28032 +               reiser4_done_tree(&get_super_private(s)->tree);
28033 +       case INIT_OID:
28034 +       case KEY_CHECK:
28035 +       case READ_SUPER:
28036 +       case JOURNAL_REPLAY:
28037 +       case INIT_STATUS:
28038 +               reiser4_status_finish();
28039 +       case INIT_JOURNAL_INFO:
28040 +               reiser4_done_journal_info(s);
28041 +       case FIND_A_SUPER:
28042 +       case CONSULT_DISKMAP:
28043 +       case NONE_DONE:
28044 +               break;
28045 +       default:
28046 +               impossible("nikita-3457", "init stage: %i", stage);
28047 +       }
28048 +
28049 +       if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED)
28050 +               return RETERR(-ENOSPC);
28051 +
28052 +       return result;
28053 +}
28054 +
28055 +static void pack_format40_super(const struct super_block *s, char *data)
28056 +{
28057 +       format40_disk_super_block *super_data =
28058 +           (format40_disk_super_block *) data;
28059 +
28060 +       reiser4_super_info_data *sbinfo = get_super_private(s);
28061 +
28062 +       assert("zam-591", data != NULL);
28063 +
28064 +       put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)),
28065 +                     &super_data->free_blocks);
28066 +
28067 +       put_unaligned(cpu_to_le64(sbinfo->tree.root_block),
28068 +                     &super_data->root_block);
28069 +
28070 +       put_unaligned(cpu_to_le64(oid_next(s)),
28071 +                     &super_data->oid);
28072 +
28073 +       put_unaligned(cpu_to_le64(oids_used(s)),
28074 +                     &super_data->file_count);
28075 +
28076 +       put_unaligned(cpu_to_le16(sbinfo->tree.height),
28077 +                     &super_data->tree_height);
28078 +
28079 +       if (update_disk_version(super_data)) {
28080 +               __u32 version = FORMAT40_VERSION | FORMAT40_UPDATE_BACKUP;
28081 +
28082 +               put_unaligned(cpu_to_le32(version), &super_data->version);
28083 +       }
28084 +}
28085 +
28086 +/* plugin->u.format.log_super
28087 +   return a jnode which should be added to transaction when the super block
28088 +   gets logged */
28089 +jnode *log_super_format40(struct super_block *s)
28090 +{
28091 +       jnode *sb_jnode;
28092 +
28093 +       sb_jnode = get_super_private(s)->u.format40.sb_jnode;
28094 +
28095 +       jload(sb_jnode);
28096 +
28097 +       pack_format40_super(s, jdata(sb_jnode));
28098 +
28099 +       jrelse(sb_jnode);
28100 +
28101 +       return sb_jnode;
28102 +}
28103 +
28104 +/* plugin->u.format.release */
28105 +int release_format40(struct super_block *s)
28106 +{
28107 +       int ret;
28108 +       reiser4_super_info_data *sbinfo;
28109 +
28110 +       sbinfo = get_super_private(s);
28111 +       assert("zam-579", sbinfo != NULL);
28112 +
28113 +       if (!rofs_super(s)) {
28114 +               ret = reiser4_capture_super_block(s);
28115 +               if (ret != 0)
28116 +                       warning("vs-898",
28117 +                               "reiser4_capture_super_block failed: %d",
28118 +                               ret);
28119 +
28120 +               ret = txnmgr_force_commit_all(s, 1);
28121 +               if (ret != 0)
28122 +                       warning("jmacd-74438", "txn_force failed: %d", ret);
28123 +
28124 +               all_grabbed2free();
28125 +       }
28126 +
28127 +       sa_destroy_allocator(&sbinfo->space_allocator, s);
28128 +       reiser4_done_journal_info(s);
28129 +       done_super_jnode(s);
28130 +
28131 +       rcu_barrier();
28132 +       reiser4_done_tree(&sbinfo->tree);
28133 +       /* call finish_rcu(), because some znode were "released" in
28134 +        * reiser4_done_tree(). */
28135 +       rcu_barrier();
28136 +
28137 +       return 0;
28138 +}
28139 +
28140 +#define FORMAT40_ROOT_LOCALITY 41
28141 +#define FORMAT40_ROOT_OBJECTID 42
28142 +
28143 +/* plugin->u.format.root_dir_key */
28144 +const reiser4_key *root_dir_key_format40(const struct super_block *super
28145 +                                        UNUSED_ARG)
28146 +{
28147 +       static const reiser4_key FORMAT40_ROOT_DIR_KEY = {
28148 +               .el = {
28149 +                       __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR),
28150 +#if REISER4_LARGE_KEY
28151 +                       ON_LARGE_KEY(0ull,)
28152 +#endif
28153 +                       __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID),
28154 +                       0ull
28155 +               }
28156 +       };
28157 +
28158 +       return &FORMAT40_ROOT_DIR_KEY;
28159 +}
28160 +
28161 +/* plugin->u.format.check_open.
28162 +   Check the opened object for validness. For now it checks for the valid oid &
28163 +   locality only, can be improved later and it its work may depend on the mount
28164 +   options. */
28165 +int check_open_format40(const struct inode *object)
28166 +{
28167 +       oid_t max, oid;
28168 +
28169 +       max = oid_next(object->i_sb) - 1;
28170 +
28171 +       /* Check the oid. */
28172 +       oid = get_inode_oid(object);
28173 +       if (oid > max) {
28174 +               warning("vpf-1360", "The object with the oid %llu "
28175 +                       "greater then the max used oid %llu found.",
28176 +                       (unsigned long long)oid, (unsigned long long)max);
28177 +
28178 +               return RETERR(-EIO);
28179 +       }
28180 +
28181 +       /* Check the locality. */
28182 +       oid = reiser4_inode_data(object)->locality_id;
28183 +       if (oid > max) {
28184 +               warning("vpf-1361", "The object with the locality %llu "
28185 +                       "greater then the max used oid %llu found.",
28186 +                       (unsigned long long)oid, (unsigned long long)max);
28187 +
28188 +               return RETERR(-EIO);
28189 +       }
28190 +
28191 +       return 0;
28192 +}
28193 +
28194 +/* plugin->u.format.version_update.
28195 +   Perform all version update operations from the on-disk
28196 +   format40_disk_super_block.version on disk to FORMAT40_VERSION.
28197 + */
28198 +int version_update_format40(struct super_block *super) {
28199 +       txn_handle * trans;
28200 +       lock_handle lh;
28201 +       txn_atom *atom;
28202 +       int ret;
28203 +
28204 +       /* Nothing to do if RO mount or the on-disk version is not less. */
28205 +       if (super->s_flags & MS_RDONLY)
28206 +               return 0;
28207 +
28208 +       if (get_super_private(super)->version >= FORMAT40_VERSION)
28209 +               return 0;
28210 +
28211 +       printk("reiser4: Updating disk format to 4.0.%u. The reiser4 metadata "
28212 +              "backup is left unchanged. Please run 'fsck.reiser4 --fix' "
28213 +              "on %s to update it too.\n", FORMAT40_VERSION, super->s_id);
28214 +
28215 +       /* Mark the uber znode dirty to call log_super on write_logs. */
28216 +       init_lh(&lh);
28217 +       ret = get_uber_znode(reiser4_get_tree(super), ZNODE_WRITE_LOCK,
28218 +                            ZNODE_LOCK_HIPRI, &lh);
28219 +       if (ret != 0)
28220 +               return ret;
28221 +
28222 +       znode_make_dirty(lh.node);
28223 +       done_lh(&lh);
28224 +
28225 +       /* Update the backup blocks. */
28226 +
28227 +       /* Force write_logs immediately. */
28228 +       trans = get_current_context()->trans;
28229 +       atom = get_current_atom_locked();
28230 +       assert("vpf-1906", atom != NULL);
28231 +
28232 +       spin_lock_txnh(trans);
28233 +       return force_commit_atom(trans);
28234 +}
28235 +
28236 +/* Make Linus happy.
28237 +   Local variables:
28238 +   c-indentation-style: "K&R"
28239 +   mode-name: "LC"
28240 +   c-basic-offset: 8
28241 +   tab-width: 8
28242 +   fill-column: 120
28243 +   scroll-step: 1
28244 +   End:
28245 +*/
28246 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/disk_format/disk_format40.h linux-2.6.35/fs/reiser4/plugin/disk_format/disk_format40.h
28247 --- linux-2.6.35.orig/fs/reiser4/plugin/disk_format/disk_format40.h     1970-01-01 01:00:00.000000000 +0100
28248 +++ linux-2.6.35/fs/reiser4/plugin/disk_format/disk_format40.h  2010-08-04 15:44:57.000000000 +0200
28249 @@ -0,0 +1,109 @@
28250 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28251 +
28252 +/* this file contains:
28253 +   - definition of ondisk super block of standart disk layout for
28254 +     reiser 4.0 (layout 40)
28255 +   - definition of layout 40 specific portion of in-core super block
28256 +   - declarations of functions implementing methods of layout plugin
28257 +     for layout 40
28258 +   - declarations of functions used to get/set fields in layout 40 super block
28259 +*/
28260 +
28261 +#ifndef __DISK_FORMAT40_H__
28262 +#define __DISK_FORMAT40_H__
28263 +
28264 +/* magic for default reiser4 layout */
28265 +#define FORMAT40_MAGIC "ReIsEr40FoRmAt"
28266 +#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE)
28267 +
28268 +#include "../../dformat.h"
28269 +
28270 +#include <linux/fs.h>          /* for struct super_block  */
28271 +
28272 +typedef enum {
28273 +       FORMAT40_LARGE_KEYS
28274 +} format40_flags;
28275 +
28276 +/* ondisk super block for format 40. It is 512 bytes long */
28277 +typedef struct format40_disk_super_block {
28278 +       /*   0 */ d64 block_count;
28279 +       /* number of block in a filesystem */
28280 +       /*   8 */ d64 free_blocks;
28281 +       /* number of free blocks */
28282 +       /*  16 */ d64 root_block;
28283 +       /* filesystem tree root block */
28284 +       /*  24 */ d64 oid;
28285 +       /* smallest free objectid */
28286 +       /*  32 */ d64 file_count;
28287 +       /* number of files in a filesystem */
28288 +       /*  40 */ d64 flushes;
28289 +       /* number of times super block was
28290 +          flushed. Needed if format 40
28291 +          will have few super blocks */
28292 +       /*  48 */ d32 mkfs_id;
28293 +       /* unique identifier of fs */
28294 +       /*  52 */ char magic[16];
28295 +       /* magic string ReIsEr40FoRmAt */
28296 +       /*  68 */ d16 tree_height;
28297 +       /* height of filesystem tree */
28298 +       /*  70 */ d16 formatting_policy;
28299 +       /* not used anymore */
28300 +       /*  72 */ d64 flags;
28301 +       /*  80 */ d32 version;
28302 +       /* on-disk format version number
28303 +          initially assigned by mkfs as the greatest format40
28304 +          version number supported by reiser4progs and updated
28305 +          in mount time in accordance with the greatest format40
28306 +          version number supported by kernel.
28307 +          Is used by fsck to catch possible corruption and
28308 +          for various compatibility issues */
28309 +       /*  84 */ char not_used[428];
28310 +} format40_disk_super_block;
28311 +
28312 +/* format 40 specific part of reiser4_super_info_data */
28313 +typedef struct format40_super_info {
28314 +/*     format40_disk_super_block actual_sb; */
28315 +       jnode *sb_jnode;
28316 +       struct {
28317 +               reiser4_block_nr super;
28318 +       } loc;
28319 +} format40_super_info;
28320 +
28321 +/* Defines for journal header and footer respectively. */
28322 +#define FORMAT40_JOURNAL_HEADER_BLOCKNR \
28323 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3)
28324 +
28325 +#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \
28326 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4)
28327 +
28328 +#define FORMAT40_STATUS_BLOCKNR \
28329 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5)
28330 +
28331 +/* Diskmap declarations */
28332 +#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID))
28333 +#define FORMAT40_SUPER 1
28334 +#define FORMAT40_JH 2
28335 +#define FORMAT40_JF 3
28336 +
28337 +/* declarations of functions implementing methods of layout plugin for
28338 +   format 40. The functions theirself are in disk_format40.c */
28339 +extern int init_format_format40(struct super_block *, void *data);
28340 +extern const reiser4_key *root_dir_key_format40(const struct super_block *);
28341 +extern int release_format40(struct super_block *s);
28342 +extern jnode *log_super_format40(struct super_block *s);
28343 +extern int check_open_format40(const struct inode *object);
28344 +extern int version_update_format40(struct super_block *super);
28345 +
28346 +/* __DISK_FORMAT40_H__ */
28347 +#endif
28348 +
28349 +/* Make Linus happy.
28350 +   Local variables:
28351 +   c-indentation-style: "K&R"
28352 +   mode-name: "LC"
28353 +   c-basic-offset: 8
28354 +   tab-width: 8
28355 +   fill-column: 120
28356 +   scroll-step: 1
28357 +   End:
28358 +*/
28359 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/disk_format/disk_format.c linux-2.6.35/fs/reiser4/plugin/disk_format/disk_format.c
28360 --- linux-2.6.35.orig/fs/reiser4/plugin/disk_format/disk_format.c       1970-01-01 01:00:00.000000000 +0100
28361 +++ linux-2.6.35/fs/reiser4/plugin/disk_format/disk_format.c    2010-08-04 15:44:57.000000000 +0200
28362 @@ -0,0 +1,38 @@
28363 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28364 +
28365 +#include "../../debug.h"
28366 +#include "../plugin_header.h"
28367 +#include "disk_format40.h"
28368 +#include "disk_format.h"
28369 +#include "../plugin.h"
28370 +
28371 +/* initialization of disk layout plugins */
28372 +disk_format_plugin format_plugins[LAST_FORMAT_ID] = {
28373 +       [FORMAT40_ID] = {
28374 +               .h = {
28375 +                       .type_id = REISER4_FORMAT_PLUGIN_TYPE,
28376 +                       .id = FORMAT40_ID,
28377 +                       .pops = NULL,
28378 +                       .label = "reiser40",
28379 +                       .desc = "standard disk layout for reiser40",
28380 +                       .linkage = {NULL, NULL}
28381 +               },
28382 +               .init_format = init_format_format40,
28383 +               .root_dir_key = root_dir_key_format40,
28384 +               .release = release_format40,
28385 +               .log_super = log_super_format40,
28386 +               .check_open = check_open_format40,
28387 +               .version_update = version_update_format40
28388 +       }
28389 +};
28390 +
28391 +/* Make Linus happy.
28392 +   Local variables:
28393 +   c-indentation-style: "K&R"
28394 +   mode-name: "LC"
28395 +   c-basic-offset: 8
28396 +   tab-width: 8
28397 +   fill-column: 120
28398 +   scroll-step: 1
28399 +   End:
28400 +*/
28401 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/disk_format/disk_format.h linux-2.6.35/fs/reiser4/plugin/disk_format/disk_format.h
28402 --- linux-2.6.35.orig/fs/reiser4/plugin/disk_format/disk_format.h       1970-01-01 01:00:00.000000000 +0100
28403 +++ linux-2.6.35/fs/reiser4/plugin/disk_format/disk_format.h    2010-08-04 15:44:57.000000000 +0200
28404 @@ -0,0 +1,27 @@
28405 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28406 +
28407 +/* identifiers for disk layouts, they are also used as indexes in array of disk
28408 +   plugins */
28409 +
28410 +#if !defined( __REISER4_DISK_FORMAT_H__ )
28411 +#define __REISER4_DISK_FORMAT_H__
28412 +
28413 +typedef enum {
28414 +       /* standard reiser4 disk layout plugin id */
28415 +       FORMAT40_ID,
28416 +       LAST_FORMAT_ID
28417 +} disk_format_id;
28418 +
28419 +/* __REISER4_DISK_FORMAT_H__ */
28420 +#endif
28421 +
28422 +/* Make Linus happy.
28423 +   Local variables:
28424 +   c-indentation-style: "K&R"
28425 +   mode-name: "LC"
28426 +   c-basic-offset: 8
28427 +   tab-width: 8
28428 +   fill-column: 120
28429 +   scroll-step: 1
28430 +   End:
28431 +*/
28432 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/disk_format/Makefile linux-2.6.35/fs/reiser4/plugin/disk_format/Makefile
28433 --- linux-2.6.35.orig/fs/reiser4/plugin/disk_format/Makefile    1970-01-01 01:00:00.000000000 +0100
28434 +++ linux-2.6.35/fs/reiser4/plugin/disk_format/Makefile 2010-08-04 15:44:57.000000000 +0200
28435 @@ -0,0 +1,5 @@
28436 +obj-$(CONFIG_REISER4_FS) += df_plugins.o
28437 +
28438 +df_plugins-objs :=     \
28439 +       disk_format40.o \
28440 +       disk_format.o
28441 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/fibration.c linux-2.6.35/fs/reiser4/plugin/fibration.c
28442 --- linux-2.6.35.orig/fs/reiser4/plugin/fibration.c     1970-01-01 01:00:00.000000000 +0100
28443 +++ linux-2.6.35/fs/reiser4/plugin/fibration.c  2010-08-04 15:44:57.000000000 +0200
28444 @@ -0,0 +1,175 @@
28445 +/* Copyright 2004 by Hans Reiser, licensing governed by
28446 + * reiser4/README */
28447 +
28448 +/* Directory fibrations */
28449 +
28450 +/*
28451 + * Suppose we have a directory tree with sources of some project. During
28452 + * compilation .o files are created within this tree. This makes access
28453 + * to the original source files less efficient, because source files are
28454 + * now "diluted" by object files: default directory plugin uses prefix
28455 + * of a file name as a part of the key for directory entry (and this
28456 + * part is also inherited by the key of file body). This means that
28457 + * foo.o will be located close to foo.c and foo.h in the tree.
28458 + *
28459 + * To avoid this effect directory plugin fill highest 7 (unused
28460 + * originally) bits of the second component of the directory entry key
28461 + * by bit-pattern depending on the file name (see
28462 + * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called
28463 + * "fibre". Fibre of the file name key is inherited by key of stat data
28464 + * and keys of file body (in the case of REISER4_LARGE_KEY).
28465 + *
28466 + * Fibre for a given file is chosen by per-directory fibration
28467 + * plugin. Names within given fibre are ordered lexicographically.
28468 + */
28469 +
28470 +#include "../debug.h"
28471 +#include "plugin_header.h"
28472 +#include "plugin.h"
28473 +#include "../super.h"
28474 +#include "../inode.h"
28475 +
28476 +#include <linux/types.h>
28477 +
28478 +static const int fibre_shift = 57;
28479 +
28480 +#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift)
28481 +
28482 +/*
28483 + * Trivial fibration: all files of directory are just ordered
28484 + * lexicographically.
28485 + */
28486 +static __u64 fibre_trivial(const struct inode *dir, const char *name, int len)
28487 +{
28488 +       return FIBRE_NO(0);
28489 +}
28490 +
28491 +/*
28492 + * dot-o fibration: place .o files after all others.
28493 + */
28494 +static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len)
28495 +{
28496 +       /* special treatment for .*\.o */
28497 +       if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.')
28498 +               return FIBRE_NO(1);
28499 +       else
28500 +               return FIBRE_NO(0);
28501 +}
28502 +
28503 +/*
28504 + * ext.1 fibration: subdivide directory into 128 fibrations one for each
28505 + * 7bit extension character (file "foo.h" goes into fibre "h"), plus
28506 + * default fibre for the rest.
28507 + */
28508 +static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len)
28509 +{
28510 +       if (len > 2 && name[len - 2] == '.')
28511 +               return FIBRE_NO(name[len - 1]);
28512 +       else
28513 +               return FIBRE_NO(0);
28514 +}
28515 +
28516 +/*
28517 + * ext.3 fibration: try to separate files with different 3-character
28518 + * extensions from each other.
28519 + */
28520 +static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len)
28521 +{
28522 +       if (len > 4 && name[len - 4] == '.')
28523 +               return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]);
28524 +       else
28525 +               return FIBRE_NO(0);
28526 +}
28527 +
28528 +static int change_fibration(struct inode *inode,
28529 +                           reiser4_plugin * plugin,
28530 +                           pset_member memb)
28531 +{
28532 +       int result;
28533 +
28534 +       assert("nikita-3503", inode != NULL);
28535 +       assert("nikita-3504", plugin != NULL);
28536 +
28537 +       assert("nikita-3505", is_reiser4_inode(inode));
28538 +       assert("nikita-3506", inode_dir_plugin(inode) != NULL);
28539 +       assert("nikita-3507",
28540 +              plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE);
28541 +
28542 +       result = 0;
28543 +       if (inode_fibration_plugin(inode) == NULL ||
28544 +           inode_fibration_plugin(inode)->h.id != plugin->h.id) {
28545 +               if (is_dir_empty(inode) == 0)
28546 +                       result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
28547 +                                                PSET_FIBRATION, plugin);
28548 +               else
28549 +                       result = RETERR(-ENOTEMPTY);
28550 +
28551 +       }
28552 +       return result;
28553 +}
28554 +
28555 +static reiser4_plugin_ops fibration_plugin_ops = {
28556 +       .init = NULL,
28557 +       .load = NULL,
28558 +       .save_len = NULL,
28559 +       .save = NULL,
28560 +       .change = change_fibration
28561 +};
28562 +
28563 +/* fibration plugins */
28564 +fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = {
28565 +       [FIBRATION_LEXICOGRAPHIC] = {
28566 +               .h = {
28567 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28568 +                       .id = FIBRATION_LEXICOGRAPHIC,
28569 +                       .pops = &fibration_plugin_ops,
28570 +                       .label = "lexicographic",
28571 +                       .desc = "no fibration",
28572 +                       .linkage = {NULL, NULL}
28573 +               },
28574 +               .fibre = fibre_trivial
28575 +       },
28576 +       [FIBRATION_DOT_O] = {
28577 +               .h = {
28578 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28579 +                       .id = FIBRATION_DOT_O,
28580 +                       .pops = &fibration_plugin_ops,
28581 +                       .label = "dot-o",
28582 +                       .desc = "fibrate .o files separately",
28583 +                       .linkage = {NULL, NULL}
28584 +               },
28585 +               .fibre = fibre_dot_o
28586 +       },
28587 +       [FIBRATION_EXT_1] = {
28588 +               .h = {
28589 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28590 +                       .id = FIBRATION_EXT_1,
28591 +                       .pops = &fibration_plugin_ops,
28592 +                       .label = "ext-1",
28593 +                       .desc = "fibrate file by single character extension",
28594 +                       .linkage = {NULL, NULL}
28595 +               },
28596 +               .fibre = fibre_ext_1
28597 +       },
28598 +       [FIBRATION_EXT_3] = {
28599 +               .h = {
28600 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28601 +                       .id = FIBRATION_EXT_3,
28602 +                       .pops = &fibration_plugin_ops,
28603 +                       .label = "ext-3",
28604 +                       .desc = "fibrate file by three character extension",
28605 +                       .linkage = {NULL, NULL}
28606 +               },
28607 +               .fibre = fibre_ext_3
28608 +       }
28609 +};
28610 +
28611 +/*
28612 + * Local variables:
28613 + * c-indentation-style: "K&R"
28614 + * mode-name: "LC"
28615 + * c-basic-offset: 8
28616 + * tab-width: 8
28617 + * fill-column: 79
28618 + * End:
28619 + */
28620 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/fibration.h linux-2.6.35/fs/reiser4/plugin/fibration.h
28621 --- linux-2.6.35.orig/fs/reiser4/plugin/fibration.h     1970-01-01 01:00:00.000000000 +0100
28622 +++ linux-2.6.35/fs/reiser4/plugin/fibration.h  2010-08-04 15:44:57.000000000 +0200
28623 @@ -0,0 +1,37 @@
28624 +/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */
28625 +
28626 +/* Fibration plugin used by hashed directory plugin to segment content
28627 + * of directory. See fs/reiser4/plugin/fibration.c for more on this. */
28628 +
28629 +#if !defined(__FS_REISER4_PLUGIN_FIBRATION_H__)
28630 +#define __FS_REISER4_PLUGIN_FIBRATION_H__
28631 +
28632 +#include "plugin_header.h"
28633 +
28634 +typedef struct fibration_plugin {
28635 +       /* generic fields */
28636 +       plugin_header h;
28637 +
28638 +        __u64(*fibre) (const struct inode *dir, const char *name, int len);
28639 +} fibration_plugin;
28640 +
28641 +typedef enum {
28642 +       FIBRATION_LEXICOGRAPHIC,
28643 +       FIBRATION_DOT_O,
28644 +       FIBRATION_EXT_1,
28645 +       FIBRATION_EXT_3,
28646 +       LAST_FIBRATION_ID
28647 +} reiser4_fibration_id;
28648 +
28649 +/* __FS_REISER4_PLUGIN_FIBRATION_H__ */
28650 +#endif
28651 +
28652 +/* Make Linus happy.
28653 +   Local variables:
28654 +   c-indentation-style: "K&R"
28655 +   mode-name: "LC"
28656 +   c-basic-offset: 8
28657 +   tab-width: 8
28658 +   fill-column: 120
28659 +   End:
28660 +*/
28661 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/file/cryptcompress.c linux-2.6.35/fs/reiser4/plugin/file/cryptcompress.c
28662 --- linux-2.6.35.orig/fs/reiser4/plugin/file/cryptcompress.c    1970-01-01 01:00:00.000000000 +0100
28663 +++ linux-2.6.35/fs/reiser4/plugin/file/cryptcompress.c 2010-08-04 15:44:57.000000000 +0200
28664 @@ -0,0 +1,3803 @@
28665 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
28666 +   reiser4/README */
28667 +/*
28668 + * Written by Edward Shishkin.
28669 + *
28670 + * Implementations of inode/file/address_space operations
28671 + * specific for cryptcompress file plugin which manages
28672 + * regular files built of compressed and(or) encrypted bodies.
28673 + * See http://dev.namesys.com/CryptcompressPlugin for details.
28674 + */
28675 +
28676 +#include "../../inode.h"
28677 +#include "../cluster.h"
28678 +#include "../object.h"
28679 +#include "../../tree_walk.h"
28680 +#include "cryptcompress.h"
28681 +
28682 +#include <linux/pagevec.h>
28683 +#include <asm/uaccess.h>
28684 +#include <linux/swap.h>
28685 +#include <linux/writeback.h>
28686 +#include <linux/random.h>
28687 +#include <linux/scatterlist.h>
28688 +
28689 +/*
28690 +               Managing primary and secondary caches by Reiser4
28691 +               cryptcompress file plugin. Synchronization scheme.
28692 +
28693 +
28694 +                                             +------------------+
28695 +                        +------------------->|    tfm stream    |
28696 +                        |                    | (compressed data)|
28697 +                  flush |                    +------------------+
28698 +                        +-----------------+           |
28699 +                        |(->)longterm lock|           V
28700 +--+        writepages() |                 |        +-***-+  reiser4        +---+
28701 +  |                     |                 +--+     | *** |  storage tree   |   |
28702 +  |                     |                    |     +-***-+  (primary cache)|   |
28703 +u | write()   (secondary| cache)             V    /   |   \                |   |
28704 +s | ---->  +----+ +----+ +----+ +----+     +-***** ******* **----+  ---->  | d |
28705 +e |        |    | |page cluster |    |     | **disk cluster**    |         | i |
28706 +r | <----  +----+ +----+ +----+ +----+     +-***** **********----+  <----  | s |
28707 +  | read()              ^                      ^      |                    | k |
28708 +  |                     |     (->)longterm lock|      |           page_io()|   |
28709 +  |                     |                      +------+                    |   |
28710 +--+         readpages() |                             |                    +---+
28711 +                        |                             V
28712 +                        |                    +------------------+
28713 +                        +--------------------|    tfm stream    |
28714 +                                             |   (plain text)   |
28715 +                                             +------------------+
28716 +*/
28717 +
28718 +/* get cryptcompress specific portion of inode */
28719 +struct cryptcompress_info *cryptcompress_inode_data(const struct inode *inode)
28720 +{
28721 +       return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info;
28722 +}
28723 +
28724 +/* plugin->u.file.init_inode_data */
28725 +void init_inode_data_cryptcompress(struct inode *inode,
28726 +                                  reiser4_object_create_data * crd,
28727 +                                  int create)
28728 +{
28729 +       struct cryptcompress_info *data;
28730 +
28731 +       data = cryptcompress_inode_data(inode);
28732 +       assert("edward-685", data != NULL);
28733 +
28734 +       memset(data, 0, sizeof(*data));
28735 +
28736 +       mutex_init(&data->checkin_mutex);
28737 +       data->trunc_index = ULONG_MAX;
28738 +       turn_on_compression(data);
28739 +       set_lattice_factor(data, MIN_LATTICE_FACTOR);
28740 +       init_inode_ordering(inode, crd, create);
28741 +}
28742 +
28743 +/* The following is a part of reiser4 cipher key manager
28744 +   which is called when opening/creating a cryptcompress file */
28745 +
28746 +/* get/set cipher key info */
28747 +struct reiser4_crypto_info * inode_crypto_info (struct inode * inode)
28748 +{
28749 +       assert("edward-90", inode != NULL);
28750 +       assert("edward-91", reiser4_inode_data(inode) != NULL);
28751 +       return cryptcompress_inode_data(inode)->crypt;
28752 +}
28753 +
28754 +static void set_inode_crypto_info (struct inode * inode,
28755 +                                  struct reiser4_crypto_info * info)
28756 +{
28757 +       cryptcompress_inode_data(inode)->crypt = info;
28758 +}
28759 +
28760 +/* allocate a cipher key info */
28761 +struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode)
28762 +{
28763 +       struct reiser4_crypto_info *info;
28764 +       int fipsize;
28765 +
28766 +       info = kzalloc(sizeof(*info), reiser4_ctx_gfp_mask_get());
28767 +       if (!info)
28768 +               return ERR_PTR(-ENOMEM);
28769 +
28770 +       fipsize = inode_digest_plugin(inode)->fipsize;
28771 +       info->keyid = kmalloc(fipsize, reiser4_ctx_gfp_mask_get());
28772 +       if (!info->keyid) {
28773 +               kfree(info);
28774 +               return ERR_PTR(-ENOMEM);
28775 +       }
28776 +       info->host = inode;
28777 +       return info;
28778 +}
28779 +
28780 +#if 0
28781 +/* allocate/free low-level info for cipher and digest
28782 +   transforms */
28783 +static int alloc_crypto_tfms(struct reiser4_crypto_info * info)
28784 +{
28785 +       struct crypto_blkcipher * ctfm = NULL;
28786 +       struct crypto_hash      * dtfm = NULL;
28787 +       cipher_plugin * cplug = inode_cipher_plugin(info->host);
28788 +       digest_plugin * dplug = inode_digest_plugin(info->host);
28789 +
28790 +       if (cplug->alloc) {
28791 +               ctfm = cplug->alloc();
28792 +               if (IS_ERR(ctfm)) {
28793 +                       warning("edward-1364",
28794 +                               "Can not allocate info for %s\n",
28795 +                               cplug->h.desc);
28796 +                       return RETERR(PTR_ERR(ctfm));
28797 +               }
28798 +       }
28799 +       info_set_cipher(info, ctfm);
28800 +       if (dplug->alloc) {
28801 +               dtfm = dplug->alloc();
28802 +               if (IS_ERR(dtfm)) {
28803 +                       warning("edward-1365",
28804 +                               "Can not allocate info for %s\n",
28805 +                               dplug->h.desc);
28806 +                       goto unhappy_with_digest;
28807 +               }
28808 +       }
28809 +       info_set_digest(info, dtfm);
28810 +       return 0;
28811 + unhappy_with_digest:
28812 +       if (cplug->free) {
28813 +               cplug->free(ctfm);
28814 +               info_set_cipher(info, NULL);
28815 +       }
28816 +       return RETERR(PTR_ERR(dtfm));
28817 +}
28818 +#endif
28819 +
28820 +static void
28821 +free_crypto_tfms(struct reiser4_crypto_info * info)
28822 +{
28823 +       assert("edward-1366", info != NULL);
28824 +       if (!info_get_cipher(info)) {
28825 +               assert("edward-1601", !info_get_digest(info));
28826 +               return;
28827 +       }
28828 +       inode_cipher_plugin(info->host)->free(info_get_cipher(info));
28829 +       info_set_cipher(info, NULL);
28830 +       inode_digest_plugin(info->host)->free(info_get_digest(info));
28831 +       info_set_digest(info, NULL);
28832 +       return;
28833 +}
28834 +
28835 +#if 0
28836 +/* create a key fingerprint for disk stat-data */
28837 +static int create_keyid (struct reiser4_crypto_info * info,
28838 +                        struct reiser4_crypto_data * data)
28839 +{
28840 +       int ret = -ENOMEM;
28841 +       size_t blk, pad;
28842 +       __u8 * dmem;
28843 +       __u8 * cmem;
28844 +       struct hash_desc      ddesc;
28845 +       struct blkcipher_desc cdesc;
28846 +       struct scatterlist sg;
28847 +
28848 +       assert("edward-1367", info != NULL);
28849 +       assert("edward-1368", info->keyid != NULL);
28850 +
28851 +       ddesc.tfm = info_get_digest(info);
28852 +       ddesc.flags = 0;
28853 +       cdesc.tfm = info_get_cipher(info);
28854 +       cdesc.flags = 0;
28855 +
28856 +       dmem = kmalloc((size_t)crypto_hash_digestsize(ddesc.tfm),
28857 +                      reiser4_ctx_gfp_mask_get());
28858 +       if (!dmem)
28859 +               goto exit1;
28860 +
28861 +       blk = crypto_blkcipher_blocksize(cdesc.tfm);
28862 +
28863 +       pad = data->keyid_size % blk;
28864 +       pad = (pad ? blk - pad : 0);
28865 +
28866 +       cmem = kmalloc((size_t)data->keyid_size + pad,
28867 +                      reiser4_ctx_gfp_mask_get());
28868 +       if (!cmem)
28869 +               goto exit2;
28870 +       memcpy(cmem, data->keyid, data->keyid_size);
28871 +       memset(cmem + data->keyid_size, 0, pad);
28872 +
28873 +       sg_init_one(&sg, cmem, data->keyid_size + pad);
28874 +
28875 +       ret = crypto_blkcipher_encrypt(&cdesc, &sg, &sg,
28876 +                                      data->keyid_size + pad);
28877 +       if (ret) {
28878 +               warning("edward-1369",
28879 +                       "encryption failed flags=%x\n", cdesc.flags);
28880 +               goto exit3;
28881 +       }
28882 +       ret = crypto_hash_digest(&ddesc, &sg, sg.length, dmem);
28883 +       if (ret) {
28884 +               warning("edward-1602",
28885 +                       "digest failed flags=%x\n", ddesc.flags);
28886 +               goto exit3;
28887 +       }
28888 +       memcpy(info->keyid, dmem, inode_digest_plugin(info->host)->fipsize);
28889 + exit3:
28890 +       kfree(cmem);
28891 + exit2:
28892 +       kfree(dmem);
28893 + exit1:
28894 +       return ret;
28895 +}
28896 +#endif
28897 +
28898 +static void destroy_keyid(struct reiser4_crypto_info * info)
28899 +{
28900 +       assert("edward-1370", info != NULL);
28901 +       assert("edward-1371", info->keyid != NULL);
28902 +       kfree(info->keyid);
28903 +       return;
28904 +}
28905 +
28906 +static void __free_crypto_info (struct inode * inode)
28907 +{
28908 +       struct reiser4_crypto_info * info = inode_crypto_info(inode);
28909 +       assert("edward-1372", info != NULL);
28910 +
28911 +       free_crypto_tfms(info);
28912 +       destroy_keyid(info);
28913 +       kfree(info);
28914 +}
28915 +
28916 +#if 0
28917 +static void instantiate_crypto_info(struct reiser4_crypto_info * info)
28918 +{
28919 +       assert("edward-1373", info != NULL);
28920 +       assert("edward-1374", info->inst == 0);
28921 +       info->inst = 1;
28922 +}
28923 +#endif
28924 +
28925 +static void uninstantiate_crypto_info(struct reiser4_crypto_info * info)
28926 +{
28927 +       assert("edward-1375", info != NULL);
28928 +       info->inst = 0;
28929 +}
28930 +
28931 +#if 0
28932 +static int is_crypto_info_instantiated(struct reiser4_crypto_info * info)
28933 +{
28934 +       return info->inst;
28935 +}
28936 +
28937 +static int inode_has_cipher_key(struct inode * inode)
28938 +{
28939 +       assert("edward-1376", inode != NULL);
28940 +       return inode_crypto_info(inode) &&
28941 +               is_crypto_info_instantiated(inode_crypto_info(inode));
28942 +}
28943 +#endif
28944 +
28945 +static void free_crypto_info (struct inode * inode)
28946 +{
28947 +       uninstantiate_crypto_info(inode_crypto_info(inode));
28948 +       __free_crypto_info(inode);
28949 +}
28950 +
28951 +static int need_cipher(struct inode * inode)
28952 +{
28953 +       return inode_cipher_plugin(inode) !=
28954 +               cipher_plugin_by_id(NONE_CIPHER_ID);
28955 +}
28956 +
28957 +/* Parse @data which contains a (uninstantiated) cipher key imported
28958 +   from user space, create a low-level cipher info and attach it to
28959 +   the @object. If success, then info contains an instantiated key */
28960 +#if 0
28961 +struct reiser4_crypto_info * create_crypto_info(struct inode * object,
28962 +                                 struct reiser4_crypto_data * data)
28963 +{
28964 +       int ret;
28965 +       struct reiser4_crypto_info * info;
28966 +
28967 +       assert("edward-1377", data != NULL);
28968 +       assert("edward-1378", need_cipher(object));
28969 +
28970 +       if (inode_file_plugin(object) !=
28971 +           file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID))
28972 +               return ERR_PTR(-EINVAL);
28973 +
28974 +       info = reiser4_alloc_crypto_info(object);
28975 +       if (IS_ERR(info))
28976 +               return info;
28977 +       ret = alloc_crypto_tfms(info);
28978 +       if (ret)
28979 +               goto err;
28980 +       /* instantiating a key */
28981 +       ret = crypto_blkcipher_setkey(info_get_cipher(info),
28982 +                                     data->key,
28983 +                                     data->keysize);
28984 +       if (ret) {
28985 +               warning("edward-1379",
28986 +                       "setkey failed flags=%x",
28987 +                       crypto_blkcipher_get_flags(info_get_cipher(info)));
28988 +               goto err;
28989 +       }
28990 +       info->keysize = data->keysize;
28991 +       ret = create_keyid(info, data);
28992 +       if (ret)
28993 +               goto err;
28994 +       instantiate_crypto_info(info);
28995 +       return info;
28996 + err:
28997 +       __free_crypto_info(object);
28998 +       return ERR_PTR(ret);
28999 +}
29000 +#endif
29001 +
29002 +/* increment/decrement a load counter when
29003 +   attaching/detaching the crypto-stat to any object */
29004 +static void load_crypto_info(struct reiser4_crypto_info * info)
29005 +{
29006 +       assert("edward-1380", info != NULL);
29007 +       inc_keyload_count(info);
29008 +}
29009 +
29010 +static void unload_crypto_info(struct inode * inode)
29011 +{
29012 +       struct reiser4_crypto_info * info = inode_crypto_info(inode);
29013 +       assert("edward-1381", info->keyload_count > 0);
29014 +
29015 +       dec_keyload_count(inode_crypto_info(inode));
29016 +       if (info->keyload_count == 0)
29017 +               /* final release */
29018 +               free_crypto_info(inode);
29019 +}
29020 +
29021 +/* attach/detach an existing crypto-stat */
29022 +void reiser4_attach_crypto_info(struct inode * inode,
29023 +                               struct reiser4_crypto_info * info)
29024 +{
29025 +       assert("edward-1382", inode != NULL);
29026 +       assert("edward-1383", info != NULL);
29027 +       assert("edward-1384", inode_crypto_info(inode) == NULL);
29028 +
29029 +       set_inode_crypto_info(inode, info);
29030 +       load_crypto_info(info);
29031 +}
29032 +
29033 +/* returns true, if crypto stat can be attached to the @host */
29034 +#if REISER4_DEBUG
29035 +static int host_allows_crypto_info(struct inode * host)
29036 +{
29037 +       int ret;
29038 +       file_plugin * fplug = inode_file_plugin(host);
29039 +
29040 +       switch (fplug->h.id) {
29041 +       case CRYPTCOMPRESS_FILE_PLUGIN_ID:
29042 +               ret = 1;
29043 +               break;
29044 +       default:
29045 +               ret = 0;
29046 +       }
29047 +       return ret;
29048 +}
29049 +#endif  /*  REISER4_DEBUG  */
29050 +
29051 +static void reiser4_detach_crypto_info(struct inode * inode)
29052 +{
29053 +       assert("edward-1385", inode != NULL);
29054 +       assert("edward-1386", host_allows_crypto_info(inode));
29055 +
29056 +       if (inode_crypto_info(inode))
29057 +               unload_crypto_info(inode);
29058 +       set_inode_crypto_info(inode, NULL);
29059 +}
29060 +
29061 +#if 0
29062 +
29063 +/* compare fingerprints of @child and @parent */
29064 +static int keyid_eq(struct reiser4_crypto_info * child,
29065 +                   struct reiser4_crypto_info * parent)
29066 +{
29067 +       return !memcmp(child->keyid,
29068 +                      parent->keyid,
29069 +                      info_digest_plugin(parent)->fipsize);
29070 +}
29071 +
29072 +/* check if a crypto-stat (which is bound to @parent) can be inherited */
29073 +int can_inherit_crypto_cryptcompress(struct inode *child, struct inode *parent)
29074 +{
29075 +       if (!need_cipher(child))
29076 +               return 0;
29077 +       /* the child is created */
29078 +       if (!inode_crypto_info(child))
29079 +               return 1;
29080 +       /* the child is looked up */
29081 +       if (!inode_crypto_info(parent))
29082 +               return 0;
29083 +       return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) &&
29084 +               inode_digest_plugin(child) == inode_digest_plugin(parent) &&
29085 +               inode_crypto_info(child)->keysize ==
29086 +               inode_crypto_info(parent)->keysize &&
29087 +               keyid_eq(inode_crypto_info(child), inode_crypto_info(parent)));
29088 +}
29089 +#endif
29090 +
29091 +/* helper functions for ->create() method of the cryptcompress plugin */
29092 +static int inode_set_crypto(struct inode * object)
29093 +{
29094 +       reiser4_inode * info;
29095 +       if (!inode_crypto_info(object)) {
29096 +               if (need_cipher(object))
29097 +                       return RETERR(-EINVAL);
29098 +               /* the file is not to be encrypted */
29099 +               return 0;
29100 +       }
29101 +       info = reiser4_inode_data(object);
29102 +       info->extmask |= (1 << CRYPTO_STAT);
29103 +       return 0;
29104 +}
29105 +
29106 +static int inode_init_compression(struct inode * object)
29107 +{
29108 +       int result = 0;
29109 +       assert("edward-1461", object != NULL);
29110 +       if (inode_compression_plugin(object)->init)
29111 +               result = inode_compression_plugin(object)->init();
29112 +       return result;
29113 +}
29114 +
29115 +static int inode_check_cluster(struct inode * object)
29116 +{
29117 +       assert("edward-696", object != NULL);
29118 +
29119 +       if (unlikely(inode_cluster_size(object) < PAGE_CACHE_SIZE)) {
29120 +               warning("edward-1320", "Can not support '%s' "
29121 +                       "logical clusters (less then page size)",
29122 +                       inode_cluster_plugin(object)->h.label);
29123 +               return RETERR(-EINVAL);
29124 +       }
29125 +       if (unlikely(inode_cluster_shift(object)) >= BITS_PER_BYTE*sizeof(int)){
29126 +               warning("edward-1463", "Can not support '%s' "
29127 +                       "logical clusters (too big for transform)",
29128 +                       inode_cluster_plugin(object)->h.label);
29129 +               return RETERR(-EINVAL);
29130 +       }
29131 +       return 0;
29132 +}
29133 +
29134 +/* plugin->destroy_inode() */
29135 +void destroy_inode_cryptcompress(struct inode * inode)
29136 +{
29137 +       assert("edward-1464", INODE_PGCOUNT(inode) == 0);
29138 +       reiser4_detach_crypto_info(inode);
29139 +       return;
29140 +}
29141 +
29142 +/* plugin->create_object():
29143 +. install plugins
29144 +. attach crypto info if specified
29145 +. attach compression info if specified
29146 +. attach cluster info
29147 +*/
29148 +int create_object_cryptcompress(struct inode *object, struct inode *parent,
29149 +                               reiser4_object_create_data * data)
29150 +{
29151 +       int result;
29152 +       reiser4_inode *info;
29153 +
29154 +       assert("edward-23", object != NULL);
29155 +       assert("edward-24", parent != NULL);
29156 +       assert("edward-30", data != NULL);
29157 +       assert("edward-26", reiser4_inode_get_flag(object, REISER4_NO_SD));
29158 +       assert("edward-27", data->id == CRYPTCOMPRESS_FILE_PLUGIN_ID);
29159 +
29160 +       info = reiser4_inode_data(object);
29161 +
29162 +       assert("edward-29", info != NULL);
29163 +
29164 +       /* set file bit */
29165 +       info->plugin_mask |= (1 << PSET_FILE);
29166 +
29167 +       /* set crypto */
29168 +       result = inode_set_crypto(object);
29169 +       if (result)
29170 +               goto error;
29171 +       /* set compression */
29172 +       result = inode_init_compression(object);
29173 +       if (result)
29174 +               goto error;
29175 +       /* set cluster */
29176 +       result = inode_check_cluster(object);
29177 +       if (result)
29178 +               goto error;
29179 +
29180 +       /* save everything in disk stat-data */
29181 +       result = write_sd_by_inode_common(object);
29182 +       if (!result)
29183 +               return 0;
29184 + error:
29185 +       reiser4_detach_crypto_info(object);
29186 +       return result;
29187 +}
29188 +
29189 +/* plugin->open() */
29190 +int open_cryptcompress(struct inode * inode, struct file * file)
29191 +{
29192 +       return 0;
29193 +}
29194 +
29195 +/* returns a blocksize, the attribute of a cipher algorithm */
29196 +static unsigned int
29197 +cipher_blocksize(struct inode * inode)
29198 +{
29199 +       assert("edward-758", need_cipher(inode));
29200 +       assert("edward-1400", inode_crypto_info(inode) != NULL);
29201 +       return crypto_blkcipher_blocksize
29202 +               (info_get_cipher(inode_crypto_info(inode)));
29203 +}
29204 +
29205 +/* returns offset translated by scale factor of the crypto-algorithm */
29206 +static loff_t inode_scaled_offset (struct inode * inode,
29207 +                                  const loff_t src_off /* input offset */)
29208 +{
29209 +       assert("edward-97", inode != NULL);
29210 +
29211 +       if (!need_cipher(inode) ||
29212 +           src_off == get_key_offset(reiser4_min_key()) ||
29213 +           src_off == get_key_offset(reiser4_max_key()))
29214 +               return src_off;
29215 +
29216 +       return inode_cipher_plugin(inode)->scale(inode,
29217 +                                                cipher_blocksize(inode),
29218 +                                                src_off);
29219 +}
29220 +
29221 +/* returns disk cluster size */
29222 +size_t inode_scaled_cluster_size(struct inode * inode)
29223 +{
29224 +       assert("edward-110", inode != NULL);
29225 +
29226 +       return inode_scaled_offset(inode, inode_cluster_size(inode));
29227 +}
29228 +
29229 +/* set number of cluster pages */
29230 +static void set_cluster_nrpages(struct cluster_handle * clust,
29231 +                               struct inode *inode)
29232 +{
29233 +       struct reiser4_slide * win;
29234 +
29235 +       assert("edward-180", clust != NULL);
29236 +       assert("edward-1040", inode != NULL);
29237 +
29238 +       clust->old_nrpages = size_in_pages(lbytes(clust->index, inode));
29239 +       win = clust->win;
29240 +       if (!win) {
29241 +               clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
29242 +               return;
29243 +       }
29244 +       assert("edward-1176", clust->op != LC_INVAL);
29245 +       assert("edward-1064", win->off + win->count + win->delta != 0);
29246 +
29247 +       if (win->stat == HOLE_WINDOW &&
29248 +           win->off == 0 && win->count == inode_cluster_size(inode)) {
29249 +               /* special case: writing a "fake" logical cluster */
29250 +               clust->nr_pages = 0;
29251 +               return;
29252 +       }
29253 +       clust->nr_pages = size_in_pages(max(win->off + win->count + win->delta,
29254 +                                           lbytes(clust->index, inode)));
29255 +       return;
29256 +}
29257 +
29258 +/* plugin->key_by_inode()
29259 +   build key of a disk cluster */
29260 +int key_by_inode_cryptcompress(struct inode *inode, loff_t off,
29261 +                              reiser4_key * key)
29262 +{
29263 +       assert("edward-64", inode != 0);
29264 +
29265 +       if (likely(off != get_key_offset(reiser4_max_key())))
29266 +               off = off_to_clust_to_off(off, inode);
29267 +       if (inode_crypto_info(inode))
29268 +               off = inode_scaled_offset(inode, off);
29269 +
29270 +       key_by_inode_and_offset_common(inode, 0, key);
29271 +       set_key_offset(key, (__u64)off);
29272 +       return 0;
29273 +}
29274 +
29275 +/* plugin->flow_by_inode() */
29276 +/* flow is used to read/write disk clusters */
29277 +int flow_by_inode_cryptcompress(struct inode *inode, const char __user * buf,
29278 +                               int user,       /* 1: @buf is of user space,
29279 +                                                  0: kernel space */
29280 +                               loff_t size,    /* @buf size */
29281 +                               loff_t off,     /* offset to start io from */
29282 +                               rw_op op,       /* READ or WRITE */
29283 +                               flow_t * f      /* resulting flow */)
29284 +{
29285 +       assert("edward-436", f != NULL);
29286 +       assert("edward-149", inode != NULL);
29287 +       assert("edward-150", inode_file_plugin(inode) != NULL);
29288 +       assert("edward-1465", user == 0); /* we use flow to read/write
29289 +                                           disk clusters located in
29290 +                                           kernel space */
29291 +       f->length = size;
29292 +       memcpy(&f->data, &buf, sizeof(buf));
29293 +       f->user = user;
29294 +       f->op = op;
29295 +
29296 +       return key_by_inode_cryptcompress(inode, off, &f->key);
29297 +}
29298 +
29299 +static int
29300 +cryptcompress_hint_validate(hint_t * hint, const reiser4_key * key,
29301 +                           znode_lock_mode lock_mode)
29302 +{
29303 +       coord_t *coord;
29304 +
29305 +       assert("edward-704", hint != NULL);
29306 +       assert("edward-1089", !hint_is_valid(hint));
29307 +       assert("edward-706", hint->lh.owner == NULL);
29308 +
29309 +       coord = &hint->ext_coord.coord;
29310 +
29311 +       if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
29312 +               /* hint either not set or set by different operation */
29313 +               return RETERR(-E_REPEAT);
29314 +
29315 +       if (get_key_offset(key) != hint->offset)
29316 +               /* hint is set for different key */
29317 +               return RETERR(-E_REPEAT);
29318 +
29319 +       assert("edward-707", reiser4_schedulable());
29320 +
29321 +       return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord,
29322 +                                    key, &hint->lh, lock_mode,
29323 +                                    ZNODE_LOCK_LOPRI);
29324 +}
29325 +
29326 +/* reserve disk space when writing a logical cluster */
29327 +static int reserve4cluster(struct inode *inode, struct cluster_handle *clust)
29328 +{
29329 +       int result = 0;
29330 +
29331 +       assert("edward-965", reiser4_schedulable());
29332 +       assert("edward-439", inode != NULL);
29333 +       assert("edward-440", clust != NULL);
29334 +       assert("edward-441", clust->pages != NULL);
29335 +
29336 +       if (clust->nr_pages == 0) {
29337 +               assert("edward-1152", clust->win != NULL);
29338 +               assert("edward-1153", clust->win->stat == HOLE_WINDOW);
29339 +               /* don't reserve disk space for fake logical cluster */
29340 +               return 0;
29341 +       }
29342 +       assert("edward-442", jprivate(clust->pages[0]) != NULL);
29343 +
29344 +       result = reiser4_grab_space_force(estimate_insert_cluster(inode) +
29345 +                                         estimate_update_cluster(inode),
29346 +                                         BA_CAN_COMMIT);
29347 +       if (result)
29348 +               return result;
29349 +       clust->reserved = 1;
29350 +       grabbed2cluster_reserved(estimate_insert_cluster(inode) +
29351 +                                estimate_update_cluster(inode));
29352 +#if REISER4_DEBUG
29353 +       clust->reserved_prepped = estimate_update_cluster(inode);
29354 +       clust->reserved_unprepped = estimate_insert_cluster(inode);
29355 +#endif
29356 +       /* there can be space grabbed by txnmgr_force_commit_all */
29357 +       return 0;
29358 +}
29359 +
29360 +/* free reserved disk space if writing a logical cluster fails */
29361 +static void free_reserved4cluster(struct inode *inode,
29362 +                                 struct cluster_handle *ch, int count)
29363 +{
29364 +       assert("edward-967", ch->reserved == 1);
29365 +
29366 +       cluster_reserved2free(count);
29367 +       ch->reserved = 0;
29368 +}
29369 +
29370 +/* The core search procedure of the cryptcompress plugin.
29371 +   If returned value is not cbk_errored, then current znode is locked */
29372 +static int find_cluster_item(hint_t * hint,
29373 +                            const reiser4_key * key, /* key of the item we are
29374 +                                                        looking for */
29375 +                            znode_lock_mode lock_mode /* which lock */ ,
29376 +                            ra_info_t * ra_info, lookup_bias bias, __u32 flags)
29377 +{
29378 +       int result;
29379 +       reiser4_key ikey;
29380 +       int went_right = 0;
29381 +       coord_t *coord = &hint->ext_coord.coord;
29382 +       coord_t orig = *coord;
29383 +
29384 +       assert("edward-152", hint != NULL);
29385 +
29386 +       if (!hint_is_valid(hint)) {
29387 +               result = cryptcompress_hint_validate(hint, key, lock_mode);
29388 +               if (result == -E_REPEAT)
29389 +                       goto traverse_tree;
29390 +               else if (result) {
29391 +                       assert("edward-1216", 0);
29392 +                       return result;
29393 +               }
29394 +               hint_set_valid(hint);
29395 +       }
29396 +       assert("edward-709", znode_is_any_locked(coord->node));
29397 +
29398 +       /* In-place lookup is going here, it means we just need to
29399 +          check if next item of the @coord match to the @keyhint) */
29400 +
29401 +       if (equal_to_rdk(coord->node, key)) {
29402 +               result = goto_right_neighbor(coord, &hint->lh);
29403 +               if (result == -E_NO_NEIGHBOR) {
29404 +                       assert("edward-1217", 0);
29405 +                       return RETERR(-EIO);
29406 +               }
29407 +               if (result)
29408 +                       return result;
29409 +               assert("edward-1218", equal_to_ldk(coord->node, key));
29410 +               went_right = 1;
29411 +       } else {
29412 +               coord->item_pos++;
29413 +               coord->unit_pos = 0;
29414 +               coord->between = AT_UNIT;
29415 +       }
29416 +       result = zload(coord->node);
29417 +       if (result)
29418 +               return result;
29419 +       assert("edward-1219", !node_is_empty(coord->node));
29420 +
29421 +       if (!coord_is_existing_item(coord)) {
29422 +               zrelse(coord->node);
29423 +               goto not_found;
29424 +       }
29425 +       item_key_by_coord(coord, &ikey);
29426 +       zrelse(coord->node);
29427 +       if (!keyeq(key, &ikey))
29428 +               goto not_found;
29429 +       /* Ok, item is found, update node counts */
29430 +       if (went_right)
29431 +               dclust_inc_extension_ncount(hint);
29432 +       return CBK_COORD_FOUND;
29433 +
29434 + not_found:
29435 +       assert("edward-1220", coord->item_pos > 0);
29436 +       //coord->item_pos--;
29437 +       /* roll back */
29438 +       *coord = orig;
29439 +       ON_DEBUG(coord_update_v(coord));
29440 +       return CBK_COORD_NOTFOUND;
29441 +
29442 + traverse_tree:
29443 +       assert("edward-713", hint->lh.owner == NULL);
29444 +       assert("edward-714", reiser4_schedulable());
29445 +
29446 +       reiser4_unset_hint(hint);
29447 +       dclust_init_extension(hint);
29448 +       coord_init_zero(coord);
29449 +       result = coord_by_key(current_tree, key, coord, &hint->lh,
29450 +                             lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL,
29451 +                             CBK_UNIQUE | flags, ra_info);
29452 +       if (cbk_errored(result))
29453 +               return result;
29454 +       if(result == CBK_COORD_FOUND)
29455 +               dclust_inc_extension_ncount(hint);
29456 +       hint_set_valid(hint);
29457 +       return result;
29458 +}
29459 +
29460 +/* This function is called by deflate[inflate] manager when
29461 +   creating a transformed/plain stream to check if we should
29462 +   create/cut some overhead. If this returns true, then @oh
29463 +   contains the size of this overhead.
29464 + */
29465 +static int need_cut_or_align(struct inode * inode,
29466 +                            struct cluster_handle * ch, rw_op rw, int * oh)
29467 +{
29468 +       struct tfm_cluster * tc = &ch->tc;
29469 +       switch (rw) {
29470 +       case WRITE_OP: /* estimate align */
29471 +               *oh = tc->len % cipher_blocksize(inode);
29472 +               if (*oh != 0)
29473 +                       return 1;
29474 +               break;
29475 +       case READ_OP:  /* estimate cut */
29476 +               *oh = *(tfm_output_data(ch) + tc->len - 1);
29477 +               break;
29478 +       default:
29479 +               impossible("edward-1401", "bad option");
29480 +       }
29481 +       return (tc->len != tc->lsize);
29482 +}
29483 +
29484 +/* create/cut an overhead of transformed/plain stream */
29485 +static void align_or_cut_overhead(struct inode * inode,
29486 +                                 struct cluster_handle * ch, rw_op rw)
29487 +{
29488 +       unsigned int oh;
29489 +       cipher_plugin * cplug = inode_cipher_plugin(inode);
29490 +
29491 +       assert("edward-1402", need_cipher(inode));
29492 +
29493 +       if (!need_cut_or_align(inode, ch, rw, &oh))
29494 +               return;
29495 +       switch (rw) {
29496 +       case WRITE_OP: /* do align */
29497 +               ch->tc.len +=
29498 +                       cplug->align_stream(tfm_input_data(ch) +
29499 +                                           ch->tc.len, ch->tc.len,
29500 +                                           cipher_blocksize(inode));
29501 +               *(tfm_input_data(ch) + ch->tc.len - 1) =
29502 +                       cipher_blocksize(inode) - oh;
29503 +               break;
29504 +       case READ_OP: /* do cut */
29505 +               assert("edward-1403", oh <= cipher_blocksize(inode));
29506 +               ch->tc.len -= oh;
29507 +               break;
29508 +       default:
29509 +               impossible("edward-1404", "bad option");
29510 +       }
29511 +       return;
29512 +}
29513 +
29514 +static unsigned max_cipher_overhead(struct inode * inode)
29515 +{
29516 +       if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream)
29517 +               return 0;
29518 +       return cipher_blocksize(inode);
29519 +}
29520 +
29521 +static int deflate_overhead(struct inode *inode)
29522 +{
29523 +       return (inode_compression_plugin(inode)->
29524 +               checksum ? DC_CHECKSUM_SIZE : 0);
29525 +}
29526 +
29527 +static unsigned deflate_overrun(struct inode * inode, int ilen)
29528 +{
29529 +       return coa_overrun(inode_compression_plugin(inode), ilen);
29530 +}
29531 +
29532 +/* Estimating compressibility of a logical cluster by various
29533 +   policies represented by compression mode plugin.
29534 +   If this returns false, then compressor won't be called for
29535 +   the cluster of index @index.
29536 +*/
29537 +static int should_compress(struct tfm_cluster * tc, cloff_t index,
29538 +                          struct inode *inode)
29539 +{
29540 +       compression_plugin *cplug = inode_compression_plugin(inode);
29541 +       compression_mode_plugin *mplug = inode_compression_mode_plugin(inode);
29542 +
29543 +       assert("edward-1321", tc->len != 0);
29544 +       assert("edward-1322", cplug != NULL);
29545 +       assert("edward-1323", mplug != NULL);
29546 +
29547 +       return /* estimate by size */
29548 +               (cplug->min_size_deflate ?
29549 +                tc->len >= cplug->min_size_deflate() :
29550 +                1) &&
29551 +               /* estimate by compression mode plugin */
29552 +               (mplug->should_deflate ?
29553 +                mplug->should_deflate(inode, index) :
29554 +                1);
29555 +}
29556 +
29557 +/* Evaluating results of compression transform.
29558 +   Returns true, if we need to accept this results */
29559 +static int save_compressed(int size_before, int size_after, struct inode *inode)
29560 +{
29561 +       return (size_after + deflate_overhead(inode) +
29562 +               max_cipher_overhead(inode) < size_before);
29563 +}
29564 +
29565 +/* Guess result of the evaluation above */
29566 +static int need_inflate(struct cluster_handle * ch, struct inode * inode,
29567 +                       int encrypted /* is cluster encrypted */ )
29568 +{
29569 +       struct tfm_cluster * tc = &ch->tc;
29570 +
29571 +       assert("edward-142", tc != 0);
29572 +       assert("edward-143", inode != NULL);
29573 +
29574 +       return tc->len <
29575 +           (encrypted ?
29576 +            inode_scaled_offset(inode, tc->lsize) :
29577 +            tc->lsize);
29578 +}
29579 +
29580 +/* If results of compression were accepted, then we add
29581 +   a checksum to catch possible disk cluster corruption.
29582 +   The following is a format of the data stored in disk clusters:
29583 +
29584 +                  data                   This is (transformed) logical cluster.
29585 +                  cipher_overhead        This is created by ->align() method
29586 +                                          of cipher plugin. May be absent.
29587 +                  checksum          (4)  This is created by ->checksum method
29588 +                                          of compression plugin to check
29589 +                                          integrity. May be absent.
29590 +
29591 +                  Crypto overhead format:
29592 +
29593 +                  data
29594 +                  control_byte      (1)   contains aligned overhead size:
29595 +                                          1 <= overhead <= cipher_blksize
29596 +*/
29597 +/* Append a checksum at the end of a transformed stream */
29598 +static void dc_set_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
29599 +{
29600 +       __u32 checksum;
29601 +
29602 +       assert("edward-1309", tc != NULL);
29603 +       assert("edward-1310", tc->len > 0);
29604 +       assert("edward-1311", cplug->checksum != NULL);
29605 +
29606 +       checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len);
29607 +       put_unaligned(cpu_to_le32(checksum),
29608 +                (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len));
29609 +       tc->len += (int)DC_CHECKSUM_SIZE;
29610 +}
29611 +
29612 +/* Check a disk cluster checksum.
29613 +   Returns 0 if checksum is correct, otherwise returns 1 */
29614 +static int dc_check_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
29615 +{
29616 +       assert("edward-1312", tc != NULL);
29617 +       assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE);
29618 +       assert("edward-1314", cplug->checksum != NULL);
29619 +
29620 +       if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM),
29621 +                           tc->len - (int)DC_CHECKSUM_SIZE) !=
29622 +           le32_to_cpu(get_unaligned((d32 *)
29623 +                                     (tfm_stream_data(tc, INPUT_STREAM)
29624 +                                      + tc->len - (int)DC_CHECKSUM_SIZE)))) {
29625 +               warning("edward-156",
29626 +                       "Bad disk cluster checksum %d, (should be %d) Fsck?\n",
29627 +                       (int)le32_to_cpu
29628 +                       (get_unaligned((d32 *)
29629 +                                      (tfm_stream_data(tc, INPUT_STREAM) +
29630 +                                       tc->len - (int)DC_CHECKSUM_SIZE))),
29631 +                       (int)cplug->checksum
29632 +                       (tfm_stream_data(tc, INPUT_STREAM),
29633 +                        tc->len - (int)DC_CHECKSUM_SIZE));
29634 +               return 1;
29635 +       }
29636 +       tc->len -= (int)DC_CHECKSUM_SIZE;
29637 +       return 0;
29638 +}
29639 +
29640 +/* get input/output stream for some transform action */
29641 +int grab_tfm_stream(struct inode * inode, struct tfm_cluster * tc,
29642 +                   tfm_stream_id id)
29643 +{
29644 +       size_t size = inode_scaled_cluster_size(inode);
29645 +
29646 +       assert("edward-901", tc != NULL);
29647 +       assert("edward-1027", inode_compression_plugin(inode) != NULL);
29648 +
29649 +       if (cluster_get_tfm_act(tc) == TFMA_WRITE)
29650 +               size += deflate_overrun(inode, inode_cluster_size(inode));
29651 +
29652 +       if (!get_tfm_stream(tc, id) && id == INPUT_STREAM)
29653 +               alternate_streams(tc);
29654 +       if (!get_tfm_stream(tc, id))
29655 +               return alloc_tfm_stream(tc, size, id);
29656 +
29657 +       assert("edward-902", tfm_stream_is_set(tc, id));
29658 +
29659 +       if (tfm_stream_size(tc, id) < size)
29660 +               return realloc_tfm_stream(tc, size, id);
29661 +       return 0;
29662 +}
29663 +
29664 +/* Common deflate manager */
29665 +int reiser4_deflate_cluster(struct cluster_handle * clust, struct inode * inode)
29666 +{
29667 +       int result = 0;
29668 +       int compressed = 0;
29669 +       int encrypted = 0;
29670 +       struct tfm_cluster * tc = &clust->tc;
29671 +       compression_plugin * coplug;
29672 +
29673 +       assert("edward-401", inode != NULL);
29674 +       assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM));
29675 +       assert("edward-1348", cluster_get_tfm_act(tc) == TFMA_WRITE);
29676 +       assert("edward-498", !tfm_cluster_is_uptodate(tc));
29677 +
29678 +       coplug = inode_compression_plugin(inode);
29679 +       if (should_compress(tc, clust->index, inode)) {
29680 +               /* try to compress, discard bad results */
29681 +               size_t dst_len;
29682 +               compression_mode_plugin * mplug =
29683 +                       inode_compression_mode_plugin(inode);
29684 +               assert("edward-602", coplug != NULL);
29685 +               assert("edward-1423", coplug->compress != NULL);
29686 +
29687 +               result = grab_coa(tc, coplug);
29688 +               if (result) {
29689 +                   warning("edward-1424",
29690 +                           "alloc_coa failed with ret=%d, skipped compression",
29691 +                           result);
29692 +                   goto cipher;
29693 +               }
29694 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29695 +               if (result) {
29696 +                   warning("edward-1425",
29697 +                        "alloc stream failed with ret=%d, skipped compression",
29698 +                           result);
29699 +                   goto cipher;
29700 +               }
29701 +               dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
29702 +               coplug->compress(get_coa(tc, coplug->h.id, tc->act),
29703 +                                tfm_input_data(clust), tc->len,
29704 +                                tfm_output_data(clust), &dst_len);
29705 +               /* make sure we didn't overwrite extra bytes */
29706 +               assert("edward-603",
29707 +                      dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
29708 +
29709 +               /* evaluate results of compression transform */
29710 +               if (save_compressed(tc->len, dst_len, inode)) {
29711 +                       /* good result, accept */
29712 +                       tc->len = dst_len;
29713 +                       if (mplug->accept_hook != NULL) {
29714 +                              result = mplug->accept_hook(inode, clust->index);
29715 +                              if (result)
29716 +                                      warning("edward-1426",
29717 +                                              "accept_hook failed with ret=%d",
29718 +                                              result);
29719 +                       }
29720 +                       compressed = 1;
29721 +               }
29722 +               else {
29723 +                       /* bad result, discard */
29724 +#if 0
29725 +                       if (cluster_is_complete(clust, inode))
29726 +                             warning("edward-1496",
29727 +                                     "incompressible cluster %lu (inode %llu)",
29728 +                                     clust->index,
29729 +                                     (unsigned long long)get_inode_oid(inode));
29730 +#endif
29731 +                       if (mplug->discard_hook != NULL &&
29732 +                           cluster_is_complete(clust, inode)) {
29733 +                               result = mplug->discard_hook(inode,
29734 +                                                            clust->index);
29735 +                               if (result)
29736 +                                     warning("edward-1427",
29737 +                                             "discard_hook failed with ret=%d",
29738 +                                             result);
29739 +                       }
29740 +               }
29741 +       }
29742 + cipher:
29743 +       if (need_cipher(inode)) {
29744 +               cipher_plugin * ciplug;
29745 +               struct blkcipher_desc desc;
29746 +               struct scatterlist src;
29747 +               struct scatterlist dst;
29748 +
29749 +               ciplug = inode_cipher_plugin(inode);
29750 +               desc.tfm = info_get_cipher(inode_crypto_info(inode));
29751 +               desc.flags = 0;
29752 +               if (compressed)
29753 +                       alternate_streams(tc);
29754 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29755 +               if (result)
29756 +                       return result;
29757 +
29758 +               align_or_cut_overhead(inode, clust, WRITE_OP);
29759 +               sg_init_one(&src, tfm_input_data(clust), tc->len);
29760 +               sg_init_one(&dst, tfm_output_data(clust), tc->len);
29761 +
29762 +               result = crypto_blkcipher_encrypt(&desc, &dst, &src, tc->len);
29763 +               if (result) {
29764 +                       warning("edward-1405",
29765 +                               "encryption failed flags=%x\n", desc.flags);
29766 +                       return result;
29767 +               }
29768 +               encrypted = 1;
29769 +       }
29770 +       if (compressed && coplug->checksum != NULL)
29771 +               dc_set_checksum(coplug, tc);
29772 +       if (!compressed && !encrypted)
29773 +               alternate_streams(tc);
29774 +       return result;
29775 +}
29776 +
29777 +/* Common inflate manager. */
29778 +int reiser4_inflate_cluster(struct cluster_handle * clust, struct inode * inode)
29779 +{
29780 +       int result = 0;
29781 +       int transformed = 0;
29782 +       struct tfm_cluster * tc = &clust->tc;
29783 +       compression_plugin * coplug;
29784 +
29785 +       assert("edward-905", inode != NULL);
29786 +       assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER);
29787 +       assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM));
29788 +       assert("edward-1349", tc->act == TFMA_READ);
29789 +       assert("edward-907", !tfm_cluster_is_uptodate(tc));
29790 +
29791 +       /* Handle a checksum (if any) */
29792 +       coplug = inode_compression_plugin(inode);
29793 +       if (need_inflate(clust, inode, need_cipher(inode)) &&
29794 +           coplug->checksum != NULL) {
29795 +               result = dc_check_checksum(coplug, tc);
29796 +               if (unlikely(result)) {
29797 +                       warning("edward-1460",
29798 +                               "Inode %llu: disk cluster %lu looks corrupted",
29799 +                               (unsigned long long)get_inode_oid(inode),
29800 +                               clust->index);
29801 +                       return RETERR(-EIO);
29802 +               }
29803 +       }
29804 +       if (need_cipher(inode)) {
29805 +               cipher_plugin * ciplug;
29806 +               struct blkcipher_desc desc;
29807 +               struct scatterlist src;
29808 +               struct scatterlist dst;
29809 +
29810 +               ciplug = inode_cipher_plugin(inode);
29811 +               desc.tfm = info_get_cipher(inode_crypto_info(inode));
29812 +               desc.flags = 0;
29813 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29814 +               if (result)
29815 +                       return result;
29816 +               assert("edward-909", tfm_cluster_is_set(tc));
29817 +
29818 +               sg_init_one(&src, tfm_input_data(clust), tc->len);
29819 +               sg_init_one(&dst, tfm_output_data(clust), tc->len);
29820 +
29821 +               result = crypto_blkcipher_decrypt(&desc, &dst, &src, tc->len);
29822 +               if (result) {
29823 +                       warning("edward-1600", "decrypt failed flags=%x\n",
29824 +                               desc.flags);
29825 +                       return result;
29826 +               }
29827 +               align_or_cut_overhead(inode, clust, READ_OP);
29828 +               transformed = 1;
29829 +       }
29830 +       if (need_inflate(clust, inode, 0)) {
29831 +               size_t dst_len = inode_cluster_size(inode);
29832 +               if(transformed)
29833 +                       alternate_streams(tc);
29834 +
29835 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29836 +               if (result)
29837 +                       return result;
29838 +               assert("edward-1305", coplug->decompress != NULL);
29839 +               assert("edward-910", tfm_cluster_is_set(tc));
29840 +
29841 +               coplug->decompress(get_coa(tc, coplug->h.id, tc->act),
29842 +                                  tfm_input_data(clust), tc->len,
29843 +                                  tfm_output_data(clust), &dst_len);
29844 +               /* check length */
29845 +               tc->len = dst_len;
29846 +               assert("edward-157", dst_len == tc->lsize);
29847 +               transformed = 1;
29848 +       }
29849 +       if (!transformed)
29850 +               alternate_streams(tc);
29851 +       return result;
29852 +}
29853 +
29854 +/* This is implementation of readpage method of struct
29855 +   address_space_operations for cryptcompress plugin. */
29856 +int readpage_cryptcompress(struct file *file, struct page *page)
29857 +{
29858 +       reiser4_context *ctx;
29859 +       struct cluster_handle clust;
29860 +       item_plugin *iplug;
29861 +       int result;
29862 +
29863 +       assert("edward-88", PageLocked(page));
29864 +       assert("vs-976", !PageUptodate(page));
29865 +       assert("edward-89", page->mapping && page->mapping->host);
29866 +
29867 +       ctx = reiser4_init_context(page->mapping->host->i_sb);
29868 +       if (IS_ERR(ctx)) {
29869 +               unlock_page(page);
29870 +               return PTR_ERR(ctx);
29871 +       }
29872 +       assert("edward-113",
29873 +              ergo(file != NULL,
29874 +                   page->mapping == file->f_dentry->d_inode->i_mapping));
29875 +
29876 +       if (PageUptodate(page)) {
29877 +               warning("edward-1338", "page is already uptodate\n");
29878 +               unlock_page(page);
29879 +               reiser4_exit_context(ctx);
29880 +               return 0;
29881 +       }
29882 +       cluster_init_read(&clust, NULL);
29883 +       clust.file = file;
29884 +       iplug = item_plugin_by_id(CTAIL_ID);
29885 +       if (!iplug->s.file.readpage) {
29886 +               unlock_page(page);
29887 +               put_cluster_handle(&clust);
29888 +               reiser4_exit_context(ctx);
29889 +               return -EINVAL;
29890 +       }
29891 +       result = iplug->s.file.readpage(&clust, page);
29892 +
29893 +       put_cluster_handle(&clust);
29894 +       reiser4_txn_restart(ctx);
29895 +       reiser4_exit_context(ctx);
29896 +       return result;
29897 +}
29898 +
29899 +/* number of pages to check in */
29900 +static int get_new_nrpages(struct cluster_handle * clust)
29901 +{
29902 +       switch (clust->op) {
29903 +       case LC_APPOV:
29904 +               return clust->nr_pages;
29905 +       case LC_TRUNC:
29906 +               assert("edward-1179", clust->win != NULL);
29907 +               return size_in_pages(clust->win->off + clust->win->count);
29908 +       default:
29909 +               impossible("edward-1180", "bad page cluster option");
29910 +               return 0;
29911 +       }
29912 +}
29913 +
29914 +static void set_cluster_pages_dirty(struct cluster_handle * clust,
29915 +                                   struct inode * inode)
29916 +{
29917 +       int i;
29918 +       struct page *pg;
29919 +       int nrpages = get_new_nrpages(clust);
29920 +
29921 +       for (i = 0; i < nrpages; i++) {
29922 +
29923 +               pg = clust->pages[i];
29924 +               assert("edward-968", pg != NULL);
29925 +               lock_page(pg);
29926 +               assert("edward-1065", PageUptodate(pg));
29927 +               set_page_dirty_notag(pg);
29928 +               unlock_page(pg);
29929 +               mark_page_accessed(pg);
29930 +       }
29931 +}
29932 +
29933 +/* Grab a page cluster for read/write operations.
29934 +   Attach a jnode for write operations (when preparing for modifications, which
29935 +   are supposed to be committed).
29936 +
29937 +   We allocate only one jnode per page cluster; this jnode is binded to the
29938 +   first page of this cluster, so we have an extra-reference that will be put
29939 +   as soon as jnode is evicted from memory), other references will be cleaned
29940 +   up in flush time (assume that check in page cluster was successful).
29941 +*/
29942 +int grab_page_cluster(struct inode * inode,
29943 +                     struct cluster_handle * clust, rw_op rw)
29944 +{
29945 +       int i;
29946 +       int result = 0;
29947 +       jnode *node = NULL;
29948 +
29949 +       assert("edward-182", clust != NULL);
29950 +       assert("edward-183", clust->pages != NULL);
29951 +       assert("edward-1466", clust->node == NULL);
29952 +       assert("edward-1428", inode != NULL);
29953 +       assert("edward-1429", inode->i_mapping != NULL);
29954 +       assert("edward-184", clust->nr_pages <= cluster_nrpages(inode));
29955 +
29956 +       if (clust->nr_pages == 0)
29957 +               return 0;
29958 +
29959 +       for (i = 0; i < clust->nr_pages; i++) {
29960 +
29961 +               assert("edward-1044", clust->pages[i] == NULL);
29962 +
29963 +               clust->pages[i] =
29964 +                      find_or_create_page(inode->i_mapping,
29965 +                                          clust_to_pg(clust->index, inode) + i,
29966 +                                          reiser4_ctx_gfp_mask_get());
29967 +               if (!clust->pages[i]) {
29968 +                       result = RETERR(-ENOMEM);
29969 +                       break;
29970 +               }
29971 +               if (i == 0 && rw == WRITE_OP) {
29972 +                       node = jnode_of_page(clust->pages[i]);
29973 +                       if (IS_ERR(node)) {
29974 +                               result = PTR_ERR(node);
29975 +                               unlock_page(clust->pages[i]);
29976 +                               break;
29977 +                       }
29978 +                       JF_SET(node, JNODE_CLUSTER_PAGE);
29979 +                       assert("edward-920", jprivate(clust->pages[0]));
29980 +               }
29981 +               INODE_PGCOUNT_INC(inode);
29982 +               unlock_page(clust->pages[i]);
29983 +       }
29984 +       if (unlikely(result)) {
29985 +               while (i) {
29986 +                       put_cluster_page(clust->pages[--i]);
29987 +                       INODE_PGCOUNT_DEC(inode);
29988 +               }
29989 +               if (node && !IS_ERR(node))
29990 +                       jput(node);
29991 +               return result;
29992 +       }
29993 +       clust->node = node;
29994 +       return 0;
29995 +}
29996 +
29997 +static void truncate_page_cluster_range(struct inode * inode,
29998 +                                       struct page ** pages,
29999 +                                       cloff_t index,
30000 +                                       int from, int count,
30001 +                                       int even_cows)
30002 +{
30003 +       assert("edward-1467", count > 0);
30004 +       reiser4_invalidate_pages(inode->i_mapping,
30005 +                                clust_to_pg(index, inode) + from,
30006 +                                count, even_cows);
30007 +}
30008 +
30009 +/* Put @count pages starting from @from offset */
30010 +void __put_page_cluster(int from, int count,
30011 +                       struct page ** pages, struct inode  * inode)
30012 +{
30013 +       int i;
30014 +       assert("edward-1468", pages != NULL);
30015 +       assert("edward-1469", inode != NULL);
30016 +       assert("edward-1470", from >= 0 && count >= 0);
30017 +
30018 +       for (i = 0; i < count; i++) {
30019 +               assert("edward-1471", pages[from + i] != NULL);
30020 +               assert("edward-1472",
30021 +                      pages[from + i]->index == pages[from]->index + i);
30022 +
30023 +               put_cluster_page(pages[from + i]);
30024 +               INODE_PGCOUNT_DEC(inode);
30025 +       }
30026 +}
30027 +
30028 +/*
30029 + * This is dual to grab_page_cluster,
30030 + * however if @rw == WRITE_OP, then we call this function
30031 + * only if something is failed before checkin page cluster.
30032 + */
30033 +void put_page_cluster(struct cluster_handle * clust,
30034 +                     struct inode * inode, rw_op rw)
30035 +{
30036 +       assert("edward-445", clust != NULL);
30037 +       assert("edward-922", clust->pages != NULL);
30038 +       assert("edward-446",
30039 +              ergo(clust->nr_pages != 0, clust->pages[0] != NULL));
30040 +
30041 +       __put_page_cluster(0, clust->nr_pages, clust->pages, inode);
30042 +       if (rw == WRITE_OP) {
30043 +               if (unlikely(clust->node)) {
30044 +                       assert("edward-447",
30045 +                              clust->node == jprivate(clust->pages[0]));
30046 +                       jput(clust->node);
30047 +                       clust->node = NULL;
30048 +               }
30049 +       }
30050 +}
30051 +
30052 +#if REISER4_DEBUG
30053 +int cryptcompress_inode_ok(struct inode *inode)
30054 +{
30055 +       if (!(reiser4_inode_data(inode)->plugin_mask & (1 << PSET_FILE)))
30056 +               return 0;
30057 +       if (!cluster_shift_ok(inode_cluster_shift(inode)))
30058 +               return 0;
30059 +       return 1;
30060 +}
30061 +
30062 +static int window_ok(struct reiser4_slide * win, struct inode *inode)
30063 +{
30064 +       assert("edward-1115", win != NULL);
30065 +       assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW));
30066 +
30067 +       return (win->off != inode_cluster_size(inode)) &&
30068 +           (win->off + win->count + win->delta <= inode_cluster_size(inode));
30069 +}
30070 +
30071 +static int cluster_ok(struct cluster_handle * clust, struct inode *inode)
30072 +{
30073 +       assert("edward-279", clust != NULL);
30074 +
30075 +       if (!clust->pages)
30076 +               return 0;
30077 +       return (clust->win ? window_ok(clust->win, inode) : 1);
30078 +}
30079 +#if 0
30080 +static int pages_truncate_ok(struct inode *inode, pgoff_t start)
30081 +{
30082 +       int found;
30083 +       struct page * page;
30084 +
30085 +       found = find_get_pages(inode->i_mapping, start, 1, &page);
30086 +       if (found)
30087 +               put_cluster_page(page);
30088 +       return !found;
30089 +}
30090 +#else
30091 +#define pages_truncate_ok(inode, start) 1
30092 +#endif
30093 +
30094 +static int jnode_truncate_ok(struct inode *inode, cloff_t index)
30095 +{
30096 +       jnode *node;
30097 +       node = jlookup(current_tree, get_inode_oid(inode),
30098 +                      clust_to_pg(index, inode));
30099 +       if (likely(!node))
30100 +               return 1;
30101 +       jput(node);
30102 +       return 0;
30103 +}
30104 +
30105 +static int find_fake_appended(struct inode *inode, cloff_t * index);
30106 +
30107 +static int body_truncate_ok(struct inode *inode, cloff_t aidx)
30108 +{
30109 +       int result;
30110 +       cloff_t raidx;
30111 +
30112 +       result = find_fake_appended(inode, &raidx);
30113 +       return !result && (aidx == raidx);
30114 +}
30115 +#endif
30116 +
30117 +/* guess next window stat */
30118 +static inline window_stat next_window_stat(struct reiser4_slide * win)
30119 +{
30120 +       assert("edward-1130", win != NULL);
30121 +       return ((win->stat == HOLE_WINDOW && win->delta == 0) ?
30122 +               HOLE_WINDOW : DATA_WINDOW);
30123 +}
30124 +
30125 +/* guess and set next cluster index and window params */
30126 +static void move_update_window(struct inode * inode,
30127 +                              struct cluster_handle * clust,
30128 +                              loff_t file_off, loff_t to_file)
30129 +{
30130 +       struct reiser4_slide * win;
30131 +
30132 +       assert("edward-185", clust != NULL);
30133 +       assert("edward-438", clust->pages != NULL);
30134 +       assert("edward-281", cluster_ok(clust, inode));
30135 +
30136 +       win = clust->win;
30137 +       if (!win)
30138 +               return;
30139 +
30140 +       switch (win->stat) {
30141 +       case DATA_WINDOW:
30142 +               /* increment */
30143 +               clust->index++;
30144 +               win->stat = DATA_WINDOW;
30145 +               win->off = 0;
30146 +               win->count = min((loff_t)inode_cluster_size(inode), to_file);
30147 +               break;
30148 +       case HOLE_WINDOW:
30149 +               switch (next_window_stat(win)) {
30150 +               case HOLE_WINDOW:
30151 +                       /* skip */
30152 +                       clust->index = off_to_clust(file_off, inode);
30153 +                       win->stat = HOLE_WINDOW;
30154 +                       win->off = 0;
30155 +                       win->count = off_to_cloff(file_off, inode);
30156 +                       win->delta = min((loff_t)(inode_cluster_size(inode) -
30157 +                                                 win->count), to_file);
30158 +                       break;
30159 +               case DATA_WINDOW:
30160 +                       /* stay */
30161 +                       win->stat = DATA_WINDOW;
30162 +                       /* off+count+delta=inv */
30163 +                       win->off = win->off + win->count;
30164 +                       win->count = win->delta;
30165 +                       win->delta = 0;
30166 +                       break;
30167 +               default:
30168 +                       impossible("edward-282", "wrong next window state");
30169 +               }
30170 +               break;
30171 +       default:
30172 +               impossible("edward-283", "wrong current window state");
30173 +       }
30174 +       assert("edward-1068", cluster_ok(clust, inode));
30175 +}
30176 +
30177 +static int update_sd_cryptcompress(struct inode *inode)
30178 +{
30179 +       int result = 0;
30180 +
30181 +       assert("edward-978", reiser4_schedulable());
30182 +
30183 +       result = reiser4_grab_space_force(/* one for stat data update */
30184 +                                         estimate_update_common(inode),
30185 +                                         BA_CAN_COMMIT);
30186 +       if (result)
30187 +               return result;
30188 +       inode->i_ctime = inode->i_mtime = CURRENT_TIME;
30189 +       result = reiser4_update_sd(inode);
30190 +
30191 +       return result;
30192 +}
30193 +
30194 +static void uncapture_cluster_jnode(jnode * node)
30195 +{
30196 +       txn_atom *atom;
30197 +
30198 +       assert_spin_locked(&(node->guard));
30199 +
30200 +       atom = jnode_get_atom(node);
30201 +       if (atom == NULL) {
30202 +               assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
30203 +               spin_unlock_jnode(node);
30204 +               return;
30205 +       }
30206 +       reiser4_uncapture_block(node);
30207 +       spin_unlock_atom(atom);
30208 +       jput(node);
30209 +}
30210 +
30211 +static void put_found_pages(struct page **pages, int nr)
30212 +{
30213 +       int i;
30214 +       for (i = 0; i < nr; i++) {
30215 +               assert("edward-1045", pages[i] != NULL);
30216 +               put_cluster_page(pages[i]);
30217 +       }
30218 +}
30219 +
30220 +/*             Lifecycle of a logical cluster in the system.
30221 + *
30222 + *
30223 + * Logical cluster of a cryptcompress file is represented in the system by
30224 + * . page cluster (in memory, primary cache, contains plain text);
30225 + * . disk cluster (in memory, secondary cache, contains transformed text).
30226 + * Primary cache is to reduce number of transform operations (compression,
30227 + * encryption), i.e. to implement transform-caching strategy.
30228 + * Secondary cache is to reduce number of I/O operations, i.e. for usual
30229 + * write-caching strategy. Page cluster is a set of pages, i.e. mapping of
30230 + * a logical cluster to the primary cache. Disk cluster is a set of items
30231 + * of the same type defined by some reiser4 item plugin id.
30232 + *
30233 + *              1. Performing modifications
30234 + *
30235 + * Every modification of a cryptcompress file is considered as a set of
30236 + * operations performed on file's logical clusters. Every such "atomic"
30237 + * modification is truncate, append and(or) overwrite some bytes of a
30238 + * logical cluster performed in the primary cache with the following
30239 + * synchronization with the secondary cache (in flush time). Disk clusters,
30240 + * which live in the secondary cache, are supposed to be synchronized with
30241 + * disk. The mechanism of synchronization of primary and secondary caches
30242 + * includes so-called checkin/checkout technique described below.
30243 + *
30244 + *              2. Submitting modifications
30245 + *
30246 + * Each page cluster has associated jnode (a special in-memory header to
30247 + * keep a track of transactions in reiser4), which is attached to its first
30248 + * page when grabbing page cluster for modifications (see grab_page_cluster).
30249 + * Submitting modifications (see checkin_logical_cluster) is going per logical
30250 + * cluster and includes:
30251 + * . checkin_cluster_size;
30252 + * . checkin_page_cluster.
30253 + * checkin_cluster_size() is resolved to file size update (which completely
30254 + * defines new size of logical cluster (number of file's bytes in a logical
30255 + * cluster).
30256 + * checkin_page_cluster() captures jnode of a page cluster and installs
30257 + * jnode's dirty flag (if needed) to indicate that modifications are
30258 + * successfully checked in.
30259 + *
30260 + *              3. Checking out modifications
30261 + *
30262 + * Is going per logical cluster in flush time (see checkout_logical_cluster).
30263 + * This is the time of synchronizing primary and secondary caches.
30264 + * checkout_logical_cluster() includes:
30265 + * . checkout_page_cluster (retrieving checked in pages).
30266 + * . uncapture jnode (including clear dirty flag and unlock)
30267 + *
30268 + *              4. Committing modifications
30269 + *
30270 + * Proceeding a synchronization of primary and secondary caches. When checking
30271 + * out page cluster (the phase above) pages are locked/flushed/unlocked
30272 + * one-by-one in ascending order of their indexes to contiguous stream, which
30273 + * is supposed to be transformed (compressed, encrypted), chopped up into items
30274 + * and committed to disk as a disk cluster.
30275 + *
30276 + *              5. Managing page references
30277 + *
30278 + * Every checked in page have a special additional "control" reference,
30279 + * which is dropped at checkout. We need this to avoid unexpected evicting
30280 + * pages from memory before checkout. Control references are managed so
30281 + * they are not accumulated with every checkin:
30282 + *
30283 + *            0
30284 + * checkin -> 1
30285 + *            0 -> checkout
30286 + * checkin -> 1
30287 + * checkin -> 1
30288 + * checkin -> 1
30289 + *            0 -> checkout
30290 + *           ...
30291 + *
30292 + * Every page cluster has its own unique "cluster lock". Update/drop
30293 + * references are serialized via this lock. Number of checked in cluster
30294 + * pages is calculated by i_size under cluster lock. File size is updated
30295 + * at every checkin action also under cluster lock (except cases of
30296 + * appending/truncating fake logical clusters).
30297 + *
30298 + * Proof of correctness:
30299 + *
30300 + * Since we update file size under cluster lock, in the case of non-fake
30301 + * logical cluster with its lock held we do have expected number of checked
30302 + * in pages. On the other hand, append/truncate of fake logical clusters
30303 + * doesn't change number of checked in pages of any cluster.
30304 + *
30305 + * NOTE-EDWARD: As cluster lock we use guard (spinlock_t) of its jnode.
30306 + * Currently, I don't see any reason to create a special lock for those
30307 + * needs.
30308 + */
30309 +
30310 +static inline void lock_cluster(jnode * node)
30311 +{
30312 +       spin_lock_jnode(node);
30313 +}
30314 +
30315 +static inline void unlock_cluster(jnode * node)
30316 +{
30317 +       spin_unlock_jnode(node);
30318 +}
30319 +
30320 +static inline void unlock_cluster_uncapture(jnode * node)
30321 +{
30322 +       uncapture_cluster_jnode(node);
30323 +}
30324 +
30325 +/* Set new file size by window. Cluster lock is required. */
30326 +static void checkin_file_size(struct cluster_handle * clust,
30327 +                             struct inode * inode)
30328 +{
30329 +       loff_t new_size;
30330 +       struct reiser4_slide * win;
30331 +
30332 +       assert("edward-1181", clust != NULL);
30333 +       assert("edward-1182", inode != NULL);
30334 +       assert("edward-1473", clust->pages != NULL);
30335 +       assert("edward-1474", clust->pages[0] != NULL);
30336 +       assert("edward-1475", jprivate(clust->pages[0]) != NULL);
30337 +       assert_spin_locked(&(jprivate(clust->pages[0])->guard));
30338 +
30339 +
30340 +       win = clust->win;
30341 +       assert("edward-1183", win != NULL);
30342 +
30343 +       new_size = clust_to_off(clust->index, inode) + win->off;
30344 +
30345 +       switch (clust->op) {
30346 +       case LC_APPOV:
30347 +               if (new_size + win->count <= i_size_read(inode))
30348 +                       /* overwrite only */
30349 +                       return;
30350 +               new_size += win->count;
30351 +               break;
30352 +       case LC_TRUNC:
30353 +               break;
30354 +       default:
30355 +               impossible("edward-1184", "bad page cluster option");
30356 +               break;
30357 +       }
30358 +       inode_check_scale_nolock(inode, i_size_read(inode), new_size);
30359 +       i_size_write(inode, new_size);
30360 +       return;
30361 +}
30362 +
30363 +static inline void checkin_cluster_size(struct cluster_handle * clust,
30364 +                                       struct inode * inode)
30365 +{
30366 +       if (clust->win)
30367 +               checkin_file_size(clust, inode);
30368 +}
30369 +
30370 +static int checkin_page_cluster(struct cluster_handle * clust,
30371 +                               struct inode * inode)
30372 +{
30373 +       int result;
30374 +       jnode * node;
30375 +       int old_nrpages = clust->old_nrpages;
30376 +       int new_nrpages = get_new_nrpages(clust);
30377 +
30378 +       node = clust->node;
30379 +
30380 +       assert("edward-221", node != NULL);
30381 +       assert("edward-971", clust->reserved == 1);
30382 +       assert("edward-1263",
30383 +              clust->reserved_prepped == estimate_update_cluster(inode));
30384 +       assert("edward-1264", clust->reserved_unprepped == 0);
30385 +
30386 +       if (JF_ISSET(node, JNODE_DIRTY)) {
30387 +               /*
30388 +                * page cluster was checked in, but not yet
30389 +                * checked out, so release related resources
30390 +                */
30391 +               free_reserved4cluster(inode, clust,
30392 +                                     estimate_update_cluster(inode));
30393 +               __put_page_cluster(0, clust->old_nrpages,
30394 +                                  clust->pages, inode);
30395 +       } else {
30396 +               result = capture_cluster_jnode(node);
30397 +               if (unlikely(result)) {
30398 +                       unlock_cluster(node);
30399 +                       return result;
30400 +               }
30401 +               jnode_make_dirty_locked(node);
30402 +               clust->reserved = 0;
30403 +       }
30404 +       unlock_cluster(node);
30405 +
30406 +       if (new_nrpages < old_nrpages) {
30407 +               /* truncate >= 1 complete pages */
30408 +               __put_page_cluster(new_nrpages,
30409 +                                  old_nrpages - new_nrpages,
30410 +                                  clust->pages, inode);
30411 +               truncate_page_cluster_range(inode,
30412 +                                           clust->pages, clust->index,
30413 +                                           new_nrpages,
30414 +                                           old_nrpages - new_nrpages,
30415 +                                           0);
30416 +       }
30417 +#if REISER4_DEBUG
30418 +       clust->reserved_prepped -= estimate_update_cluster(inode);
30419 +#endif
30420 +       return 0;
30421 +}
30422 +
30423 +/* Submit modifications of a logical cluster */
30424 +static int checkin_logical_cluster(struct cluster_handle * clust,
30425 +                                  struct inode *inode)
30426 +{
30427 +       int result = 0;
30428 +       jnode * node;
30429 +
30430 +       node = clust->node;
30431 +
30432 +       assert("edward-1035", node != NULL);
30433 +       assert("edward-1029", clust != NULL);
30434 +       assert("edward-1030", clust->reserved == 1);
30435 +       assert("edward-1031", clust->nr_pages != 0);
30436 +       assert("edward-1032", clust->pages != NULL);
30437 +       assert("edward-1033", clust->pages[0] != NULL);
30438 +       assert("edward-1446", jnode_is_cluster_page(node));
30439 +       assert("edward-1476", node == jprivate(clust->pages[0]));
30440 +
30441 +       lock_cluster(node);
30442 +       checkin_cluster_size(clust, inode);
30443 +       /* this will unlock cluster */
30444 +       result = checkin_page_cluster(clust, inode);
30445 +       jput(node);
30446 +       clust->node = NULL;
30447 +       return result;
30448 +}
30449 +
30450 +/*
30451 + * Retrieve size of logical cluster that was checked in at
30452 + * the latest modifying session (cluster lock is required)
30453 + */
30454 +static inline void checkout_cluster_size(struct cluster_handle * clust,
30455 +                                        struct inode * inode)
30456 +{
30457 +       struct tfm_cluster *tc = &clust->tc;
30458 +
30459 +       tc->len = lbytes(clust->index, inode);
30460 +       assert("edward-1478", tc->len != 0);
30461 +}
30462 +
30463 +/*
30464 + * Retrieve a page cluster with the latest submitted modifications
30465 + * and flush its pages to previously allocated contiguous stream.
30466 + */
30467 +static void checkout_page_cluster(struct cluster_handle * clust,
30468 +                                 jnode * node, struct inode * inode)
30469 +{
30470 +       int i;
30471 +       int found;
30472 +       int to_put;
30473 +       struct tfm_cluster *tc = &clust->tc;
30474 +
30475 +       /* find and put checked in pages: cluster is locked,
30476 +        * so we must get expected number (to_put) of pages
30477 +        */
30478 +       to_put = size_in_pages(lbytes(clust->index, inode));
30479 +       found = find_get_pages(inode->i_mapping,
30480 +                              clust_to_pg(clust->index, inode),
30481 +                              to_put, clust->pages);
30482 +       BUG_ON(found != to_put);
30483 +
30484 +       __put_page_cluster(0, to_put, clust->pages, inode);
30485 +       unlock_cluster_uncapture(node);
30486 +
30487 +       /* Flush found pages.
30488 +        *
30489 +        * Note, that we don't disable modifications while flushing,
30490 +        * moreover, some found pages can be truncated, as we have
30491 +        * released cluster lock.
30492 +        */
30493 +       for (i = 0; i < found; i++) {
30494 +               int in_page;
30495 +               char * data;
30496 +               assert("edward-1479",
30497 +                      clust->pages[i]->index == clust->pages[0]->index + i);
30498 +
30499 +               lock_page(clust->pages[i]);
30500 +               if (!PageUptodate(clust->pages[i])) {
30501 +                       /* page was truncated */
30502 +                       assert("edward-1480",
30503 +                              i_size_read(inode) <= page_offset(clust->pages[i]));
30504 +                       assert("edward-1481",
30505 +                              clust->pages[i]->mapping != inode->i_mapping);
30506 +                       unlock_page(clust->pages[i]);
30507 +                       break;
30508 +               }
30509 +               /* Update the number of bytes in the logical cluster,
30510 +                * as it could be partially truncated. Note, that only
30511 +                * partial truncate is possible (complete truncate can
30512 +                * not go here, as it is performed via ->kill_hook()
30513 +                 * called by cut_file_items(), and the last one must
30514 +                 * wait for znode locked with parent coord).
30515 +                */
30516 +               checkout_cluster_size(clust, inode);
30517 +
30518 +               /* this can be zero, as new file size is
30519 +                  checked in before truncating pages */
30520 +               in_page = __mbp(tc->len, i);
30521 +
30522 +               data = kmap(clust->pages[i]);
30523 +               memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
30524 +                      data, in_page);
30525 +               kunmap(clust->pages[i]);
30526 +
30527 +               if (PageDirty(clust->pages[i]))
30528 +                       cancel_dirty_page(clust->pages[i], PAGE_CACHE_SIZE);
30529 +
30530 +               unlock_page(clust->pages[i]);
30531 +
30532 +               if (in_page < PAGE_CACHE_SIZE)
30533 +                       /* end of the file */
30534 +                       break;
30535 +       }
30536 +       put_found_pages(clust->pages, found); /* find_get_pages */
30537 +       tc->lsize = tc->len;
30538 +       return;
30539 +}
30540 +
30541 +/* Check out modifications of a logical cluster */
30542 +int checkout_logical_cluster(struct cluster_handle * clust,
30543 +                            jnode * node, struct inode *inode)
30544 +{
30545 +       int result;
30546 +       struct tfm_cluster *tc = &clust->tc;
30547 +
30548 +       assert("edward-980", node != NULL);
30549 +       assert("edward-236", inode != NULL);
30550 +       assert("edward-237", clust != NULL);
30551 +       assert("edward-240", !clust->win);
30552 +       assert("edward-241", reiser4_schedulable());
30553 +       assert("edward-718", cryptcompress_inode_ok(inode));
30554 +
30555 +       result = grab_tfm_stream(inode, tc, INPUT_STREAM);
30556 +       if (result) {
30557 +               warning("edward-1430", "alloc stream failed with ret=%d",
30558 +                       result);
30559 +               return RETERR(-E_REPEAT);
30560 +       }
30561 +       lock_cluster(node);
30562 +
30563 +       if (unlikely(!JF_ISSET(node, JNODE_DIRTY))) {
30564 +               /* race with another flush */
30565 +               warning("edward-982",
30566 +                       "checking out logical cluster %lu of inode %llu: "
30567 +                       "jnode is not dirty", clust->index,
30568 +                       (unsigned long long)get_inode_oid(inode));
30569 +               unlock_cluster(node);
30570 +               return RETERR(-E_REPEAT);
30571 +       }
30572 +       cluster_reserved2grabbed(estimate_update_cluster(inode));
30573 +
30574 +       /* this will unlock cluster */
30575 +       checkout_page_cluster(clust, node, inode);
30576 +       return 0;
30577 +}
30578 +
30579 +/* set hint for the cluster of the index @index */
30580 +static void set_hint_cluster(struct inode *inode, hint_t * hint,
30581 +                            cloff_t index, znode_lock_mode mode)
30582 +{
30583 +       reiser4_key key;
30584 +       assert("edward-722", cryptcompress_inode_ok(inode));
30585 +       assert("edward-723",
30586 +              inode_file_plugin(inode) ==
30587 +              file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
30588 +
30589 +       inode_file_plugin(inode)->key_by_inode(inode,
30590 +                                              clust_to_off(index, inode),
30591 +                                              &key);
30592 +
30593 +       reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, &key);
30594 +       hint->offset = get_key_offset(&key);
30595 +       hint->mode = mode;
30596 +}
30597 +
30598 +void invalidate_hint_cluster(struct cluster_handle * clust)
30599 +{
30600 +       assert("edward-1291", clust != NULL);
30601 +       assert("edward-1292", clust->hint != NULL);
30602 +
30603 +       done_lh(&clust->hint->lh);
30604 +       hint_clr_valid(clust->hint);
30605 +}
30606 +
30607 +static void put_hint_cluster(struct cluster_handle * clust,
30608 +                            struct inode *inode, znode_lock_mode mode)
30609 +{
30610 +       assert("edward-1286", clust != NULL);
30611 +       assert("edward-1287", clust->hint != NULL);
30612 +
30613 +       set_hint_cluster(inode, clust->hint, clust->index + 1, mode);
30614 +       invalidate_hint_cluster(clust);
30615 +}
30616 +
30617 +static int balance_dirty_page_cluster(struct cluster_handle * clust,
30618 +                                     struct inode *inode, loff_t off,
30619 +                                     loff_t to_file,
30620 +                                     int nr_dirtied)
30621 +{
30622 +       int result;
30623 +       struct cryptcompress_info * info;
30624 +
30625 +       assert("edward-724", inode != NULL);
30626 +       assert("edward-725", cryptcompress_inode_ok(inode));
30627 +       assert("edward-1547",
30628 +              nr_dirtied != 0 && nr_dirtied <= cluster_nrpages(inode));
30629 +
30630 +       /* set next window params */
30631 +       move_update_window(inode, clust, off, to_file);
30632 +
30633 +       result = update_sd_cryptcompress(inode);
30634 +       if (result)
30635 +               return result;
30636 +       assert("edward-726", clust->hint->lh.owner == NULL);
30637 +       info = cryptcompress_inode_data(inode);
30638 +
30639 +       mutex_unlock(&info->checkin_mutex);
30640 +       reiser4_throttle_write(inode, nr_dirtied);
30641 +       mutex_lock(&info->checkin_mutex);
30642 +       return 0;
30643 +}
30644 +
30645 +/* set zeroes to the page cluster, proceed it, and maybe, try to capture
30646 +   its pages */
30647 +static int write_hole(struct inode *inode, struct cluster_handle * clust,
30648 +                     loff_t file_off, loff_t to_file)
30649 +{
30650 +       int result = 0;
30651 +       unsigned cl_off, cl_count = 0;
30652 +       unsigned to_pg, pg_off;
30653 +       struct reiser4_slide * win;
30654 +
30655 +       assert("edward-190", clust != NULL);
30656 +       assert("edward-1069", clust->win != NULL);
30657 +       assert("edward-191", inode != NULL);
30658 +       assert("edward-727", cryptcompress_inode_ok(inode));
30659 +       assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER);
30660 +       assert("edward-1154",
30661 +              ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1));
30662 +
30663 +       win = clust->win;
30664 +
30665 +       assert("edward-1070", win != NULL);
30666 +       assert("edward-201", win->stat == HOLE_WINDOW);
30667 +       assert("edward-192", cluster_ok(clust, inode));
30668 +
30669 +       if (win->off == 0 && win->count == inode_cluster_size(inode)) {
30670 +               /* This part of the hole will be represented by "fake"
30671 +                * logical cluster, i.e. which doesn't have appropriate
30672 +                * disk cluster until someone modify this logical cluster
30673 +                * and make it dirty.
30674 +                * So go forward here..
30675 +                */
30676 +               move_update_window(inode, clust, file_off, to_file);
30677 +               return 0;
30678 +       }
30679 +       cl_count = win->count;  /* number of zeroes to write */
30680 +       cl_off = win->off;
30681 +       pg_off = off_to_pgoff(win->off);
30682 +
30683 +       while (cl_count) {
30684 +               struct page *page;
30685 +               page = clust->pages[off_to_pg(cl_off)];
30686 +
30687 +               assert("edward-284", page != NULL);
30688 +
30689 +               to_pg = min((typeof(pg_off))PAGE_CACHE_SIZE - pg_off, cl_count);
30690 +               lock_page(page);
30691 +               zero_user(page, pg_off, to_pg);
30692 +               SetPageUptodate(page);
30693 +               set_page_dirty_notag(page);
30694 +               mark_page_accessed(page);
30695 +               unlock_page(page);
30696 +
30697 +               cl_off += to_pg;
30698 +               cl_count -= to_pg;
30699 +               pg_off = 0;
30700 +       }
30701 +       if (!win->delta) {
30702 +               /* only zeroes in this window, try to capture
30703 +                */
30704 +               result = checkin_logical_cluster(clust, inode);
30705 +               if (result)
30706 +                       return result;
30707 +               put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
30708 +               result = balance_dirty_page_cluster(clust,
30709 +                                                   inode, file_off, to_file,
30710 +                                                   win_count_to_nrpages(win));
30711 +       } else
30712 +               move_update_window(inode, clust, file_off, to_file);
30713 +       return result;
30714 +}
30715 +
30716 +/*
30717 +  The main disk search procedure for cryptcompress plugin, which
30718 +  . scans all items of disk cluster with the lock mode @mode
30719 +  . maybe reads each one (if @read)
30720 +  . maybe makes its znode dirty (if write lock mode was specified)
30721 +
30722 +  NOTE-EDWARD: Callers should handle the case when disk cluster
30723 +  is incomplete (-EIO)
30724 +*/
30725 +int find_disk_cluster(struct cluster_handle * clust,
30726 +                     struct inode *inode, int read, znode_lock_mode mode)
30727 +{
30728 +       flow_t f;
30729 +       hint_t *hint;
30730 +       int result = 0;
30731 +       int was_grabbed;
30732 +       ra_info_t ra_info;
30733 +       file_plugin *fplug;
30734 +       item_plugin *iplug;
30735 +       struct tfm_cluster *tc;
30736 +       struct cryptcompress_info * info;
30737 +
30738 +       assert("edward-138", clust != NULL);
30739 +       assert("edward-728", clust->hint != NULL);
30740 +       assert("edward-226", reiser4_schedulable());
30741 +       assert("edward-137", inode != NULL);
30742 +       assert("edward-729", cryptcompress_inode_ok(inode));
30743 +
30744 +       hint = clust->hint;
30745 +       fplug = inode_file_plugin(inode);
30746 +       was_grabbed = get_current_context()->grabbed_blocks;
30747 +       info = cryptcompress_inode_data(inode);
30748 +       tc = &clust->tc;
30749 +
30750 +       assert("edward-462", !tfm_cluster_is_uptodate(tc));
30751 +       assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM)));
30752 +
30753 +       dclust_init_extension(hint);
30754 +
30755 +       /* set key of the first disk cluster item */
30756 +       fplug->flow_by_inode(inode,
30757 +                            (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL),
30758 +                            0 /* kernel space */ ,
30759 +                            inode_scaled_cluster_size(inode),
30760 +                            clust_to_off(clust->index, inode), READ_OP, &f);
30761 +       if (mode == ZNODE_WRITE_LOCK) {
30762 +               /* reserve for flush to make dirty all the leaf nodes
30763 +                  which contain disk cluster */
30764 +               result =
30765 +                   reiser4_grab_space_force(estimate_dirty_cluster(inode),
30766 +                                            BA_CAN_COMMIT);
30767 +               if (result)
30768 +                       goto out;
30769 +       }
30770 +
30771 +       ra_info.key_to_stop = f.key;
30772 +       set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
30773 +
30774 +       while (f.length) {
30775 +               result = find_cluster_item(hint, &f.key, mode,
30776 +                                          NULL, FIND_EXACT,
30777 +                                          (mode == ZNODE_WRITE_LOCK ?
30778 +                                           CBK_FOR_INSERT : 0));
30779 +               switch (result) {
30780 +               case CBK_COORD_NOTFOUND:
30781 +                       result = 0;
30782 +                       if (inode_scaled_offset
30783 +                           (inode, clust_to_off(clust->index, inode)) ==
30784 +                           get_key_offset(&f.key)) {
30785 +                               /* first item not found, this is treated
30786 +                                  as disk cluster is absent */
30787 +                               clust->dstat = FAKE_DISK_CLUSTER;
30788 +                               goto out;
30789 +                       }
30790 +                       /* we are outside the cluster, stop search here */
30791 +                       assert("edward-146",
30792 +                              f.length != inode_scaled_cluster_size(inode));
30793 +                       goto ok;
30794 +               case CBK_COORD_FOUND:
30795 +                       assert("edward-148",
30796 +                              hint->ext_coord.coord.between == AT_UNIT);
30797 +                       assert("edward-460",
30798 +                              hint->ext_coord.coord.unit_pos == 0);
30799 +
30800 +                       coord_clear_iplug(&hint->ext_coord.coord);
30801 +                       result = zload_ra(hint->ext_coord.coord.node, &ra_info);
30802 +                       if (unlikely(result))
30803 +                               goto out;
30804 +                       iplug = item_plugin_by_coord(&hint->ext_coord.coord);
30805 +                       assert("edward-147",
30806 +                              item_id_by_coord(&hint->ext_coord.coord) ==
30807 +                              CTAIL_ID);
30808 +
30809 +                       result = iplug->s.file.read(NULL, &f, hint);
30810 +                       if (result) {
30811 +                               zrelse(hint->ext_coord.coord.node);
30812 +                               goto out;
30813 +                       }
30814 +                       if (mode == ZNODE_WRITE_LOCK) {
30815 +                               /* Don't make dirty more nodes then it was
30816 +                                  estimated (see comments before
30817 +                                  estimate_dirty_cluster). Missed nodes will be
30818 +                                  read up in flush time if they are evicted from
30819 +                                  memory */
30820 +                               if (dclust_get_extension_ncount(hint) <=
30821 +                                   estimate_dirty_cluster(inode))
30822 +                                  znode_make_dirty(hint->ext_coord.coord.node);
30823 +
30824 +                               znode_set_convertible(hint->ext_coord.coord.
30825 +                                                     node);
30826 +                       }
30827 +                       zrelse(hint->ext_coord.coord.node);
30828 +                       break;
30829 +               default:
30830 +                       goto out;
30831 +               }
30832 +       }
30833 + ok:
30834 +       /* at least one item was found  */
30835 +       /* NOTE-EDWARD: Callers should handle the case
30836 +          when disk cluster is incomplete (-EIO) */
30837 +       tc->len = inode_scaled_cluster_size(inode) - f.length;
30838 +       tc->lsize = lbytes(clust->index, inode);
30839 +       assert("edward-1196", tc->len > 0);
30840 +       assert("edward-1406", tc->lsize > 0);
30841 +
30842 +       if (hint_is_unprepped_dclust(clust->hint)) {
30843 +               clust->dstat = UNPR_DISK_CLUSTER;
30844 +       } else if (clust->index == info->trunc_index) {
30845 +               clust->dstat = TRNC_DISK_CLUSTER;
30846 +       } else {
30847 +               clust->dstat = PREP_DISK_CLUSTER;
30848 +               dclust_set_extension_dsize(clust->hint, tc->len);
30849 +       }
30850 + out:
30851 +       assert("edward-1339",
30852 +              get_current_context()->grabbed_blocks >= was_grabbed);
30853 +       grabbed2free(get_current_context(),
30854 +                    get_current_super_private(),
30855 +                    get_current_context()->grabbed_blocks - was_grabbed);
30856 +       return result;
30857 +}
30858 +
30859 +int get_disk_cluster_locked(struct cluster_handle * clust, struct inode *inode,
30860 +                           znode_lock_mode lock_mode)
30861 +{
30862 +       reiser4_key key;
30863 +       ra_info_t ra_info;
30864 +
30865 +       assert("edward-730", reiser4_schedulable());
30866 +       assert("edward-731", clust != NULL);
30867 +       assert("edward-732", inode != NULL);
30868 +
30869 +       if (hint_is_valid(clust->hint)) {
30870 +               assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER);
30871 +               assert("edward-1294",
30872 +                      znode_is_write_locked(clust->hint->lh.node));
30873 +               /* already have a valid locked position */
30874 +               return (clust->dstat ==
30875 +                       FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND :
30876 +                       CBK_COORD_FOUND);
30877 +       }
30878 +       key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode),
30879 +                                  &key);
30880 +       ra_info.key_to_stop = key;
30881 +       set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
30882 +
30883 +       return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT,
30884 +                                CBK_FOR_INSERT);
30885 +}
30886 +
30887 +/* Read needed cluster pages before modifying.
30888 +   If success, @clust->hint contains locked position in the tree.
30889 +   Also:
30890 +   . find and set disk cluster state
30891 +   . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER.
30892 +*/
30893 +static int read_some_cluster_pages(struct inode * inode,
30894 +                                  struct cluster_handle * clust)
30895 +{
30896 +       int i;
30897 +       int result = 0;
30898 +       item_plugin *iplug;
30899 +       struct reiser4_slide * win = clust->win;
30900 +       znode_lock_mode mode = ZNODE_WRITE_LOCK;
30901 +
30902 +       iplug = item_plugin_by_id(CTAIL_ID);
30903 +
30904 +       assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc));
30905 +
30906 +#if REISER4_DEBUG
30907 +       if (clust->nr_pages == 0) {
30908 +               /* start write hole from fake disk cluster */
30909 +               assert("edward-1117", win != NULL);
30910 +               assert("edward-1118", win->stat == HOLE_WINDOW);
30911 +               assert("edward-1119", new_logical_cluster(clust, inode));
30912 +       }
30913 +#endif
30914 +       if (new_logical_cluster(clust, inode)) {
30915 +               /*
30916 +                  new page cluster is about to be written, nothing to read,
30917 +                */
30918 +               assert("edward-734", reiser4_schedulable());
30919 +               assert("edward-735", clust->hint->lh.owner == NULL);
30920 +
30921 +               if (clust->nr_pages) {
30922 +                       int off;
30923 +                       struct page * pg;
30924 +                       assert("edward-1419", clust->pages != NULL);
30925 +                       pg = clust->pages[clust->nr_pages - 1];
30926 +                       assert("edward-1420", pg != NULL);
30927 +                       off = off_to_pgoff(win->off+win->count+win->delta);
30928 +                       if (off) {
30929 +                               lock_page(pg);
30930 +                               zero_user_segment(pg, off, PAGE_CACHE_SIZE);
30931 +                               unlock_page(pg);
30932 +                       }
30933 +               }
30934 +               clust->dstat = FAKE_DISK_CLUSTER;
30935 +               return 0;
30936 +       }
30937 +       /*
30938 +          Here we should search for disk cluster to figure out its real state.
30939 +          Also there is one more important reason to do disk search: we need
30940 +          to make disk cluster _dirty_ if it exists
30941 +        */
30942 +
30943 +       /* if windows is specified, read the only pages
30944 +          that will be modified partially */
30945 +
30946 +       for (i = 0; i < clust->nr_pages; i++) {
30947 +               struct page *pg = clust->pages[i];
30948 +
30949 +               lock_page(pg);
30950 +               if (PageUptodate(pg)) {
30951 +                       unlock_page(pg);
30952 +                       continue;
30953 +               }
30954 +               unlock_page(pg);
30955 +
30956 +               if (win &&
30957 +                   i >= size_in_pages(win->off) &&
30958 +                   i < off_to_pg(win->off + win->count + win->delta))
30959 +                       /* page will be completely overwritten */
30960 +                       continue;
30961 +
30962 +               if (win && (i == clust->nr_pages - 1) &&
30963 +                   /* the last page is
30964 +                      partially modified,
30965 +                      not uptodate .. */
30966 +                   (size_in_pages(i_size_read(inode)) <= pg->index)) {
30967 +                       /* .. and appended,
30968 +                          so set zeroes to the rest */
30969 +                       int offset;
30970 +                       lock_page(pg);
30971 +                       assert("edward-1260",
30972 +                              size_in_pages(win->off + win->count +
30973 +                                            win->delta) - 1 == i);
30974 +
30975 +                       offset =
30976 +                           off_to_pgoff(win->off + win->count + win->delta);
30977 +                       zero_user_segment(pg, offset, PAGE_CACHE_SIZE);
30978 +                       unlock_page(pg);
30979 +                       /* still not uptodate */
30980 +                       break;
30981 +               }
30982 +               lock_page(pg);
30983 +               result = do_readpage_ctail(inode, clust, pg, mode);
30984 +
30985 +               assert("edward-1526", ergo(!result, PageUptodate(pg)));
30986 +               unlock_page(pg);
30987 +               if (result) {
30988 +                       warning("edward-219", "do_readpage_ctail failed");
30989 +                       goto out;
30990 +               }
30991 +       }
30992 +       if (!tfm_cluster_is_uptodate(&clust->tc)) {
30993 +               /* disk cluster unclaimed, but we need to make its znodes dirty
30994 +                * to make flush update convert its content
30995 +                */
30996 +               result = find_disk_cluster(clust, inode,
30997 +                                          0 /* do not read items */,
30998 +                                          mode);
30999 +       }
31000 + out:
31001 +       tfm_cluster_clr_uptodate(&clust->tc);
31002 +       return result;
31003 +}
31004 +
31005 +static int should_create_unprepped_cluster(struct cluster_handle * clust,
31006 +                                          struct inode * inode)
31007 +{
31008 +       assert("edward-737", clust != NULL);
31009 +
31010 +       switch (clust->dstat) {
31011 +       case PREP_DISK_CLUSTER:
31012 +       case UNPR_DISK_CLUSTER:
31013 +               return 0;
31014 +       case FAKE_DISK_CLUSTER:
31015 +               if (clust->win &&
31016 +                   clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) {
31017 +                       assert("edward-1172",
31018 +                              new_logical_cluster(clust, inode));
31019 +                       return 0;
31020 +               }
31021 +               return 1;
31022 +       default:
31023 +               impossible("edward-1173", "bad disk cluster state");
31024 +               return 0;
31025 +       }
31026 +}
31027 +
31028 +static int cryptcompress_make_unprepped_cluster(struct cluster_handle * clust,
31029 +                                               struct inode *inode)
31030 +{
31031 +       int result;
31032 +
31033 +       assert("edward-1123", reiser4_schedulable());
31034 +       assert("edward-737", clust != NULL);
31035 +       assert("edward-738", inode != NULL);
31036 +       assert("edward-739", cryptcompress_inode_ok(inode));
31037 +       assert("edward-1053", clust->hint != NULL);
31038 +
31039 +       if (!should_create_unprepped_cluster(clust, inode)) {
31040 +               if (clust->reserved) {
31041 +                       cluster_reserved2free(estimate_insert_cluster(inode));
31042 +#if REISER4_DEBUG
31043 +                       assert("edward-1267",
31044 +                              clust->reserved_unprepped ==
31045 +                              estimate_insert_cluster(inode));
31046 +                       clust->reserved_unprepped -=
31047 +                               estimate_insert_cluster(inode);
31048 +#endif
31049 +               }
31050 +               return 0;
31051 +       }
31052 +       assert("edward-1268", clust->reserved);
31053 +       cluster_reserved2grabbed(estimate_insert_cluster(inode));
31054 +#if REISER4_DEBUG
31055 +       assert("edward-1441",
31056 +              clust->reserved_unprepped == estimate_insert_cluster(inode));
31057 +       clust->reserved_unprepped -= estimate_insert_cluster(inode);
31058 +#endif
31059 +       result = ctail_insert_unprepped_cluster(clust, inode);
31060 +       if (result)
31061 +               return result;
31062 +
31063 +       inode_add_bytes(inode, inode_cluster_size(inode));
31064 +
31065 +       assert("edward-743", cryptcompress_inode_ok(inode));
31066 +       assert("edward-744", znode_is_write_locked(clust->hint->lh.node));
31067 +
31068 +       clust->dstat = UNPR_DISK_CLUSTER;
31069 +       return 0;
31070 +}
31071 +
31072 +/* . Grab page cluster for read, write, setattr, etc. operations;
31073 + * . Truncate its complete pages, if needed;
31074 + */
31075 +int prepare_page_cluster(struct inode * inode, struct cluster_handle * clust,
31076 +                        rw_op rw)
31077 +{
31078 +       assert("edward-177", inode != NULL);
31079 +       assert("edward-741", cryptcompress_inode_ok(inode));
31080 +       assert("edward-740", clust->pages != NULL);
31081 +
31082 +       set_cluster_nrpages(clust, inode);
31083 +       reset_cluster_pgset(clust, cluster_nrpages(inode));
31084 +       return grab_page_cluster(inode, clust, rw);
31085 +}
31086 +
31087 +/* Truncate complete page cluster of index @index.
31088 + * This is called by ->kill_hook() method of item
31089 + * plugin when deleting a disk cluster of such index.
31090 + */
31091 +void truncate_complete_page_cluster(struct inode *inode, cloff_t index,
31092 +                                   int even_cows)
31093 +{
31094 +       int found;
31095 +       int nr_pages;
31096 +       jnode *node;
31097 +       struct page *pages[MAX_CLUSTER_NRPAGES];
31098 +
31099 +       node = jlookup(current_tree, get_inode_oid(inode),
31100 +                      clust_to_pg(index, inode));
31101 +       nr_pages = size_in_pages(lbytes(index, inode));
31102 +       assert("edward-1483", nr_pages != 0);
31103 +       if (!node)
31104 +               goto truncate;
31105 +       found = find_get_pages(inode->i_mapping,
31106 +                              clust_to_pg(index, inode),
31107 +                              cluster_nrpages(inode), pages);
31108 +       if (!found) {
31109 +               assert("edward-1484", jnode_truncate_ok(inode, index));
31110 +               return;
31111 +       }
31112 +       lock_cluster(node);
31113 +
31114 +       if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS)
31115 +           && index == 0)
31116 +               /* converting to unix_file is in progress */
31117 +               JF_CLR(node, JNODE_CLUSTER_PAGE);
31118 +       if (JF_ISSET(node, JNODE_DIRTY)) {
31119 +               /*
31120 +                * @nr_pages were checked in, but not yet checked out -
31121 +                * we need to release them. (also there can be pages
31122 +                * attached to page cache by read(), etc. - don't take
31123 +                * them into account).
31124 +                */
31125 +               assert("edward-1198", found >= nr_pages);
31126 +
31127 +               /* free disk space grabbed for disk cluster converting */
31128 +               cluster_reserved2grabbed(estimate_update_cluster(inode));
31129 +               grabbed2free(get_current_context(),
31130 +                            get_current_super_private(),
31131 +                            estimate_update_cluster(inode));
31132 +               __put_page_cluster(0, nr_pages, pages, inode);
31133 +
31134 +               /* This will clear dirty bit, uncapture and unlock jnode */
31135 +               unlock_cluster_uncapture(node);
31136 +       } else
31137 +               unlock_cluster(node);
31138 +       jput(node);                         /* jlookup */
31139 +       put_found_pages(pages, found); /* find_get_pages */
31140 + truncate:
31141 +       if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) &&
31142 +           index == 0)
31143 +               return;
31144 +       truncate_page_cluster_range(inode, pages, index, 0,
31145 +                                   cluster_nrpages(inode),
31146 +                                   even_cows);
31147 +       assert("edward-1201",
31148 +              ergo(!reiser4_inode_get_flag(inode,
31149 +                                           REISER4_FILE_CONV_IN_PROGRESS),
31150 +                   jnode_truncate_ok(inode, index)));
31151 +       return;
31152 +}
31153 +
31154 +/*
31155 + * Set cluster handle @clust of a logical cluster before
31156 + * modifications which are supposed to be committed.
31157 + *
31158 + * . grab cluster pages;
31159 + * . reserve disk space;
31160 + * . maybe read pages from disk and set the disk cluster dirty;
31161 + * . maybe write hole and check in (partially zeroed) logical cluster;
31162 + * . create 'unprepped' disk cluster for new or fake logical one.
31163 + */
31164 +static int prepare_logical_cluster(struct inode *inode,
31165 +                                  loff_t file_off, /* write position
31166 +                                                      in the file */
31167 +                                  loff_t to_file, /* bytes of users data
31168 +                                                     to write to the file */
31169 +                                  struct cluster_handle * clust,
31170 +                                  logical_cluster_op op)
31171 +{
31172 +       int result = 0;
31173 +       struct reiser4_slide * win = clust->win;
31174 +
31175 +       reset_cluster_params(clust);
31176 +       cluster_set_tfm_act(&clust->tc, TFMA_READ);
31177 +#if REISER4_DEBUG
31178 +       clust->ctx = get_current_context();
31179 +#endif
31180 +       assert("edward-1190", op != LC_INVAL);
31181 +
31182 +       clust->op = op;
31183 +
31184 +       result = prepare_page_cluster(inode, clust, WRITE_OP);
31185 +       if (result)
31186 +               return result;
31187 +       assert("edward-1447",
31188 +              ergo(clust->nr_pages != 0, jprivate(clust->pages[0])));
31189 +       assert("edward-1448",
31190 +              ergo(clust->nr_pages != 0,
31191 +                   jnode_is_cluster_page(jprivate(clust->pages[0]))));
31192 +
31193 +       result = reserve4cluster(inode, clust);
31194 +       if (result)
31195 +               goto err1;
31196 +       result = read_some_cluster_pages(inode, clust);
31197 +       if (result) {
31198 +               free_reserved4cluster(inode,
31199 +                                     clust,
31200 +                                     estimate_update_cluster(inode) +
31201 +                                     estimate_insert_cluster(inode));
31202 +               goto err1;
31203 +       }
31204 +       assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER);
31205 +
31206 +       result = cryptcompress_make_unprepped_cluster(clust, inode);
31207 +       if (result)
31208 +               goto err2;
31209 +       if (win && win->stat == HOLE_WINDOW) {
31210 +               result = write_hole(inode, clust, file_off, to_file);
31211 +               if (result)
31212 +                       goto err2;
31213 +       }
31214 +       return 0;
31215 + err2:
31216 +       free_reserved4cluster(inode, clust,
31217 +                             estimate_update_cluster(inode));
31218 + err1:
31219 +       put_page_cluster(clust, inode, WRITE_OP);
31220 +       assert("edward-1125", result == -ENOSPC);
31221 +       return result;
31222 +}
31223 +
31224 +/* set window by two offsets */
31225 +static void set_window(struct cluster_handle * clust,
31226 +                      struct reiser4_slide * win, struct inode *inode,
31227 +                      loff_t o1, loff_t o2)
31228 +{
31229 +       assert("edward-295", clust != NULL);
31230 +       assert("edward-296", inode != NULL);
31231 +       assert("edward-1071", win != NULL);
31232 +       assert("edward-297", o1 <= o2);
31233 +
31234 +       clust->index = off_to_clust(o1, inode);
31235 +
31236 +       win->off = off_to_cloff(o1, inode);
31237 +       win->count = min((loff_t)(inode_cluster_size(inode) - win->off),
31238 +                        o2 - o1);
31239 +       win->delta = 0;
31240 +
31241 +       clust->win = win;
31242 +}
31243 +
31244 +static int set_cluster_by_window(struct inode *inode,
31245 +                                struct cluster_handle * clust,
31246 +                                struct reiser4_slide * win, size_t length,
31247 +                                loff_t file_off)
31248 +{
31249 +       int result;
31250 +
31251 +       assert("edward-197", clust != NULL);
31252 +       assert("edward-1072", win != NULL);
31253 +       assert("edward-198", inode != NULL);
31254 +
31255 +       result = alloc_cluster_pgset(clust, cluster_nrpages(inode));
31256 +       if (result)
31257 +               return result;
31258 +
31259 +       if (file_off > i_size_read(inode)) {
31260 +               /* Uhmm, hole in cryptcompress file... */
31261 +               loff_t hole_size;
31262 +               hole_size = file_off - inode->i_size;
31263 +
31264 +               set_window(clust, win, inode, inode->i_size, file_off);
31265 +               win->stat = HOLE_WINDOW;
31266 +               if (win->off + hole_size < inode_cluster_size(inode))
31267 +                       /* there is also user's data to append to the hole */
31268 +                       win->delta = min(inode_cluster_size(inode) -
31269 +                                        (win->off + win->count), length);
31270 +               return 0;
31271 +       }
31272 +       set_window(clust, win, inode, file_off, file_off + length);
31273 +       win->stat = DATA_WINDOW;
31274 +       return 0;
31275 +}
31276 +
31277 +int set_cluster_by_page(struct cluster_handle * clust, struct page * page,
31278 +                       int count)
31279 +{
31280 +       int result = 0;
31281 +       int (*setting_actor)(struct cluster_handle * clust, int count);
31282 +
31283 +       assert("edward-1358", clust != NULL);
31284 +       assert("edward-1359", page != NULL);
31285 +       assert("edward-1360", page->mapping != NULL);
31286 +       assert("edward-1361", page->mapping->host != NULL);
31287 +
31288 +       setting_actor =
31289 +               (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset);
31290 +       result = setting_actor(clust, count);
31291 +       clust->index = pg_to_clust(page->index, page->mapping->host);
31292 +       return result;
31293 +}
31294 +
31295 +/* reset all the params that not get updated */
31296 +void reset_cluster_params(struct cluster_handle * clust)
31297 +{
31298 +       assert("edward-197", clust != NULL);
31299 +
31300 +       clust->dstat = INVAL_DISK_CLUSTER;
31301 +       clust->tc.uptodate = 0;
31302 +       clust->tc.len = 0;
31303 +}
31304 +
31305 +/* the heart of write_cryptcompress */
31306 +static loff_t do_write_cryptcompress(struct file *file, struct inode *inode,
31307 +                                    const char __user *buf, size_t to_write,
31308 +                                    loff_t pos, struct dispatch_context *cont)
31309 +{
31310 +       int i;
31311 +       hint_t *hint;
31312 +       int result = 0;
31313 +       size_t count;
31314 +       struct reiser4_slide win;
31315 +       struct cluster_handle clust;
31316 +       struct cryptcompress_info * info;
31317 +
31318 +       assert("edward-154", buf != NULL);
31319 +       assert("edward-161", reiser4_schedulable());
31320 +       assert("edward-748", cryptcompress_inode_ok(inode));
31321 +       assert("edward-159", current_blocksize == PAGE_CACHE_SIZE);
31322 +       assert("edward-1274", get_current_context()->grabbed_blocks == 0);
31323 +
31324 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31325 +       if (hint == NULL)
31326 +               return RETERR(-ENOMEM);
31327 +
31328 +       result = load_file_hint(file, hint);
31329 +       if (result) {
31330 +               kfree(hint);
31331 +               return result;
31332 +       }
31333 +       count = to_write;
31334 +
31335 +       reiser4_slide_init(&win);
31336 +       cluster_init_read(&clust, &win);
31337 +       clust.hint = hint;
31338 +       info = cryptcompress_inode_data(inode);
31339 +
31340 +       mutex_lock(&info->checkin_mutex);
31341 +
31342 +       result = set_cluster_by_window(inode, &clust, &win, to_write, pos);
31343 +       if (result)
31344 +               goto out;
31345 +
31346 +       if (next_window_stat(&win) == HOLE_WINDOW) {
31347 +               /* write hole in this iteration
31348 +                  separated from the loop below */
31349 +               result = write_dispatch_hook(file, inode,
31350 +                                            pos, &clust, cont);
31351 +               if (result)
31352 +                       goto out;
31353 +               result = prepare_logical_cluster(inode, pos, count, &clust,
31354 +                                                LC_APPOV);
31355 +               if (result)
31356 +                       goto out;
31357 +       }
31358 +       do {
31359 +               const char __user * src;
31360 +               unsigned page_off, to_page;
31361 +
31362 +               assert("edward-750", reiser4_schedulable());
31363 +
31364 +               result = write_dispatch_hook(file, inode,
31365 +                                            pos + to_write - count,
31366 +                                            &clust, cont);
31367 +               if (result)
31368 +                       goto out;
31369 +               if (cont->state == DISPATCH_ASSIGNED_NEW)
31370 +                       /* done_lh was called in write_dispatch_hook */
31371 +                       goto out_no_longterm_lock;
31372 +
31373 +               result = prepare_logical_cluster(inode, pos, count, &clust,
31374 +                                                LC_APPOV);
31375 +               if (result)
31376 +                       goto out;
31377 +
31378 +               assert("edward-751", cryptcompress_inode_ok(inode));
31379 +               assert("edward-204", win.stat == DATA_WINDOW);
31380 +               assert("edward-1288", hint_is_valid(clust.hint));
31381 +               assert("edward-752",
31382 +                      znode_is_write_locked(hint->ext_coord.coord.node));
31383 +               put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK);
31384 +
31385 +               /* set write position in page */
31386 +               page_off = off_to_pgoff(win.off);
31387 +
31388 +               /* copy user's data to cluster pages */
31389 +               for (i = off_to_pg(win.off), src = buf;
31390 +                    i < size_in_pages(win.off + win.count);
31391 +                    i++, src += to_page) {
31392 +                       to_page = __mbp(win.off + win.count, i) - page_off;
31393 +                       assert("edward-1039",
31394 +                              page_off + to_page <= PAGE_CACHE_SIZE);
31395 +                       assert("edward-287", clust.pages[i] != NULL);
31396 +
31397 +                       fault_in_pages_readable(src, to_page);
31398 +
31399 +                       lock_page(clust.pages[i]);
31400 +                       result =
31401 +                           __copy_from_user((char *)kmap(clust.pages[i]) +
31402 +                                            page_off, src, to_page);
31403 +                       kunmap(clust.pages[i]);
31404 +                       if (unlikely(result)) {
31405 +                               unlock_page(clust.pages[i]);
31406 +                               result = -EFAULT;
31407 +                               goto err2;
31408 +                       }
31409 +                       SetPageUptodate(clust.pages[i]);
31410 +                       set_page_dirty_notag(clust.pages[i]);
31411 +                       flush_dcache_page(clust.pages[i]);
31412 +                       mark_page_accessed(clust.pages[i]);
31413 +                       unlock_page(clust.pages[i]);
31414 +                       page_off = 0;
31415 +               }
31416 +               assert("edward-753", cryptcompress_inode_ok(inode));
31417 +
31418 +               result = checkin_logical_cluster(&clust, inode);
31419 +               if (result)
31420 +                       goto err2;
31421 +
31422 +               buf   += win.count;
31423 +               count -= win.count;
31424 +
31425 +               result = balance_dirty_page_cluster(&clust, inode, 0, count,
31426 +                                                   win_count_to_nrpages(&win));
31427 +               if (result)
31428 +                       goto err1;
31429 +               assert("edward-755", hint->lh.owner == NULL);
31430 +               reset_cluster_params(&clust);
31431 +               continue;
31432 +       err2:
31433 +               put_page_cluster(&clust, inode, WRITE_OP);
31434 +       err1:
31435 +               if (clust.reserved)
31436 +                       free_reserved4cluster(inode,
31437 +                                             &clust,
31438 +                                             estimate_update_cluster(inode));
31439 +               break;
31440 +       } while (count);
31441 + out:
31442 +       done_lh(&hint->lh);
31443 +       save_file_hint(file, hint);
31444 + out_no_longterm_lock:
31445 +       mutex_unlock(&info->checkin_mutex);
31446 +       kfree(hint);
31447 +       put_cluster_handle(&clust);
31448 +       assert("edward-195",
31449 +              ergo((to_write == count),
31450 +                   (result < 0 || cont->state == DISPATCH_ASSIGNED_NEW)));
31451 +       return (to_write - count) ? (to_write - count) : result;
31452 +}
31453 +
31454 +/**
31455 + * plugin->write()
31456 + * @file: file to write to
31457 + * @buf: address of user-space buffer
31458 + * @read_amount: number of bytes to write
31459 + * @off: position in file to write to
31460 + */
31461 +ssize_t write_cryptcompress(struct file *file, const char __user *buf,
31462 +                           size_t count, loff_t *off,
31463 +                           struct dispatch_context *cont)
31464 +{
31465 +       ssize_t result;
31466 +       struct inode *inode;
31467 +       reiser4_context *ctx;
31468 +       loff_t pos = *off;
31469 +       struct cryptcompress_info *info;
31470 +
31471 +       assert("edward-1449", cont->state == DISPATCH_INVAL_STATE);
31472 +
31473 +       inode = file->f_dentry->d_inode;
31474 +       assert("edward-196", cryptcompress_inode_ok(inode));
31475 +
31476 +       info = cryptcompress_inode_data(inode);
31477 +       ctx = get_current_context();
31478 +
31479 +       result = generic_write_checks(file, &pos, &count, 0);
31480 +       if (unlikely(result != 0)) {
31481 +               context_set_commit_async(ctx);
31482 +               return result;
31483 +       }
31484 +       if (unlikely(count == 0))
31485 +               return 0;
31486 +       result = file_remove_suid(file);
31487 +       if (unlikely(result != 0)) {
31488 +               context_set_commit_async(ctx);
31489 +               return result;
31490 +       }
31491 +       /* remove_suid might create a transaction */
31492 +       reiser4_txn_restart(ctx);
31493 +
31494 +       result = do_write_cryptcompress(file, inode, buf, count, pos, cont);
31495 +
31496 +       if (unlikely(result < 0)) {
31497 +               context_set_commit_async(ctx);
31498 +               return result;
31499 +       }
31500 +       /* update position in a file */
31501 +       *off = pos + result;
31502 +       return result;
31503 +}
31504 +
31505 +/* plugin->readpages */
31506 +int readpages_cryptcompress(struct file *file, struct address_space *mapping,
31507 +                           struct list_head *pages, unsigned nr_pages)
31508 +{
31509 +       reiser4_context * ctx;
31510 +       int ret;
31511 +
31512 +       ctx = reiser4_init_context(mapping->host->i_sb);
31513 +       if (IS_ERR(ctx)) {
31514 +               ret = PTR_ERR(ctx);
31515 +               goto err;
31516 +       }
31517 +       /* cryptcompress file can be built of ctail items only */
31518 +       ret = readpages_ctail(file, mapping, pages);
31519 +       reiser4_txn_restart(ctx);
31520 +       reiser4_exit_context(ctx);
31521 +       if (ret) {
31522 +err:
31523 +               put_pages_list(pages);
31524 +       }
31525 +       return ret;
31526 +}
31527 +
31528 +static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode)
31529 +{
31530 +       /* reserve one block to update stat data item */
31531 +       assert("edward-1193",
31532 +              inode_file_plugin(inode)->estimate.update ==
31533 +              estimate_update_common);
31534 +       return estimate_update_common(inode);
31535 +}
31536 +
31537 +/**
31538 + * plugin->read
31539 + * @file: file to read from
31540 + * @buf: address of user-space buffer
31541 + * @read_amount: number of bytes to read
31542 + * @off: position in file to read from
31543 + */
31544 +ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size,
31545 +                          loff_t * off)
31546 +{
31547 +       ssize_t result;
31548 +       struct inode *inode;
31549 +       reiser4_context *ctx;
31550 +       struct cryptcompress_info *info;
31551 +       reiser4_block_nr needed;
31552 +
31553 +       inode = file->f_dentry->d_inode;
31554 +       assert("edward-1194", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
31555 +
31556 +       ctx = reiser4_init_context(inode->i_sb);
31557 +       if (IS_ERR(ctx))
31558 +               return PTR_ERR(ctx);
31559 +
31560 +       info = cryptcompress_inode_data(inode);
31561 +       needed = cryptcompress_estimate_read(inode);
31562 +
31563 +       result = reiser4_grab_space(needed, BA_CAN_COMMIT);
31564 +       if (result != 0) {
31565 +               reiser4_exit_context(ctx);
31566 +               return result;
31567 +       }
31568 +       result = do_sync_read(file, buf, size, off);
31569 +
31570 +       context_set_commit_async(ctx);
31571 +       reiser4_exit_context(ctx);
31572 +
31573 +       return result;
31574 +}
31575 +
31576 +/* Look for a disk cluster and keep lookup result in @found.
31577 + * If @index > 0, then find disk cluster of the index (@index - 1);
31578 + * If @index == 0, then find the rightmost disk cluster.
31579 + * Keep incremented index of the found disk cluster in @found.
31580 + * @found == 0 means that disk cluster was not found (in the last
31581 + * case (@index == 0) it means that file doesn't have disk clusters).
31582 + */
31583 +static int lookup_disk_cluster(struct inode *inode, cloff_t * found,
31584 +                              cloff_t index)
31585 +{
31586 +       int result;
31587 +       reiser4_key key;
31588 +       loff_t offset;
31589 +       hint_t *hint;
31590 +       lock_handle *lh;
31591 +       lookup_bias bias;
31592 +       coord_t *coord;
31593 +       item_plugin *iplug;
31594 +
31595 +       assert("edward-1131", inode != NULL);
31596 +       assert("edward-95", cryptcompress_inode_ok(inode));
31597 +
31598 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31599 +       if (hint == NULL)
31600 +               return RETERR(-ENOMEM);
31601 +       hint_init_zero(hint);
31602 +       lh = &hint->lh;
31603 +
31604 +       bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN);
31605 +       offset =
31606 +           (index ? clust_to_off(index, inode) -
31607 +            1 : get_key_offset(reiser4_max_key()));
31608 +
31609 +       key_by_inode_cryptcompress(inode, offset, &key);
31610 +
31611 +       /* find the last item of this object */
31612 +       result =
31613 +           find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */,
31614 +                             bias, 0);
31615 +       if (cbk_errored(result)) {
31616 +               done_lh(lh);
31617 +               kfree(hint);
31618 +               return result;
31619 +       }
31620 +       if (result == CBK_COORD_NOTFOUND) {
31621 +               /* no real disk clusters */
31622 +               done_lh(lh);
31623 +               kfree(hint);
31624 +               *found = 0;
31625 +               return 0;
31626 +       }
31627 +       /* disk cluster is found */
31628 +       coord = &hint->ext_coord.coord;
31629 +       coord_clear_iplug(coord);
31630 +       result = zload(coord->node);
31631 +       if (unlikely(result)) {
31632 +               done_lh(lh);
31633 +               kfree(hint);
31634 +               return result;
31635 +       }
31636 +       iplug = item_plugin_by_coord(coord);
31637 +       assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID));
31638 +       assert("edward-1202", ctail_ok(coord));
31639 +
31640 +       item_key_by_coord(coord, &key);
31641 +       *found = off_to_clust(get_key_offset(&key), inode) + 1;
31642 +
31643 +       assert("edward-1132", ergo(index, index == *found));
31644 +
31645 +       zrelse(coord->node);
31646 +       done_lh(lh);
31647 +       kfree(hint);
31648 +       return 0;
31649 +}
31650 +
31651 +static int find_fake_appended(struct inode *inode, cloff_t * index)
31652 +{
31653 +       return lookup_disk_cluster(inode, index,
31654 +                                  0 /* find last real one */ );
31655 +}
31656 +
31657 +/* Set left coord when unit is not found after node_lookup()
31658 +   This takes into account that there can be holes in a sequence
31659 +   of disk clusters */
31660 +
31661 +static void adjust_left_coord(coord_t * left_coord)
31662 +{
31663 +       switch (left_coord->between) {
31664 +       case AFTER_UNIT:
31665 +               left_coord->between = AFTER_ITEM;
31666 +       case AFTER_ITEM:
31667 +       case BEFORE_UNIT:
31668 +               break;
31669 +       default:
31670 +               impossible("edward-1204", "bad left coord to cut");
31671 +       }
31672 +       return;
31673 +}
31674 +
31675 +#define CRC_CUT_TREE_MIN_ITERATIONS 64
31676 +
31677 +/* plugin->cut_tree_worker */
31678 +int cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key,
31679 +                                 const reiser4_key * to_key,
31680 +                                 reiser4_key * smallest_removed,
31681 +                                 struct inode *object, int truncate,
31682 +                                 int *progress)
31683 +{
31684 +       lock_handle next_node_lock;
31685 +       coord_t left_coord;
31686 +       int result;
31687 +
31688 +       assert("edward-1158", tap->coord->node != NULL);
31689 +       assert("edward-1159", znode_is_write_locked(tap->coord->node));
31690 +       assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL);
31691 +
31692 +       *progress = 0;
31693 +       init_lh(&next_node_lock);
31694 +
31695 +       while (1) {
31696 +               znode *node;    /* node from which items are cut */
31697 +               node_plugin *nplug;     /* node plugin for @node */
31698 +
31699 +               node = tap->coord->node;
31700 +
31701 +               /* Move next_node_lock to the next node on the left. */
31702 +               result =
31703 +                   reiser4_get_left_neighbor(&next_node_lock, node,
31704 +                                             ZNODE_WRITE_LOCK,
31705 +                                             GN_CAN_USE_UPPER_LEVELS);
31706 +               if (result != 0 && result != -E_NO_NEIGHBOR)
31707 +                       break;
31708 +               /* FIXME-EDWARD: Check can we delete the node as a whole. */
31709 +               result = reiser4_tap_load(tap);
31710 +               if (result)
31711 +                       return result;
31712 +
31713 +               /* Prepare the second (right) point for cut_node() */
31714 +               if (*progress)
31715 +                       coord_init_last_unit(tap->coord, node);
31716 +
31717 +               else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL)
31718 +                       /* set rightmost unit for the items without lookup method */
31719 +                       tap->coord->unit_pos = coord_last_unit_pos(tap->coord);
31720 +
31721 +               nplug = node->nplug;
31722 +
31723 +               assert("edward-1161", nplug);
31724 +               assert("edward-1162", nplug->lookup);
31725 +
31726 +               /* left_coord is leftmost unit cut from @node */
31727 +               result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord);
31728 +
31729 +               if (IS_CBKERR(result))
31730 +                       break;
31731 +
31732 +               if (result == CBK_COORD_NOTFOUND)
31733 +                       adjust_left_coord(&left_coord);
31734 +
31735 +               /* adjust coordinates so that they are set to existing units */
31736 +               if (coord_set_to_right(&left_coord)
31737 +                   || coord_set_to_left(tap->coord)) {
31738 +                       result = 0;
31739 +                       break;
31740 +               }
31741 +
31742 +               if (coord_compare(&left_coord, tap->coord) ==
31743 +                   COORD_CMP_ON_RIGHT) {
31744 +                       /* keys from @from_key to @to_key are not in the tree */
31745 +                       result = 0;
31746 +                       break;
31747 +               }
31748 +
31749 +               /* cut data from one node */
31750 +               *smallest_removed = *reiser4_min_key();
31751 +               result = kill_node_content(&left_coord,
31752 +                                          tap->coord,
31753 +                                          from_key,
31754 +                                          to_key,
31755 +                                          smallest_removed,
31756 +                                          next_node_lock.node,
31757 +                                          object, truncate);
31758 +               reiser4_tap_relse(tap);
31759 +
31760 +               if (result)
31761 +                       break;
31762 +
31763 +               ++(*progress);
31764 +
31765 +               /* Check whether all items with keys >= from_key were removed
31766 +                * from the tree. */
31767 +               if (keyle(smallest_removed, from_key))
31768 +                       /* result = 0; */
31769 +                       break;
31770 +
31771 +               if (next_node_lock.node == NULL)
31772 +                       break;
31773 +
31774 +               result = reiser4_tap_move(tap, &next_node_lock);
31775 +               done_lh(&next_node_lock);
31776 +               if (result)
31777 +                       break;
31778 +
31779 +               /* Break long cut_tree operation (deletion of a large file) if
31780 +                * atom requires commit. */
31781 +               if (*progress > CRC_CUT_TREE_MIN_ITERATIONS
31782 +                   && current_atom_should_commit()) {
31783 +                       result = -E_REPEAT;
31784 +                       break;
31785 +               }
31786 +       }
31787 +       done_lh(&next_node_lock);
31788 +       return result;
31789 +}
31790 +
31791 +/* Append or expand hole in two steps:
31792 + * 1) set zeroes to the rightmost page of the rightmost non-fake
31793 + *    logical cluster;
31794 + * 2) expand hole via fake logical clusters (just increase i_size)
31795 + */
31796 +static int cryptcompress_append_hole(struct inode *inode /* with old size */,
31797 +                                    loff_t new_size)
31798 +{
31799 +       int result = 0;
31800 +       hint_t *hint;
31801 +       lock_handle *lh;
31802 +       loff_t hole_size;
31803 +       int nr_zeroes;
31804 +       struct reiser4_slide win;
31805 +       struct cluster_handle clust;
31806 +
31807 +       assert("edward-1133", inode->i_size < new_size);
31808 +       assert("edward-1134", reiser4_schedulable());
31809 +       assert("edward-1135", cryptcompress_inode_ok(inode));
31810 +       assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE);
31811 +       assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0);
31812 +
31813 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31814 +       if (hint == NULL)
31815 +               return RETERR(-ENOMEM);
31816 +       hint_init_zero(hint);
31817 +       lh = &hint->lh;
31818 +
31819 +       reiser4_slide_init(&win);
31820 +       cluster_init_read(&clust, &win);
31821 +       clust.hint = hint;
31822 +
31823 +       result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
31824 +       if (result)
31825 +               goto out;
31826 +       if (off_to_cloff(inode->i_size, inode) == 0)
31827 +               goto append_fake;
31828 +       hole_size = new_size - inode->i_size;
31829 +       nr_zeroes =
31830 +               inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode);
31831 +       if (hole_size < nr_zeroes)
31832 +               nr_zeroes = hole_size;
31833 +       set_window(&clust, &win, inode, inode->i_size,
31834 +                  inode->i_size + nr_zeroes);
31835 +       win.stat = HOLE_WINDOW;
31836 +
31837 +       assert("edward-1137",
31838 +              clust.index == off_to_clust(inode->i_size, inode));
31839 +
31840 +       result = prepare_logical_cluster(inode, 0, 0, &clust, LC_APPOV);
31841 +
31842 +       assert("edward-1271", !result || result == -ENOSPC);
31843 +       if (result)
31844 +               goto out;
31845 +       assert("edward-1139",
31846 +              clust.dstat == PREP_DISK_CLUSTER ||
31847 +              clust.dstat == UNPR_DISK_CLUSTER);
31848 +
31849 +       assert("edward-1431", hole_size >= nr_zeroes);
31850 +       if (hole_size == nr_zeroes)
31851 +       /* nothing to append anymore */
31852 +               goto out;
31853 + append_fake:
31854 +       INODE_SET_SIZE(inode, new_size);
31855 + out:
31856 +       done_lh(lh);
31857 +       kfree(hint);
31858 +       put_cluster_handle(&clust);
31859 +       return result;
31860 +}
31861 +
31862 +static int update_cryptcompress_size(struct inode *inode, loff_t new_size,
31863 +                                    int update_sd)
31864 +{
31865 +       return (new_size & ((loff_t) (inode_cluster_size(inode)) - 1)
31866 +               ? 0 : reiser4_update_file_size(inode, new_size, update_sd));
31867 +}
31868 +
31869 +/* Prune cryptcompress file in two steps:
31870 + * 1) cut all nominated logical clusters except the leftmost one which
31871 + *    is to be partially truncated. Note, that there can be "holes"
31872 + *    represented by fake logical clusters.
31873 + * 2) set zeroes and capture leftmost partially truncated logical
31874 + *    cluster, if it is not fake; otherwise prune fake logical cluster
31875 + *    (just decrease i_size).
31876 + */
31877 +static int prune_cryptcompress(struct inode *inode, loff_t new_size,
31878 +                              int update_sd, cloff_t aidx)
31879 +{
31880 +       int result = 0;
31881 +       unsigned nr_zeroes;
31882 +       loff_t to_prune;
31883 +       loff_t old_size;
31884 +       cloff_t ridx;
31885 +
31886 +       hint_t *hint;
31887 +       lock_handle *lh;
31888 +       struct reiser4_slide win;
31889 +       struct cluster_handle clust;
31890 +
31891 +       assert("edward-1140", inode->i_size >= new_size);
31892 +       assert("edward-1141", reiser4_schedulable());
31893 +       assert("edward-1142", cryptcompress_inode_ok(inode));
31894 +       assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE);
31895 +
31896 +       old_size = inode->i_size;
31897 +
31898 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31899 +       if (hint == NULL)
31900 +               return RETERR(-ENOMEM);
31901 +       hint_init_zero(hint);
31902 +       lh = &hint->lh;
31903 +
31904 +       reiser4_slide_init(&win);
31905 +       cluster_init_read(&clust, &win);
31906 +       clust.hint = hint;
31907 +
31908 +       /* calculate index of the rightmost logical cluster
31909 +          that will be completely truncated */
31910 +       ridx = size_in_lc(new_size, inode);
31911 +
31912 +       /* truncate all disk clusters starting from @ridx */
31913 +       assert("edward-1174", ridx <= aidx);
31914 +       old_size = inode->i_size;
31915 +       if (ridx != aidx) {
31916 +               struct cryptcompress_info * info;
31917 +               info = cryptcompress_inode_data(inode);
31918 +               result = cut_file_items(inode,
31919 +                                       clust_to_off(ridx, inode),
31920 +                                       update_sd,
31921 +                                       clust_to_off(aidx, inode),
31922 +                                       update_cryptcompress_size);
31923 +               info->trunc_index = ULONG_MAX;
31924 +               if (result)
31925 +                       goto out;
31926 +       }
31927 +       /*
31928 +        * there can be pages of fake logical clusters, truncate them
31929 +        */
31930 +       truncate_inode_pages(inode->i_mapping, clust_to_off(ridx, inode));
31931 +       assert("edward-1524",
31932 +              pages_truncate_ok(inode, clust_to_pg(ridx, inode)));
31933 +       /*
31934 +        * now perform partial truncate of last logical cluster
31935 +        */
31936 +       if (!off_to_cloff(new_size, inode)) {
31937 +               /* no partial truncate is needed */
31938 +               assert("edward-1145", inode->i_size == new_size);
31939 +               goto truncate_fake;
31940 +       }
31941 +       assert("edward-1146", new_size < inode->i_size);
31942 +
31943 +       to_prune = inode->i_size - new_size;
31944 +
31945 +       /* check if the last logical cluster is fake */
31946 +       result = lookup_disk_cluster(inode, &aidx, ridx);
31947 +       if (result)
31948 +               goto out;
31949 +       if (!aidx)
31950 +               /* yup, this is fake one */
31951 +               goto truncate_fake;
31952 +
31953 +       assert("edward-1148", aidx == ridx);
31954 +
31955 +       /* do partial truncate of the last page cluster,
31956 +          and try to capture this one */
31957 +       result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
31958 +       if (result)
31959 +               goto out;
31960 +       nr_zeroes = (off_to_pgoff(new_size) ?
31961 +                    PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0);
31962 +       set_window(&clust, &win, inode, new_size, new_size + nr_zeroes);
31963 +       win.stat = HOLE_WINDOW;
31964 +
31965 +       assert("edward-1149", clust.index == ridx - 1);
31966 +
31967 +       result = prepare_logical_cluster(inode, 0, 0, &clust, LC_TRUNC);
31968 +       if (result)
31969 +               goto out;
31970 +       assert("edward-1151",
31971 +              clust.dstat == PREP_DISK_CLUSTER ||
31972 +              clust.dstat == UNPR_DISK_CLUSTER);
31973 +
31974 +       assert("edward-1191", inode->i_size == new_size);
31975 +       assert("edward-1206", body_truncate_ok(inode, ridx));
31976 + truncate_fake:
31977 +       /* drop all the pages that don't have jnodes (i.e. pages
31978 +          which can not be truncated by cut_file_items() because
31979 +          of holes represented by fake disk clusters) including
31980 +          the pages of partially truncated cluster which was
31981 +          released by prepare_logical_cluster() */
31982 +       INODE_SET_SIZE(inode, new_size);
31983 +       truncate_inode_pages(inode->i_mapping, new_size);
31984 + out:
31985 +       assert("edward-1334", !result || result == -ENOSPC);
31986 +       assert("edward-1497",
31987 +              pages_truncate_ok(inode, size_in_pages(new_size)));
31988 +
31989 +       done_lh(lh);
31990 +       kfree(hint);
31991 +       put_cluster_handle(&clust);
31992 +       return result;
31993 +}
31994 +
31995 +/* Prepare cryptcompress file for truncate:
31996 + * prune or append rightmost fake logical clusters (if any)
31997 + */
31998 +static int start_truncate_fake(struct inode *inode, cloff_t aidx,
31999 +                              loff_t new_size, int update_sd)
32000 +{
32001 +       int result = 0;
32002 +       int bytes;
32003 +
32004 +       if (new_size > inode->i_size) {
32005 +               /* append */
32006 +               if (inode->i_size < clust_to_off(aidx, inode))
32007 +                       /* no fake bytes */
32008 +                       return 0;
32009 +               bytes = new_size - inode->i_size;
32010 +               INODE_SET_SIZE(inode, inode->i_size + bytes);
32011 +       } else {
32012 +               /* prune */
32013 +               if (inode->i_size <= clust_to_off(aidx, inode))
32014 +                       /* no fake bytes */
32015 +                       return 0;
32016 +               bytes = inode->i_size -
32017 +                       max(new_size, clust_to_off(aidx, inode));
32018 +               if (!bytes)
32019 +                       return 0;
32020 +               INODE_SET_SIZE(inode, inode->i_size - bytes);
32021 +               /* In the case of fake prune we need to drop page cluster.
32022 +                  There are only 2 cases for partially truncated page:
32023 +                  1. If is is dirty, therefore it is anonymous
32024 +                  (was dirtied via mmap), and will be captured
32025 +                  later via ->capture().
32026 +                  2. If is clean, therefore it is filled by zeroes.
32027 +                  In both cases we don't need to make it dirty and
32028 +                  capture here.
32029 +                */
32030 +               truncate_inode_pages(inode->i_mapping, inode->i_size);
32031 +       }
32032 +       if (update_sd)
32033 +               result = update_sd_cryptcompress(inode);
32034 +       return result;
32035 +}
32036 +
32037 +/**
32038 + * This is called in setattr_cryptcompress when it is used to truncate,
32039 + * and in delete_object_cryptcompress
32040 + */
32041 +static int cryptcompress_truncate(struct inode *inode, /* old size */
32042 +                                 loff_t new_size,      /* new size */
32043 +                                 int update_sd)
32044 +{
32045 +       int result;
32046 +       cloff_t aidx;
32047 +
32048 +       result = find_fake_appended(inode, &aidx);
32049 +       if (result)
32050 +               return result;
32051 +       assert("edward-1208",
32052 +              ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode)));
32053 +
32054 +       result = start_truncate_fake(inode, aidx, new_size, update_sd);
32055 +       if (result)
32056 +               return result;
32057 +       if (inode->i_size == new_size)
32058 +               /* nothing to truncate anymore */
32059 +               return 0;
32060 +       result = (inode->i_size < new_size ?
32061 +                 cryptcompress_append_hole(inode, new_size) :
32062 +                 prune_cryptcompress(inode, new_size, update_sd, aidx));
32063 +       if (!result && update_sd)
32064 +               result = update_sd_cryptcompress(inode);
32065 +       return result;
32066 +}
32067 +
32068 +/**
32069 + * Capture a pager cluster.
32070 + * @clust must be set up by a caller.
32071 + */
32072 +static int capture_page_cluster(struct cluster_handle * clust,
32073 +                               struct inode * inode)
32074 +{
32075 +       int result;
32076 +
32077 +       assert("edward-1073", clust != NULL);
32078 +       assert("edward-1074", inode != NULL);
32079 +       assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER);
32080 +
32081 +       result = prepare_logical_cluster(inode, 0, 0, clust, LC_APPOV);
32082 +       if (result)
32083 +               return result;
32084 +
32085 +       set_cluster_pages_dirty(clust, inode);
32086 +       result = checkin_logical_cluster(clust, inode);
32087 +       put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
32088 +       if (unlikely(result))
32089 +               put_page_cluster(clust, inode, WRITE_OP);
32090 +       return result;
32091 +}
32092 +
32093 +/* Starting from @index find tagged pages of the same page cluster.
32094 + * Clear the tag for each of them. Return number of found pages.
32095 + */
32096 +static int find_anon_page_cluster(struct address_space * mapping,
32097 +                                 pgoff_t * index, struct page ** pages)
32098 +{
32099 +       int i = 0;
32100 +       int found;
32101 +       spin_lock_irq(&mapping->tree_lock);
32102 +       do {
32103 +               /* looking for one page */
32104 +               found = radix_tree_gang_lookup_tag(&mapping->page_tree,
32105 +                                                  (void **)&pages[i],
32106 +                                                  *index, 1,
32107 +                                                  PAGECACHE_TAG_REISER4_MOVED);
32108 +               if (!found)
32109 +                       break;
32110 +               if (!same_page_cluster(pages[0], pages[i]))
32111 +                       break;
32112 +
32113 +               /* found */
32114 +               page_cache_get(pages[i]);
32115 +               *index = pages[i]->index + 1;
32116 +
32117 +               radix_tree_tag_clear(&mapping->page_tree,
32118 +                                    pages[i]->index,
32119 +                                    PAGECACHE_TAG_REISER4_MOVED);
32120 +               if (last_page_in_cluster(pages[i++]))
32121 +                       break;
32122 +       } while (1);
32123 +       spin_unlock_irq(&mapping->tree_lock);
32124 +       return i;
32125 +}
32126 +
32127 +#define MAX_PAGES_TO_CAPTURE  (1024)
32128 +
32129 +/* Capture anonymous page clusters */
32130 +static int capture_anon_pages(struct address_space * mapping, pgoff_t * index,
32131 +                             int to_capture)
32132 +{
32133 +       int count = 0;
32134 +       int found = 0;
32135 +       int result = 0;
32136 +       hint_t *hint;
32137 +       lock_handle *lh;
32138 +       struct inode * inode;
32139 +       struct cluster_handle clust;
32140 +       struct page * pages[MAX_CLUSTER_NRPAGES];
32141 +
32142 +       assert("edward-1127", mapping != NULL);
32143 +       assert("edward-1128", mapping->host != NULL);
32144 +       assert("edward-1440", mapping->host->i_mapping == mapping);
32145 +
32146 +       inode = mapping->host;
32147 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
32148 +       if (hint == NULL)
32149 +               return RETERR(-ENOMEM);
32150 +       hint_init_zero(hint);
32151 +       lh = &hint->lh;
32152 +
32153 +       cluster_init_read(&clust, NULL);
32154 +       clust.hint = hint;
32155 +
32156 +       result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
32157 +       if (result)
32158 +               goto out;
32159 +
32160 +       while (to_capture > 0) {
32161 +               found = find_anon_page_cluster(mapping, index, pages);
32162 +               if (!found) {
32163 +                       *index = (pgoff_t) - 1;
32164 +                       break;
32165 +               }
32166 +               move_cluster_forward(&clust, inode, pages[0]->index);
32167 +               result = capture_page_cluster(&clust, inode);
32168 +
32169 +               put_found_pages(pages, found); /* find_anon_page_cluster */
32170 +               if (result)
32171 +                       break;
32172 +               to_capture -= clust.nr_pages;
32173 +               count += clust.nr_pages;
32174 +       }
32175 +       if (result) {
32176 +               warning("edward-1077",
32177 +                       "Capture failed (inode %llu, result=%i, captured=%d)\n",
32178 +                       (unsigned long long)get_inode_oid(inode), result, count);
32179 +       } else {
32180 +               assert("edward-1078", ergo(found > 0, count > 0));
32181 +               if (to_capture <= 0)
32182 +                       /* there may be left more pages */
32183 +                       __mark_inode_dirty(inode, I_DIRTY_PAGES);
32184 +               result = count;
32185 +       }
32186 +      out:
32187 +       done_lh(lh);
32188 +       kfree(hint);
32189 +       put_cluster_handle(&clust);
32190 +       return result;
32191 +}
32192 +
32193 +/* Returns true if inode's mapping has dirty pages
32194 +   which do not belong to any atom */
32195 +static int cryptcompress_inode_has_anon_pages(struct inode *inode)
32196 +{
32197 +       int result;
32198 +       spin_lock_irq(&inode->i_mapping->tree_lock);
32199 +       result = radix_tree_tagged(&inode->i_mapping->page_tree,
32200 +                                  PAGECACHE_TAG_REISER4_MOVED);
32201 +       spin_unlock_irq(&inode->i_mapping->tree_lock);
32202 +       return result;
32203 +}
32204 +
32205 +/* plugin->writepages */
32206 +int writepages_cryptcompress(struct address_space *mapping,
32207 +                            struct writeback_control *wbc)
32208 +{
32209 +       int result = 0;
32210 +       long to_capture;
32211 +       pgoff_t nrpages;
32212 +       pgoff_t index = 0;
32213 +       struct inode *inode;
32214 +       struct cryptcompress_info *info;
32215 +
32216 +       inode = mapping->host;
32217 +       if (!cryptcompress_inode_has_anon_pages(inode))
32218 +               goto end;
32219 +       info = cryptcompress_inode_data(inode);
32220 +       nrpages = size_in_pages(i_size_read(inode));
32221 +
32222 +       if (wbc->sync_mode != WB_SYNC_ALL)
32223 +               to_capture = min(wbc->nr_to_write, (long)MAX_PAGES_TO_CAPTURE);
32224 +       else
32225 +               to_capture = MAX_PAGES_TO_CAPTURE;
32226 +       do {
32227 +               reiser4_context *ctx;
32228 +
32229 +               ctx = reiser4_init_context(inode->i_sb);
32230 +               if (IS_ERR(ctx)) {
32231 +                       result = PTR_ERR(ctx);
32232 +                       break;
32233 +               }
32234 +               /* avoid recursive calls to ->sync_inodes */
32235 +               ctx->nobalance = 1;
32236 +
32237 +               assert("edward-1079",
32238 +                      lock_stack_isclean(get_current_lock_stack()));
32239 +
32240 +               reiser4_txn_restart_current();
32241 +
32242 +               if (get_current_context()->entd) {
32243 +                       if (mutex_trylock(&info->checkin_mutex) == 0) {
32244 +                               /* the mutex might be occupied by
32245 +                                  entd caller */
32246 +                               result = RETERR(-EBUSY);
32247 +                               reiser4_exit_context(ctx);
32248 +                               break;
32249 +                       }
32250 +               } else
32251 +                       mutex_lock(&info->checkin_mutex);
32252 +
32253 +               result = capture_anon_pages(inode->i_mapping, &index,
32254 +                                           to_capture);
32255 +               mutex_unlock(&info->checkin_mutex);
32256 +
32257 +               if (result < 0) {
32258 +                       reiser4_exit_context(ctx);
32259 +                       break;
32260 +               }
32261 +               wbc->nr_to_write -= result;
32262 +               if (wbc->sync_mode != WB_SYNC_ALL) {
32263 +                       reiser4_exit_context(ctx);
32264 +                       break;
32265 +               }
32266 +               result = txnmgr_force_commit_all(inode->i_sb, 0);
32267 +               reiser4_exit_context(ctx);
32268 +       } while (result >= 0 && index < nrpages);
32269 +
32270 + end:
32271 +       if (is_in_reiser4_context()) {
32272 +               if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
32273 +                       /* there are already pages to flush, flush them out,
32274 +                          do not delay until end of reiser4_sync_inodes */
32275 +                       reiser4_writeout(inode->i_sb, wbc);
32276 +                       get_current_context()->nr_captured = 0;
32277 +               }
32278 +       }
32279 +       return result;
32280 +}
32281 +
32282 +/* plugin->ioctl */
32283 +int ioctl_cryptcompress(struct inode *inode, struct file *filp,
32284 +                       unsigned int cmd, unsigned long arg)
32285 +{
32286 +       return RETERR(-ENOSYS);
32287 +}
32288 +
32289 +/* plugin->mmap */
32290 +int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
32291 +{
32292 +       int result;
32293 +       struct inode *inode;
32294 +       reiser4_context *ctx;
32295 +
32296 +       inode = file->f_dentry->d_inode;
32297 +       ctx = reiser4_init_context(inode->i_sb);
32298 +       if (IS_ERR(ctx))
32299 +               return PTR_ERR(ctx);
32300 +       /*
32301 +        * generic_file_mmap will do update_atime. Grab space for stat data
32302 +        * update.
32303 +        */
32304 +       result = reiser4_grab_space_force
32305 +               (inode_file_plugin(inode)->estimate.update(inode),
32306 +                BA_CAN_COMMIT);
32307 +       if (result) {
32308 +               reiser4_exit_context(ctx);
32309 +               return result;
32310 +       }
32311 +       result = generic_file_mmap(file, vma);
32312 +       reiser4_exit_context(ctx);
32313 +       return result;
32314 +}
32315 +
32316 +/* plugin->delete_object */
32317 +int delete_object_cryptcompress(struct inode *inode)
32318 +{
32319 +       int result;
32320 +       struct cryptcompress_info * info;
32321 +
32322 +       assert("edward-429", inode->i_nlink == 0);
32323 +
32324 +       reiser4_txn_restart_current();
32325 +       info = cryptcompress_inode_data(inode);
32326 +
32327 +       mutex_lock(&info->checkin_mutex);
32328 +       result = cryptcompress_truncate(inode, 0, 0);
32329 +       mutex_unlock(&info->checkin_mutex);
32330 +
32331 +       if (result) {
32332 +               warning("edward-430",
32333 +                       "cannot truncate cryptcompress file  %lli: %i",
32334 +                       (unsigned long long)get_inode_oid(inode),
32335 +                       result);
32336 +       }
32337 +       truncate_inode_pages(inode->i_mapping, 0);
32338 +       assert("edward-1487", pages_truncate_ok(inode, 0));
32339 +       /* and remove stat data */
32340 +       return reiser4_delete_object_common(inode);
32341 +}
32342 +
32343 +/*
32344 + * plugin->setattr
32345 + * This implements actual truncate (see comments in reiser4/page_cache.c)
32346 + */
32347 +int setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
32348 +{
32349 +       int result;
32350 +       struct inode *inode;
32351 +       struct cryptcompress_info * info;
32352 +
32353 +       inode = dentry->d_inode;
32354 +       info = cryptcompress_inode_data(inode);
32355 +
32356 +       if (attr->ia_valid & ATTR_SIZE) {
32357 +               if (i_size_read(inode) != attr->ia_size) {
32358 +                       reiser4_context *ctx;
32359 +                       loff_t old_size;
32360 +
32361 +                       ctx = reiser4_init_context(dentry->d_inode->i_sb);
32362 +                       if (IS_ERR(ctx))
32363 +                               return PTR_ERR(ctx);
32364 +                       result = setattr_dispatch_hook(inode);
32365 +                       if (result) {
32366 +                               context_set_commit_async(ctx);
32367 +                               reiser4_exit_context(ctx);
32368 +                               return result;
32369 +                       }
32370 +                       old_size = i_size_read(inode);
32371 +                       inode_check_scale(inode, old_size, attr->ia_size);
32372 +
32373 +                       mutex_lock(&info->checkin_mutex);
32374 +                       result = cryptcompress_truncate(inode,
32375 +                                                       attr->ia_size,
32376 +                                                       1/* update sd */);
32377 +                       mutex_unlock(&info->checkin_mutex);
32378 +                       if (result) {
32379 +                            warning("edward-1192",
32380 +                                    "truncate_cryptcompress failed: oid %lli, "
32381 +                                    "old size %lld, new size %lld, retval %d",
32382 +                                    (unsigned long long)
32383 +                                    get_inode_oid(inode), old_size,
32384 +                                    attr->ia_size, result);
32385 +                       }
32386 +                       context_set_commit_async(ctx);
32387 +                       reiser4_exit_context(ctx);
32388 +               } else
32389 +                       result = 0;
32390 +       } else
32391 +               result = reiser4_setattr_common(dentry, attr);
32392 +       return result;
32393 +}
32394 +
32395 +/* plugin->release */
32396 +int release_cryptcompress(struct inode *inode, struct file *file)
32397 +{
32398 +       reiser4_context *ctx = reiser4_init_context(inode->i_sb);
32399 +
32400 +       if (IS_ERR(ctx))
32401 +               return PTR_ERR(ctx);
32402 +       reiser4_free_file_fsdata(file);
32403 +       reiser4_exit_context(ctx);
32404 +       return 0;
32405 +}
32406 +
32407 +/* plugin->prepare_write */
32408 +int write_begin_cryptcompress(struct file *file, struct page *page,
32409 +                         unsigned from, unsigned to)
32410 +{
32411 +       return do_prepare_write(file, page, from, to);
32412 +}
32413 +
32414 +/* plugin->commit_write */
32415 +int write_end_cryptcompress(struct file *file, struct page *page,
32416 +                         unsigned from, unsigned to)
32417 +{
32418 +       int ret;
32419 +       hint_t *hint;
32420 +       lock_handle *lh;
32421 +       struct inode * inode;
32422 +       struct cluster_handle clust;
32423 +
32424 +       unlock_page(page);
32425 +
32426 +       inode = page->mapping->host;
32427 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
32428 +       if (hint == NULL)
32429 +               return RETERR(-ENOMEM);
32430 +       hint_init_zero(hint);
32431 +       lh = &hint->lh;
32432 +
32433 +       cluster_init_read(&clust, NULL);
32434 +       clust.hint = hint;
32435 +
32436 +       ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
32437 +       if (ret)
32438 +               goto out;
32439 +       clust.index = pg_to_clust(page->index, inode);
32440 +       ret = capture_page_cluster(&clust, inode);
32441 +       if (ret)
32442 +               warning("edward-1557",
32443 +                       "Capture failed (inode %llu, result=%i)",
32444 +                       (unsigned long long)get_inode_oid(inode), ret);
32445 + out:
32446 +       done_lh(lh);
32447 +       kfree(hint);
32448 +       put_cluster_handle(&clust);
32449 +       return ret;
32450 +}
32451 +
32452 +/* plugin->bmap */
32453 +sector_t bmap_cryptcompress(struct address_space *mapping, sector_t lblock)
32454 +{
32455 +       return -EINVAL;
32456 +}
32457 +
32458 +/*
32459 +  Local variables:
32460 +  c-indentation-style: "K&R"
32461 +  mode-name: "LC"
32462 +  c-basic-offset: 8
32463 +  tab-width: 8
32464 +  fill-column: 80
32465 +  scroll-step: 1
32466 +  End:
32467 +*/
32468 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/file/cryptcompress.h linux-2.6.35/fs/reiser4/plugin/file/cryptcompress.h
32469 --- linux-2.6.35.orig/fs/reiser4/plugin/file/cryptcompress.h    1970-01-01 01:00:00.000000000 +0100
32470 +++ linux-2.6.35/fs/reiser4/plugin/file/cryptcompress.h 2010-08-04 15:44:57.000000000 +0200
32471 @@ -0,0 +1,616 @@
32472 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
32473 +/* See http://www.namesys.com/cryptcompress_design.html */
32474 +
32475 +#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ )
32476 +#define __FS_REISER4_CRYPTCOMPRESS_H__
32477 +
32478 +#include "../../page_cache.h"
32479 +#include "../compress/compress.h"
32480 +#include "../crypto/cipher.h"
32481 +
32482 +#include <linux/pagemap.h>
32483 +
32484 +#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT
32485 +#define MAX_CLUSTER_SHIFT 16
32486 +#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT)
32487 +#define DC_CHECKSUM_SIZE 4
32488 +
32489 +#define MIN_LATTICE_FACTOR 1
32490 +#define MAX_LATTICE_FACTOR 32
32491 +
32492 +/* this mask contains all non-standard plugins that might
32493 +   be present in reiser4-specific part of inode managed by
32494 +   cryptcompress file plugin */
32495 +#define cryptcompress_mask                             \
32496 +       ((1 << PSET_FILE) |                             \
32497 +        (1 << PSET_CLUSTER) |                          \
32498 +        (1 << PSET_CIPHER) |                           \
32499 +        (1 << PSET_DIGEST) |                           \
32500 +        (1 << PSET_COMPRESSION) |                      \
32501 +        (1 << PSET_COMPRESSION_MODE))
32502 +
32503 +#if REISER4_DEBUG
32504 +static inline int cluster_shift_ok(int shift)
32505 +{
32506 +       return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT);
32507 +}
32508 +#endif
32509 +
32510 +#if REISER4_DEBUG
32511 +#define INODE_PGCOUNT(inode)                                           \
32512 +({                                                                     \
32513 +       assert("edward-1530", inode_file_plugin(inode) ==               \
32514 +              file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));        \
32515 +       atomic_read(&cryptcompress_inode_data(inode)->pgcount);         \
32516 + })
32517 +#define INODE_PGCOUNT_INC(inode)                                       \
32518 +do {                                                                   \
32519 +       assert("edward-1531", inode_file_plugin(inode) ==               \
32520 +              file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));        \
32521 +       atomic_inc(&cryptcompress_inode_data(inode)->pgcount);          \
32522 +} while (0)
32523 +#define INODE_PGCOUNT_DEC(inode)                                       \
32524 +do {                                                                   \
32525 +       if (inode_file_plugin(inode) ==                                 \
32526 +           file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))            \
32527 +               atomic_dec(&cryptcompress_inode_data(inode)->pgcount);  \
32528 +} while (0)
32529 +#else
32530 +#define INODE_PGCOUNT(inode) (0)
32531 +#define INODE_PGCOUNT_INC(inode)
32532 +#define INODE_PGCOUNT_DEC(inode)
32533 +#endif /* REISER4_DEBUG */
32534 +
32535 +struct tfm_stream {
32536 +       __u8 *data;
32537 +       size_t size;
32538 +};
32539 +
32540 +typedef enum {
32541 +       INPUT_STREAM,
32542 +       OUTPUT_STREAM,
32543 +       LAST_STREAM
32544 +} tfm_stream_id;
32545 +
32546 +typedef struct tfm_stream * tfm_unit[LAST_STREAM];
32547 +
32548 +static inline __u8 *ts_data(struct tfm_stream * stm)
32549 +{
32550 +       assert("edward-928", stm != NULL);
32551 +       return stm->data;
32552 +}
32553 +
32554 +static inline size_t ts_size(struct tfm_stream * stm)
32555 +{
32556 +       assert("edward-929", stm != NULL);
32557 +       return stm->size;
32558 +}
32559 +
32560 +static inline void set_ts_size(struct tfm_stream * stm, size_t size)
32561 +{
32562 +       assert("edward-930", stm != NULL);
32563 +
32564 +       stm->size = size;
32565 +}
32566 +
32567 +static inline int alloc_ts(struct tfm_stream ** stm)
32568 +{
32569 +       assert("edward-931", stm);
32570 +       assert("edward-932", *stm == NULL);
32571 +
32572 +       *stm = kzalloc(sizeof(**stm), reiser4_ctx_gfp_mask_get());
32573 +       if (!*stm)
32574 +               return -ENOMEM;
32575 +       return 0;
32576 +}
32577 +
32578 +static inline void free_ts(struct tfm_stream * stm)
32579 +{
32580 +       assert("edward-933", !ts_data(stm));
32581 +       assert("edward-934", !ts_size(stm));
32582 +
32583 +       kfree(stm);
32584 +}
32585 +
32586 +static inline int alloc_ts_data(struct tfm_stream * stm, size_t size)
32587 +{
32588 +       assert("edward-935", !ts_data(stm));
32589 +       assert("edward-936", !ts_size(stm));
32590 +       assert("edward-937", size != 0);
32591 +
32592 +       stm->data = reiser4_vmalloc(size);
32593 +       if (!stm->data)
32594 +               return -ENOMEM;
32595 +       set_ts_size(stm, size);
32596 +       return 0;
32597 +}
32598 +
32599 +static inline void free_ts_data(struct tfm_stream * stm)
32600 +{
32601 +       assert("edward-938", equi(ts_data(stm), ts_size(stm)));
32602 +
32603 +       if (ts_data(stm))
32604 +               vfree(ts_data(stm));
32605 +       memset(stm, 0, sizeof *stm);
32606 +}
32607 +
32608 +/* Write modes for item conversion in flush convert phase */
32609 +typedef enum {
32610 +       CRC_APPEND_ITEM = 1,
32611 +       CRC_OVERWRITE_ITEM = 2,
32612 +       CRC_CUT_ITEM = 3
32613 +} cryptcompress_write_mode_t;
32614 +
32615 +typedef enum {
32616 +       LC_INVAL  = 0,   /* invalid value */
32617 +       LC_APPOV = 1,    /* append and/or overwrite */
32618 +       LC_TRUNC = 2     /* truncate */
32619 +} logical_cluster_op;
32620 +
32621 +/* Transform cluster.
32622 + * Intermediate state between page cluster and disk cluster
32623 + * Is used for data transform (compression/encryption)
32624 + */
32625 +struct tfm_cluster {
32626 +       coa_set coa;      /* compression algorithms info */
32627 +       tfm_unit tun;     /* plain and transformed streams */
32628 +       tfm_action act;
32629 +       int uptodate;
32630 +       int lsize;        /* number of bytes in logical cluster */
32631 +       int len;          /* length of the transform stream */
32632 +};
32633 +
32634 +static inline coa_t get_coa(struct tfm_cluster * tc, reiser4_compression_id id,
32635 +                           tfm_action act)
32636 +{
32637 +       return tc->coa[id][act];
32638 +}
32639 +
32640 +static inline void set_coa(struct tfm_cluster * tc, reiser4_compression_id id,
32641 +                          tfm_action act, coa_t coa)
32642 +{
32643 +       tc->coa[id][act] = coa;
32644 +}
32645 +
32646 +static inline int alloc_coa(struct tfm_cluster * tc, compression_plugin * cplug)
32647 +{
32648 +       coa_t coa;
32649 +
32650 +       coa = cplug->alloc(tc->act);
32651 +       if (IS_ERR(coa))
32652 +               return PTR_ERR(coa);
32653 +       set_coa(tc, cplug->h.id, tc->act, coa);
32654 +       return 0;
32655 +}
32656 +
32657 +static inline int
32658 +grab_coa(struct tfm_cluster * tc, compression_plugin * cplug)
32659 +{
32660 +       return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ?
32661 +               alloc_coa(tc, cplug) : 0);
32662 +}
32663 +
32664 +static inline void free_coa_set(struct tfm_cluster * tc)
32665 +{
32666 +       tfm_action j;
32667 +       reiser4_compression_id i;
32668 +       compression_plugin *cplug;
32669 +
32670 +       assert("edward-810", tc != NULL);
32671 +
32672 +       for (j = 0; j < TFMA_LAST; j++)
32673 +               for (i = 0; i < LAST_COMPRESSION_ID; i++) {
32674 +                       if (!get_coa(tc, i, j))
32675 +                               continue;
32676 +                       cplug = compression_plugin_by_id(i);
32677 +                       assert("edward-812", cplug->free != NULL);
32678 +                       cplug->free(get_coa(tc, i, j), j);
32679 +                       set_coa(tc, i, j, 0);
32680 +               }
32681 +       return;
32682 +}
32683 +
32684 +static inline struct tfm_stream * get_tfm_stream(struct tfm_cluster * tc,
32685 +                                                tfm_stream_id id)
32686 +{
32687 +       return tc->tun[id];
32688 +}
32689 +
32690 +static inline void set_tfm_stream(struct tfm_cluster * tc,
32691 +                                 tfm_stream_id id, struct tfm_stream * ts)
32692 +{
32693 +       tc->tun[id] = ts;
32694 +}
32695 +
32696 +static inline __u8 *tfm_stream_data(struct tfm_cluster * tc, tfm_stream_id id)
32697 +{
32698 +       return ts_data(get_tfm_stream(tc, id));
32699 +}
32700 +
32701 +static inline void set_tfm_stream_data(struct tfm_cluster * tc,
32702 +                                      tfm_stream_id id, __u8 * data)
32703 +{
32704 +       get_tfm_stream(tc, id)->data = data;
32705 +}
32706 +
32707 +static inline size_t tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id)
32708 +{
32709 +       return ts_size(get_tfm_stream(tc, id));
32710 +}
32711 +
32712 +static inline void
32713 +set_tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id, size_t size)
32714 +{
32715 +       get_tfm_stream(tc, id)->size = size;
32716 +}
32717 +
32718 +static inline int
32719 +alloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
32720 +{
32721 +       assert("edward-939", tc != NULL);
32722 +       assert("edward-940", !get_tfm_stream(tc, id));
32723 +
32724 +       tc->tun[id] = kzalloc(sizeof(struct tfm_stream),
32725 +                             reiser4_ctx_gfp_mask_get());
32726 +       if (!tc->tun[id])
32727 +               return -ENOMEM;
32728 +       return alloc_ts_data(get_tfm_stream(tc, id), size);
32729 +}
32730 +
32731 +static inline int
32732 +realloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
32733 +{
32734 +       assert("edward-941", tfm_stream_size(tc, id) < size);
32735 +       free_ts_data(get_tfm_stream(tc, id));
32736 +       return alloc_ts_data(get_tfm_stream(tc, id), size);
32737 +}
32738 +
32739 +static inline void free_tfm_stream(struct tfm_cluster * tc, tfm_stream_id id)
32740 +{
32741 +       free_ts_data(get_tfm_stream(tc, id));
32742 +       free_ts(get_tfm_stream(tc, id));
32743 +       set_tfm_stream(tc, id, 0);
32744 +}
32745 +
32746 +static inline unsigned coa_overrun(compression_plugin * cplug, int ilen)
32747 +{
32748 +       return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0);
32749 +}
32750 +
32751 +static inline void free_tfm_unit(struct tfm_cluster * tc)
32752 +{
32753 +       tfm_stream_id id;
32754 +       for (id = 0; id < LAST_STREAM; id++) {
32755 +               if (!get_tfm_stream(tc, id))
32756 +                       continue;
32757 +               free_tfm_stream(tc, id);
32758 +       }
32759 +}
32760 +
32761 +static inline void put_tfm_cluster(struct tfm_cluster * tc)
32762 +{
32763 +       assert("edward-942", tc != NULL);
32764 +       free_coa_set(tc);
32765 +       free_tfm_unit(tc);
32766 +}
32767 +
32768 +static inline int tfm_cluster_is_uptodate(struct tfm_cluster * tc)
32769 +{
32770 +       assert("edward-943", tc != NULL);
32771 +       assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1);
32772 +       return (tc->uptodate == 1);
32773 +}
32774 +
32775 +static inline void tfm_cluster_set_uptodate(struct tfm_cluster * tc)
32776 +{
32777 +       assert("edward-945", tc != NULL);
32778 +       assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1);
32779 +       tc->uptodate = 1;
32780 +       return;
32781 +}
32782 +
32783 +static inline void tfm_cluster_clr_uptodate(struct tfm_cluster * tc)
32784 +{
32785 +       assert("edward-947", tc != NULL);
32786 +       assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1);
32787 +       tc->uptodate = 0;
32788 +       return;
32789 +}
32790 +
32791 +static inline int tfm_stream_is_set(struct tfm_cluster * tc, tfm_stream_id id)
32792 +{
32793 +       return (get_tfm_stream(tc, id) &&
32794 +               tfm_stream_data(tc, id) && tfm_stream_size(tc, id));
32795 +}
32796 +
32797 +static inline int tfm_cluster_is_set(struct tfm_cluster * tc)
32798 +{
32799 +       int i;
32800 +       for (i = 0; i < LAST_STREAM; i++)
32801 +               if (!tfm_stream_is_set(tc, i))
32802 +                       return 0;
32803 +       return 1;
32804 +}
32805 +
32806 +static inline void alternate_streams(struct tfm_cluster * tc)
32807 +{
32808 +       struct tfm_stream *tmp = get_tfm_stream(tc, INPUT_STREAM);
32809 +
32810 +       set_tfm_stream(tc, INPUT_STREAM, get_tfm_stream(tc, OUTPUT_STREAM));
32811 +       set_tfm_stream(tc, OUTPUT_STREAM, tmp);
32812 +}
32813 +
32814 +/* Set of states to indicate a kind of data
32815 + * that will be written to the window */
32816 +typedef enum {
32817 +       DATA_WINDOW,            /* user's data */
32818 +       HOLE_WINDOW             /* zeroes (such kind of data can be written
32819 +                                * if we start to write from offset > i_size) */
32820 +} window_stat;
32821 +
32822 +/* Window (of logical cluster size) discretely sliding along a file.
32823 + * Is used to locate hole region in a logical cluster to be properly
32824 + * represented on disk.
32825 + * We split a write to cryptcompress file into writes to its logical
32826 + * clusters. Before writing to a logical cluster we set a window, i.e.
32827 + * calculate values of the following fields:
32828 + */
32829 +struct reiser4_slide {
32830 +       unsigned off;           /* offset to write from */
32831 +       unsigned count;         /* number of bytes to write */
32832 +       unsigned delta;         /* number of bytes to append to the hole */
32833 +       window_stat stat;       /* what kind of data will be written starting
32834 +                                  from @off */
32835 +};
32836 +
32837 +/* Possible states of a disk cluster */
32838 +typedef enum {
32839 +       INVAL_DISK_CLUSTER,     /* unknown state */
32840 +       PREP_DISK_CLUSTER,      /* disk cluster got converted by flush
32841 +                                * at least 1 time */
32842 +       UNPR_DISK_CLUSTER,      /* disk cluster just created and should be
32843 +                                * converted by flush */
32844 +       FAKE_DISK_CLUSTER,      /* disk cluster doesn't exist neither in memory
32845 +                                * nor on disk */
32846 +       TRNC_DISK_CLUSTER       /* disk cluster is partially truncated */
32847 +} disk_cluster_stat;
32848 +
32849 +/* The following structure represents various stages of the same logical
32850 + * cluster of index @index:
32851 + * . fixed slide
32852 + * . page cluster         (stage in primary cache)
32853 + * . transform cluster    (transition stage)
32854 + * . disk cluster         (stage in secondary cache)
32855 + * This structure is used in transition and synchronizing operations, e.g.
32856 + * transform cluster is a transition state when synchronizing page cluster
32857 + * and disk cluster.
32858 + * FIXME: Encapsulate page cluster, disk cluster.
32859 + */
32860 +struct cluster_handle {
32861 +       cloff_t index;           /* offset in a file (unit is a cluster size) */
32862 +       int index_valid;         /* for validating the index above, if needed */
32863 +       struct file *file;       /* host file */
32864 +
32865 +       /* logical cluster */
32866 +       struct reiser4_slide *win; /* sliding window to locate holes */
32867 +       logical_cluster_op op;   /* logical cluster operation (truncate or
32868 +                                   append/overwrite) */
32869 +       /* transform cluster */
32870 +       struct tfm_cluster tc;   /* contains all needed info to synchronize
32871 +                                   page cluster and disk cluster) */
32872 +        /* page cluster */
32873 +       int nr_pages;            /* number of pages of current checkin action */
32874 +       int old_nrpages;         /* number of pages of last checkin action */
32875 +       struct page **pages;     /* attached pages */
32876 +       jnode * node;            /* jnode for capture */
32877 +
32878 +       /* disk cluster */
32879 +       hint_t *hint;            /* current position in the tree */
32880 +       disk_cluster_stat dstat; /* state of the current disk cluster */
32881 +       int reserved;            /* is space for disk cluster reserved */
32882 +#if REISER4_DEBUG
32883 +       reiser4_context *ctx;
32884 +       int reserved_prepped;
32885 +       int reserved_unprepped;
32886 +#endif
32887 +
32888 +};
32889 +
32890 +static inline __u8 * tfm_input_data (struct cluster_handle * clust)
32891 +{
32892 +       return tfm_stream_data(&clust->tc, INPUT_STREAM);
32893 +}
32894 +
32895 +static inline __u8 * tfm_output_data (struct cluster_handle * clust)
32896 +{
32897 +       return tfm_stream_data(&clust->tc, OUTPUT_STREAM);
32898 +}
32899 +
32900 +static inline int reset_cluster_pgset(struct cluster_handle * clust,
32901 +                                     int nrpages)
32902 +{
32903 +       assert("edward-1057", clust->pages != NULL);
32904 +       memset(clust->pages, 0, sizeof(*clust->pages) * nrpages);
32905 +       return 0;
32906 +}
32907 +
32908 +static inline int alloc_cluster_pgset(struct cluster_handle * clust,
32909 +                                     int nrpages)
32910 +{
32911 +       assert("edward-949", clust != NULL);
32912 +       assert("edward-1362", clust->pages == NULL);
32913 +       assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES);
32914 +
32915 +       clust->pages = kzalloc(sizeof(*clust->pages) * nrpages,
32916 +                              reiser4_ctx_gfp_mask_get());
32917 +       if (!clust->pages)
32918 +               return RETERR(-ENOMEM);
32919 +       return 0;
32920 +}
32921 +
32922 +static inline void move_cluster_pgset(struct cluster_handle *clust,
32923 +                                     struct page ***pages, int * nr_pages)
32924 +{
32925 +       assert("edward-1545", clust != NULL && clust->pages != NULL);
32926 +       assert("edward-1546", pages != NULL && *pages == NULL);
32927 +       *pages = clust->pages;
32928 +       *nr_pages = clust->nr_pages;
32929 +       clust->pages = NULL;
32930 +}
32931 +
32932 +static inline void free_cluster_pgset(struct cluster_handle * clust)
32933 +{
32934 +       assert("edward-951", clust->pages != NULL);
32935 +       kfree(clust->pages);
32936 +       clust->pages = NULL;
32937 +}
32938 +
32939 +static inline void put_cluster_handle(struct cluster_handle * clust)
32940 +{
32941 +       assert("edward-435", clust != NULL);
32942 +
32943 +       put_tfm_cluster(&clust->tc);
32944 +       if (clust->pages)
32945 +               free_cluster_pgset(clust);
32946 +       memset(clust, 0, sizeof *clust);
32947 +}
32948 +
32949 +static inline void inc_keyload_count(struct reiser4_crypto_info * data)
32950 +{
32951 +       assert("edward-1410", data != NULL);
32952 +       data->keyload_count++;
32953 +}
32954 +
32955 +static inline void dec_keyload_count(struct reiser4_crypto_info * data)
32956 +{
32957 +       assert("edward-1411", data != NULL);
32958 +       assert("edward-1412", data->keyload_count > 0);
32959 +       data->keyload_count--;
32960 +}
32961 +
32962 +static inline int capture_cluster_jnode(jnode * node)
32963 +{
32964 +       return reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
32965 +}
32966 +
32967 +/* cryptcompress specific part of reiser4_inode */
32968 +struct cryptcompress_info {
32969 +       struct mutex checkin_mutex;  /* This is to serialize
32970 +                                     * checkin_logical_cluster operations */
32971 +       cloff_t trunc_index;         /* Index of the leftmost truncated disk
32972 +                                     * cluster (to resolve races with read) */
32973 +       struct reiser4_crypto_info *crypt;
32974 +       /*
32975 +        * the following 2 fields are controlled by compression mode plugin
32976 +        */
32977 +       int compress_toggle;          /* Current status of compressibility */
32978 +       int lattice_factor;           /* Factor of dynamic lattice. FIXME: Have
32979 +                                      * a compression_toggle to keep the factor
32980 +                                      */
32981 +#if REISER4_DEBUG
32982 +       atomic_t pgcount;             /* number of grabbed pages */
32983 +#endif
32984 +};
32985 +
32986 +static inline void set_compression_toggle (struct cryptcompress_info * info, int val)
32987 +{
32988 +       info->compress_toggle = val;
32989 +}
32990 +
32991 +static inline int get_compression_toggle (struct cryptcompress_info * info)
32992 +{
32993 +       return info->compress_toggle;
32994 +}
32995 +
32996 +static inline int compression_is_on(struct cryptcompress_info * info)
32997 +{
32998 +       return get_compression_toggle(info) == 1;
32999 +}
33000 +
33001 +static inline void turn_on_compression(struct cryptcompress_info * info)
33002 +{
33003 +       set_compression_toggle(info, 1);
33004 +}
33005 +
33006 +static inline void turn_off_compression(struct cryptcompress_info * info)
33007 +{
33008 +       set_compression_toggle(info, 0);
33009 +}
33010 +
33011 +static inline void set_lattice_factor(struct cryptcompress_info * info, int val)
33012 +{
33013 +       info->lattice_factor = val;
33014 +}
33015 +
33016 +static inline int get_lattice_factor(struct cryptcompress_info * info)
33017 +{
33018 +       return info->lattice_factor;
33019 +}
33020 +
33021 +struct cryptcompress_info *cryptcompress_inode_data(const struct inode *);
33022 +int equal_to_rdk(znode *, const reiser4_key *);
33023 +int goto_right_neighbor(coord_t *, lock_handle *);
33024 +int cryptcompress_inode_ok(struct inode *inode);
33025 +int coord_is_unprepped_ctail(const coord_t * coord);
33026 +extern int do_readpage_ctail(struct inode *, struct cluster_handle *,
33027 +                            struct page * page, znode_lock_mode mode);
33028 +extern int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
33029 +                                         struct inode * inode);
33030 +extern int readpages_cryptcompress(struct file*, struct address_space*,
33031 +                                  struct list_head*, unsigned);
33032 +int bind_cryptcompress(struct inode *child, struct inode *parent);
33033 +void destroy_inode_cryptcompress(struct inode * inode);
33034 +int grab_page_cluster(struct inode *inode, struct cluster_handle * clust,
33035 +                     rw_op rw);
33036 +int write_dispatch_hook(struct file *file, struct inode * inode,
33037 +                       loff_t pos, struct cluster_handle * clust,
33038 +                       struct dispatch_context * cont);
33039 +int setattr_dispatch_hook(struct inode * inode);
33040 +struct reiser4_crypto_info * inode_crypto_info(struct inode * inode);
33041 +void inherit_crypto_info_common(struct inode * parent, struct inode * object,
33042 +                               int (*can_inherit)(struct inode * child,
33043 +                                                  struct inode * parent));
33044 +void reiser4_attach_crypto_info(struct inode * inode,
33045 +                               struct reiser4_crypto_info * info);
33046 +void change_crypto_info(struct inode * inode, struct reiser4_crypto_info * new);
33047 +struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode);
33048 +
33049 +static inline struct crypto_blkcipher * info_get_cipher(struct reiser4_crypto_info * info)
33050 +{
33051 +       return info->cipher;
33052 +}
33053 +
33054 +static inline void info_set_cipher(struct reiser4_crypto_info * info,
33055 +                                  struct crypto_blkcipher * tfm)
33056 +{
33057 +       info->cipher = tfm;
33058 +}
33059 +
33060 +static inline struct crypto_hash * info_get_digest(struct reiser4_crypto_info * info)
33061 +{
33062 +       return info->digest;
33063 +}
33064 +
33065 +static inline void info_set_digest(struct reiser4_crypto_info * info,
33066 +                                  struct crypto_hash * tfm)
33067 +{
33068 +       info->digest = tfm;
33069 +}
33070 +
33071 +static inline void put_cluster_page(struct page * page)
33072 +{
33073 +       page_cache_release(page);
33074 +}
33075 +
33076 +#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */
33077 +
33078 +/* Make Linus happy.
33079 +   Local variables:
33080 +   c-indentation-style: "K&R"
33081 +   mode-name: "LC"
33082 +   c-basic-offset: 8
33083 +   tab-width: 8
33084 +   fill-column: 120
33085 +   scroll-step: 1
33086 +   End:
33087 +*/
33088 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/file/file.c linux-2.6.35/fs/reiser4/plugin/file/file.c
33089 --- linux-2.6.35.orig/fs/reiser4/plugin/file/file.c     1970-01-01 01:00:00.000000000 +0100
33090 +++ linux-2.6.35/fs/reiser4/plugin/file/file.c  2010-08-04 18:08:14.000000000 +0200
33091 @@ -0,0 +1,2688 @@
33092 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
33093 + * reiser4/README */
33094 +
33095 +/*
33096 + * this file contains implementations of inode/file/address_space/file plugin
33097 + * operations specific for "unix file plugin" (plugin id is
33098 + * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only
33099 + * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have
33100 + * no items but stat data)
33101 + */
33102 +
33103 +#include "../../inode.h"
33104 +#include "../../super.h"
33105 +#include "../../tree_walk.h"
33106 +#include "../../carry.h"
33107 +#include "../../page_cache.h"
33108 +#include "../../ioctl.h"
33109 +#include "../object.h"
33110 +#include "../cluster.h"
33111 +#include "../../safe_link.h"
33112 +
33113 +#include <linux/writeback.h>
33114 +#include <linux/pagevec.h>
33115 +#include <linux/syscalls.h>
33116 +
33117 +
33118 +static int unpack(struct file *file, struct inode *inode, int forever);
33119 +static void drop_access(struct unix_file_info *);
33120 +static int hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
33121 +                        znode_lock_mode lock_mode);
33122 +
33123 +/* Get exclusive access and make sure that file is not partially
33124 + * converted (It may happen that another process is doing tail
33125 + * conversion. If so, wait until it completes)
33126 + */
33127 +static inline void get_exclusive_access_careful(struct unix_file_info * uf_info,
33128 +                                               struct inode *inode)
33129 +{
33130 +        do {
33131 +               get_exclusive_access(uf_info);
33132 +               if (!reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))
33133 +                       break;
33134 +               drop_exclusive_access(uf_info);
33135 +               schedule();
33136 +       } while (1);
33137 +}
33138 +
33139 +/* get unix file plugin specific portion of inode */
33140 +struct unix_file_info *unix_file_inode_data(const struct inode *inode)
33141 +{
33142 +       return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info;
33143 +}
33144 +
33145 +/**
33146 + * equal_to_rdk - compare key and znode's right delimiting key
33147 + * @node: node whose right delimiting key to compare with @key
33148 + * @key: key to compare with @node's right delimiting key
33149 + *
33150 + * Returns true if @key is equal to right delimiting key of @node.
33151 + */
33152 +int equal_to_rdk(znode *node, const reiser4_key *key)
33153 +{
33154 +       int result;
33155 +
33156 +       read_lock_dk(znode_get_tree(node));
33157 +       result = keyeq(key, znode_get_rd_key(node));
33158 +       read_unlock_dk(znode_get_tree(node));
33159 +       return result;
33160 +}
33161 +
33162 +#if REISER4_DEBUG
33163 +
33164 +/**
33165 + * equal_to_ldk - compare key and znode's left delimiting key
33166 + * @node: node whose left delimiting key to compare with @key
33167 + * @key: key to compare with @node's left delimiting key
33168 + *
33169 + * Returns true if @key is equal to left delimiting key of @node.
33170 + */
33171 +int equal_to_ldk(znode *node, const reiser4_key *key)
33172 +{
33173 +       int result;
33174 +
33175 +       read_lock_dk(znode_get_tree(node));
33176 +       result = keyeq(key, znode_get_ld_key(node));
33177 +       read_unlock_dk(znode_get_tree(node));
33178 +       return result;
33179 +}
33180 +
33181 +/**
33182 + * check_coord - check whether coord corresponds to key
33183 + * @coord: coord to check
33184 + * @key: key @coord has to correspond to
33185 + *
33186 + * Returns true if @coord is set as if it was set as result of lookup with @key
33187 + * in coord->node.
33188 + */
33189 +static int check_coord(const coord_t *coord, const reiser4_key *key)
33190 +{
33191 +       coord_t twin;
33192 +
33193 +       node_plugin_by_node(coord->node)->lookup(coord->node, key,
33194 +                                                FIND_MAX_NOT_MORE_THAN, &twin);
33195 +       return coords_equal(coord, &twin);
33196 +}
33197 +
33198 +#endif /* REISER4_DEBUG */
33199 +
33200 +/**
33201 + * init_uf_coord - initialize extended coord
33202 + * @uf_coord:
33203 + * @lh:
33204 + *
33205 + *
33206 + */
33207 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh)
33208 +{
33209 +       coord_init_zero(&uf_coord->coord);
33210 +       coord_clear_iplug(&uf_coord->coord);
33211 +       uf_coord->lh = lh;
33212 +       init_lh(lh);
33213 +       memset(&uf_coord->extension, 0, sizeof(uf_coord->extension));
33214 +       uf_coord->valid = 0;
33215 +}
33216 +
33217 +static void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
33218 +{
33219 +       assert("vs-1333", uf_coord->valid == 0);
33220 +
33221 +       if (coord_is_between_items(&uf_coord->coord))
33222 +               return;
33223 +
33224 +       assert("vs-1348",
33225 +              item_plugin_by_coord(&uf_coord->coord)->s.file.
33226 +              init_coord_extension);
33227 +
33228 +       item_body_by_coord(&uf_coord->coord);
33229 +       item_plugin_by_coord(&uf_coord->coord)->s.file.
33230 +           init_coord_extension(uf_coord, offset);
33231 +}
33232 +
33233 +/**
33234 + * goto_right_neighbor - lock right neighbor, drop current node lock
33235 + * @coord:
33236 + * @lh:
33237 + *
33238 + * Obtain lock on right neighbor and drop lock on current node.
33239 + */
33240 +int goto_right_neighbor(coord_t *coord, lock_handle *lh)
33241 +{
33242 +       int result;
33243 +       lock_handle lh_right;
33244 +
33245 +       assert("vs-1100", znode_is_locked(coord->node));
33246 +
33247 +       init_lh(&lh_right);
33248 +       result = reiser4_get_right_neighbor(&lh_right, coord->node,
33249 +                                           znode_is_wlocked(coord->node) ?
33250 +                                           ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
33251 +                                           GN_CAN_USE_UPPER_LEVELS);
33252 +       if (result) {
33253 +               done_lh(&lh_right);
33254 +               return result;
33255 +       }
33256 +
33257 +       /*
33258 +        * we hold two longterm locks on neighboring nodes. Unlock left of
33259 +        * them
33260 +        */
33261 +       done_lh(lh);
33262 +
33263 +       coord_init_first_unit_nocheck(coord, lh_right.node);
33264 +       move_lh(lh, &lh_right);
33265 +
33266 +       return 0;
33267 +
33268 +}
33269 +
33270 +/**
33271 + * set_file_state
33272 + * @uf_info:
33273 + * @cbk_result:
33274 + * @level:
33275 + *
33276 + * This is to be used by find_file_item and in find_file_state to
33277 + * determine real state of file
33278 + */
33279 +static void set_file_state(struct unix_file_info *uf_info, int cbk_result,
33280 +                          tree_level level)
33281 +{
33282 +       if (cbk_errored(cbk_result))
33283 +               /* error happened in find_file_item */
33284 +               return;
33285 +
33286 +       assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL);
33287 +
33288 +       if (uf_info->container == UF_CONTAINER_UNKNOWN) {
33289 +               if (cbk_result == CBK_COORD_NOTFOUND)
33290 +                       uf_info->container = UF_CONTAINER_EMPTY;
33291 +               else if (level == LEAF_LEVEL)
33292 +                       uf_info->container = UF_CONTAINER_TAILS;
33293 +               else
33294 +                       uf_info->container = UF_CONTAINER_EXTENTS;
33295 +       } else {
33296 +               /*
33297 +                * file state is known, check whether it is set correctly if
33298 +                * file is not being tail converted
33299 +                */
33300 +               if (!reiser4_inode_get_flag(unix_file_info_to_inode(uf_info),
33301 +                                           REISER4_PART_IN_CONV)) {
33302 +                       assert("vs-1162",
33303 +                              ergo(level == LEAF_LEVEL &&
33304 +                                   cbk_result == CBK_COORD_FOUND,
33305 +                                   uf_info->container == UF_CONTAINER_TAILS));
33306 +                       assert("vs-1165",
33307 +                              ergo(level == TWIG_LEVEL &&
33308 +                                   cbk_result == CBK_COORD_FOUND,
33309 +                                   uf_info->container == UF_CONTAINER_EXTENTS));
33310 +               }
33311 +       }
33312 +}
33313 +
33314 +int find_file_item_nohint(coord_t *coord, lock_handle *lh,
33315 +                         const reiser4_key *key, znode_lock_mode lock_mode,
33316 +                         struct inode *inode)
33317 +{
33318 +       return reiser4_object_lookup(inode, key, coord, lh, lock_mode,
33319 +                                    FIND_MAX_NOT_MORE_THAN,
33320 +                                    TWIG_LEVEL, LEAF_LEVEL,
33321 +                                    (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE :
33322 +                                    (CBK_UNIQUE | CBK_FOR_INSERT),
33323 +                                    NULL /* ra_info */ );
33324 +}
33325 +
33326 +/**
33327 + * find_file_item - look for file item in the tree
33328 + * @hint: provides coordinate, lock handle, seal
33329 + * @key: key for search
33330 + * @mode: mode of lock to put on returned node
33331 + * @ra_info:
33332 + * @inode:
33333 + *
33334 + * This finds position in the tree corresponding to @key. It first tries to use
33335 + * @hint's seal if it is set.
33336 + */
33337 +int find_file_item(hint_t *hint, const reiser4_key *key,
33338 +                  znode_lock_mode lock_mode,
33339 +                  struct inode *inode)
33340 +{
33341 +       int result;
33342 +       coord_t *coord;
33343 +       lock_handle *lh;
33344 +
33345 +       assert("nikita-3030", reiser4_schedulable());
33346 +       assert("vs-1707", hint != NULL);
33347 +       assert("vs-47", inode != NULL);
33348 +
33349 +       coord = &hint->ext_coord.coord;
33350 +       lh = hint->ext_coord.lh;
33351 +       init_lh(lh);
33352 +
33353 +       result = hint_validate(hint, key, 1 /* check key */, lock_mode);
33354 +       if (!result) {
33355 +               if (coord->between == AFTER_UNIT &&
33356 +                   equal_to_rdk(coord->node, key)) {
33357 +                       result = goto_right_neighbor(coord, lh);
33358 +                       if (result == -E_NO_NEIGHBOR)
33359 +                               return RETERR(-EIO);
33360 +                       if (result)
33361 +                               return result;
33362 +                       assert("vs-1152", equal_to_ldk(coord->node, key));
33363 +                       /*
33364 +                        * we moved to different node. Invalidate coord
33365 +                        * extension, zload is necessary to init it again
33366 +                        */
33367 +                       hint->ext_coord.valid = 0;
33368 +               }
33369 +
33370 +               set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND,
33371 +                              znode_get_level(coord->node));
33372 +
33373 +               return CBK_COORD_FOUND;
33374 +       }
33375 +
33376 +       coord_init_zero(coord);
33377 +       result = find_file_item_nohint(coord, lh, key, lock_mode, inode);
33378 +       set_file_state(unix_file_inode_data(inode), result,
33379 +                      znode_get_level(coord->node));
33380 +
33381 +       /* FIXME: we might already have coord extension initialized */
33382 +       hint->ext_coord.valid = 0;
33383 +       return result;
33384 +}
33385 +
33386 +/* plugin->u.file.write_flowom = NULL
33387 +   plugin->u.file.read_flow = NULL */
33388 +
33389 +void hint_init_zero(hint_t * hint)
33390 +{
33391 +       memset(hint, 0, sizeof(*hint));
33392 +       init_lh(&hint->lh);
33393 +       hint->ext_coord.lh = &hint->lh;
33394 +}
33395 +
33396 +static int find_file_state(struct inode *inode, struct unix_file_info *uf_info)
33397 +{
33398 +       int result;
33399 +       reiser4_key key;
33400 +       coord_t coord;
33401 +       lock_handle lh;
33402 +
33403 +       assert("vs-1628", ea_obtained(uf_info));
33404 +
33405 +       if (uf_info->container == UF_CONTAINER_UNKNOWN) {
33406 +               key_by_inode_and_offset_common(inode, 0, &key);
33407 +               init_lh(&lh);
33408 +               result = find_file_item_nohint(&coord, &lh, &key,
33409 +                                              ZNODE_READ_LOCK, inode);
33410 +               set_file_state(uf_info, result, znode_get_level(coord.node));
33411 +               done_lh(&lh);
33412 +               if (!cbk_errored(result))
33413 +                       result = 0;
33414 +       } else
33415 +               result = 0;
33416 +       assert("vs-1074",
33417 +              ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
33418 +       reiser4_txn_restart_current();
33419 +       return result;
33420 +}
33421 +
33422 +/**
33423 + * Estimate and reserve space needed to truncate page
33424 + * which gets partially truncated: one block for page
33425 + * itself, stat-data update (estimate_one_insert_into_item)
33426 + * and one item insertion (estimate_one_insert_into_item)
33427 + * which may happen if page corresponds to hole extent and
33428 + * unallocated one will have to be created
33429 + */
33430 +static int reserve_partial_page(reiser4_tree * tree)
33431 +{
33432 +       grab_space_enable();
33433 +       return reiser4_grab_reserved(reiser4_get_current_sb(),
33434 +                                    1 +
33435 +                                    2 * estimate_one_insert_into_item(tree),
33436 +                                    BA_CAN_COMMIT);
33437 +}
33438 +
33439 +/* estimate and reserve space needed to cut one item and update one stat data */
33440 +static int reserve_cut_iteration(reiser4_tree * tree)
33441 +{
33442 +       __u64 estimate = estimate_one_item_removal(tree)
33443 +           + estimate_one_insert_into_item(tree);
33444 +
33445 +       assert("nikita-3172", lock_stack_isclean(get_current_lock_stack()));
33446 +
33447 +       grab_space_enable();
33448 +       /* We need to double our estimate now that we can delete more than one
33449 +          node. */
33450 +       return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2,
33451 +                                    BA_CAN_COMMIT);
33452 +}
33453 +
33454 +int reiser4_update_file_size(struct inode *inode, loff_t new_size,
33455 +                            int update_sd)
33456 +{
33457 +       int result = 0;
33458 +
33459 +       INODE_SET_SIZE(inode, new_size);
33460 +       if (update_sd) {
33461 +               inode->i_ctime = inode->i_mtime = CURRENT_TIME;
33462 +               result = reiser4_update_sd(inode);
33463 +       }
33464 +       return result;
33465 +}
33466 +
33467 +/**
33468 + * Cut file items one by one starting from the last one until
33469 + * new file size (inode->i_size) is reached. Reserve space
33470 + * and update file stat data on every single cut from the tree
33471 + */
33472 +int cut_file_items(struct inode *inode, loff_t new_size,
33473 +                  int update_sd, loff_t cur_size,
33474 +                  int (*update_actor) (struct inode *, loff_t, int))
33475 +{
33476 +       reiser4_key from_key, to_key;
33477 +       reiser4_key smallest_removed;
33478 +       file_plugin *fplug = inode_file_plugin(inode);
33479 +       int result;
33480 +       int progress = 0;
33481 +
33482 +       assert("vs-1248",
33483 +              fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) ||
33484 +              fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
33485 +
33486 +       fplug->key_by_inode(inode, new_size, &from_key);
33487 +       to_key = from_key;
33488 +       set_key_offset(&to_key, cur_size - 1 /*get_key_offset(reiser4_max_key()) */ );
33489 +       /* this loop normally runs just once */
33490 +       while (1) {
33491 +               result = reserve_cut_iteration(reiser4_tree_by_inode(inode));
33492 +               if (result)
33493 +                       break;
33494 +
33495 +               result = reiser4_cut_tree_object(current_tree, &from_key, &to_key,
33496 +                                                &smallest_removed, inode, 1,
33497 +                                                &progress);
33498 +               if (result == -E_REPEAT) {
33499 +                       /**
33500 +                        * -E_REPEAT is a signal to interrupt a long
33501 +                        * file truncation process
33502 +                        */
33503 +                       if (progress) {
33504 +                               result = update_actor(inode,
33505 +                                             get_key_offset(&smallest_removed),
33506 +                                             update_sd);
33507 +                               if (result)
33508 +                                       break;
33509 +                       }
33510 +                       /* the below does up(sbinfo->delete_mutex).
33511 +                        * Do not get folled */
33512 +                       reiser4_release_reserved(inode->i_sb);
33513 +                       /**
33514 +                        * reiser4_cut_tree_object() was interrupted probably
33515 +                        * because current atom requires commit, we have to
33516 +                        * release transaction handle to allow atom commit.
33517 +                        */
33518 +                       reiser4_txn_restart_current();
33519 +                       continue;
33520 +               }
33521 +               if (result
33522 +                   && !(result == CBK_COORD_NOTFOUND && new_size == 0
33523 +                        && inode->i_size == 0))
33524 +                       break;
33525 +
33526 +               set_key_offset(&smallest_removed, new_size);
33527 +               /* Final sd update after the file gets its correct size */
33528 +               result = update_actor(inode, get_key_offset(&smallest_removed),
33529 +                                     update_sd);
33530 +               break;
33531 +       }
33532 +
33533 +       /* the below does up(sbinfo->delete_mutex). Do not get folled */
33534 +       reiser4_release_reserved(inode->i_sb);
33535 +
33536 +       return result;
33537 +}
33538 +
33539 +int find_or_create_extent(struct page *page);
33540 +
33541 +/* part of truncate_file_body: it is called when truncate is used to make file
33542 +   shorter */
33543 +static int shorten_file(struct inode *inode, loff_t new_size)
33544 +{
33545 +       int result;
33546 +       struct page *page;
33547 +       int padd_from;
33548 +       unsigned long index;
33549 +       struct unix_file_info *uf_info;
33550 +
33551 +       /*
33552 +        * all items of ordinary reiser4 file are grouped together. That is why
33553 +        * we can use reiser4_cut_tree. Plan B files (for instance) can not be
33554 +        * truncated that simply
33555 +        */
33556 +       result = cut_file_items(inode, new_size, 1 /*update_sd */ ,
33557 +                               get_key_offset(reiser4_max_key()),
33558 +                               reiser4_update_file_size);
33559 +       if (result)
33560 +               return result;
33561 +
33562 +       uf_info = unix_file_inode_data(inode);
33563 +       assert("vs-1105", new_size == inode->i_size);
33564 +       if (new_size == 0) {
33565 +               uf_info->container = UF_CONTAINER_EMPTY;
33566 +               return 0;
33567 +       }
33568 +
33569 +       result = find_file_state(inode, uf_info);
33570 +       if (result)
33571 +               return result;
33572 +       if (uf_info->container == UF_CONTAINER_TAILS)
33573 +               /*
33574 +                * No need to worry about zeroing last page after new file
33575 +                * end
33576 +                */
33577 +               return 0;
33578 +
33579 +       padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1);
33580 +       if (!padd_from)
33581 +               /* file is truncated to page boundary */
33582 +               return 0;
33583 +
33584 +       result = reserve_partial_page(reiser4_tree_by_inode(inode));
33585 +       if (result) {
33586 +               reiser4_release_reserved(inode->i_sb);
33587 +               return result;
33588 +       }
33589 +
33590 +       /* last page is partially truncated - zero its content */
33591 +       index = (inode->i_size >> PAGE_CACHE_SHIFT);
33592 +       page = read_mapping_page(inode->i_mapping, index, NULL);
33593 +       if (IS_ERR(page)) {
33594 +               /*
33595 +                * the below does up(sbinfo->delete_mutex). Do not get
33596 +                * confused
33597 +                */
33598 +               reiser4_release_reserved(inode->i_sb);
33599 +               if (likely(PTR_ERR(page) == -EINVAL)) {
33600 +                       /* looks like file is built of tail items */
33601 +                       return 0;
33602 +               }
33603 +               return PTR_ERR(page);
33604 +       }
33605 +       wait_on_page_locked(page);
33606 +       if (!PageUptodate(page)) {
33607 +               page_cache_release(page);
33608 +               /*
33609 +                * the below does up(sbinfo->delete_mutex). Do not get
33610 +                * confused
33611 +                */
33612 +               reiser4_release_reserved(inode->i_sb);
33613 +               return RETERR(-EIO);
33614 +       }
33615 +
33616 +       /*
33617 +        * if page correspons to hole extent unit - unallocated one will be
33618 +        * created here. This is not necessary
33619 +        */
33620 +       result = find_or_create_extent(page);
33621 +
33622 +       /*
33623 +        * FIXME: cut_file_items has already updated inode. Probably it would
33624 +        * be better to update it here when file is really truncated
33625 +        */
33626 +       if (result) {
33627 +               page_cache_release(page);
33628 +               /*
33629 +                * the below does up(sbinfo->delete_mutex). Do not get
33630 +                * confused
33631 +                */
33632 +               reiser4_release_reserved(inode->i_sb);
33633 +               return result;
33634 +       }
33635 +
33636 +       lock_page(page);
33637 +       assert("vs-1066", PageLocked(page));
33638 +       zero_user_segment(page, padd_from, PAGE_CACHE_SIZE);
33639 +       unlock_page(page);
33640 +       page_cache_release(page);
33641 +       /* the below does up(sbinfo->delete_mutex). Do not get confused */
33642 +       reiser4_release_reserved(inode->i_sb);
33643 +       return 0;
33644 +}
33645 +
33646 +/**
33647 + * should_have_notail
33648 + * @uf_info:
33649 + * @new_size:
33650 + *
33651 + * Calls formatting plugin to see whether file of size @new_size has to be
33652 + * stored in unformatted nodes or in tail items. 0 is returned for later case.
33653 + */
33654 +static int should_have_notail(const struct unix_file_info *uf_info, loff_t new_size)
33655 +{
33656 +       if (!uf_info->tplug)
33657 +               return 1;
33658 +       return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info),
33659 +                                         new_size);
33660 +
33661 +}
33662 +
33663 +/**
33664 + * truncate_file_body - change length of file
33665 + * @inode: inode of file
33666 + * @new_size: new file length
33667 + *
33668 + * Adjusts items file @inode is built of to match @new_size. It may either cut
33669 + * items or add them to represent a hole at the end of file. The caller has to
33670 + * obtain exclusive access to the file.
33671 + */
33672 +static int truncate_file_body(struct inode *inode, struct iattr *attr)
33673 +{
33674 +       int result;
33675 +       loff_t new_size = attr->ia_size;
33676 +
33677 +       if (inode->i_size < new_size) {
33678 +               /* expanding truncate */
33679 +               struct unix_file_info *uf_info = unix_file_inode_data(inode);
33680 +
33681 +               result = find_file_state(inode, uf_info);
33682 +               if (result)
33683 +                       return result;
33684 +
33685 +               if (should_have_notail(uf_info, new_size)) {
33686 +                       /*
33687 +                        * file of size @new_size has to be built of
33688 +                        * extents. If it is built of tails - convert to
33689 +                        * extents
33690 +                        */
33691 +                       if (uf_info->container ==  UF_CONTAINER_TAILS) {
33692 +                               /*
33693 +                                * if file is being convered by another process
33694 +                                * - wait until it completes
33695 +                                */
33696 +                               while (1) {
33697 +                                       if (reiser4_inode_get_flag(inode,
33698 +                                                                  REISER4_PART_IN_CONV)) {
33699 +                                               drop_exclusive_access(uf_info);
33700 +                                               schedule();
33701 +                                               get_exclusive_access(uf_info);
33702 +                                               continue;
33703 +                                       }
33704 +                                       break;
33705 +                               }
33706 +
33707 +                               if (uf_info->container ==  UF_CONTAINER_TAILS) {
33708 +                                       result = tail2extent(uf_info);
33709 +                                       if (result)
33710 +                                               return result;
33711 +                               }
33712 +                       }
33713 +                       result = reiser4_write_extent(NULL, inode, NULL,
33714 +                                                     0, &new_size);
33715 +                       if (result)
33716 +                               return result;
33717 +                       uf_info->container = UF_CONTAINER_EXTENTS;
33718 +               } else {
33719 +                       if (uf_info->container ==  UF_CONTAINER_EXTENTS) {
33720 +                               result = reiser4_write_extent(NULL, inode, NULL,
33721 +                                                             0, &new_size);
33722 +                               if (result)
33723 +                                       return result;
33724 +                       } else {
33725 +                               result = reiser4_write_tail(NULL, inode, NULL,
33726 +                                                           0, &new_size);
33727 +                               if (result)
33728 +                                       return result;
33729 +                               uf_info->container = UF_CONTAINER_TAILS;
33730 +                       }
33731 +               }
33732 +               BUG_ON(result > 0);
33733 +               result = reiser4_update_file_size(inode, new_size, 1);
33734 +               BUG_ON(result != 0);
33735 +       } else
33736 +               result = shorten_file(inode, new_size);
33737 +       return result;
33738 +}
33739 +
33740 +/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */
33741 +
33742 +/**
33743 + * load_file_hint - copy hint from struct file to local variable
33744 + * @file: file to get hint from
33745 + * @hint: structure to fill
33746 + *
33747 + * Reiser4 specific portion of struct file may contain information (hint)
33748 + * stored on exiting from previous read or write. That information includes
33749 + * seal of znode and coord within that znode where previous read or write
33750 + * stopped. This function copies that information to @hint if it was stored or
33751 + * initializes @hint by 0s otherwise.
33752 + */
33753 +int load_file_hint(struct file *file, hint_t *hint)
33754 +{
33755 +       reiser4_file_fsdata *fsdata;
33756 +
33757 +       if (file) {
33758 +               fsdata = reiser4_get_file_fsdata(file);
33759 +               if (IS_ERR(fsdata))
33760 +                       return PTR_ERR(fsdata);
33761 +
33762 +               spin_lock_inode(file->f_dentry->d_inode);
33763 +               if (reiser4_seal_is_set(&fsdata->reg.hint.seal)) {
33764 +                       *hint = fsdata->reg.hint;
33765 +                       init_lh(&hint->lh);
33766 +                       hint->ext_coord.lh = &hint->lh;
33767 +                       spin_unlock_inode(file->f_dentry->d_inode);
33768 +                       /*
33769 +                        * force re-validation of the coord on the first
33770 +                        * iteration of the read/write loop.
33771 +                        */
33772 +                       hint->ext_coord.valid = 0;
33773 +                       assert("nikita-19892", coords_equal(&hint->seal.coord1,
33774 +                                                           &hint->ext_coord.
33775 +                                                           coord));
33776 +                       return 0;
33777 +               }
33778 +               memset(&fsdata->reg.hint, 0, sizeof(hint_t));
33779 +               spin_unlock_inode(file->f_dentry->d_inode);
33780 +       }
33781 +       hint_init_zero(hint);
33782 +       return 0;
33783 +}
33784 +
33785 +/**
33786 + * save_file_hint - copy hint to reiser4 private struct file's part
33787 + * @file: file to save hint in
33788 + * @hint: hint to save
33789 + *
33790 + * This copies @hint to reiser4 private part of struct file. It can help
33791 + * speedup future accesses to the file.
33792 + */
33793 +void save_file_hint(struct file *file, const hint_t *hint)
33794 +{
33795 +       reiser4_file_fsdata *fsdata;
33796 +
33797 +       assert("edward-1337", hint != NULL);
33798 +
33799 +       if (!file || !reiser4_seal_is_set(&hint->seal))
33800 +               return;
33801 +       fsdata = reiser4_get_file_fsdata(file);
33802 +       assert("vs-965", !IS_ERR(fsdata));
33803 +       assert("nikita-19891",
33804 +              coords_equal(&hint->seal.coord1, &hint->ext_coord.coord));
33805 +       assert("vs-30", hint->lh.owner == NULL);
33806 +       spin_lock_inode(file->f_dentry->d_inode);
33807 +       fsdata->reg.hint = *hint;
33808 +       spin_unlock_inode(file->f_dentry->d_inode);
33809 +       return;
33810 +}
33811 +
33812 +void reiser4_unset_hint(hint_t * hint)
33813 +{
33814 +       assert("vs-1315", hint);
33815 +       hint->ext_coord.valid = 0;
33816 +       reiser4_seal_done(&hint->seal);
33817 +       done_lh(&hint->lh);
33818 +}
33819 +
33820 +/* coord must be set properly. So, that reiser4_set_hint
33821 +   has nothing to do */
33822 +void reiser4_set_hint(hint_t * hint, const reiser4_key * key,
33823 +                     znode_lock_mode mode)
33824 +{
33825 +       ON_DEBUG(coord_t * coord = &hint->ext_coord.coord);
33826 +       assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key)));
33827 +
33828 +       reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, key);
33829 +       hint->offset = get_key_offset(key);
33830 +       hint->mode = mode;
33831 +       done_lh(&hint->lh);
33832 +}
33833 +
33834 +int hint_is_set(const hint_t * hint)
33835 +{
33836 +       return reiser4_seal_is_set(&hint->seal);
33837 +}
33838 +
33839 +#if REISER4_DEBUG
33840 +static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
33841 +{
33842 +       return (get_key_locality(k1) == get_key_locality(k2) &&
33843 +               get_key_type(k1) == get_key_type(k2) &&
33844 +               get_key_band(k1) == get_key_band(k2) &&
33845 +               get_key_ordering(k1) == get_key_ordering(k2) &&
33846 +               get_key_objectid(k1) == get_key_objectid(k2));
33847 +}
33848 +#endif
33849 +
33850 +static int
33851 +hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
33852 +             znode_lock_mode lock_mode)
33853 +{
33854 +       if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
33855 +               /* hint either not set or set by different operation */
33856 +               return RETERR(-E_REPEAT);
33857 +
33858 +       assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key));
33859 +
33860 +       if (check_key && get_key_offset(key) != hint->offset)
33861 +               /* hint is set for different key */
33862 +               return RETERR(-E_REPEAT);
33863 +
33864 +       assert("vs-31", hint->ext_coord.lh == &hint->lh);
33865 +       return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, key,
33866 +                                    hint->ext_coord.lh, lock_mode,
33867 +                                    ZNODE_LOCK_LOPRI);
33868 +}
33869 +
33870 +/**
33871 + * Look for place at twig level for extent corresponding to page,
33872 + * call extent's writepage method to create unallocated extent if
33873 + * it does not exist yet, initialize jnode, capture page
33874 + */
33875 +int find_or_create_extent(struct page *page)
33876 +{
33877 +       int result;
33878 +       struct inode *inode;
33879 +       int plugged_hole;
33880 +
33881 +       jnode *node;
33882 +
33883 +       assert("vs-1065", page->mapping && page->mapping->host);
33884 +       inode = page->mapping->host;
33885 +
33886 +       lock_page(page);
33887 +       node = jnode_of_page(page);
33888 +       if (IS_ERR(node)) {
33889 +               unlock_page(page);
33890 +               return PTR_ERR(node);
33891 +       }
33892 +       JF_SET(node, JNODE_WRITE_PREPARED);
33893 +       unlock_page(page);
33894 +
33895 +       if (node->blocknr == 0) {
33896 +               plugged_hole = 0;
33897 +               result = reiser4_update_extent(inode, node, page_offset(page),
33898 +                                              &plugged_hole);
33899 +               if (result) {
33900 +                       JF_CLR(node, JNODE_WRITE_PREPARED);
33901 +                       jput(node);
33902 +                       warning("edward-1549",
33903 +                               "reiser4_update_extent failed: %d", result);
33904 +                       return result;
33905 +               }
33906 +               if (plugged_hole)
33907 +                       reiser4_update_sd(inode);
33908 +       } else {
33909 +               spin_lock_jnode(node);
33910 +               result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
33911 +               BUG_ON(result != 0);
33912 +               jnode_make_dirty_locked(node);
33913 +               spin_unlock_jnode(node);
33914 +       }
33915 +
33916 +       BUG_ON(node->atom == NULL);
33917 +       JF_CLR(node, JNODE_WRITE_PREPARED);
33918 +
33919 +       if (get_current_context()->entd) {
33920 +               entd_context *ent = get_entd_context(node->tree->super);
33921 +
33922 +               if (ent->cur_request->page == page)
33923 +                       /* the following reference will be
33924 +                          dropped in reiser4_writeout */
33925 +                       ent->cur_request->node = jref(node);
33926 +       }
33927 +       jput(node);
33928 +       return 0;
33929 +}
33930 +
33931 +/**
33932 + * has_anonymous_pages - check whether inode has pages dirtied via mmap
33933 + * @inode: inode to check
33934 + *
33935 + * Returns true if inode's mapping has dirty pages which do not belong to any
33936 + * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page
33937 + * tree or were eflushed and can be found via jnodes tagged
33938 + * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes.
33939 + */
33940 +static int has_anonymous_pages(struct inode *inode)
33941 +{
33942 +       int result;
33943 +
33944 +       spin_lock_irq(&inode->i_mapping->tree_lock);
33945 +       result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED);
33946 +       spin_unlock_irq(&inode->i_mapping->tree_lock);
33947 +       return result;
33948 +}
33949 +
33950 +/**
33951 + * capture_page_and_create_extent -
33952 + * @page: page to be captured
33953 + *
33954 + * Grabs space for extent creation and stat data update and calls function to
33955 + * do actual work.
33956 + */
33957 +static int capture_page_and_create_extent(struct page *page)
33958 +{
33959 +       int result;
33960 +       struct inode *inode;
33961 +
33962 +       assert("vs-1084", page->mapping && page->mapping->host);
33963 +       inode = page->mapping->host;
33964 +       assert("vs-1139",
33965 +              unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS);
33966 +       /* page belongs to file */
33967 +       assert("vs-1393",
33968 +              inode->i_size > page_offset(page));
33969 +
33970 +       /* page capture may require extent creation (if it does not exist yet)
33971 +          and stat data's update (number of blocks changes on extent
33972 +          creation) */
33973 +       grab_space_enable();
33974 +       result = reiser4_grab_space(2 * estimate_one_insert_into_item
33975 +                                   (reiser4_tree_by_inode(inode)),
33976 +                                   BA_CAN_COMMIT);
33977 +       if (likely(!result))
33978 +               result = find_or_create_extent(page);
33979 +
33980 +       if (result != 0)
33981 +               SetPageError(page);
33982 +       return result;
33983 +}
33984 +
33985 +/* plugin->write_end() */
33986 +int write_end_unix_file(struct file *file, struct page *page,
33987 +                       unsigned from, unsigned to)
33988 +{
33989 +       unlock_page(page);
33990 +       return capture_page_and_create_extent(page);
33991 +}
33992 +
33993 +/*
33994 + * Support for "anonymous" pages and jnodes.
33995 + *
33996 + * When file is write-accessed through mmap pages can be dirtied from the user
33997 + * level. In this case kernel is not notified until one of following happens:
33998 + *
33999 + *     (1) msync()
34000 + *
34001 + *     (2) truncate() (either explicit or through unlink)
34002 + *
34003 + *     (3) VM scanner starts reclaiming mapped pages, dirtying them before
34004 + *     starting write-back.
34005 + *
34006 + * As a result of (3) ->writepage may be called on a dirty page without
34007 + * jnode. Such page is called "anonymous" in reiser4. Certain work-loads
34008 + * (iozone) generate huge number of anonymous pages.
34009 + *
34010 + * reiser4_sync_sb() method tries to insert anonymous pages into
34011 + * tree. This is done by capture_anonymous_*() functions below.
34012 + */
34013 +
34014 +/**
34015 + * capture_anonymous_page - involve page into transaction
34016 + * @pg: page to deal with
34017 + *
34018 + * Takes care that @page has corresponding metadata in the tree, creates jnode
34019 + * for @page and captures it. On success 1 is returned.
34020 + */
34021 +static int capture_anonymous_page(struct page *page)
34022 +{
34023 +       int result;
34024 +
34025 +       if (PageWriteback(page))
34026 +               /* FIXME: do nothing? */
34027 +               return 0;
34028 +
34029 +       result = capture_page_and_create_extent(page);
34030 +       if (result == 0) {
34031 +               result = 1;
34032 +       } else
34033 +               warning("nikita-3329",
34034 +                               "Cannot capture anon page: %i", result);
34035 +
34036 +       return result;
34037 +}
34038 +
34039 +/**
34040 + * capture_anonymous_pages - find and capture pages dirtied via mmap
34041 + * @mapping: address space where to look for pages
34042 + * @index: start index
34043 + * @to_capture: maximum number of pages to capture
34044 + *
34045 + * Looks for pages tagged REISER4_MOVED starting from the *@index-th page,
34046 + * captures (involves into atom) them, returns number of captured pages,
34047 + * updates @index to next page after the last captured one.
34048 + */
34049 +static int
34050 +capture_anonymous_pages(struct address_space *mapping, pgoff_t *index,
34051 +                       unsigned int to_capture)
34052 +{
34053 +       int result;
34054 +       struct pagevec pvec;
34055 +       unsigned int i, count;
34056 +       int nr;
34057 +
34058 +       pagevec_init(&pvec, 0);
34059 +       count = min(pagevec_space(&pvec), to_capture);
34060 +       nr = 0;
34061 +
34062 +       /* find pages tagged MOVED */
34063 +       spin_lock_irq(&mapping->tree_lock);
34064 +       pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree,
34065 +                                            (void **)pvec.pages, *index, count,
34066 +                                            PAGECACHE_TAG_REISER4_MOVED);
34067 +       if (pagevec_count(&pvec) == 0) {
34068 +               /*
34069 +                * there are no pages tagged MOVED in mapping->page_tree
34070 +                * starting from *index
34071 +                */
34072 +               spin_unlock_irq(&mapping->tree_lock);
34073 +               *index = (pgoff_t)-1;
34074 +               return 0;
34075 +       }
34076 +
34077 +       /* clear MOVED tag for all found pages */
34078 +       for (i = 0; i < pagevec_count(&pvec); i++) {
34079 +               page_cache_get(pvec.pages[i]);
34080 +               radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index,
34081 +                                    PAGECACHE_TAG_REISER4_MOVED);
34082 +       }
34083 +       spin_unlock_irq(&mapping->tree_lock);
34084 +
34085 +
34086 +       *index = pvec.pages[i - 1]->index + 1;
34087 +
34088 +       for (i = 0; i < pagevec_count(&pvec); i++) {
34089 +               result = capture_anonymous_page(pvec.pages[i]);
34090 +               if (result == 1)
34091 +                       nr++;
34092 +               else {
34093 +                       if (result < 0) {
34094 +                               warning("vs-1454",
34095 +                                       "failed to capture page: "
34096 +                                       "result=%d, captured=%d)\n",
34097 +                                       result, i);
34098 +
34099 +                               /*
34100 +                                * set MOVED tag to all pages which left not
34101 +                                * captured
34102 +                                */
34103 +                               spin_lock_irq(&mapping->tree_lock);
34104 +                               for (; i < pagevec_count(&pvec); i ++) {
34105 +                                       radix_tree_tag_set(&mapping->page_tree,
34106 +                                                          pvec.pages[i]->index,
34107 +                                                          PAGECACHE_TAG_REISER4_MOVED);
34108 +                               }
34109 +                               spin_unlock_irq(&mapping->tree_lock);
34110 +
34111 +                               pagevec_release(&pvec);
34112 +                               return result;
34113 +                       } else {
34114 +                               /*
34115 +                                * result == 0. capture_anonymous_page returns
34116 +                                * 0 for Writeback-ed page. Set MOVED tag on
34117 +                                * that page
34118 +                                */
34119 +                               spin_lock_irq(&mapping->tree_lock);
34120 +                               radix_tree_tag_set(&mapping->page_tree,
34121 +                                                  pvec.pages[i]->index,
34122 +                                                  PAGECACHE_TAG_REISER4_MOVED);
34123 +                               spin_unlock_irq(&mapping->tree_lock);
34124 +                               if (i == 0)
34125 +                                       *index = pvec.pages[0]->index;
34126 +                               else
34127 +                                       *index = pvec.pages[i - 1]->index + 1;
34128 +                       }
34129 +               }
34130 +       }
34131 +       pagevec_release(&pvec);
34132 +       return nr;
34133 +}
34134 +
34135 +/**
34136 + * capture_anonymous_jnodes - find and capture anonymous jnodes
34137 + * @mapping: address space where to look for jnodes
34138 + * @from: start index
34139 + * @to: end index
34140 + * @to_capture: maximum number of jnodes to capture
34141 + *
34142 + * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in
34143 + * the range of indexes @from-@to and captures them, returns number of captured
34144 + * jnodes, updates @from to next jnode after the last captured one.
34145 + */
34146 +static int
34147 +capture_anonymous_jnodes(struct address_space *mapping,
34148 +                        pgoff_t *from, pgoff_t to, int to_capture)
34149 +{
34150 +       *from = to;
34151 +       return 0;
34152 +}
34153 +
34154 +/*
34155 + * Commit atom of the jnode of a page.
34156 + */
34157 +static int sync_page(struct page *page)
34158 +{
34159 +       int result;
34160 +       do {
34161 +               jnode *node;
34162 +               txn_atom *atom;
34163 +
34164 +               lock_page(page);
34165 +               node = jprivate(page);
34166 +               if (node != NULL) {
34167 +                       spin_lock_jnode(node);
34168 +                       atom = jnode_get_atom(node);
34169 +                       spin_unlock_jnode(node);
34170 +               } else
34171 +                       atom = NULL;
34172 +               unlock_page(page);
34173 +               result = reiser4_sync_atom(atom);
34174 +       } while (result == -E_REPEAT);
34175 +       /*
34176 +        * ZAM-FIXME-HANS: document the logic of this loop, is it just to
34177 +        * handle the case where more pages get added to the atom while we are
34178 +        * syncing it?
34179 +        */
34180 +       assert("nikita-3485", ergo(result == 0,
34181 +                                  get_current_context()->trans->atom == NULL));
34182 +       return result;
34183 +}
34184 +
34185 +/*
34186 + * Commit atoms of pages on @pages list.
34187 + * call sync_page for each page from mapping's page tree
34188 + */
34189 +static int sync_page_list(struct inode *inode)
34190 +{
34191 +       int result;
34192 +       struct address_space *mapping;
34193 +       unsigned long from;     /* start index for radix_tree_gang_lookup */
34194 +       unsigned int found;     /* return value for radix_tree_gang_lookup */
34195 +
34196 +       mapping = inode->i_mapping;
34197 +       from = 0;
34198 +       result = 0;
34199 +       spin_lock_irq(&mapping->tree_lock);
34200 +       while (result == 0) {
34201 +               struct page *page;
34202 +
34203 +               found =
34204 +                   radix_tree_gang_lookup(&mapping->page_tree, (void **)&page,
34205 +                                          from, 1);
34206 +               assert("edward-1550", found < 2);
34207 +               if (found == 0)
34208 +                       break;
34209 +               /**
34210 +                * page may not leave radix tree because it is protected from
34211 +                * truncating by inode->i_mutex locked by sys_fsync
34212 +                */
34213 +               page_cache_get(page);
34214 +               spin_unlock_irq(&mapping->tree_lock);
34215 +
34216 +               from = page->index + 1;
34217 +
34218 +               result = sync_page(page);
34219 +
34220 +               page_cache_release(page);
34221 +               spin_lock_irq(&mapping->tree_lock);
34222 +       }
34223 +
34224 +       spin_unlock_irq(&mapping->tree_lock);
34225 +       return result;
34226 +}
34227 +
34228 +static int commit_file_atoms(struct inode *inode)
34229 +{
34230 +       int result;
34231 +       struct unix_file_info *uf_info;
34232 +
34233 +       uf_info = unix_file_inode_data(inode);
34234 +
34235 +       get_exclusive_access(uf_info);
34236 +       /*
34237 +        * find what items file is made from
34238 +        */
34239 +       result = find_file_state(inode, uf_info);
34240 +       drop_exclusive_access(uf_info);
34241 +       if (result != 0)
34242 +               return result;
34243 +
34244 +       /*
34245 +        * file state cannot change because we are under ->i_mutex
34246 +        */
34247 +       switch (uf_info->container) {
34248 +       case UF_CONTAINER_EXTENTS:
34249 +               /* find_file_state might open join an atom */
34250 +               reiser4_txn_restart_current();
34251 +               result =
34252 +                   /*
34253 +                    * when we are called by
34254 +                    * filemap_fdatawrite->
34255 +                    *    do_writepages()->
34256 +                    *       reiser4_writepages()
34257 +                    *
34258 +                    * inode->i_mapping->dirty_pages are spices into
34259 +                    * ->io_pages, leaving ->dirty_pages dirty.
34260 +                    *
34261 +                    * When we are called from
34262 +                    * reiser4_fsync()->sync_unix_file(), we have to
34263 +                    * commit atoms of all pages on the ->dirty_list.
34264 +                    *
34265 +                    * So for simplicity we just commit ->io_pages and
34266 +                    * ->dirty_pages.
34267 +                    */
34268 +                   sync_page_list(inode);
34269 +               break;
34270 +       case UF_CONTAINER_TAILS:
34271 +               /*
34272 +                * NOTE-NIKITA probably we can be smarter for tails. For now
34273 +                * just commit all existing atoms.
34274 +                */
34275 +               result = txnmgr_force_commit_all(inode->i_sb, 0);
34276 +               break;
34277 +       case UF_CONTAINER_EMPTY:
34278 +               result = 0;
34279 +               break;
34280 +       case UF_CONTAINER_UNKNOWN:
34281 +       default:
34282 +               result = -EIO;
34283 +               break;
34284 +       }
34285 +
34286 +       /*
34287 +        * commit current transaction: there can be captured nodes from
34288 +        * find_file_state() and finish_conversion().
34289 +        */
34290 +       reiser4_txn_restart_current();
34291 +       return result;
34292 +}
34293 +
34294 +/**
34295 + * writepages_unix_file - writepages of struct address_space_operations
34296 + * @mapping:
34297 + * @wbc:
34298 + *
34299 + * This captures anonymous pages and anonymous jnodes. Anonymous pages are
34300 + * pages which are dirtied via mmapping. Anonymous jnodes are ones which were
34301 + * created by reiser4_writepage.
34302 + */
34303 +int writepages_unix_file(struct address_space *mapping,
34304 +                    struct writeback_control *wbc)
34305 +{
34306 +       int result;
34307 +       struct unix_file_info *uf_info;
34308 +       pgoff_t pindex, jindex, nr_pages;
34309 +       long to_capture;
34310 +       struct inode *inode;
34311 +
34312 +       inode = mapping->host;
34313 +       if (!has_anonymous_pages(inode)) {
34314 +               result = 0;
34315 +               goto end;
34316 +       }
34317 +       jindex = pindex = wbc->range_start >> PAGE_CACHE_SHIFT;
34318 +       result = 0;
34319 +       nr_pages = size_in_pages(i_size_read(inode));
34320 +
34321 +       uf_info = unix_file_inode_data(inode);
34322 +
34323 +       do {
34324 +               reiser4_context *ctx;
34325 +
34326 +               if (wbc->sync_mode != WB_SYNC_ALL)
34327 +                       to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST);
34328 +               else
34329 +                       to_capture = CAPTURE_APAGE_BURST;
34330 +
34331 +               ctx = reiser4_init_context(inode->i_sb);
34332 +               if (IS_ERR(ctx)) {
34333 +                       result = PTR_ERR(ctx);
34334 +                       break;
34335 +               }
34336 +               /* avoid recursive calls to ->sync_inodes */
34337 +               ctx->nobalance = 1;
34338 +               assert("zam-760", lock_stack_isclean(get_current_lock_stack()));
34339 +               assert("edward-1551", LOCK_CNT_NIL(inode_sem_w));
34340 +               assert("edward-1552", LOCK_CNT_NIL(inode_sem_r));
34341 +
34342 +               reiser4_txn_restart_current();
34343 +
34344 +               /* we have to get nonexclusive access to the file */
34345 +               if (get_current_context()->entd) {
34346 +                       /*
34347 +                        * use nonblocking version of nonexclusive_access to
34348 +                        * avoid deadlock which might look like the following:
34349 +                        * process P1 holds NEA on file F1 and called entd to
34350 +                        * reclaim some memory. Entd works for P1 and is going
34351 +                        * to capture pages of file F2. To do that entd has to
34352 +                        * get NEA to F2. F2 is held by process P2 which also
34353 +                        * called entd. But entd is serving P1 at the moment
34354 +                        * and P2 has to wait. Process P3 trying to get EA to
34355 +                        * file F2. Existence of pending EA request to file F2
34356 +                        * makes impossible for entd to get NEA to file
34357 +                        * F2. Neither of these process can continue. Using
34358 +                        * nonblocking version of gettign NEA is supposed to
34359 +                        * avoid this deadlock.
34360 +                        */
34361 +                       if (try_to_get_nonexclusive_access(uf_info) == 0) {
34362 +                               result = RETERR(-EBUSY);
34363 +                               reiser4_exit_context(ctx);
34364 +                               break;
34365 +                       }
34366 +               } else
34367 +                       get_nonexclusive_access(uf_info);
34368 +
34369 +               while (to_capture > 0) {
34370 +                       pgoff_t start;
34371 +
34372 +                       assert("vs-1727", jindex <= pindex);
34373 +                       if (pindex == jindex) {
34374 +                               start = pindex;
34375 +                               result =
34376 +                                   capture_anonymous_pages(inode->i_mapping,
34377 +                                                           &pindex,
34378 +                                                           to_capture);
34379 +                               if (result <= 0)
34380 +                                       break;
34381 +                               to_capture -= result;
34382 +                               wbc->nr_to_write -= result;
34383 +                               if (start + result == pindex) {
34384 +                                       jindex = pindex;
34385 +                                       continue;
34386 +                               }
34387 +                               if (to_capture <= 0)
34388 +                                       break;
34389 +                       }
34390 +                       /* deal with anonymous jnodes between jindex and pindex */
34391 +                       result =
34392 +                           capture_anonymous_jnodes(inode->i_mapping, &jindex,
34393 +                                                    pindex, to_capture);
34394 +                       if (result < 0)
34395 +                               break;
34396 +                       to_capture -= result;
34397 +                       get_current_context()->nr_captured += result;
34398 +
34399 +                       if (jindex == (pgoff_t) - 1) {
34400 +                               assert("vs-1728", pindex == (pgoff_t) - 1);
34401 +                               break;
34402 +                       }
34403 +               }
34404 +               if (to_capture <= 0)
34405 +                       /* there may be left more pages */
34406 +                       __mark_inode_dirty(inode, I_DIRTY_PAGES);
34407 +
34408 +               drop_nonexclusive_access(uf_info);
34409 +               if (result < 0) {
34410 +                       /* error happened */
34411 +                       reiser4_exit_context(ctx);
34412 +                       return result;
34413 +               }
34414 +               if (wbc->sync_mode != WB_SYNC_ALL) {
34415 +                       reiser4_exit_context(ctx);
34416 +                       return 0;
34417 +               }
34418 +               result = commit_file_atoms(inode);
34419 +               reiser4_exit_context(ctx);
34420 +               if (pindex >= nr_pages && jindex == pindex)
34421 +                       break;
34422 +       } while (1);
34423 +
34424 +      end:
34425 +       if (is_in_reiser4_context()) {
34426 +               if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
34427 +                       /*
34428 +                        * there are already pages to flush, flush them out, do
34429 +                        * not delay until end of reiser4_sync_inodes
34430 +                        */
34431 +                       reiser4_writeout(inode->i_sb, wbc);
34432 +                       get_current_context()->nr_captured = 0;
34433 +               }
34434 +       }
34435 +       return result;
34436 +}
34437 +
34438 +/**
34439 + * readpage_unix_file_nolock - readpage of struct address_space_operations
34440 + * @file:
34441 + * @page:
34442 + *
34443 + * Compose a key and search for item containing information about @page
34444 + * data. If item is found - its readpage method is called.
34445 + */
34446 +int readpage_unix_file(struct file *file, struct page *page)
34447 +{
34448 +       reiser4_context *ctx;
34449 +       int result;
34450 +       struct inode *inode;
34451 +       reiser4_key key;
34452 +       item_plugin *iplug;
34453 +       hint_t *hint;
34454 +       lock_handle *lh;
34455 +       coord_t *coord;
34456 +
34457 +       assert("vs-1062", PageLocked(page));
34458 +       assert("vs-976", !PageUptodate(page));
34459 +       assert("vs-1061", page->mapping && page->mapping->host);
34460 +
34461 +       if (page->mapping->host->i_size <= page_offset(page)) {
34462 +               /* page is out of file */
34463 +               zero_user(page, 0, PAGE_CACHE_SIZE);
34464 +               SetPageUptodate(page);
34465 +               unlock_page(page);
34466 +               return 0;
34467 +       }
34468 +
34469 +       inode = page->mapping->host;
34470 +       ctx = reiser4_init_context(inode->i_sb);
34471 +       if (IS_ERR(ctx)) {
34472 +               unlock_page(page);
34473 +               return PTR_ERR(ctx);
34474 +       }
34475 +
34476 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34477 +       if (hint == NULL) {
34478 +               unlock_page(page);
34479 +               reiser4_exit_context(ctx);
34480 +               return RETERR(-ENOMEM);
34481 +       }
34482 +
34483 +       result = load_file_hint(file, hint);
34484 +       if (result) {
34485 +               kfree(hint);
34486 +               unlock_page(page);
34487 +               reiser4_exit_context(ctx);
34488 +               return result;
34489 +       }
34490 +       lh = &hint->lh;
34491 +
34492 +       /* get key of first byte of the page */
34493 +       key_by_inode_and_offset_common(inode, page_offset(page), &key);
34494 +
34495 +       /* look for file metadata corresponding to first byte of page */
34496 +       page_cache_get(page);
34497 +       unlock_page(page);
34498 +       result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode);
34499 +       lock_page(page);
34500 +       page_cache_release(page);
34501 +
34502 +       if (page->mapping == NULL) {
34503 +               /*
34504 +                * readpage allows truncate to run concurrently. Page was
34505 +                * truncated while it was not locked
34506 +                */
34507 +               done_lh(lh);
34508 +               kfree(hint);
34509 +               unlock_page(page);
34510 +               reiser4_txn_restart(ctx);
34511 +               reiser4_exit_context(ctx);
34512 +               return -EINVAL;
34513 +       }
34514 +
34515 +       if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) {
34516 +               if (result == CBK_COORD_FOUND &&
34517 +                   hint->ext_coord.coord.between != AT_UNIT)
34518 +                       /* file is truncated */
34519 +                       result = -EINVAL;
34520 +               done_lh(lh);
34521 +               kfree(hint);
34522 +               unlock_page(page);
34523 +               reiser4_txn_restart(ctx);
34524 +               reiser4_exit_context(ctx);
34525 +               return result;
34526 +       }
34527 +
34528 +       /*
34529 +        * item corresponding to page is found. It can not be removed because
34530 +        * znode lock is held
34531 +        */
34532 +       if (PageUptodate(page)) {
34533 +               done_lh(lh);
34534 +               kfree(hint);
34535 +               unlock_page(page);
34536 +               reiser4_txn_restart(ctx);
34537 +               reiser4_exit_context(ctx);
34538 +               return 0;
34539 +       }
34540 +
34541 +       coord = &hint->ext_coord.coord;
34542 +       result = zload(coord->node);
34543 +       if (result) {
34544 +               done_lh(lh);
34545 +               kfree(hint);
34546 +               unlock_page(page);
34547 +               reiser4_txn_restart(ctx);
34548 +               reiser4_exit_context(ctx);
34549 +               return result;
34550 +       }
34551 +
34552 +       validate_extended_coord(&hint->ext_coord, page_offset(page));
34553 +
34554 +       if (!coord_is_existing_unit(coord)) {
34555 +               /* this indicates corruption */
34556 +               warning("vs-280",
34557 +                       "Looking for page %lu of file %llu (size %lli). "
34558 +                       "No file items found (%d). File is corrupted?\n",
34559 +                       page->index, (unsigned long long)get_inode_oid(inode),
34560 +                       inode->i_size, result);
34561 +               zrelse(coord->node);
34562 +               done_lh(lh);
34563 +               kfree(hint);
34564 +               unlock_page(page);
34565 +               reiser4_txn_restart(ctx);
34566 +               reiser4_exit_context(ctx);
34567 +               return RETERR(-EIO);
34568 +       }
34569 +
34570 +       /*
34571 +        * get plugin of found item or use plugin if extent if there are no
34572 +        * one
34573 +        */
34574 +       iplug = item_plugin_by_coord(coord);
34575 +       if (iplug->s.file.readpage)
34576 +               result = iplug->s.file.readpage(coord, page);
34577 +       else
34578 +               result = RETERR(-EINVAL);
34579 +
34580 +       if (!result) {
34581 +               set_key_offset(&key,
34582 +                              (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT);
34583 +               /* FIXME should call reiser4_set_hint() */
34584 +               reiser4_unset_hint(hint);
34585 +       } else {
34586 +               unlock_page(page);
34587 +               reiser4_unset_hint(hint);
34588 +       }
34589 +       assert("vs-979",
34590 +              ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
34591 +       assert("vs-9791", ergo(result != 0, !PageLocked(page)));
34592 +
34593 +       zrelse(coord->node);
34594 +       done_lh(lh);
34595 +
34596 +       save_file_hint(file, hint);
34597 +       kfree(hint);
34598 +
34599 +       /*
34600 +        * FIXME: explain why it is needed. HINT: page allocation in write can
34601 +        * not be done when atom is not NULL because reiser4_writepage can not
34602 +        * kick entd and have to eflush
34603 +        */
34604 +       reiser4_txn_restart(ctx);
34605 +       reiser4_exit_context(ctx);
34606 +       return result;
34607 +}
34608 +
34609 +struct uf_readpages_context {
34610 +       lock_handle lh;
34611 +       coord_t coord;
34612 +};
34613 +
34614 +/* A callback function for readpages_unix_file/read_cache_pages.
34615 + * If the file is build of tails, then return error (-ENOENT).
34616 + *
34617 + * @data -- a pointer to reiser4_readpages_context object,
34618 + *            to save the twig lock and the coord between
34619 + *            read_cache_page iterations.
34620 + * @page -- page to start read.
34621 + */
34622 +static int uf_readpages_filler(void * data, struct page * page)
34623 +{
34624 +       struct uf_readpages_context *rc = data;
34625 +       jnode * node;
34626 +       int ret = 0;
34627 +       reiser4_extent *ext;
34628 +       __u64 ext_index;
34629 +       int cbk_done = 0;
34630 +       struct address_space * mapping = page->mapping;
34631 +
34632 +       if (PageUptodate(page)) {
34633 +               unlock_page(page);
34634 +               return 0;
34635 +       }
34636 +       page_cache_get(page);
34637 +
34638 +       if (rc->lh.node == 0) {
34639 +               /* no twig lock  - have to do tree search. */
34640 +               reiser4_key key;
34641 +       repeat:
34642 +               unlock_page(page);
34643 +               key_by_inode_and_offset_common(
34644 +                       mapping->host, page_offset(page), &key);
34645 +               ret = coord_by_key(
34646 +                       &get_super_private(mapping->host->i_sb)->tree,
34647 +                       &key, &rc->coord, &rc->lh,
34648 +                       ZNODE_READ_LOCK, FIND_EXACT,
34649 +                       TWIG_LEVEL, TWIG_LEVEL, CBK_UNIQUE, NULL);
34650 +               if (unlikely(ret))
34651 +                       goto exit;
34652 +               lock_page(page);
34653 +               if (PageUptodate(page))
34654 +                       goto unlock;
34655 +               cbk_done = 1;
34656 +       }
34657 +       ret = zload(rc->coord.node);
34658 +       if (unlikely(ret))
34659 +               goto unlock;
34660 +       if (!coord_is_existing_item(&rc->coord) ||
34661 +           !item_is_extent(&rc->coord)) {
34662 +               zrelse(rc->coord.node);
34663 +               ret = RETERR(-EIO);
34664 +               goto unlock;
34665 +       }
34666 +       ext = extent_by_coord(&rc->coord);
34667 +       ext_index = extent_unit_index(&rc->coord);
34668 +       if (page->index < ext_index ||
34669 +           page->index >= ext_index + extent_get_width(ext)) {
34670 +               /* the page index doesn't belong to the extent unit
34671 +                  which the coord points to - release the lock and
34672 +                  repeat with tree search. */
34673 +               zrelse(rc->coord.node);
34674 +               done_lh(&rc->lh);
34675 +               /* we can be here after a CBK call only in case of
34676 +                  corruption of the tree or the tree lookup algorithm bug. */
34677 +               if (unlikely(cbk_done)) {
34678 +                       ret = RETERR(-EIO);
34679 +                       goto unlock;
34680 +               }
34681 +               goto repeat;
34682 +       }
34683 +       node = jnode_of_page(page);
34684 +       if (unlikely(IS_ERR(node))) {
34685 +               zrelse(rc->coord.node);
34686 +               ret = PTR_ERR(node);
34687 +               goto unlock;
34688 +       }
34689 +       ret = reiser4_do_readpage_extent(ext, page->index - ext_index, page);
34690 +       jput(node);
34691 +       zrelse(rc->coord.node);
34692 +       if (likely(!ret))
34693 +               goto exit;
34694 + unlock:
34695 +       unlock_page(page);
34696 + exit:
34697 +       page_cache_release(page);
34698 +       return ret;
34699 +}
34700 +
34701 +/**
34702 + * readpages_unix_file - called by the readahead code, starts reading for each
34703 + * page of given list of pages
34704 + */
34705 +int readpages_unix_file(
34706 +       struct file *file, struct address_space *mapping,
34707 +       struct list_head *pages, unsigned nr_pages)
34708 +{
34709 +       reiser4_context *ctx;
34710 +       struct uf_readpages_context rc;
34711 +       int ret;
34712 +
34713 +       ctx = reiser4_init_context(mapping->host->i_sb);
34714 +       if (IS_ERR(ctx)) {
34715 +               put_pages_list(pages);
34716 +               return PTR_ERR(ctx);
34717 +       }
34718 +       init_lh(&rc.lh);
34719 +       ret = read_cache_pages(mapping, pages,  uf_readpages_filler, &rc);
34720 +       done_lh(&rc.lh);
34721 +       context_set_commit_async(ctx);
34722 +       /* close the transaction to protect further page allocation from deadlocks */
34723 +       reiser4_txn_restart(ctx);
34724 +       reiser4_exit_context(ctx);
34725 +       return ret;
34726 +}
34727 +
34728 +static reiser4_block_nr unix_file_estimate_read(struct inode *inode,
34729 +                                               loff_t count UNUSED_ARG)
34730 +{
34731 +       /* We should reserve one block, because of updating of the stat data
34732 +          item */
34733 +       assert("vs-1249",
34734 +              inode_file_plugin(inode)->estimate.update ==
34735 +              estimate_update_common);
34736 +       return estimate_update_common(inode);
34737 +}
34738 +
34739 +/* this is called with nonexclusive access obtained, file's container can not change */
34740 +static ssize_t read_file(hint_t *hint, struct file *file,      /* file to read from to */
34741 +                        char __user *buf,      /* address of user-space buffer */
34742 +                        size_t count,  /* number of bytes to read */
34743 +                        loff_t *off)
34744 +{
34745 +       int result;
34746 +       struct inode *inode;
34747 +       flow_t flow;
34748 +       int (*read_f) (struct file *, flow_t *, hint_t *);
34749 +       coord_t *coord;
34750 +       znode *loaded;
34751 +
34752 +       inode = file->f_dentry->d_inode;
34753 +
34754 +       /* build flow */
34755 +       assert("vs-1250",
34756 +              inode_file_plugin(inode)->flow_by_inode ==
34757 +              flow_by_inode_unix_file);
34758 +       result =
34759 +           flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count,
34760 +                                   *off, READ_OP, &flow);
34761 +       if (unlikely(result))
34762 +               return result;
34763 +
34764 +       /* get seal and coord sealed with it from reiser4 private data
34765 +          of struct file.  The coord will tell us where our last read
34766 +          of this file finished, and the seal will help to determine
34767 +          if that location is still valid.
34768 +        */
34769 +       coord = &hint->ext_coord.coord;
34770 +       while (flow.length && result == 0) {
34771 +               result =
34772 +                       find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode);
34773 +               if (cbk_errored(result))
34774 +                       /* error happened */
34775 +                       break;
34776 +
34777 +               if (coord->between != AT_UNIT) {
34778 +                       /* there were no items corresponding to given offset */
34779 +                       done_lh(hint->ext_coord.lh);
34780 +                       break;
34781 +               }
34782 +
34783 +               loaded = coord->node;
34784 +               result = zload(loaded);
34785 +               if (unlikely(result)) {
34786 +                       done_lh(hint->ext_coord.lh);
34787 +                       break;
34788 +               }
34789 +
34790 +               if (hint->ext_coord.valid == 0)
34791 +                       validate_extended_coord(&hint->ext_coord,
34792 +                                               get_key_offset(&flow.key));
34793 +
34794 +               assert("vs-4", hint->ext_coord.valid == 1);
34795 +               assert("vs-33", hint->ext_coord.lh == &hint->lh);
34796 +               /* call item's read method */
34797 +               read_f = item_plugin_by_coord(coord)->s.file.read;
34798 +               result = read_f(file, &flow, hint);
34799 +               zrelse(loaded);
34800 +               done_lh(hint->ext_coord.lh);
34801 +       }
34802 +
34803 +       return (count - flow.length) ? (count - flow.length) : result;
34804 +}
34805 +
34806 +static ssize_t read_unix_file_container_tails(struct file*, char __user*, size_t, loff_t*);
34807 +
34808 +/**
34809 + * read_unix_file - read of struct file_operations
34810 + * @file: file to read from
34811 + * @buf: address of user-space buffer
34812 + * @read_amount: number of bytes to read
34813 + * @off: position in file to read from
34814 + *
34815 + * This is implementation of vfs's read method of struct file_operations for
34816 + * unix file plugin.
34817 + */
34818 +ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount,
34819 +                      loff_t *off)
34820 +{
34821 +       reiser4_context *ctx;
34822 +       ssize_t result;
34823 +       struct inode *inode;
34824 +       struct unix_file_info *uf_info;
34825 +
34826 +       if (unlikely(read_amount == 0))
34827 +               return 0;
34828 +
34829 +       assert("umka-072", file != NULL);
34830 +       assert("umka-074", off != NULL);
34831 +       inode = file->f_dentry->d_inode;
34832 +       assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
34833 +
34834 +       ctx = reiser4_init_context(inode->i_sb);
34835 +       if (IS_ERR(ctx))
34836 +               return PTR_ERR(ctx);
34837 +       uf_info = unix_file_inode_data(inode);
34838 +       if (uf_info->container == UF_CONTAINER_UNKNOWN) {
34839 +               get_exclusive_access(uf_info);
34840 +               result = find_file_state(inode, uf_info);
34841 +               if (unlikely(result != 0))
34842 +                       goto out;
34843 +       } else
34844 +               get_nonexclusive_access(uf_info);
34845 +       result = reiser4_grab_space_force(unix_file_estimate_read(inode, read_amount),
34846 +                                         BA_CAN_COMMIT);
34847 +       if (unlikely(result != 0))
34848 +               goto out;
34849 +       if (uf_info->container == UF_CONTAINER_EXTENTS){
34850 +               result = do_sync_read(file, buf, read_amount, off);
34851 +       } else if (uf_info->container == UF_CONTAINER_TAILS ||
34852 +                  reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV) ||
34853 +                  reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
34854 +               result = read_unix_file_container_tails(file, buf, read_amount, off);
34855 +       } else {
34856 +               assert("zam-1085", uf_info->container == UF_CONTAINER_EMPTY);
34857 +               result = 0;
34858 +       }
34859 +out:
34860 +       drop_access(uf_info);
34861 +       context_set_commit_async(ctx);
34862 +       reiser4_exit_context(ctx);
34863 +       return result;
34864 +}
34865 +
34866 +static ssize_t read_unix_file_container_tails(
34867 +       struct file *file, char __user *buf, size_t read_amount, loff_t *off)
34868 +{
34869 +       int result;
34870 +       struct inode *inode;
34871 +       hint_t *hint;
34872 +       struct unix_file_info *uf_info;
34873 +       size_t count, read, left;
34874 +       loff_t size;
34875 +
34876 +       assert("umka-072", file != NULL);
34877 +       assert("umka-074", off != NULL);
34878 +       inode = file->f_dentry->d_inode;
34879 +       assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
34880 +
34881 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34882 +       if (hint == NULL)
34883 +               return RETERR(-ENOMEM);
34884 +
34885 +       result = load_file_hint(file, hint);
34886 +       if (result) {
34887 +               kfree(hint);
34888 +               return result;
34889 +       }
34890 +
34891 +       left = read_amount;
34892 +       count = 0;
34893 +       uf_info = unix_file_inode_data(inode);
34894 +       while (left > 0) {
34895 +               reiser4_txn_restart_current();
34896 +               size = i_size_read(inode);
34897 +               if (*off >= size)
34898 +                       /* position to read from is past the end of file */
34899 +                       break;
34900 +               if (*off + left > size)
34901 +                       left = size - *off;
34902 +               /* faultin user page */
34903 +               result = fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left);
34904 +               if (result)
34905 +                       return RETERR(-EFAULT);
34906 +
34907 +               read = read_file(hint, file, buf,
34908 +                                left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left,
34909 +                                off);
34910 +               if (read < 0) {
34911 +                       result = read;
34912 +                       break;
34913 +               }
34914 +               left -= read;
34915 +               buf += read;
34916 +
34917 +               /* update position in a file */
34918 +               *off += read;
34919 +               /* total number of read bytes */
34920 +               count += read;
34921 +       }
34922 +       done_lh(&hint->lh);
34923 +       save_file_hint(file, hint);
34924 +       kfree(hint);
34925 +       if (count)
34926 +               file_accessed(file);
34927 +       /* return number of read bytes or error code if nothing is read */
34928 +       return count ? count : result;
34929 +}
34930 +
34931 +/* This function takes care about @file's pages. First of all it checks if
34932 +   filesystems readonly and if so gets out. Otherwise, it throws out all
34933 +   pages of file if it was mapped for read and going to be mapped for write
34934 +   and consists of tails. This is done in order to not manage few copies
34935 +   of the data (first in page cache and second one in tails them selves)
34936 +   for the case of mapping files consisting tails.
34937 +
34938 +   Here also tail2extent conversion is performed if it is allowed and file
34939 +   is going to be written or mapped for write. This functions may be called
34940 +   from write_unix_file() or mmap_unix_file(). */
34941 +static int check_pages_unix_file(struct file *file, struct inode *inode)
34942 +{
34943 +       reiser4_invalidate_pages(inode->i_mapping, 0,
34944 +                                (inode->i_size + PAGE_CACHE_SIZE -
34945 +                                 1) >> PAGE_CACHE_SHIFT, 0);
34946 +       return unpack(file, inode, 0 /* not forever */ );
34947 +}
34948 +
34949 +/**
34950 + * mmap_unix_file - mmap of struct file_operations
34951 + * @file: file to mmap
34952 + * @vma:
34953 + *
34954 + * This is implementation of vfs's mmap method of struct file_operations for
34955 + * unix file plugin. It converts file to extent if necessary. Sets
34956 + * reiser4_inode's flag - REISER4_HAS_MMAP.
34957 + */
34958 +int mmap_unix_file(struct file *file, struct vm_area_struct *vma)
34959 +{
34960 +       reiser4_context *ctx;
34961 +       int result;
34962 +       struct inode *inode;
34963 +       struct unix_file_info *uf_info;
34964 +       reiser4_block_nr needed;
34965 +
34966 +       inode = file->f_dentry->d_inode;
34967 +       ctx = reiser4_init_context(inode->i_sb);
34968 +       if (IS_ERR(ctx))
34969 +               return PTR_ERR(ctx);
34970 +
34971 +       uf_info = unix_file_inode_data(inode);
34972 +
34973 +       get_exclusive_access_careful(uf_info, inode);
34974 +
34975 +       if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) {
34976 +               /*
34977 +                * we need file built of extent items. If it is still built of
34978 +                * tail items we have to convert it. Find what items the file
34979 +                * is built of
34980 +                */
34981 +               result = find_file_state(inode, uf_info);
34982 +               if (result != 0) {
34983 +                       drop_exclusive_access(uf_info);
34984 +                       reiser4_exit_context(ctx);
34985 +                       return result;
34986 +               }
34987 +
34988 +               assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS ||
34989 +                                  uf_info->container == UF_CONTAINER_EXTENTS ||
34990 +                                  uf_info->container == UF_CONTAINER_EMPTY));
34991 +               if (uf_info->container == UF_CONTAINER_TAILS) {
34992 +                       /*
34993 +                        * invalidate all pages and convert file from tails to
34994 +                        * extents
34995 +                        */
34996 +                       result = check_pages_unix_file(file, inode);
34997 +                       if (result) {
34998 +                               drop_exclusive_access(uf_info);
34999 +                               reiser4_exit_context(ctx);
35000 +                               return result;
35001 +                       }
35002 +               }
35003 +       }
35004 +
35005 +       /*
35006 +        * generic_file_mmap will do update_atime. Grab space for stat data
35007 +        * update.
35008 +        */
35009 +       needed = inode_file_plugin(inode)->estimate.update(inode);
35010 +       result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
35011 +       if (result) {
35012 +               drop_exclusive_access(uf_info);
35013 +               reiser4_exit_context(ctx);
35014 +               return result;
35015 +       }
35016 +
35017 +       result = generic_file_mmap(file, vma);
35018 +       if (result == 0) {
35019 +               /* mark file as having mapping. */
35020 +               reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
35021 +       }
35022 +
35023 +       drop_exclusive_access(uf_info);
35024 +       reiser4_exit_context(ctx);
35025 +       return result;
35026 +}
35027 +
35028 +/**
35029 + * find_first_item
35030 + * @inode:
35031 + *
35032 + * Finds file item which is responsible for first byte in the file.
35033 + */
35034 +static int find_first_item(struct inode *inode)
35035 +{
35036 +       coord_t coord;
35037 +       lock_handle lh;
35038 +       reiser4_key key;
35039 +       int result;
35040 +
35041 +       coord_init_zero(&coord);
35042 +       init_lh(&lh);
35043 +       inode_file_plugin(inode)->key_by_inode(inode, 0, &key);
35044 +       result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK,
35045 +                                      inode);
35046 +       if (result == CBK_COORD_FOUND) {
35047 +               if (coord.between == AT_UNIT) {
35048 +                       result = zload(coord.node);
35049 +                       if (result == 0) {
35050 +                               result = item_id_by_coord(&coord);
35051 +                               zrelse(coord.node);
35052 +                               if (result != EXTENT_POINTER_ID &&
35053 +                                   result != FORMATTING_ID)
35054 +                                       result = RETERR(-EIO);
35055 +                       }
35056 +               } else
35057 +                       result = RETERR(-EIO);
35058 +       }
35059 +       done_lh(&lh);
35060 +       return result;
35061 +}
35062 +
35063 +/**
35064 + * open_unix_file
35065 + * @inode:
35066 + * @file:
35067 + *
35068 + * If filesystem is not readonly - complete uncompleted tail conversion if
35069 + * there was one
35070 + */
35071 +int open_unix_file(struct inode *inode, struct file *file)
35072 +{
35073 +       int result;
35074 +       reiser4_context *ctx;
35075 +       struct unix_file_info *uf_info;
35076 +
35077 +       if (IS_RDONLY(inode))
35078 +               return 0;
35079 +
35080 +       if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED))
35081 +               return 0;
35082 +
35083 +       ctx = reiser4_init_context(inode->i_sb);
35084 +       if (IS_ERR(ctx))
35085 +               return PTR_ERR(ctx);
35086 +
35087 +       uf_info = unix_file_inode_data(inode);
35088 +
35089 +       get_exclusive_access_careful(uf_info, inode);
35090 +
35091 +       if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
35092 +               /*
35093 +                * other process completed the conversion
35094 +                */
35095 +               drop_exclusive_access(uf_info);
35096 +               reiser4_exit_context(ctx);
35097 +               return 0;
35098 +       }
35099 +
35100 +       /*
35101 +        * file left in semi converted state after unclean shutdown or another
35102 +        * thread is doing conversion and dropped exclusive access which doing
35103 +        * balance dirty pages. Complete the conversion
35104 +        */
35105 +       result = find_first_item(inode);
35106 +       if (result == EXTENT_POINTER_ID)
35107 +               /*
35108 +                * first item is extent, therefore there was incomplete
35109 +                * tail2extent conversion. Complete it
35110 +                */
35111 +               result = tail2extent(unix_file_inode_data(inode));
35112 +       else if (result == FORMATTING_ID)
35113 +               /*
35114 +                * first item is formatting item, therefore there was
35115 +                * incomplete extent2tail conversion. Complete it
35116 +                */
35117 +               result = extent2tail(file, unix_file_inode_data(inode));
35118 +       else
35119 +               result = -EIO;
35120 +
35121 +       assert("vs-1712",
35122 +              ergo(result == 0,
35123 +                   (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED) &&
35124 +                    !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))));
35125 +       drop_exclusive_access(uf_info);
35126 +       reiser4_exit_context(ctx);
35127 +       return result;
35128 +}
35129 +
35130 +#define NEITHER_OBTAINED 0
35131 +#define EA_OBTAINED 1
35132 +#define NEA_OBTAINED 2
35133 +
35134 +static void drop_access(struct unix_file_info *uf_info)
35135 +{
35136 +       if (uf_info->exclusive_use)
35137 +               drop_exclusive_access(uf_info);
35138 +       else
35139 +               drop_nonexclusive_access(uf_info);
35140 +}
35141 +
35142 +#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \
35143 +                             __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__)
35144 +
35145 +/**
35146 + * write_unix_file - private ->write() method of unix_file plugin.
35147 + *
35148 + * @file: file to write to
35149 + * @buf: address of user-space buffer
35150 + * @count: number of bytes to write
35151 + * @pos: position in file to write to
35152 + * @cont: unused argument, as we don't perform plugin conversion when being
35153 + * managed by unix_file plugin.
35154 + */
35155 +ssize_t write_unix_file(struct file *file,
35156 +                       const char __user *buf,
35157 +                       size_t count, loff_t *pos,
35158 +                       struct dispatch_context *cont)
35159 +{
35160 +       int result;
35161 +       reiser4_context *ctx;
35162 +       struct inode *inode;
35163 +       struct unix_file_info *uf_info;
35164 +       ssize_t written;
35165 +       int try_free_space;
35166 +       int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY;
35167 +       size_t left;
35168 +       ssize_t (*write_op)(struct file *, struct inode *,
35169 +                           const char __user *, size_t,
35170 +                           loff_t *pos);
35171 +       int ea;
35172 +       loff_t new_size;
35173 +
35174 +       ctx = get_current_context();
35175 +       inode = file->f_dentry->d_inode;
35176 +
35177 +       assert("vs-947", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
35178 +       assert("vs-9471", (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)));
35179 +
35180 +       /* check amount of bytes to write and writing position */
35181 +       result = generic_write_checks(file, pos, &count, 0);
35182 +       if (result) {
35183 +               context_set_commit_async(ctx);
35184 +               return result;
35185 +       }
35186 +
35187 +       result = file_remove_suid(file);
35188 +       if (result) {
35189 +               context_set_commit_async(ctx);
35190 +               return result;
35191 +       }
35192 +       /* remove_suid might create a transaction */
35193 +       reiser4_txn_restart(ctx);
35194 +
35195 +       uf_info = unix_file_inode_data(inode);
35196 +
35197 +       current->backing_dev_info = inode->i_mapping->backing_dev_info;
35198 +       written = 0;
35199 +       try_free_space = 0;
35200 +       left = count;
35201 +       ea = NEITHER_OBTAINED;
35202 +
35203 +       new_size = i_size_read(inode);
35204 +       if (*pos + count > new_size)
35205 +               new_size = *pos + count;
35206 +
35207 +       while (left) {
35208 +               if (left < to_write)
35209 +                       to_write = left;
35210 +
35211 +               if (uf_info->container == UF_CONTAINER_EMPTY) {
35212 +                       get_exclusive_access(uf_info);
35213 +                       ea = EA_OBTAINED;
35214 +                       if (uf_info->container != UF_CONTAINER_EMPTY) {
35215 +                               /* file is made not empty by another process */
35216 +                               drop_exclusive_access(uf_info);
35217 +                               ea = NEITHER_OBTAINED;
35218 +                               continue;
35219 +                       }
35220 +               } else if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35221 +                       /*
35222 +                        * get exclusive access directly just to not have to
35223 +                        * re-obtain it if file will appear empty
35224 +                        */
35225 +                       get_exclusive_access(uf_info);
35226 +                       ea = EA_OBTAINED;
35227 +                       result = find_file_state(inode, uf_info);
35228 +                       if (result) {
35229 +                               drop_exclusive_access(uf_info);
35230 +                               ea = NEITHER_OBTAINED;
35231 +                               break;
35232 +                       }
35233 +               } else {
35234 +                       get_nonexclusive_access(uf_info);
35235 +                       ea = NEA_OBTAINED;
35236 +               }
35237 +
35238 +               /* either EA or NEA is obtained. Choose item write method */
35239 +               if (uf_info->container == UF_CONTAINER_EXTENTS) {
35240 +                       /* file is built of extent items */
35241 +                       write_op = reiser4_write_extent;
35242 +               } else if (uf_info->container == UF_CONTAINER_EMPTY) {
35243 +                       /* file is empty */
35244 +                       if (should_have_notail(uf_info, new_size))
35245 +                               write_op = reiser4_write_extent;
35246 +                       else
35247 +                               write_op = reiser4_write_tail;
35248 +               } else {
35249 +                       /* file is built of tail items */
35250 +                       if (should_have_notail(uf_info, new_size)) {
35251 +                               if (ea == NEA_OBTAINED) {
35252 +                                       drop_nonexclusive_access(uf_info);
35253 +                                       get_exclusive_access(uf_info);
35254 +                                       ea = EA_OBTAINED;
35255 +                               }
35256 +                               if (uf_info->container == UF_CONTAINER_TAILS) {
35257 +                                       /*
35258 +                                        * if file is being convered by another
35259 +                                        * process - wait until it completes
35260 +                                        */
35261 +                                       while (1) {
35262 +                                               if (reiser4_inode_get_flag(inode,
35263 +                                                                          REISER4_PART_IN_CONV)) {
35264 +                                                       drop_exclusive_access(uf_info);
35265 +                                                       schedule();
35266 +                                                       get_exclusive_access(uf_info);
35267 +                                                       continue;
35268 +                                               }
35269 +                                               break;
35270 +                                       }
35271 +                                       if (uf_info->container ==  UF_CONTAINER_TAILS) {
35272 +                                               result = tail2extent(uf_info);
35273 +                                               if (result) {
35274 +                                                       drop_exclusive_access(uf_info);
35275 +                                                       context_set_commit_async(ctx);
35276 +                                                       break;
35277 +                                               }
35278 +                                       }
35279 +                               }
35280 +                               drop_exclusive_access(uf_info);
35281 +                               ea = NEITHER_OBTAINED;
35282 +                               continue;
35283 +                       }
35284 +                       write_op = reiser4_write_tail;
35285 +               }
35286 +
35287 +               written = write_op(file, inode, buf, to_write, pos);
35288 +               if (written == -ENOSPC && try_free_space) {
35289 +                       drop_access(uf_info);
35290 +                       txnmgr_force_commit_all(inode->i_sb, 0);
35291 +                       try_free_space = 0;
35292 +                       continue;
35293 +               }
35294 +               if (written < 0) {
35295 +                       drop_access(uf_info);
35296 +                       result = written;
35297 +                       break;
35298 +               }
35299 +               /* something is written. */
35300 +               if (uf_info->container == UF_CONTAINER_EMPTY) {
35301 +                       assert("edward-1553", ea == EA_OBTAINED);
35302 +                       uf_info->container =
35303 +                               (write_op == reiser4_write_extent) ?
35304 +                               UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS;
35305 +               } else {
35306 +                       assert("edward-1554", ergo(uf_info->container == UF_CONTAINER_EXTENTS,
35307 +                                       write_op == reiser4_write_extent));
35308 +                       assert("edward-1555", ergo(uf_info->container == UF_CONTAINER_TAILS,
35309 +                                       write_op == reiser4_write_tail));
35310 +               }
35311 +               if (*pos + written > inode->i_size)
35312 +                       INODE_SET_FIELD(inode, i_size, *pos + written);
35313 +               file_update_time(file);
35314 +               result = reiser4_update_sd(inode);
35315 +               if (result) {
35316 +                       current->backing_dev_info = NULL;
35317 +                       drop_access(uf_info);
35318 +                       context_set_commit_async(ctx);
35319 +                       break;
35320 +               }
35321 +               drop_access(uf_info);
35322 +               ea = NEITHER_OBTAINED;
35323 +
35324 +               /*
35325 +                * tell VM how many pages were dirtied. Maybe number of pages
35326 +                * which were dirty already should not be counted
35327 +                */
35328 +               reiser4_throttle_write(inode,
35329 +                            (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE);
35330 +               left -= written;
35331 +               buf += written;
35332 +               *pos += written;
35333 +       }
35334 +       if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
35335 +               reiser4_txn_restart_current();
35336 +               grab_space_enable();
35337 +               result = reiser4_sync_file_common(file,
35338 +                                                 0 /* data and stat data */);
35339 +               if (result)
35340 +                       warning("reiser4-7", "failed to sync file %llu",
35341 +                               (unsigned long long)get_inode_oid(inode));
35342 +       }
35343 +
35344 +       current->backing_dev_info = NULL;
35345 +
35346 +       /*
35347 +        * return number of written bytes or error code if nothing is
35348 +        * written. Note, that it does not work correctly in case when
35349 +        * sync_unix_file returns error
35350 +        */
35351 +       return (count - left) ? (count - left) : result;
35352 +}
35353 +
35354 +/**
35355 + * release_unix_file - release of struct file_operations
35356 + * @inode: inode of released file
35357 + * @file: file to release
35358 + *
35359 + * Implementation of release method of struct file_operations for unix file
35360 + * plugin. If last reference to indode is released - convert all extent items
35361 + * into tail items if necessary. Frees reiser4 specific file data.
35362 + */
35363 +int release_unix_file(struct inode *inode, struct file *file)
35364 +{
35365 +       reiser4_context *ctx;
35366 +       struct unix_file_info *uf_info;
35367 +       int result;
35368 +       int in_reiser4;
35369 +
35370 +       in_reiser4 = is_in_reiser4_context();
35371 +
35372 +       ctx = reiser4_init_context(inode->i_sb);
35373 +       if (IS_ERR(ctx))
35374 +               return PTR_ERR(ctx);
35375 +
35376 +       result = 0;
35377 +       if (in_reiser4 == 0) {
35378 +               uf_info = unix_file_inode_data(inode);
35379 +
35380 +               get_exclusive_access_careful(uf_info, inode);
35381 +               if (atomic_read(&file->f_dentry->d_count) == 1 &&
35382 +                   uf_info->container == UF_CONTAINER_EXTENTS &&
35383 +                   !should_have_notail(uf_info, inode->i_size) &&
35384 +                   !rofs_inode(inode)) {
35385 +                       result = extent2tail(file, uf_info);
35386 +                       if (result != 0) {
35387 +                               context_set_commit_async(ctx);
35388 +                               warning("nikita-3233",
35389 +                                       "Failed (%d) to convert in %s (%llu)",
35390 +                                       result, __FUNCTION__,
35391 +                                       (unsigned long long)
35392 +                                       get_inode_oid(inode));
35393 +                       }
35394 +               }
35395 +               drop_exclusive_access(uf_info);
35396 +       } else {
35397 +               /*
35398 +                  we are within reiser4 context already. How latter is
35399 +                  possible? Simple:
35400 +
35401 +                  (gdb) bt
35402 +                  #0  get_exclusive_access ()
35403 +                  #2  0xc01e56d3 in release_unix_file ()
35404 +                  #3  0xc01c3643 in reiser4_release ()
35405 +                  #4  0xc014cae0 in __fput ()
35406 +                  #5  0xc013ffc3 in remove_vm_struct ()
35407 +                  #6  0xc0141786 in exit_mmap ()
35408 +                  #7  0xc0118480 in mmput ()
35409 +                  #8  0xc0133205 in oom_kill ()
35410 +                  #9  0xc01332d1 in out_of_memory ()
35411 +                  #10 0xc013bc1d in try_to_free_pages ()
35412 +                  #11 0xc013427b in __alloc_pages ()
35413 +                  #12 0xc013f058 in do_anonymous_page ()
35414 +                  #13 0xc013f19d in do_no_page ()
35415 +                  #14 0xc013f60e in handle_mm_fault ()
35416 +                  #15 0xc01131e5 in do_page_fault ()
35417 +                  #16 0xc0104935 in error_code ()
35418 +                  #17 0xc025c0c6 in __copy_to_user_ll ()
35419 +                  #18 0xc01d496f in reiser4_read_tail ()
35420 +                  #19 0xc01e4def in read_unix_file ()
35421 +                  #20 0xc01c3504 in reiser4_read ()
35422 +                  #21 0xc014bd4f in vfs_read ()
35423 +                  #22 0xc014bf66 in sys_read ()
35424 +                */
35425 +               warning("vs-44", "out of memory?");
35426 +       }
35427 +
35428 +       reiser4_free_file_fsdata(file);
35429 +
35430 +       reiser4_exit_context(ctx);
35431 +       return result;
35432 +}
35433 +
35434 +static void set_file_notail(struct inode *inode)
35435 +{
35436 +       reiser4_inode *state;
35437 +       formatting_plugin *tplug;
35438 +
35439 +       state = reiser4_inode_data(inode);
35440 +       tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID);
35441 +       force_plugin_pset(inode, PSET_FORMATTING, (reiser4_plugin *)tplug);
35442 +}
35443 +
35444 +/* if file is built of tails - convert it to extents */
35445 +static int unpack(struct file *filp, struct inode *inode, int forever)
35446 +{
35447 +       int result = 0;
35448 +       struct unix_file_info *uf_info;
35449 +
35450 +       uf_info = unix_file_inode_data(inode);
35451 +       assert("vs-1628", ea_obtained(uf_info));
35452 +
35453 +       result = find_file_state(inode, uf_info);
35454 +       if (result)
35455 +               return result;
35456 +       assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN);
35457 +
35458 +       if (uf_info->container == UF_CONTAINER_TAILS) {
35459 +               /*
35460 +                * if file is being convered by another process - wait until it
35461 +                * completes
35462 +                */
35463 +               while (1) {
35464 +                       if (reiser4_inode_get_flag(inode,
35465 +                                                  REISER4_PART_IN_CONV)) {
35466 +                               drop_exclusive_access(uf_info);
35467 +                               schedule();
35468 +                               get_exclusive_access(uf_info);
35469 +                               continue;
35470 +                       }
35471 +                       break;
35472 +               }
35473 +               if (uf_info->container == UF_CONTAINER_TAILS) {
35474 +                       result = tail2extent(uf_info);
35475 +                       if (result)
35476 +                               return result;
35477 +               }
35478 +       }
35479 +       if (forever) {
35480 +               /* safe new formatting plugin in stat data */
35481 +               __u64 tograb;
35482 +
35483 +               set_file_notail(inode);
35484 +
35485 +               grab_space_enable();
35486 +               tograb = inode_file_plugin(inode)->estimate.update(inode);
35487 +               result = reiser4_grab_space(tograb, BA_CAN_COMMIT);
35488 +               result = reiser4_update_sd(inode);
35489 +       }
35490 +
35491 +       return result;
35492 +}
35493 +
35494 +/* implentation of vfs' ioctl method of struct file_operations for unix file
35495 +   plugin
35496 +*/
35497 +int
35498 +ioctl_unix_file(struct inode *inode, struct file *filp,
35499 +               unsigned int cmd, unsigned long arg UNUSED_ARG)
35500 +{
35501 +       reiser4_context *ctx;
35502 +       int result;
35503 +
35504 +       ctx = reiser4_init_context(inode->i_sb);
35505 +       if (IS_ERR(ctx))
35506 +               return PTR_ERR(ctx);
35507 +
35508 +       switch (cmd) {
35509 +       case REISER4_IOC_UNPACK:
35510 +               get_exclusive_access(unix_file_inode_data(inode));
35511 +               result = unpack(filp, inode, 1 /* forever */ );
35512 +               drop_exclusive_access(unix_file_inode_data(inode));
35513 +               break;
35514 +
35515 +       default:
35516 +               result = RETERR(-ENOSYS);
35517 +               break;
35518 +       }
35519 +       reiser4_exit_context(ctx);
35520 +       return result;
35521 +}
35522 +
35523 +/* implentation of vfs' bmap method of struct address_space_operations for unix
35524 +   file plugin
35525 +*/
35526 +sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock)
35527 +{
35528 +       reiser4_context *ctx;
35529 +       sector_t result;
35530 +       reiser4_key key;
35531 +       coord_t coord;
35532 +       lock_handle lh;
35533 +       struct inode *inode;
35534 +       item_plugin *iplug;
35535 +       sector_t block;
35536 +
35537 +       inode = mapping->host;
35538 +
35539 +       ctx = reiser4_init_context(inode->i_sb);
35540 +       if (IS_ERR(ctx))
35541 +               return PTR_ERR(ctx);
35542 +       key_by_inode_and_offset_common(inode,
35543 +                                      (loff_t) lblock * current_blocksize,
35544 +                                      &key);
35545 +
35546 +       init_lh(&lh);
35547 +       result =
35548 +           find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
35549 +       if (cbk_errored(result)) {
35550 +               done_lh(&lh);
35551 +               reiser4_exit_context(ctx);
35552 +               return result;
35553 +       }
35554 +
35555 +       result = zload(coord.node);
35556 +       if (result) {
35557 +               done_lh(&lh);
35558 +               reiser4_exit_context(ctx);
35559 +               return result;
35560 +       }
35561 +
35562 +       iplug = item_plugin_by_coord(&coord);
35563 +       if (iplug->s.file.get_block) {
35564 +               result = iplug->s.file.get_block(&coord, lblock, &block);
35565 +               if (result == 0)
35566 +                       result = block;
35567 +       } else
35568 +               result = RETERR(-EINVAL);
35569 +
35570 +       zrelse(coord.node);
35571 +       done_lh(&lh);
35572 +       reiser4_exit_context(ctx);
35573 +       return result;
35574 +}
35575 +
35576 +/**
35577 + * flow_by_inode_unix_file - initizlize structure flow
35578 + * @inode: inode of file for which read or write is abou
35579 + * @buf: buffer to perform read to or write from
35580 + * @user: flag showing whether @buf is user space or kernel space
35581 + * @size: size of buffer @buf
35582 + * @off: start offset fro read or write
35583 + * @op: READ or WRITE
35584 + * @flow:
35585 + *
35586 + * Initializes fields of @flow: key, size of data, i/o mode (read or write).
35587 + */
35588 +int flow_by_inode_unix_file(struct inode *inode,
35589 +                           const char __user *buf, int user,
35590 +                           loff_t size, loff_t off,
35591 +                           rw_op op, flow_t *flow)
35592 +{
35593 +       assert("nikita-1100", inode != NULL);
35594 +
35595 +       flow->length = size;
35596 +       memcpy(&flow->data, &buf, sizeof(buf));
35597 +       flow->user = user;
35598 +       flow->op = op;
35599 +       assert("nikita-1931", inode_file_plugin(inode) != NULL);
35600 +       assert("nikita-1932",
35601 +              inode_file_plugin(inode)->key_by_inode ==
35602 +              key_by_inode_and_offset_common);
35603 +       /* calculate key of write position and insert it into flow->key */
35604 +       return key_by_inode_and_offset_common(inode, off, &flow->key);
35605 +}
35606 +
35607 +/* plugin->u.file.set_plug_in_sd = NULL
35608 +   plugin->u.file.set_plug_in_inode = NULL
35609 +   plugin->u.file.create_blank_sd = NULL */
35610 +/* plugin->u.file.delete */
35611 +/*
35612 +   plugin->u.file.add_link = reiser4_add_link_common
35613 +   plugin->u.file.rem_link = NULL */
35614 +
35615 +/* plugin->u.file.owns_item
35616 +   this is common_file_owns_item with assertion */
35617 +/* Audited by: green(2002.06.15) */
35618 +int
35619 +owns_item_unix_file(const struct inode *inode /* object to check against */ ,
35620 +                   const coord_t * coord /* coord to check */ )
35621 +{
35622 +       int result;
35623 +
35624 +       result = owns_item_common(inode, coord);
35625 +       if (!result)
35626 +               return 0;
35627 +       if (!plugin_of_group(item_plugin_by_coord(coord),
35628 +                            UNIX_FILE_METADATA_ITEM_TYPE))
35629 +               return 0;
35630 +       assert("vs-547",
35631 +              item_id_by_coord(coord) == EXTENT_POINTER_ID ||
35632 +              item_id_by_coord(coord) == FORMATTING_ID);
35633 +       return 1;
35634 +}
35635 +
35636 +static int setattr_truncate(struct inode *inode, struct iattr *attr)
35637 +{
35638 +       int result;
35639 +       int s_result;
35640 +       loff_t old_size;
35641 +       reiser4_tree *tree;
35642 +
35643 +       inode_check_scale(inode, inode->i_size, attr->ia_size);
35644 +
35645 +       old_size = inode->i_size;
35646 +       tree = reiser4_tree_by_inode(inode);
35647 +
35648 +       result = safe_link_grab(tree, BA_CAN_COMMIT);
35649 +       if (result == 0)
35650 +               result = safe_link_add(inode, SAFE_TRUNCATE);
35651 +       if (result == 0)
35652 +               result = truncate_file_body(inode, attr);
35653 +       if (result)
35654 +               warning("vs-1588", "truncate_file failed: oid %lli, "
35655 +                       "old size %lld, new size %lld, retval %d",
35656 +                       (unsigned long long)get_inode_oid(inode),
35657 +                       old_size, attr->ia_size, result);
35658 +
35659 +       s_result = safe_link_grab(tree, BA_CAN_COMMIT);
35660 +       if (s_result == 0)
35661 +               s_result =
35662 +                   safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE);
35663 +       if (s_result != 0) {
35664 +               warning("nikita-3417", "Cannot kill safelink %lli: %i",
35665 +                       (unsigned long long)get_inode_oid(inode), s_result);
35666 +       }
35667 +       safe_link_release(tree);
35668 +       return result;
35669 +}
35670 +
35671 +/* plugin->u.file.setattr method */
35672 +/* This calls inode_setattr and if truncate is in effect it also takes
35673 +   exclusive inode access to avoid races */
35674 +int setattr_unix_file(struct dentry *dentry,   /* Object to change attributes */
35675 +                     struct iattr *attr /* change description */ )
35676 +{
35677 +       int result;
35678 +
35679 +       if (attr->ia_valid & ATTR_SIZE) {
35680 +               reiser4_context *ctx;
35681 +               struct unix_file_info *uf_info;
35682 +
35683 +               /* truncate does reservation itself and requires exclusive
35684 +                  access obtained */
35685 +               ctx = reiser4_init_context(dentry->d_inode->i_sb);
35686 +               if (IS_ERR(ctx))
35687 +                       return PTR_ERR(ctx);
35688 +
35689 +               uf_info = unix_file_inode_data(dentry->d_inode);
35690 +               get_exclusive_access_careful(uf_info, dentry->d_inode);
35691 +               result = setattr_truncate(dentry->d_inode, attr);
35692 +               drop_exclusive_access(uf_info);
35693 +               context_set_commit_async(ctx);
35694 +               reiser4_exit_context(ctx);
35695 +       } else
35696 +               result = reiser4_setattr_common(dentry, attr);
35697 +
35698 +       return result;
35699 +}
35700 +
35701 +/* plugin->u.file.init_inode_data */
35702 +void
35703 +init_inode_data_unix_file(struct inode *inode,
35704 +                         reiser4_object_create_data * crd, int create)
35705 +{
35706 +       struct unix_file_info *data;
35707 +
35708 +       data = unix_file_inode_data(inode);
35709 +       data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN;
35710 +       init_rwsem(&data->latch);
35711 +       data->tplug = inode_formatting_plugin(inode);
35712 +       data->exclusive_use = 0;
35713 +
35714 +#if REISER4_DEBUG
35715 +       data->ea_owner = NULL;
35716 +       atomic_set(&data->nr_neas, 0);
35717 +#endif
35718 +       init_inode_ordering(inode, crd, create);
35719 +}
35720 +
35721 +/**
35722 + * delete_unix_file - delete_object of file_plugin
35723 + * @inode: inode to be deleted
35724 + *
35725 + * Truncates file to length 0, removes stat data and safe link.
35726 + */
35727 +int delete_object_unix_file(struct inode *inode)
35728 +{
35729 +       struct unix_file_info *uf_info;
35730 +       int result;
35731 +
35732 +       if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
35733 +               return 0;
35734 +
35735 +       /* truncate file bogy first */
35736 +       uf_info = unix_file_inode_data(inode);
35737 +       get_exclusive_access(uf_info);
35738 +       result = shorten_file(inode, 0 /* size */ );
35739 +       drop_exclusive_access(uf_info);
35740 +
35741 +       if (result)
35742 +               warning("edward-1556",
35743 +                       "failed to truncate file (%llu) on removal: %d",
35744 +                       get_inode_oid(inode), result);
35745 +
35746 +       /* remove stat data and safe link */
35747 +       return reiser4_delete_object_common(inode);
35748 +}
35749 +
35750 +/* plugin->write_begin() */
35751 +int write_begin_unix_file(struct file *file, struct page *page,
35752 +                         unsigned from, unsigned to)
35753 +{
35754 +       int ret;
35755 +       struct unix_file_info *info;
35756 +
35757 +       info = unix_file_inode_data(file->f_dentry->d_inode);
35758 +       get_exclusive_access(info);
35759 +       ret = find_file_state(file->f_dentry->d_inode, info);
35760 +       if (likely(ret == 0)) {
35761 +               if (info->container == UF_CONTAINER_TAILS)
35762 +                       ret = -EINVAL;
35763 +               else
35764 +                       ret = do_prepare_write(file, page, from, to);
35765 +       }
35766 +       drop_exclusive_access(info);
35767 +       return ret;
35768 +}
35769 +
35770 +/*
35771 + * Local variables:
35772 + * c-indentation-style: "K&R"
35773 + * mode-name: "LC"
35774 + * c-basic-offset: 8
35775 + * tab-width: 8
35776 + * fill-column: 79
35777 + * scroll-step: 1
35778 + * End:
35779 + */
35780 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/file/file_conversion.c linux-2.6.35/fs/reiser4/plugin/file/file_conversion.c
35781 --- linux-2.6.35.orig/fs/reiser4/plugin/file/file_conversion.c  1970-01-01 01:00:00.000000000 +0100
35782 +++ linux-2.6.35/fs/reiser4/plugin/file/file_conversion.c       2010-08-04 15:44:57.000000000 +0200
35783 @@ -0,0 +1,747 @@
35784 +/* Copyright 2001, 2002, 2003 by Hans Reiser,
35785 +   licensing governed by reiser4/README */
35786 +
35787 +/**
35788 + * This file contains dispatching hooks, and conversion methods, which
35789 + * implement transitions in the FILE interface.
35790 + *
35791 + * Dispatching hook makes a decision (at dispatching point) about the
35792 + * most reasonable plugin. Such decision is made in accordance with some
35793 + * O(1)-heuristic.
35794 + *
35795 + * We implement a transition CRYPTCOMPRESS -> UNIX_FILE for files with
35796 + * incompressible data. Current heuristic to estimate compressibility is
35797 + * very simple: if first complete logical cluster (64K by default) of a
35798 + * file is incompressible, then we make a decision, that the whole file
35799 + * is incompressible.
35800 + *
35801 + * To enable dispatching we install a special "magic" compression mode
35802 + * plugin CONVX_COMPRESSION_MODE_ID at file creation time.
35803 + *
35804 + * Note, that we don't perform back conversion (UNIX_FILE->CRYPTCOMPRESS)
35805 + * because of compatibility reasons).
35806 + *
35807 + * In conversion time we protect CS, the conversion set (file's (meta)data
35808 + * and plugin table (pset)) via special per-inode rw-semaphore (conv_sem).
35809 + * The methods which implement conversion are CS writers. The methods of FS
35810 + * interface (file_operations, inode_operations, address_space_operations)
35811 + * are CS readers.
35812 + */
35813 +
35814 +#include "../../inode.h"
35815 +#include "../cluster.h"
35816 +#include "file.h"
35817 +
35818 +#define conversion_enabled(inode)                                      \
35819 +        (inode_compression_mode_plugin(inode) ==                      \
35820 +         compression_mode_plugin_by_id(CONVX_COMPRESSION_MODE_ID))
35821 +
35822 +/**
35823 + * Located sections (readers and writers of @pset) are not permanently
35824 + * critical: cryptcompress file can be converted only if the conversion
35825 + * is enabled (see the macrio above). Also we don't perform back
35826 + * conversion. The following helper macro is a sanity check to decide
35827 + * if we need the protection (locks are always additional overheads).
35828 + */
35829 +#define should_protect(inode)                                          \
35830 +       (inode_file_plugin(inode) ==                                    \
35831 +        file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID) &&             \
35832 +        conversion_enabled(inode))
35833 +/**
35834 + * To avoid confusion with read/write file operations, we'll speak about
35835 + * "passive" protection for FCS readers and "active" protection for FCS
35836 + * writers. All methods with active or passive protection have suffix
35837 + * "careful".
35838 + */
35839 +/**
35840 + * Macros for passive protection.
35841 + *
35842 + * Construct invariant operation to be supplied to VFS.
35843 + * The macro accepts the following lexemes:
35844 + * @type - type of the value represented by the compound statement;
35845 + * @method - name of an operation to be supplied to VFS (reiser4 file
35846 + * plugin also should contain a method with such name).
35847 + */
35848 +#define PROT_PASSIVE(type, method, args)                               \
35849 +({                                                                     \
35850 +       type _result;                                                   \
35851 +       struct rw_semaphore * guard =                                   \
35852 +               &reiser4_inode_data(inode)->conv_sem;                   \
35853 +                                                                       \
35854 +       if (should_protect(inode)) {                                    \
35855 +               down_read(guard);                                       \
35856 +               if (!should_protect(inode))                             \
35857 +                       up_read(guard);                                 \
35858 +       }                                                               \
35859 +       _result = inode_file_plugin(inode)->method args;                \
35860 +       if (should_protect(inode))                                      \
35861 +               up_read(guard);                                         \
35862 +       _result;                                                        \
35863 +})
35864 +
35865 +#define PROT_PASSIVE_VOID(method, args)                                        \
35866 +({                                                                     \
35867 +       struct rw_semaphore * guard =                                   \
35868 +               &reiser4_inode_data(inode)->conv_sem;                   \
35869 +                                                                       \
35870 +       if (should_protect(inode)) {                                    \
35871 +               down_read(guard);                                       \
35872 +               if (!should_protect(inode))                             \
35873 +                       up_read(guard);                                 \
35874 +       }                                                               \
35875 +       inode_file_plugin(inode)->method args;                          \
35876 +                                                                       \
35877 +       if (should_protect(inode))                                      \
35878 +               up_read(guard);                                         \
35879 +})
35880 +
35881 +/* Pass management to the unix-file plugin with "notail" policy */
35882 +static int __cryptcompress2unixfile(struct file *file, struct inode * inode)
35883 +{
35884 +       int result;
35885 +       reiser4_inode *info;
35886 +       struct unix_file_info * uf;
35887 +       info = reiser4_inode_data(inode);
35888 +
35889 +       result = aset_set_unsafe(&info->pset,
35890 +                           PSET_FILE,
35891 +                           (reiser4_plugin *)
35892 +                           file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
35893 +       if (result)
35894 +               return result;
35895 +       result = aset_set_unsafe(&info->pset,
35896 +                           PSET_FORMATTING,
35897 +                           (reiser4_plugin *)
35898 +                           formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID));
35899 +       if (result)
35900 +               return result;
35901 +       /* get rid of non-standard plugins */
35902 +       info->plugin_mask &= ~cryptcompress_mask;
35903 +       /* get rid of plugin stat-data extension */
35904 +       info->extmask &= ~(1 << PLUGIN_STAT);
35905 +
35906 +       reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
35907 +
35908 +       /* FIXME use init_inode_data_unix_file() instead,
35909 +          but aviod init_inode_ordering() */
35910 +       /* Init unix-file specific part of inode */
35911 +       uf = unix_file_inode_data(inode);
35912 +       uf->container = UF_CONTAINER_UNKNOWN;
35913 +       init_rwsem(&uf->latch);
35914 +       uf->tplug = inode_formatting_plugin(inode);
35915 +       uf->exclusive_use = 0;
35916 +#if REISER4_DEBUG
35917 +       uf->ea_owner = NULL;
35918 +       atomic_set(&uf->nr_neas, 0);
35919 +#endif
35920 +       /**
35921 +        * we was carefull for file_ops, inode_ops and as_ops
35922 +        * to be invariant for plugin conversion, so there is
35923 +        * no need to update ones already installed in the
35924 +        * vfs's residence.
35925 +        */
35926 +       return 0;
35927 +}
35928 +
35929 +#if REISER4_DEBUG
35930 +static int disabled_conversion_inode_ok(struct inode * inode)
35931 +{
35932 +       __u64 extmask = reiser4_inode_data(inode)->extmask;
35933 +       __u16 plugin_mask = reiser4_inode_data(inode)->plugin_mask;
35934 +
35935 +       return ((extmask & (1 << LIGHT_WEIGHT_STAT)) &&
35936 +               (extmask & (1 << UNIX_STAT)) &&
35937 +               (extmask & (1 << LARGE_TIMES_STAT)) &&
35938 +               (extmask & (1 << PLUGIN_STAT)) &&
35939 +               (plugin_mask & (1 << PSET_COMPRESSION_MODE)));
35940 +}
35941 +#endif
35942 +
35943 +/**
35944 + * Disable future attempts to schedule/convert file plugin.
35945 + * This function is called by plugin schedule hooks.
35946 + *
35947 + * To disable conversion we assign any compression mode plugin id
35948 + * different from CONVX_COMPRESSION_MODE_ID.
35949 + */
35950 +static int disable_conversion(struct inode * inode)
35951 +{
35952 +       int result;
35953 +       result =
35954 +              force_plugin_pset(inode,
35955 +                                PSET_COMPRESSION_MODE,
35956 +                                (reiser4_plugin *)compression_mode_plugin_by_id
35957 +                                (LATTD_COMPRESSION_MODE_ID));
35958 +       assert("edward-1500",
35959 +              ergo(!result, disabled_conversion_inode_ok(inode)));
35960 +       return result;
35961 +}
35962 +
35963 +/**
35964 + * Check if we really have achieved plugin scheduling point
35965 + */
35966 +static int check_dispatch_point(struct inode * inode,
35967 +                               loff_t pos /* position in the
35968 +                                             file to write from */,
35969 +                               struct cluster_handle * clust,
35970 +                               struct dispatch_context * cont)
35971 +{
35972 +       assert("edward-1505", conversion_enabled(inode));
35973 +       /*
35974 +        * if file size is more then cluster size, then compressible
35975 +        * status must be figured out (i.e. compression was disabled,
35976 +        * or file plugin was converted to unix_file)
35977 +        */
35978 +       assert("edward-1506", inode->i_size <= inode_cluster_size(inode));
35979 +
35980 +       if (pos > inode->i_size)
35981 +               /* first logical cluster will contain a (partial) hole */
35982 +               return disable_conversion(inode);
35983 +       if (pos < inode_cluster_size(inode))
35984 +               /* writing to the first logical cluster */
35985 +               return 0;
35986 +       /*
35987 +        * here we have:
35988 +        * cluster_size <= pos <= i_size <= cluster_size,
35989 +        * and, hence,  pos == i_size == cluster_size
35990 +        */
35991 +       assert("edward-1498",
35992 +              pos == inode->i_size &&
35993 +              pos == inode_cluster_size(inode));
35994 +       assert("edward-1539", cont != NULL);
35995 +       assert("edward-1540", cont->state == DISPATCH_INVAL_STATE);
35996 +
35997 +       cont->state = DISPATCH_POINT;
35998 +       return 0;
35999 +}
36000 +
36001 +static void start_check_compressibility(struct inode * inode,
36002 +                                       struct cluster_handle * clust,
36003 +                                       hint_t * hint)
36004 +{
36005 +       assert("edward-1507", clust->index == 1);
36006 +       assert("edward-1508", !tfm_cluster_is_uptodate(&clust->tc));
36007 +       assert("edward-1509", cluster_get_tfm_act(&clust->tc) == TFMA_READ);
36008 +
36009 +       hint_init_zero(hint);
36010 +       clust->hint = hint;
36011 +       clust->index --;
36012 +       clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
36013 +
36014 +       /* first logical cluster (of index #0) must be complete */
36015 +       assert("edward-1510", lbytes(clust->index, inode) ==
36016 +              inode_cluster_size(inode));
36017 +}
36018 +
36019 +static void finish_check_compressibility(struct inode * inode,
36020 +                                        struct cluster_handle * clust,
36021 +                                        hint_t * hint)
36022 +{
36023 +       reiser4_unset_hint(clust->hint);
36024 +       clust->hint = hint;
36025 +       clust->index ++;
36026 +}
36027 +
36028 +#if REISER4_DEBUG
36029 +static int prepped_dclust_ok(hint_t * hint)
36030 +{
36031 +       reiser4_key key;
36032 +       coord_t * coord = &hint->ext_coord.coord;
36033 +
36034 +       item_key_by_coord(coord, &key);
36035 +       return (item_id_by_coord(coord) == CTAIL_ID &&
36036 +               !coord_is_unprepped_ctail(coord) &&
36037 +               (get_key_offset(&key) + nr_units_ctail(coord) ==
36038 +                dclust_get_extension_dsize(hint)));
36039 +}
36040 +#endif
36041 +
36042 +#define fifty_persent(size) (size >> 1)
36043 +/* evaluation of data compressibility */
36044 +#define data_is_compressible(osize, isize)             \
36045 +       (osize < fifty_persent(isize))
36046 +
36047 +/**
36048 + * A simple O(1)-heuristic for compressibility.
36049 + * This is called not more then one time per file's life.
36050 + * Read first logical cluster (of index #0) and estimate its compressibility.
36051 + * Save estimation result in @cont.
36052 + */
36053 +static int read_check_compressibility(struct inode * inode,
36054 +                                     struct cluster_handle * clust,
36055 +                                     struct dispatch_context * cont)
36056 +{
36057 +       int i;
36058 +       int result;
36059 +       size_t dst_len;
36060 +       hint_t tmp_hint;
36061 +       hint_t * cur_hint = clust->hint;
36062 +       assert("edward-1541", cont->state == DISPATCH_POINT);
36063 +
36064 +       start_check_compressibility(inode, clust, &tmp_hint);
36065 +
36066 +       reset_cluster_pgset(clust, cluster_nrpages(inode));
36067 +       result = grab_page_cluster(inode, clust, READ_OP);
36068 +       if (result)
36069 +               return result;
36070 +       /* Read page cluster here */
36071 +       for (i = 0; i < clust->nr_pages; i++) {
36072 +               struct page *page = clust->pages[i];
36073 +               lock_page(page);
36074 +               result = do_readpage_ctail(inode, clust, page,
36075 +                                          ZNODE_READ_LOCK);
36076 +               unlock_page(page);
36077 +               if (result)
36078 +                       goto error;
36079 +       }
36080 +       tfm_cluster_clr_uptodate(&clust->tc);
36081 +
36082 +       cluster_set_tfm_act(&clust->tc, TFMA_WRITE);
36083 +
36084 +       if (hint_is_valid(&tmp_hint) && !hint_is_unprepped_dclust(&tmp_hint)) {
36085 +               /* lenght of compressed data is known, no need to compress */
36086 +               assert("edward-1511",
36087 +                      znode_is_any_locked(tmp_hint.lh.node));
36088 +               assert("edward-1512",
36089 +                      WITH_DATA(tmp_hint.ext_coord.coord.node,
36090 +                                prepped_dclust_ok(&tmp_hint)));
36091 +               dst_len = dclust_get_extension_dsize(&tmp_hint);
36092 +       }
36093 +       else {
36094 +               struct tfm_cluster * tc = &clust->tc;
36095 +               compression_plugin * cplug = inode_compression_plugin(inode);
36096 +               result = grab_tfm_stream(inode, tc, INPUT_STREAM);
36097 +               if (result)
36098 +                       goto error;
36099 +               for (i = 0; i < clust->nr_pages; i++) {
36100 +                       char *data;
36101 +                       lock_page(clust->pages[i]);
36102 +                       BUG_ON(!PageUptodate(clust->pages[i]));
36103 +                       data = kmap(clust->pages[i]);
36104 +                       memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
36105 +                              data, PAGE_CACHE_SIZE);
36106 +                       kunmap(clust->pages[i]);
36107 +                       unlock_page(clust->pages[i]);
36108 +               }
36109 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
36110 +               if (result)
36111 +                       goto error;
36112 +               result = grab_coa(tc, cplug);
36113 +               if (result)
36114 +                       goto error;
36115 +               tc->len = tc->lsize = lbytes(clust->index, inode);
36116 +               assert("edward-1513", tc->len == inode_cluster_size(inode));
36117 +               dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
36118 +               cplug->compress(get_coa(tc, cplug->h.id, tc->act),
36119 +                               tfm_input_data(clust), tc->len,
36120 +                               tfm_output_data(clust), &dst_len);
36121 +               assert("edward-1514",
36122 +                      dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
36123 +       }
36124 +       finish_check_compressibility(inode, clust, cur_hint);
36125 +       cont->state =
36126 +               (data_is_compressible(dst_len, inode_cluster_size(inode)) ?
36127 +                DISPATCH_REMAINS_OLD :
36128 +                DISPATCH_ASSIGNED_NEW);
36129 +       return 0;
36130 + error:
36131 +       put_page_cluster(clust, inode, READ_OP);
36132 +       return result;
36133 +}
36134 +
36135 +/* Cut disk cluster of index @idx */
36136 +static int cut_disk_cluster(struct inode * inode, cloff_t idx)
36137 +{
36138 +       reiser4_key from, to;
36139 +       assert("edward-1515", inode_file_plugin(inode) ==
36140 +              file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
36141 +       key_by_inode_cryptcompress(inode, clust_to_off(idx, inode), &from);
36142 +       to = from;
36143 +       set_key_offset(&to,
36144 +                      get_key_offset(&from) + inode_cluster_size(inode) - 1);
36145 +       return reiser4_cut_tree(reiser4_tree_by_inode(inode),
36146 +                               &from, &to, inode, 0);
36147 +}
36148 +
36149 +static int reserve_cryptcompress2unixfile(struct inode *inode)
36150 +{
36151 +       reiser4_block_nr unformatted_nodes;
36152 +       reiser4_tree *tree;
36153 +
36154 +       tree = reiser4_tree_by_inode(inode);
36155 +
36156 +       /* number of unformatted nodes which will be created */
36157 +       unformatted_nodes = cluster_nrpages(inode); /* N */
36158 +
36159 +       /*
36160 +        * space required for one iteration of extent->tail conversion:
36161 +        *
36162 +        *     1. kill ctail items
36163 +        *
36164 +        *     2. insert N unformatted nodes
36165 +        *
36166 +        *     3. insert N (worst-case single-block
36167 +        *     extents) extent units.
36168 +        *
36169 +        *     4. drilling to the leaf level by coord_by_key()
36170 +        *
36171 +        *     5. possible update of stat-data
36172 +        *
36173 +        */
36174 +       grab_space_enable();
36175 +       return reiser4_grab_space
36176 +               (2 * tree->height +
36177 +                unformatted_nodes  +
36178 +                unformatted_nodes * estimate_one_insert_into_item(tree) +
36179 +                1 + estimate_one_insert_item(tree) +
36180 +                inode_file_plugin(inode)->estimate.update(inode),
36181 +                BA_CAN_COMMIT);
36182 +}
36183 +
36184 +/**
36185 + * Convert cryptcompress file plugin to unix_file plugin.
36186 + */
36187 +static int cryptcompress2unixfile(struct file *file, struct inode *inode,
36188 +                                 struct dispatch_context *cont)
36189 +{
36190 +       int i;
36191 +       int result = 0;
36192 +       struct cryptcompress_info *cr_info;
36193 +       struct unix_file_info *uf_info;
36194 +       assert("edward-1516", cont->pages[0]->index == 0);
36195 +
36196 +       /* release all cryptcompress-specific resources */
36197 +       cr_info = cryptcompress_inode_data(inode);
36198 +       result = reserve_cryptcompress2unixfile(inode);
36199 +       if (result)
36200 +               goto out;
36201 +       /* tell kill_hook to not truncate pages */
36202 +       reiser4_inode_set_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
36203 +       result = cut_disk_cluster(inode, 0);
36204 +       if (result)
36205 +               goto out;
36206 +       /* captured jnode of cluster and assotiated resources (pages,
36207 +          reserved disk space) were released by ->kill_hook() method
36208 +          of the item plugin */
36209 +
36210 +       result = __cryptcompress2unixfile(file, inode);
36211 +       if (result)
36212 +               goto out;
36213 +       /* At this point file is managed by unix file plugin */
36214 +
36215 +       uf_info = unix_file_inode_data(inode);
36216 +
36217 +       assert("edward-1518",
36218 +              ergo(jprivate(cont->pages[0]),
36219 +                   !jnode_is_cluster_page(jprivate(cont->pages[0]))));
36220 +       for(i = 0; i < cont->nr_pages; i++) {
36221 +               assert("edward-1519", cont->pages[i]);
36222 +               assert("edward-1520", PageUptodate(cont->pages[i]));
36223 +
36224 +               result = find_or_create_extent(cont->pages[i]);
36225 +               if (result)
36226 +                       break;
36227 +       }
36228 +       if (unlikely(result))
36229 +               goto out;
36230 +       uf_info->container = UF_CONTAINER_EXTENTS;
36231 +       result = reiser4_update_sd(inode);
36232 + out:
36233 +       all_grabbed2free();
36234 +       return result;
36235 +}
36236 +
36237 +#define convert_file_plugin cryptcompress2unixfile
36238 +
36239 +/**
36240 + * This is called by ->write() method of a cryptcompress file plugin.
36241 + * Make a decision about the most reasonable file plugin id to manage
36242 + * the file.
36243 + */
36244 +int write_dispatch_hook(struct file *file, struct inode *inode,
36245 +                       loff_t pos, struct cluster_handle *clust,
36246 +                       struct dispatch_context *cont)
36247 +{
36248 +       int result;
36249 +       if (!conversion_enabled(inode))
36250 +               return 0;
36251 +       result = check_dispatch_point(inode, pos, clust, cont);
36252 +       if (result || cont->state != DISPATCH_POINT)
36253 +               return result;
36254 +       result = read_check_compressibility(inode, clust, cont);
36255 +       if (result)
36256 +               return result;
36257 +       if (cont->state == DISPATCH_REMAINS_OLD) {
36258 +               put_page_cluster(clust, inode, READ_OP);
36259 +               return disable_conversion(inode);
36260 +       }
36261 +       assert("edward-1543", cont->state == DISPATCH_ASSIGNED_NEW);
36262 +       /*
36263 +        * page cluster is grabbed and uptodate. It will be
36264 +        * released with a pgset after plugin conversion is
36265 +        * finished, see put_dispatch_context().
36266 +        */
36267 +       reiser4_unset_hint(clust->hint);
36268 +       move_cluster_pgset(clust, &cont->pages, &cont->nr_pages);
36269 +       return 0;
36270 +}
36271 +
36272 +/**
36273 + * This is called by ->setattr() method of cryptcompress file plugin.
36274 + */
36275 +int setattr_dispatch_hook(struct inode * inode)
36276 +{
36277 +       if (conversion_enabled(inode))
36278 +               return disable_conversion(inode);
36279 +       return 0;
36280 +}
36281 +
36282 +static inline void init_dispatch_context(struct dispatch_context * cont)
36283 +{
36284 +       memset(cont, 0, sizeof(*cont));
36285 +}
36286 +
36287 +static inline void done_dispatch_context(struct dispatch_context * cont,
36288 +                                        struct inode * inode)
36289 +{
36290 +       if (cont->pages) {
36291 +               __put_page_cluster(0, cont->nr_pages, cont->pages, inode);
36292 +               kfree(cont->pages);
36293 +       }
36294 +}
36295 +/**
36296 + * Here are wrappers with "protection", aka Reiser4 "careful" methods.
36297 + * They are used by vfs (as methods of file_ops, inode_ops or as_ops),
36298 + * which is not aware of plugin conversion performed by Reiser4.
36299 + */
36300 +
36301 +/*
36302 + * Wrappers with active protection for:
36303 + *
36304 + * ->write();
36305 + */
36306 +
36307 +/*
36308 + * ->write() file operation supplied to VFS.
36309 + * Write a file in 3 steps (some of them can be optional).
36310 + */
36311 +ssize_t reiser4_write_careful(struct file *file, const char __user *buf,
36312 +                             size_t count, loff_t *off)
36313 +{
36314 +       int result;
36315 +       reiser4_context *ctx;
36316 +       ssize_t written_old = 0; /* bytes written with initial plugin */
36317 +       ssize_t written_new = 0; /* bytes written with new plugin */
36318 +       struct dispatch_context cont;
36319 +       struct inode * inode = file->f_dentry->d_inode;
36320 +
36321 +       ctx = reiser4_init_context(inode->i_sb);
36322 +       if (IS_ERR(ctx))
36323 +               return PTR_ERR(ctx);
36324 +       init_dispatch_context(&cont);
36325 +       mutex_lock(&inode->i_mutex);
36326 +       /**
36327 +        * First step.
36328 +        * Start write with initial file plugin.
36329 +        * Keep a plugin schedule status at @cont (if any).
36330 +        */
36331 +       written_old = inode_file_plugin(inode)->write(file,
36332 +                                                     buf,
36333 +                                                     count,
36334 +                                                     off,
36335 +                                                     &cont);
36336 +       if (cont.state != DISPATCH_ASSIGNED_NEW || written_old < 0)
36337 +               goto exit;
36338 +       /**
36339 +        * Second step.
36340 +        * New file plugin has been scheduled.
36341 +        * Perform conversion to the new plugin.
36342 +        */
36343 +       down_read(&reiser4_inode_data(inode)->conv_sem);
36344 +       result = convert_file_plugin(file, inode, &cont);
36345 +       up_read(&reiser4_inode_data(inode)->conv_sem);
36346 +       if (result) {
36347 +               warning("edward-1544",
36348 +                       "Inode %llu: file plugin conversion failed (%d)",
36349 +                       (unsigned long long)get_inode_oid(inode),
36350 +                       result);
36351 +               context_set_commit_async(ctx);
36352 +               goto exit;
36353 +       }
36354 +       reiser4_txn_restart(ctx);
36355 +       /**
36356 +        * Third step:
36357 +        * Finish write with the new file plugin.
36358 +        */
36359 +       assert("edward-1536",
36360 +              inode_file_plugin(inode) ==
36361 +              file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
36362 +
36363 +       written_new = inode_file_plugin(inode)->write(file,
36364 +                                                     buf + written_old,
36365 +                                                     count - written_old,
36366 +                                                     off,
36367 +                                                     NULL);
36368 + exit:
36369 +       mutex_unlock(&inode->i_mutex);
36370 +       done_dispatch_context(&cont, inode);
36371 +       reiser4_exit_context(ctx);
36372 +
36373 +       return written_old + (written_new < 0 ? 0 : written_new);
36374 +}
36375 +
36376 +/* Wrappers with passive protection for:
36377 + *
36378 + * ->open();
36379 + * ->read();
36380 + * ->ioctl();
36381 + * ->mmap();
36382 + * ->release();
36383 + * ->bmap().
36384 + */
36385 +
36386 +int reiser4_open_careful(struct inode *inode, struct file *file)
36387 +{
36388 +       return PROT_PASSIVE(int, open, (inode, file));
36389 +}
36390 +
36391 +ssize_t reiser4_read_careful(struct file * file, char __user * buf,
36392 +                            size_t size, loff_t * off)
36393 +{
36394 +       struct inode * inode = file->f_dentry->d_inode;
36395 +       return PROT_PASSIVE(ssize_t, read, (file, buf, size, off));
36396 +}
36397 +
36398 +int reiser4_ioctl_careful(struct inode *inode, struct file *filp,
36399 +                         unsigned int cmd, unsigned long arg)
36400 +{
36401 +       return PROT_PASSIVE(int, ioctl, (inode, filp, cmd, arg));
36402 +}
36403 +
36404 +int reiser4_mmap_careful(struct file *file, struct vm_area_struct *vma)
36405 +{
36406 +       struct inode *inode = file->f_dentry->d_inode;
36407 +       return PROT_PASSIVE(int, mmap, (file, vma));
36408 +}
36409 +
36410 +int reiser4_release_careful(struct inode *inode, struct file *file)
36411 +{
36412 +       return PROT_PASSIVE(int, release, (inode, file));
36413 +}
36414 +
36415 +sector_t reiser4_bmap_careful(struct address_space * mapping, sector_t lblock)
36416 +{
36417 +       struct inode *inode = mapping->host;
36418 +       return PROT_PASSIVE(sector_t, bmap, (mapping, lblock));
36419 +}
36420 +
36421 +/**
36422 + * NOTE: The following two methods are
36423 + * used only for loopback functionality.
36424 + * reiser4_write_end() can not cope with
36425 + * short writes for now.
36426 + */
36427 +int reiser4_write_begin_careful(struct file *file,
36428 +                               struct address_space *mapping,
36429 +                               loff_t pos,
36430 +                               unsigned len,
36431 +                               unsigned flags,
36432 +                               struct page **pagep,
36433 +                               void **fsdata)
36434 +{
36435 +       int ret = 0;
36436 +       unsigned start, end;
36437 +       struct page *page;
36438 +       pgoff_t index;
36439 +       reiser4_context *ctx;
36440 +       struct inode * inode = file->f_dentry->d_inode;
36441 +
36442 +       index = pos >> PAGE_CACHE_SHIFT;
36443 +       start = pos & (PAGE_CACHE_SIZE - 1);
36444 +       end = start + len;
36445 +
36446 +       page = grab_cache_page_write_begin(mapping, index,
36447 +                                          flags & AOP_FLAG_NOFS);
36448 +       *pagep = page;
36449 +       if (!page)
36450 +               return -ENOMEM;
36451 +
36452 +       ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
36453 +       if (IS_ERR(ctx)) {
36454 +               ret = PTR_ERR(ctx);
36455 +               goto out;
36456 +       }
36457 +       ret = PROT_PASSIVE(int, write_begin, (file, page, start, end));
36458 +
36459 +       /* don't commit transaction under inode semaphore */
36460 +       context_set_commit_async(ctx);
36461 +       reiser4_exit_context(ctx);
36462 + out:
36463 +       if (unlikely(ret)) {
36464 +               unlock_page(page);
36465 +               page_cache_release(page);
36466 +       }
36467 +       return ret;
36468 +}
36469 +
36470 +int reiser4_write_end_careful(struct file *file,
36471 +                             struct address_space *mapping,
36472 +                             loff_t pos,
36473 +                             unsigned len,
36474 +                             unsigned copied,
36475 +                             struct page *page,
36476 +                             void *fsdata)
36477 +{
36478 +       int ret;
36479 +       reiser4_context *ctx;
36480 +       unsigned start, end;
36481 +       struct inode *inode = page->mapping->host;
36482 +
36483 +       assert("umka-3101", file != NULL);
36484 +       assert("umka-3102", page != NULL);
36485 +       assert("umka-3093", PageLocked(page));
36486 +
36487 +       start = pos & (PAGE_CACHE_SIZE - 1);
36488 +       end = start + len;
36489 +
36490 +       flush_dcache_page(page);
36491 +       SetPageUptodate(page);
36492 +
36493 +       ctx = reiser4_init_context(page->mapping->host->i_sb);
36494 +       if (IS_ERR(ctx)){
36495 +               unlock_page(page);
36496 +               ret = PTR_ERR(ctx);
36497 +               goto out;
36498 +       }
36499 +       ret = PROT_PASSIVE(int, write_end, (file, page, start, end));
36500 +
36501 +       /* don't commit transaction under inode semaphore */
36502 +       context_set_commit_async(ctx);
36503 +       reiser4_exit_context(ctx);
36504 + out:
36505 +       page_cache_release(page);
36506 +       if (!ret)
36507 +               ret = copied;
36508 +       return ret;
36509 +}
36510 +
36511 +/*
36512 + * Wrappers without protection for:
36513 + *
36514 + * ->setattr()
36515 + */
36516 +int reiser4_setattr(struct dentry *dentry, struct iattr *attr)
36517 +{
36518 +       return inode_file_plugin(dentry->d_inode)->setattr(dentry, attr);
36519 +}
36520 +
36521 +/*
36522 +  Local variables:
36523 +  c-indentation-style: "K&R"
36524 +  mode-name: "LC"
36525 +  c-basic-offset: 8
36526 +  tab-width: 8
36527 +  fill-column: 80
36528 +  scroll-step: 1
36529 +  End:
36530 +*/
36531 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/file/file.h linux-2.6.35/fs/reiser4/plugin/file/file.h
36532 --- linux-2.6.35.orig/fs/reiser4/plugin/file/file.h     1970-01-01 01:00:00.000000000 +0100
36533 +++ linux-2.6.35/fs/reiser4/plugin/file/file.h  2010-08-04 20:23:15.000000000 +0200
36534 @@ -0,0 +1,336 @@
36535 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
36536 + * reiser4/README */
36537 +
36538 +/* this file contains declarations of methods implementing
36539 +   file plugins (UNIX_FILE_PLUGIN_ID, CRYPTCOMPRESS_FILE_PLUGIN_ID
36540 +   and SYMLINK_FILE_PLUGIN_ID) */
36541 +
36542 +#if !defined( __REISER4_FILE_H__ )
36543 +#define __REISER4_FILE_H__
36544 +
36545 +/* possible states in dispatching process */
36546 +typedef enum {
36547 +       DISPATCH_INVAL_STATE,  /* invalid state */
36548 +       DISPATCH_POINT,        /* dispatching point has been achieved */
36549 +       DISPATCH_REMAINS_OLD,  /* made a decision to manage by old plugin */
36550 +       DISPATCH_ASSIGNED_NEW  /* a new plugin has been assigned */
36551 +} dispatch_state;
36552 +
36553 +struct dispatch_context {
36554 +       int nr_pages;
36555 +       struct page **pages;
36556 +       dispatch_state state;
36557 +};
36558 +
36559 +/**
36560 + * Declarations of common/careful/generic methods.
36561 + * Suppose ->foo() is a vs method (of f_ops, i_ops, or a_ops);
36562 + * Then common reiser4 method for foo looks like reiser4_foo_common;
36563 + * careful method looks like reiser4_foo_careful;
36564 + * generic method looks like reiser4_foo.
36565 + *
36566 + * Common method is a simple instruction set eligible for more
36567 + * then one plugin id.
36568 + *
36569 + * Generic method looks at the plugin installed in inode's
36570 + * plugin set and calls its appropriate method.
36571 + *
36572 + * Careful method looks like generic method with protected pset
36573 + * (see plugin/file/file_conversion.c for details).
36574 + */
36575 +
36576 +/* inode operations */
36577 +int reiser4_setattr(struct dentry *, struct iattr *);
36578 +
36579 +/* file operations */
36580 +ssize_t reiser4_read_careful(struct file *, char __user *buf,
36581 +                            size_t count, loff_t *off);
36582 +ssize_t reiser4_write_careful(struct file *, const char __user *buf,
36583 +                             size_t count, loff_t * off);
36584 +int reiser4_ioctl_careful(struct inode *inode, struct file *filp,
36585 +                         unsigned int cmd, unsigned long arg);
36586 +int reiser4_mmap_careful(struct file *, struct vm_area_struct *);
36587 +int reiser4_open_careful(struct inode *inode, struct file *file);
36588 +int reiser4_release_careful(struct inode *, struct file *);
36589 +int reiser4_sync_file_common(struct file *, int datasync);
36590 +
36591 +/* address space operations */
36592 +int reiser4_readpage(struct file *, struct page *);
36593 +int reiser4_readpages(struct file*, struct address_space*, struct list_head*,
36594 +                     unsigned);
36595 +int reiser4_writepages(struct address_space *, struct writeback_control *);
36596 +int reiser4_write_begin_careful(struct file *file,
36597 +                               struct address_space *mapping,
36598 +                               loff_t pos, unsigned len, unsigned flags,
36599 +                               struct page **pagep, void **fsdata);
36600 +int reiser4_write_end_careful(struct file *file,
36601 +                             struct address_space *mapping,
36602 +                             loff_t pos, unsigned len, unsigned copied,
36603 +                             struct page *page, void *fsdata);
36604 +sector_t reiser4_bmap_careful(struct address_space *, sector_t lblock);
36605 +
36606 +/*
36607 + * Private methods of unix-file plugin
36608 + * (UNIX_FILE_PLUGIN_ID)
36609 + */
36610 +
36611 +/* private inode operations */
36612 +int setattr_unix_file(struct dentry *, struct iattr *);
36613 +
36614 +/* private file operations */
36615 +
36616 +ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount,
36617 +                      loff_t *off);
36618 +ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount,
36619 +                       loff_t * off, struct dispatch_context * cont);
36620 +int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd,
36621 +                   unsigned long arg);
36622 +int mmap_unix_file(struct file *, struct vm_area_struct *);
36623 +int open_unix_file(struct inode *, struct file *);
36624 +int release_unix_file(struct inode *, struct file *);
36625 +
36626 +/* private address space operations */
36627 +int readpage_unix_file(struct file *, struct page *);
36628 +int readpages_unix_file(struct file*, struct address_space*, struct list_head*,
36629 +                       unsigned);
36630 +int writepages_unix_file(struct address_space *, struct writeback_control *);
36631 +int write_begin_unix_file(struct file *file, struct page *page,
36632 +                         unsigned from, unsigned to);
36633 +int write_end_unix_file(struct file *file, struct page *page,
36634 +                       unsigned from, unsigned to);
36635 +sector_t bmap_unix_file(struct address_space *, sector_t lblock);
36636 +
36637 +/* other private methods */
36638 +int delete_object_unix_file(struct inode *);
36639 +int flow_by_inode_unix_file(struct inode *, const char __user *buf,
36640 +                           int user, loff_t, loff_t, rw_op, flow_t *);
36641 +int owns_item_unix_file(const struct inode *, const coord_t *);
36642 +void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *,
36643 +                              int create);
36644 +
36645 +/*
36646 + * Private methods of cryptcompress file plugin
36647 + * (CRYPTCOMPRESS_FILE_PLUGIN_ID)
36648 + */
36649 +
36650 +/* private inode operations */
36651 +int setattr_cryptcompress(struct dentry *, struct iattr *);
36652 +
36653 +/* private file operations */
36654 +ssize_t read_cryptcompress(struct file *, char __user *buf,
36655 +                          size_t count, loff_t *off);
36656 +ssize_t write_cryptcompress(struct file *, const char __user *buf,
36657 +                           size_t count, loff_t * off,
36658 +                           struct dispatch_context *cont);
36659 +int ioctl_cryptcompress(struct inode *, struct file *, unsigned int cmd,
36660 +                       unsigned long arg);
36661 +int mmap_cryptcompress(struct file *, struct vm_area_struct *);
36662 +int open_cryptcompress(struct inode *, struct file *);
36663 +int release_cryptcompress(struct inode *, struct file *);
36664 +
36665 +/* private address space operations */
36666 +int readpage_cryptcompress(struct file *, struct page *);
36667 +int readpages_cryptcompress(struct file*, struct address_space*,
36668 +                           struct list_head*, unsigned);
36669 +int writepages_cryptcompress(struct address_space *,
36670 +                            struct writeback_control *);
36671 +int write_begin_cryptcompress(struct file *file, struct page *page,
36672 +                             unsigned from, unsigned to);
36673 +int write_end_cryptcompress(struct file *file, struct page *page,
36674 +                           unsigned from, unsigned to);
36675 +sector_t bmap_cryptcompress(struct address_space *, sector_t lblock);
36676 +
36677 +/* other private methods */
36678 +int flow_by_inode_cryptcompress(struct inode *, const char __user *buf,
36679 +                               int user, loff_t, loff_t, rw_op, flow_t *);
36680 +int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *);
36681 +int create_object_cryptcompress(struct inode *, struct inode *,
36682 +                               reiser4_object_create_data *);
36683 +int delete_object_cryptcompress(struct inode *);
36684 +void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *,
36685 +                                  int create);
36686 +int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key,
36687 +                                 const reiser4_key * to_key,
36688 +                                 reiser4_key * smallest_removed,
36689 +                                 struct inode *object, int truncate,
36690 +                                 int *progress);
36691 +void destroy_inode_cryptcompress(struct inode *);
36692 +
36693 +/*
36694 + * Private methods of symlink file plugin
36695 + * (SYMLINK_FILE_PLUGIN_ID)
36696 + */
36697 +int reiser4_create_symlink(struct inode *symlink, struct inode *dir,
36698 +                          reiser4_object_create_data *);
36699 +void destroy_inode_symlink(struct inode *);
36700 +
36701 +/*
36702 + * all the write into unix file is performed by item write method. Write method
36703 + * of unix file plugin only decides which item plugin (extent or tail) and in
36704 + * which mode (one from the enum below) to call
36705 + */
36706 +typedef enum {
36707 +       FIRST_ITEM = 1,
36708 +       APPEND_ITEM = 2,
36709 +       OVERWRITE_ITEM = 3
36710 +} write_mode_t;
36711 +
36712 +/* unix file may be in one the following states */
36713 +typedef enum {
36714 +       UF_CONTAINER_UNKNOWN = 0,
36715 +       UF_CONTAINER_TAILS = 1,
36716 +       UF_CONTAINER_EXTENTS = 2,
36717 +       UF_CONTAINER_EMPTY = 3
36718 +} file_container_t;
36719 +
36720 +struct formatting_plugin;
36721 +struct inode;
36722 +
36723 +/* unix file plugin specific part of reiser4 inode */
36724 +struct unix_file_info {
36725 +       /*
36726 +        * this read-write lock protects file containerization change. Accesses
36727 +        * which do not change file containerization (see file_container_t)
36728 +        * (read, readpage, writepage, write (until tail conversion is
36729 +        * involved)) take read-lock. Accesses which modify file
36730 +        * containerization (truncate, conversion from tail to extent and back)
36731 +        * take write-lock.
36732 +        */
36733 +       struct rw_semaphore latch;
36734 +       /* this enum specifies which items are used to build the file */
36735 +       file_container_t container;
36736 +       /*
36737 +        * plugin which controls when file is to be converted to extents and
36738 +        * back to tail
36739 +        */
36740 +       struct formatting_plugin *tplug;
36741 +       /* if this is set, file is in exclusive use */
36742 +       int exclusive_use;
36743 +#if REISER4_DEBUG
36744 +       /* pointer to task struct of thread owning exclusive access to file */
36745 +       void *ea_owner;
36746 +       atomic_t nr_neas;
36747 +       void *last_reader;
36748 +#endif
36749 +};
36750 +
36751 +struct unix_file_info *unix_file_inode_data(const struct inode *inode);
36752 +void get_exclusive_access(struct unix_file_info *);
36753 +void drop_exclusive_access(struct unix_file_info *);
36754 +void get_nonexclusive_access(struct unix_file_info *);
36755 +void drop_nonexclusive_access(struct unix_file_info *);
36756 +int try_to_get_nonexclusive_access(struct unix_file_info *);
36757 +int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode,
36758 +                  struct inode *);
36759 +int find_file_item_nohint(coord_t *, lock_handle *,
36760 +                         const reiser4_key *, znode_lock_mode,
36761 +                         struct inode *);
36762 +
36763 +int load_file_hint(struct file *, hint_t *);
36764 +void save_file_hint(struct file *, const hint_t *);
36765 +
36766 +#include "../item/extent.h"
36767 +#include "../item/tail.h"
36768 +#include "../item/ctail.h"
36769 +
36770 +struct uf_coord {
36771 +       coord_t coord;
36772 +       lock_handle *lh;
36773 +       int valid;
36774 +       union {
36775 +               struct extent_coord_extension extent;
36776 +               struct tail_coord_extension tail;
36777 +               struct ctail_coord_extension ctail;
36778 +       } extension;
36779 +};
36780 +
36781 +#include "../../forward.h"
36782 +#include "../../seal.h"
36783 +#include "../../lock.h"
36784 +
36785 +/*
36786 + * This structure is used to speed up file operations (reads and writes).  A
36787 + * hint is a suggestion about where a key resolved to last time.  A seal
36788 + * indicates whether a node has been modified since a hint was last recorded.
36789 + * You check the seal, and if the seal is still valid, you can use the hint
36790 + * without traversing the tree again.
36791 + */
36792 +struct hint {
36793 +       seal_t seal; /* a seal over last file item accessed */
36794 +       uf_coord_t ext_coord;
36795 +       loff_t offset;
36796 +       znode_lock_mode mode;
36797 +       lock_handle lh;
36798 +};
36799 +
36800 +static inline int hint_is_valid(hint_t * hint)
36801 +{
36802 +       return hint->ext_coord.valid;
36803 +}
36804 +
36805 +static inline void hint_set_valid(hint_t * hint)
36806 +{
36807 +       hint->ext_coord.valid = 1;
36808 +}
36809 +
36810 +static inline void hint_clr_valid(hint_t * hint)
36811 +{
36812 +       hint->ext_coord.valid = 0;
36813 +}
36814 +
36815 +int load_file_hint(struct file *, hint_t *);
36816 +void save_file_hint(struct file *, const hint_t *);
36817 +void hint_init_zero(hint_t *);
36818 +void reiser4_set_hint(hint_t *, const reiser4_key *, znode_lock_mode);
36819 +int hint_is_set(const hint_t *);
36820 +void reiser4_unset_hint(hint_t *);
36821 +
36822 +int reiser4_update_file_size(struct inode *, loff_t, int update_sd);
36823 +int cut_file_items(struct inode *, loff_t new_size,
36824 +                  int update_sd, loff_t cur_size,
36825 +                  int (*update_actor) (struct inode *, loff_t, int));
36826 +#if REISER4_DEBUG
36827 +
36828 +/* return 1 is exclusive access is obtained, 0 - otherwise */
36829 +static inline int ea_obtained(struct unix_file_info * uf_info)
36830 +{
36831 +       int ret;
36832 +
36833 +       ret = down_read_trylock(&uf_info->latch);
36834 +       if (ret)
36835 +               up_read(&uf_info->latch);
36836 +       return !ret;
36837 +}
36838 +
36839 +#endif
36840 +
36841 +#define WRITE_GRANULARITY 32
36842 +
36843 +int tail2extent(struct unix_file_info *);
36844 +int extent2tail(struct file *, struct unix_file_info *);
36845 +
36846 +int goto_right_neighbor(coord_t *, lock_handle *);
36847 +int find_or_create_extent(struct page *);
36848 +int equal_to_ldk(znode *, const reiser4_key *);
36849 +
36850 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh);
36851 +
36852 +static inline int cbk_errored(int cbk_result)
36853 +{
36854 +       return (cbk_result != CBK_COORD_NOTFOUND
36855 +               && cbk_result != CBK_COORD_FOUND);
36856 +}
36857 +
36858 +/* __REISER4_FILE_H__ */
36859 +#endif
36860 +
36861 +/*
36862 + * Local variables:
36863 + * c-indentation-style: "K&R"
36864 + * mode-name: "LC"
36865 + * c-basic-offset: 8
36866 + * tab-width: 8
36867 + * fill-column: 79
36868 + * scroll-step: 1
36869 + * End:
36870 +*/
36871 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/file/Makefile linux-2.6.35/fs/reiser4/plugin/file/Makefile
36872 --- linux-2.6.35.orig/fs/reiser4/plugin/file/Makefile   1970-01-01 01:00:00.000000000 +0100
36873 +++ linux-2.6.35/fs/reiser4/plugin/file/Makefile        2010-08-04 15:44:57.000000000 +0200
36874 @@ -0,0 +1,7 @@
36875 +obj-$(CONFIG_REISER4_FS) += file_plugins.o
36876 +
36877 +file_plugins-objs :=           \
36878 +       file.o                  \
36879 +       tail_conversion.o       \
36880 +       symlink.o               \
36881 +       cryptcompress.o
36882 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/file/symfile.c linux-2.6.35/fs/reiser4/plugin/file/symfile.c
36883 --- linux-2.6.35.orig/fs/reiser4/plugin/file/symfile.c  1970-01-01 01:00:00.000000000 +0100
36884 +++ linux-2.6.35/fs/reiser4/plugin/file/symfile.c       2010-08-04 15:44:57.000000000 +0200
36885 @@ -0,0 +1,87 @@
36886 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
36887 +
36888 +/* Symfiles are a generalization of Unix symlinks.
36889 +
36890 +   A symfile when read behaves as though you took its contents and
36891 +   substituted them into the reiser4 naming system as the right hand side
36892 +   of an assignment, and then read that which you had assigned to it.
36893 +
36894 +   A key issue for symfiles is how to implement writes through to
36895 +   subfiles.  In general, one must have some method of determining what
36896 +   of that which is written to the symfile is written to what subfile.
36897 +   This can be done by use of custom plugin methods written by users, or
36898 +   by using a few general methods we provide for those willing to endure
36899 +   the insertion of delimiters into what is read.
36900 +
36901 +   Writing to symfiles without delimiters to denote what is written to
36902 +   what subfile is not supported by any plugins we provide in this
36903 +   release.  Our most sophisticated support for writes is that embodied
36904 +   by the invert plugin (see invert.c).
36905 +
36906 +   A read only version of the /etc/passwd file might be
36907 +   constructed as a symfile whose contents are as follows:
36908 +
36909 +   /etc/passwd/userlines/*
36910 +
36911 +   or
36912 +
36913 +   /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root
36914 +
36915 +   or
36916 +
36917 +   /etc/passwd/userlines/(demidov+edward+reiser+root)
36918 +
36919 +   A symfile with contents
36920 +
36921 +   /filenameA+"(some text stored in the uninvertable symfile)+/filenameB
36922 +
36923 +   will return when read
36924 +
36925 +   The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB
36926 +
36927 +   and write of what has been read will not be possible to implement as
36928 +   an identity operation because there are no delimiters denoting the
36929 +   boundaries of what is to be written to what subfile.
36930 +
36931 +   Note that one could make this a read/write symfile if one specified
36932 +   delimiters, and the write method understood those delimiters delimited
36933 +   what was written to subfiles.
36934 +
36935 +   So, specifying the symfile in a manner that allows writes:
36936 +
36937 +   /etc/passwd/userlines/demidov+"(
36938 +   )+/etc/passwd/userlines/edward+"(
36939 +   )+/etc/passwd/userlines/reiser+"(
36940 +   )+/etc/passwd/userlines/root+"(
36941 +   )
36942 +
36943 +   or
36944 +
36945 +   /etc/passwd/userlines/(demidov+"(
36946 +   )+edward+"(
36947 +   )+reiser+"(
36948 +   )+root+"(
36949 +   ))
36950 +
36951 +   and the file demidov might be specified as:
36952 +
36953 +   /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell
36954 +
36955 +   or
36956 +
36957 +   /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell)
36958 +
36959 +   Notice that if the file demidov has a carriage return in it, the
36960 +   parsing fails, but then if you put carriage returns in the wrong place
36961 +   in a normal /etc/passwd file it breaks things also.
36962 +
36963 +   Note that it is forbidden to have no text between two interpolations
36964 +   if one wants to be able to define what parts of a write go to what
36965 +   subfiles referenced in an interpolation.
36966 +
36967 +   If one wants to be able to add new lines by writing to the file, one
36968 +   must either write a custom plugin for /etc/passwd that knows how to
36969 +   name an added line, or one must use an invert, or one must use a more
36970 +   sophisticated symfile syntax that we are not planning to write for
36971 +   version 4.0.
36972 +*/
36973 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/file/symlink.c linux-2.6.35/fs/reiser4/plugin/file/symlink.c
36974 --- linux-2.6.35.orig/fs/reiser4/plugin/file/symlink.c  1970-01-01 01:00:00.000000000 +0100
36975 +++ linux-2.6.35/fs/reiser4/plugin/file/symlink.c       2010-08-04 15:44:57.000000000 +0200
36976 @@ -0,0 +1,95 @@
36977 +/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */
36978 +
36979 +#include "../../inode.h"
36980 +
36981 +#include <linux/types.h>
36982 +#include <linux/fs.h>
36983 +
36984 +/* file plugin methods specific for symlink files
36985 +   (SYMLINK_FILE_PLUGIN_ID) */
36986 +
36987 +/* this is implementation of create_object method of file plugin for
36988 +   SYMLINK_FILE_PLUGIN_ID
36989 + */
36990 +
36991 +/**
36992 + * reiser4_create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID
36993 + * @symlink: inode of symlink object
36994 + * @dir: inode of parent directory
36995 + * @info:  parameters of new object
36996 + *
36997 + * Inserts stat data with symlink extension where into the tree.
36998 + */
36999 +int reiser4_create_symlink(struct inode *symlink,
37000 +                          struct inode *dir UNUSED_ARG,
37001 +                          reiser4_object_create_data *data /* info passed to us
37002 +                                                            * this is filled by
37003 +                                                            * reiser4() syscall
37004 +                                                            * in particular */)
37005 +{
37006 +       int result;
37007 +
37008 +       assert("nikita-680", symlink != NULL);
37009 +       assert("nikita-681", S_ISLNK(symlink->i_mode));
37010 +       assert("nikita-685", reiser4_inode_get_flag(symlink, REISER4_NO_SD));
37011 +       assert("nikita-682", dir != NULL);
37012 +       assert("nikita-684", data != NULL);
37013 +       assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID);
37014 +
37015 +       /*
37016 +        * stat data of symlink has symlink extension in which we store
37017 +        * symlink content, that is, path symlink is pointing to.
37018 +        */
37019 +       reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT);
37020 +
37021 +       assert("vs-838", symlink->i_private == NULL);
37022 +       symlink->i_private = (void *)data->name;
37023 +
37024 +       assert("vs-843", symlink->i_size == 0);
37025 +       INODE_SET_FIELD(symlink, i_size, strlen(data->name));
37026 +
37027 +       /* insert stat data appended with data->name */
37028 +       result = inode_file_plugin(symlink)->write_sd_by_inode(symlink);
37029 +       if (result) {
37030 +               /* FIXME-VS: Make sure that symlink->i_private is not attached
37031 +                  to kmalloced data */
37032 +               INODE_SET_FIELD(symlink, i_size, 0);
37033 +       } else {
37034 +               assert("vs-849", symlink->i_private
37035 +                      && reiser4_inode_get_flag(symlink,
37036 +                                                REISER4_GENERIC_PTR_USED));
37037 +               assert("vs-850",
37038 +                      !memcmp((char *)symlink->i_private, data->name,
37039 +                              (size_t) symlink->i_size + 1));
37040 +       }
37041 +       return result;
37042 +}
37043 +
37044 +/* this is implementation of destroy_inode method of file plugin for
37045 +   SYMLINK_FILE_PLUGIN_ID
37046 + */
37047 +void destroy_inode_symlink(struct inode *inode)
37048 +{
37049 +       assert("edward-799",
37050 +              inode_file_plugin(inode) ==
37051 +              file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
37052 +       assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode));
37053 +       assert("edward-801", reiser4_inode_get_flag(inode,
37054 +                                                   REISER4_GENERIC_PTR_USED));
37055 +       assert("vs-839", S_ISLNK(inode->i_mode));
37056 +
37057 +       kfree(inode->i_private);
37058 +       inode->i_private = NULL;
37059 +       reiser4_inode_clr_flag(inode, REISER4_GENERIC_PTR_USED);
37060 +}
37061 +
37062 +/*
37063 +  Local variables:
37064 +  c-indentation-style: "K&R"
37065 +  mode-name: "LC"
37066 +  c-basic-offset: 8
37067 +  tab-width: 8
37068 +  fill-column: 80
37069 +  scroll-step: 1
37070 +  End:
37071 +*/
37072 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/file/tail_conversion.c linux-2.6.35/fs/reiser4/plugin/file/tail_conversion.c
37073 --- linux-2.6.35.orig/fs/reiser4/plugin/file/tail_conversion.c  1970-01-01 01:00:00.000000000 +0100
37074 +++ linux-2.6.35/fs/reiser4/plugin/file/tail_conversion.c       2010-08-04 15:44:57.000000000 +0200
37075 @@ -0,0 +1,743 @@
37076 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
37077 +
37078 +#include "../../inode.h"
37079 +#include "../../super.h"
37080 +#include "../../page_cache.h"
37081 +#include "../../carry.h"
37082 +#include "../../safe_link.h"
37083 +#include "../../vfs_ops.h"
37084 +
37085 +#include <linux/writeback.h>
37086 +
37087 +/* this file contains:
37088 +   tail2extent and extent2tail */
37089 +
37090 +/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
37091 +void get_exclusive_access(struct unix_file_info * uf_info)
37092 +{
37093 +       assert("nikita-3028", reiser4_schedulable());
37094 +       assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
37095 +       assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
37096 +       /*
37097 +        * "deadlock avoidance": sometimes we commit a transaction under
37098 +        * rw-semaphore on a file. Such commit can deadlock with another
37099 +        * thread that captured some block (hence preventing atom from being
37100 +        * committed) and waits on rw-semaphore.
37101 +        */
37102 +       reiser4_txn_restart_current();
37103 +       LOCK_CNT_INC(inode_sem_w);
37104 +       down_write(&uf_info->latch);
37105 +       uf_info->exclusive_use = 1;
37106 +       assert("vs-1713", uf_info->ea_owner == NULL);
37107 +       assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
37108 +       ON_DEBUG(uf_info->ea_owner = current);
37109 +}
37110 +
37111 +void drop_exclusive_access(struct unix_file_info * uf_info)
37112 +{
37113 +       assert("vs-1714", uf_info->ea_owner == current);
37114 +       assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
37115 +       ON_DEBUG(uf_info->ea_owner = NULL);
37116 +       uf_info->exclusive_use = 0;
37117 +       up_write(&uf_info->latch);
37118 +       assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
37119 +       assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
37120 +       LOCK_CNT_DEC(inode_sem_w);
37121 +       reiser4_txn_restart_current();
37122 +}
37123 +
37124 +/**
37125 + * nea_grabbed - do something when file semaphore is down_read-ed
37126 + * @uf_info:
37127 + *
37128 + * This is called when nonexclisive access is obtained on file. All it does is
37129 + * for debugging purposes.
37130 + */
37131 +static void nea_grabbed(struct unix_file_info *uf_info)
37132 +{
37133 +#if REISER4_DEBUG
37134 +       LOCK_CNT_INC(inode_sem_r);
37135 +       assert("vs-1716", uf_info->ea_owner == NULL);
37136 +       atomic_inc(&uf_info->nr_neas);
37137 +       uf_info->last_reader = current;
37138 +#endif
37139 +}
37140 +
37141 +/**
37142 + * get_nonexclusive_access - get nonexclusive access to a file
37143 + * @uf_info: unix file specific part of inode to obtain access to
37144 + *
37145 + * Nonexclusive access is obtained on a file before read, write, readpage.
37146 + */
37147 +void get_nonexclusive_access(struct unix_file_info *uf_info)
37148 +{
37149 +       assert("nikita-3029", reiser4_schedulable());
37150 +       assert("nikita-3361", get_current_context()->trans->atom == NULL);
37151 +
37152 +       down_read(&uf_info->latch);
37153 +       nea_grabbed(uf_info);
37154 +}
37155 +
37156 +/**
37157 + * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
37158 + * @uf_info: unix file specific part of inode to obtain access to
37159 + *
37160 + * Non-blocking version of nonexclusive access obtaining.
37161 + */
37162 +int try_to_get_nonexclusive_access(struct unix_file_info *uf_info)
37163 +{
37164 +       int result;
37165 +
37166 +       result = down_read_trylock(&uf_info->latch);
37167 +       if (result)
37168 +               nea_grabbed(uf_info);
37169 +       return result;
37170 +}
37171 +
37172 +void drop_nonexclusive_access(struct unix_file_info * uf_info)
37173 +{
37174 +       assert("vs-1718", uf_info->ea_owner == NULL);
37175 +       assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
37176 +       ON_DEBUG(atomic_dec(&uf_info->nr_neas));
37177 +
37178 +       up_read(&uf_info->latch);
37179 +
37180 +       LOCK_CNT_DEC(inode_sem_r);
37181 +       reiser4_txn_restart_current();
37182 +}
37183 +
37184 +/* part of tail2extent. Cut all items covering @count bytes starting from
37185 +   @offset */
37186 +/* Audited by: green(2002.06.15) */
37187 +static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
37188 +{
37189 +       reiser4_key from, to;
37190 +
37191 +       /* AUDIT: How about putting an assertion here, what would check
37192 +          all provided range is covered by tail items only? */
37193 +       /* key of first byte in the range to be cut  */
37194 +       inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
37195 +
37196 +       /* key of last byte in that range */
37197 +       to = from;
37198 +       set_key_offset(&to, (__u64) (offset + count - 1));
37199 +
37200 +       /* cut everything between those keys */
37201 +       return reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, &to,
37202 +                               inode, 0);
37203 +}
37204 +
37205 +static void release_all_pages(struct page **pages, unsigned nr_pages)
37206 +{
37207 +       unsigned i;
37208 +
37209 +       for (i = 0; i < nr_pages; i++) {
37210 +               if (pages[i] == NULL) {
37211 +#if REISER4_DEBUG
37212 +                       unsigned j;
37213 +                       for (j = i + 1; j < nr_pages; j++)
37214 +                               assert("vs-1620", pages[j] == NULL);
37215 +#endif
37216 +                       break;
37217 +               }
37218 +               page_cache_release(pages[i]);
37219 +               pages[i] = NULL;
37220 +       }
37221 +}
37222 +
37223 +/* part of tail2extent. replace tail items with extent one. Content of tail
37224 +   items (@count bytes) being cut are copied already into
37225 +   pages. extent_writepage method is called to create extents corresponding to
37226 +   those pages */
37227 +static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
37228 +{
37229 +       int result;
37230 +       unsigned i;
37231 +       STORE_COUNTERS;
37232 +
37233 +       if (nr_pages == 0)
37234 +               return 0;
37235 +
37236 +       assert("vs-596", pages[0]);
37237 +
37238 +       /* cut copied items */
37239 +       result = cut_formatting_items(inode, page_offset(pages[0]), count);
37240 +       if (result)
37241 +               return result;
37242 +
37243 +       CHECK_COUNTERS;
37244 +
37245 +       /* put into tree replacement for just removed items: extent item, namely */
37246 +       for (i = 0; i < nr_pages; i++) {
37247 +               result = add_to_page_cache_lru(pages[i], inode->i_mapping,
37248 +                                              pages[i]->index,
37249 +                                              mapping_gfp_mask(inode->
37250 +                                                               i_mapping));
37251 +               if (result)
37252 +                       break;
37253 +               unlock_page(pages[i]);
37254 +               result = find_or_create_extent(pages[i]);
37255 +               if (result)
37256 +                       break;
37257 +               SetPageUptodate(pages[i]);
37258 +       }
37259 +       return result;
37260 +}
37261 +
37262 +#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
37263 +                                * items */
37264 +
37265 +static int reserve_tail2extent_iteration(struct inode *inode)
37266 +{
37267 +       reiser4_block_nr unformatted_nodes;
37268 +       reiser4_tree *tree;
37269 +
37270 +       tree = reiser4_tree_by_inode(inode);
37271 +
37272 +       /* number of unformatted nodes which will be created */
37273 +       unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
37274 +
37275 +       /*
37276 +        * space required for one iteration of extent->tail conversion:
37277 +        *
37278 +        *     1. kill N tail items
37279 +        *
37280 +        *     2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
37281 +        *
37282 +        *     3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
37283 +        *     extents) extent units.
37284 +        *
37285 +        *     4. drilling to the leaf level by coord_by_key()
37286 +        *
37287 +        *     5. possible update of stat-data
37288 +        *
37289 +        */
37290 +       grab_space_enable();
37291 +       return reiser4_grab_space
37292 +           (2 * tree->height +
37293 +            TAIL2EXTENT_PAGE_NUM +
37294 +            TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
37295 +            1 + estimate_one_insert_item(tree) +
37296 +            inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
37297 +}
37298 +
37299 +/* clear stat data's flag indicating that conversion is being converted */
37300 +static int complete_conversion(struct inode *inode)
37301 +{
37302 +       int result;
37303 +
37304 +       grab_space_enable();
37305 +       result =
37306 +           reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
37307 +                              BA_CAN_COMMIT);
37308 +       if (result == 0) {
37309 +               reiser4_inode_clr_flag(inode, REISER4_PART_MIXED);
37310 +               result = reiser4_update_sd(inode);
37311 +       }
37312 +       if (result)
37313 +               warning("vs-1696", "Failed to clear converting bit of %llu: %i",
37314 +                       (unsigned long long)get_inode_oid(inode), result);
37315 +       return 0;
37316 +}
37317 +
37318 +/**
37319 + * find_start
37320 + * @inode:
37321 + * @id:
37322 + * @offset:
37323 + *
37324 + * this is used by tail2extent and extent2tail to detect where previous
37325 + * uncompleted conversion stopped
37326 + */
37327 +static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
37328 +{
37329 +       int result;
37330 +       lock_handle lh;
37331 +       coord_t coord;
37332 +       struct unix_file_info *ufo;
37333 +       int found;
37334 +       reiser4_key key;
37335 +
37336 +       ufo = unix_file_inode_data(inode);
37337 +       init_lh(&lh);
37338 +       result = 0;
37339 +       found = 0;
37340 +       inode_file_plugin(inode)->key_by_inode(inode, *offset, &key);
37341 +       do {
37342 +               init_lh(&lh);
37343 +               result = find_file_item_nohint(&coord, &lh, &key,
37344 +                                              ZNODE_READ_LOCK, inode);
37345 +
37346 +               if (result == CBK_COORD_FOUND) {
37347 +                       if (coord.between == AT_UNIT) {
37348 +                               /*coord_clear_iplug(&coord); */
37349 +                               result = zload(coord.node);
37350 +                               if (result == 0) {
37351 +                                       if (item_id_by_coord(&coord) == id)
37352 +                                               found = 1;
37353 +                                       else
37354 +                                               item_plugin_by_coord(&coord)->s.
37355 +                                                   file.append_key(&coord,
37356 +                                                                   &key);
37357 +                                       zrelse(coord.node);
37358 +                               }
37359 +                       } else
37360 +                               result = RETERR(-ENOENT);
37361 +               }
37362 +               done_lh(&lh);
37363 +       } while (result == 0 && !found);
37364 +       *offset = get_key_offset(&key);
37365 +       return result;
37366 +}
37367 +
37368 +/**
37369 + * tail2extent
37370 + * @uf_info:
37371 + *
37372 + *
37373 + */
37374 +int tail2extent(struct unix_file_info *uf_info)
37375 +{
37376 +       int result;
37377 +       reiser4_key key;        /* key of next byte to be moved to page */
37378 +       char *p_data;           /* data of page */
37379 +       unsigned page_off = 0,  /* offset within the page where to copy data */
37380 +           count;              /* number of bytes of item which can be
37381 +                                * copied to page */
37382 +       struct page *pages[TAIL2EXTENT_PAGE_NUM];
37383 +       struct page *page;
37384 +       int done;               /* set to 1 when all file is read */
37385 +       char *item;
37386 +       int i;
37387 +       struct inode *inode;
37388 +       int first_iteration;
37389 +       int bytes;
37390 +       __u64 offset;
37391 +
37392 +       assert("nikita-3362", ea_obtained(uf_info));
37393 +       inode = unix_file_info_to_inode(uf_info);
37394 +       assert("nikita-3412", !IS_RDONLY(inode));
37395 +       assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
37396 +       assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
37397 +
37398 +       offset = 0;
37399 +       first_iteration = 1;
37400 +       result = 0;
37401 +       if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37402 +               /*
37403 +                * file is marked on disk as there was a conversion which did
37404 +                * not complete due to either crash or some error. Find which
37405 +                * offset tail conversion stopped at
37406 +                */
37407 +               result = find_start(inode, FORMATTING_ID, &offset);
37408 +               if (result == -ENOENT) {
37409 +                       /* no tail items found, everything is converted */
37410 +                       uf_info->container = UF_CONTAINER_EXTENTS;
37411 +                       complete_conversion(inode);
37412 +                       return 0;
37413 +               } else if (result != 0)
37414 +                       /* some other error */
37415 +                       return result;
37416 +               first_iteration = 0;
37417 +       }
37418 +
37419 +       reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
37420 +
37421 +       /* get key of first byte of a file */
37422 +       inode_file_plugin(inode)->key_by_inode(inode, offset, &key);
37423 +
37424 +       done = 0;
37425 +       while (done == 0) {
37426 +               memset(pages, 0, sizeof(pages));
37427 +               result = reserve_tail2extent_iteration(inode);
37428 +               if (result != 0) {
37429 +                       reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37430 +                       goto out;
37431 +               }
37432 +               if (first_iteration) {
37433 +                       reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
37434 +                       reiser4_update_sd(inode);
37435 +                       first_iteration = 0;
37436 +               }
37437 +               bytes = 0;
37438 +               for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
37439 +                       assert("vs-598",
37440 +                              (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
37441 +                       page = alloc_page(reiser4_ctx_gfp_mask_get());
37442 +                       if (!page) {
37443 +                               result = RETERR(-ENOMEM);
37444 +                               goto error;
37445 +                       }
37446 +
37447 +                       page->index =
37448 +                           (unsigned long)(get_key_offset(&key) >>
37449 +                                           PAGE_CACHE_SHIFT);
37450 +                       /*
37451 +                        * usually when one is going to longterm lock znode (as
37452 +                        * find_file_item does, for instance) he must not hold
37453 +                        * locked pages. However, there is an exception for
37454 +                        * case tail2extent. Pages appearing here are not
37455 +                        * reachable to everyone else, they are clean, they do
37456 +                        * not have jnodes attached so keeping them locked do
37457 +                        * not risk deadlock appearance
37458 +                        */
37459 +                       assert("vs-983", !PagePrivate(page));
37460 +                       reiser4_invalidate_pages(inode->i_mapping, page->index,
37461 +                                                1, 0);
37462 +
37463 +                       for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
37464 +                               coord_t coord;
37465 +                               lock_handle lh;
37466 +
37467 +                               /* get next item */
37468 +                               /* FIXME: we might want to readahead here */
37469 +                               init_lh(&lh);
37470 +                               result =
37471 +                                   find_file_item_nohint(&coord, &lh, &key,
37472 +                                                         ZNODE_READ_LOCK,
37473 +                                                         inode);
37474 +                               if (result != CBK_COORD_FOUND) {
37475 +                                       /*
37476 +                                        * error happened of not items of file
37477 +                                        * were found
37478 +                                        */
37479 +                                       done_lh(&lh);
37480 +                                       page_cache_release(page);
37481 +                                       goto error;
37482 +                               }
37483 +
37484 +                               if (coord.between == AFTER_UNIT) {
37485 +                                       /*
37486 +                                        * end of file is reached. Padd page
37487 +                                        * with zeros
37488 +                                        */
37489 +                                       done_lh(&lh);
37490 +                                       done = 1;
37491 +                                       p_data = kmap_atomic(page, KM_USER0);
37492 +                                       memset(p_data + page_off, 0,
37493 +                                              PAGE_CACHE_SIZE - page_off);
37494 +                                       kunmap_atomic(p_data, KM_USER0);
37495 +                                       break;
37496 +                               }
37497 +
37498 +                               result = zload(coord.node);
37499 +                               if (result) {
37500 +                                       page_cache_release(page);
37501 +                                       done_lh(&lh);
37502 +                                       goto error;
37503 +                               }
37504 +                               assert("vs-856", coord.between == AT_UNIT);
37505 +                               item = ((char *)item_body_by_coord(&coord)) +
37506 +                                       coord.unit_pos;
37507 +
37508 +                               /* how many bytes to copy */
37509 +                               count =
37510 +                                   item_length_by_coord(&coord) -
37511 +                                   coord.unit_pos;
37512 +                               /* limit length of copy to end of page */
37513 +                               if (count > PAGE_CACHE_SIZE - page_off)
37514 +                                       count = PAGE_CACHE_SIZE - page_off;
37515 +
37516 +                               /*
37517 +                                * copy item (as much as will fit starting from
37518 +                                * the beginning of the item) into the page
37519 +                                */
37520 +                               p_data = kmap_atomic(page, KM_USER0);
37521 +                               memcpy(p_data + page_off, item, count);
37522 +                               kunmap_atomic(p_data, KM_USER0);
37523 +
37524 +                               page_off += count;
37525 +                               bytes += count;
37526 +                               set_key_offset(&key,
37527 +                                              get_key_offset(&key) + count);
37528 +
37529 +                               zrelse(coord.node);
37530 +                               done_lh(&lh);
37531 +                       } /* end of loop which fills one page by content of
37532 +                          * formatting items */
37533 +
37534 +                       if (page_off) {
37535 +                               /* something was copied into page */
37536 +                               pages[i] = page;
37537 +                       } else {
37538 +                               page_cache_release(page);
37539 +                               assert("vs-1648", done == 1);
37540 +                               break;
37541 +                       }
37542 +               } /* end of loop through pages of one conversion iteration */
37543 +
37544 +               if (i > 0) {
37545 +                       result = replace(inode, pages, i, bytes);
37546 +                       release_all_pages(pages, sizeof_array(pages));
37547 +                       if (result)
37548 +                               goto error;
37549 +                       /*
37550 +                        * We have to drop exclusive access to avoid deadlock
37551 +                        * which may happen because called by reiser4_writepages
37552 +                        * capture_unix_file requires to get non-exclusive
37553 +                        * access to a file. It is safe to drop EA in the middle
37554 +                        * of tail2extent conversion because write_unix_file,
37555 +                        * setattr_unix_file(truncate), mmap_unix_file,
37556 +                        * release_unix_file(extent2tail) checks if conversion
37557 +                        * is not in progress (see comments before
37558 +                        * get_exclusive_access_careful().
37559 +                        * Other processes that acquire non-exclusive access
37560 +                        * (read_unix_file, reiser4_writepages, etc) should work
37561 +                        * on partially converted files.
37562 +                        */
37563 +                       drop_exclusive_access(uf_info);
37564 +                       /* throttle the conversion
37565 +                          FIXME-EDWARD: Pass the precise number of pages
37566 +                          that was dirtied */
37567 +                       reiser4_throttle_write(inode, 1);
37568 +                       get_exclusive_access(uf_info);
37569 +
37570 +                       /*
37571 +                        * nobody is allowed to complete conversion but a
37572 +                        * process which started it
37573 +                        */
37574 +                       assert("", reiser4_inode_get_flag(inode,
37575 +                                                         REISER4_PART_MIXED));
37576 +               }
37577 +       }
37578 +       if (result == 0) {
37579 +               /* file is converted to extent items */
37580 +               reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37581 +               assert("vs-1697", reiser4_inode_get_flag(inode,
37582 +                                                        REISER4_PART_MIXED));
37583 +
37584 +               uf_info->container = UF_CONTAINER_EXTENTS;
37585 +               complete_conversion(inode);
37586 +       } else {
37587 +               /*
37588 +                * conversion is not complete. Inode was already marked as
37589 +                * REISER4_PART_MIXED and stat-data were updated at the first
37590 +                * iteration of the loop above.
37591 +                */
37592 +       error:
37593 +               release_all_pages(pages, sizeof_array(pages));
37594 +               reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37595 +               warning("edward-1548", "Partial conversion of %llu: %i",
37596 +                       (unsigned long long)get_inode_oid(inode), result);
37597 +       }
37598 +
37599 + out:
37600 +       /* this flag should be cleared, otherwise get_exclusive_access_careful()
37601 +          will fall into infinite loop */
37602 +       assert("edward-1549", !reiser4_inode_get_flag(inode,
37603 +                                                     REISER4_PART_IN_CONV));
37604 +       return result;
37605 +}
37606 +
37607 +static int reserve_extent2tail_iteration(struct inode *inode)
37608 +{
37609 +       reiser4_tree *tree;
37610 +
37611 +       tree = reiser4_tree_by_inode(inode);
37612 +       /*
37613 +        * reserve blocks for (in this order):
37614 +        *
37615 +        *     1. removal of extent item
37616 +        *
37617 +        *     2. insertion of tail by insert_flow()
37618 +        *
37619 +        *     3. drilling to the leaf level by coord_by_key()
37620 +        *
37621 +        *     4. possible update of stat-data
37622 +        */
37623 +       grab_space_enable();
37624 +       return reiser4_grab_space
37625 +           (estimate_one_item_removal(tree) +
37626 +            estimate_insert_flow(tree->height) +
37627 +            1 + estimate_one_insert_item(tree) +
37628 +            inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
37629 +}
37630 +
37631 +/* for every page of file: read page, cut part of extent pointing to this page,
37632 +   put data of page tree by tail item */
37633 +int extent2tail(struct file * file, struct unix_file_info *uf_info)
37634 +{
37635 +       int result;
37636 +       struct inode *inode;
37637 +       struct page *page;
37638 +       unsigned long num_pages, i;
37639 +       unsigned long start_page;
37640 +       reiser4_key from;
37641 +       reiser4_key to;
37642 +       unsigned count;
37643 +       __u64 offset;
37644 +
37645 +       assert("nikita-3362", ea_obtained(uf_info));
37646 +       inode = unix_file_info_to_inode(uf_info);
37647 +       assert("nikita-3412", !IS_RDONLY(inode));
37648 +       assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
37649 +       assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
37650 +
37651 +       offset = 0;
37652 +       if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37653 +               /*
37654 +                * file is marked on disk as there was a conversion which did
37655 +                * not complete due to either crash or some error. Find which
37656 +                * offset tail conversion stopped at
37657 +                */
37658 +               result = find_start(inode, EXTENT_POINTER_ID, &offset);
37659 +               if (result == -ENOENT) {
37660 +                       /* no extent found, everything is converted */
37661 +                       uf_info->container = UF_CONTAINER_TAILS;
37662 +                       complete_conversion(inode);
37663 +                       return 0;
37664 +               } else if (result != 0)
37665 +                       /* some other error */
37666 +                       return result;
37667 +       }
37668 +
37669 +       reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
37670 +
37671 +       /* number of pages in the file */
37672 +       num_pages =
37673 +           (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
37674 +       start_page = offset >> PAGE_CACHE_SHIFT;
37675 +
37676 +       inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
37677 +       to = from;
37678 +
37679 +       result = 0;
37680 +       for (i = 0; i < num_pages; i++) {
37681 +               __u64 start_byte;
37682 +
37683 +               result = reserve_extent2tail_iteration(inode);
37684 +               if (result != 0)
37685 +                       break;
37686 +               if (i == 0 && offset == 0) {
37687 +                       reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
37688 +                       reiser4_update_sd(inode);
37689 +               }
37690 +
37691 +               page = read_mapping_page(inode->i_mapping,
37692 +                                        (unsigned)(i + start_page), NULL);
37693 +               if (IS_ERR(page)) {
37694 +                       result = PTR_ERR(page);
37695 +                       break;
37696 +               }
37697 +
37698 +               wait_on_page_locked(page);
37699 +
37700 +               if (!PageUptodate(page)) {
37701 +                       page_cache_release(page);
37702 +                       result = RETERR(-EIO);
37703 +                       break;
37704 +               }
37705 +
37706 +               /* cut part of file we have read */
37707 +               start_byte = (__u64) ((i + start_page) << PAGE_CACHE_SHIFT);
37708 +               set_key_offset(&from, start_byte);
37709 +               set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
37710 +               /*
37711 +                * reiser4_cut_tree_object() returns -E_REPEAT to allow atom
37712 +                * commits during over-long truncates. But
37713 +                * extent->tail conversion should be performed in one
37714 +                * transaction.
37715 +                */
37716 +               result = reiser4_cut_tree(reiser4_tree_by_inode(inode), &from,
37717 +                                         &to, inode, 0);
37718 +
37719 +               if (result) {
37720 +                       page_cache_release(page);
37721 +                       break;
37722 +               }
37723 +
37724 +               /* put page data into tree via tail_write */
37725 +               count = PAGE_CACHE_SIZE;
37726 +               if ((i == (num_pages - 1)) &&
37727 +                   (inode->i_size & ~PAGE_CACHE_MASK))
37728 +                       /* last page can be incompleted */
37729 +                       count = (inode->i_size & ~PAGE_CACHE_MASK);
37730 +               while (count) {
37731 +                       loff_t pos = start_byte;
37732 +
37733 +                       assert("edward-1537",
37734 +                              file != NULL && file->f_dentry != NULL);
37735 +                       assert("edward-1538",
37736 +                              file->f_dentry->d_inode == inode);
37737 +
37738 +                       result = reiser4_write_tail(file, inode,
37739 +                                                   (char __user *)kmap(page),
37740 +                                                   count, &pos);
37741 +                       reiser4_free_file_fsdata(file);
37742 +                       if (result <= 0) {
37743 +                               warning("", "reiser4_write_tail failed");
37744 +                               page_cache_release(page);
37745 +                               reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37746 +                               return result;
37747 +                       }
37748 +                       count -= result;
37749 +               }
37750 +
37751 +               /* release page */
37752 +               lock_page(page);
37753 +               /* page is already detached from jnode and mapping. */
37754 +               assert("vs-1086", page->mapping == NULL);
37755 +               assert("nikita-2690",
37756 +                      (!PagePrivate(page) && jprivate(page) == 0));
37757 +               /* waiting for writeback completion with page lock held is
37758 +                * perfectly valid. */
37759 +               wait_on_page_writeback(page);
37760 +               reiser4_drop_page(page);
37761 +               /* release reference taken by read_cache_page() above */
37762 +               page_cache_release(page);
37763 +
37764 +               drop_exclusive_access(uf_info);
37765 +               /*
37766 +                * throttle the conversion.
37767 +                * FIXME-EDWARD: Calculate and pass the precise number
37768 +                * of pages that was dirtied
37769 +                */
37770 +               reiser4_throttle_write(inode, 1);
37771 +               get_exclusive_access(uf_info);
37772 +               /*
37773 +                * nobody is allowed to complete conversion but a process which
37774 +                * started it
37775 +                */
37776 +               assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED));
37777 +       }
37778 +
37779 +       reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37780 +
37781 +       if (i == num_pages) {
37782 +               /* file is converted to formatted items */
37783 +               assert("vs-1698", reiser4_inode_get_flag(inode,
37784 +                                                        REISER4_PART_MIXED));
37785 +               assert("vs-1260",
37786 +                      inode_has_no_jnodes(reiser4_inode_data(inode)));
37787 +
37788 +               uf_info->container = UF_CONTAINER_TAILS;
37789 +               complete_conversion(inode);
37790 +               return 0;
37791 +       }
37792 +       /*
37793 +        * conversion is not complete. Inode was already marked as
37794 +        * REISER4_PART_MIXED and stat-data were updated at the first
37795 +        * iteration of the loop above.
37796 +        */
37797 +       warning("nikita-2282",
37798 +               "Partial conversion of %llu: %lu of %lu: %i",
37799 +               (unsigned long long)get_inode_oid(inode), i,
37800 +               num_pages, result);
37801 +
37802 +       /* this flag should be cleared, otherwise get_exclusive_access_careful()
37803 +          will fall into infinite loop */
37804 +       assert("edward-1550", !reiser4_inode_get_flag(inode,
37805 +                                                     REISER4_PART_IN_CONV));
37806 +       return result;
37807 +}
37808 +
37809 +/*
37810 + * Local variables:
37811 + * c-indentation-style: "K&R"
37812 + * mode-name: "LC"
37813 + * c-basic-offset: 8
37814 + * tab-width: 8
37815 + * fill-column: 79
37816 + * scroll-step: 1
37817 + * End:
37818 + */
37819 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/file_ops.c linux-2.6.35/fs/reiser4/plugin/file_ops.c
37820 --- linux-2.6.35.orig/fs/reiser4/plugin/file_ops.c      1970-01-01 01:00:00.000000000 +0100
37821 +++ linux-2.6.35/fs/reiser4/plugin/file_ops.c   2010-08-04 18:11:07.000000000 +0200
37822 @@ -0,0 +1,163 @@
37823 +/* Copyright 2005 by Hans Reiser, licensing governed by
37824 +   reiser4/README */
37825 +
37826 +/* this file contains typical implementations for some of methods of
37827 +   struct file_operations and of struct address_space_operations
37828 +*/
37829 +
37830 +#include "../inode.h"
37831 +#include "object.h"
37832 +
37833 +/* file operations */
37834 +
37835 +/* implementation of vfs's llseek method of struct file_operations for
37836 +   typical directory can be found in readdir_common.c
37837 +*/
37838 +loff_t reiser4_llseek_dir_common(struct file *, loff_t, int origin);
37839 +
37840 +/* implementation of vfs's readdir method of struct file_operations for
37841 +   typical directory can be found in readdir_common.c
37842 +*/
37843 +int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
37844 +
37845 +/**
37846 + * reiser4_release_dir_common - release of struct file_operations
37847 + * @inode: inode of released file
37848 + * @file: file to release
37849 + *
37850 + * Implementation of release method of struct file_operations for typical
37851 + * directory. All it does is freeing of reiser4 specific file data.
37852 +*/
37853 +int reiser4_release_dir_common(struct inode *inode, struct file *file)
37854 +{
37855 +       reiser4_context *ctx;
37856 +
37857 +       ctx = reiser4_init_context(inode->i_sb);
37858 +       if (IS_ERR(ctx))
37859 +               return PTR_ERR(ctx);
37860 +       reiser4_free_file_fsdata(file);
37861 +       reiser4_exit_context(ctx);
37862 +       return 0;
37863 +}
37864 +
37865 +/* this is common implementation of vfs's fsync method of struct
37866 +   file_operations
37867 +*/
37868 +int reiser4_sync_common(struct file *file, int datasync)
37869 +{
37870 +       reiser4_context *ctx;
37871 +       int result;
37872 +       struct dentry *dentry = file->f_path.dentry;
37873 +
37874 +       ctx = reiser4_init_context(dentry->d_inode->i_sb);
37875 +       if (IS_ERR(ctx))
37876 +               return PTR_ERR(ctx);
37877 +       result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0);
37878 +
37879 +       context_set_commit_async(ctx);
37880 +       reiser4_exit_context(ctx);
37881 +       return result;
37882 +}
37883 +
37884 +/*
37885 + * common sync method for regular files.
37886 + *
37887 + * We are trying to be smart here. Instead of committing all atoms (original
37888 + * solution), we scan dirty pages of this file and commit all atoms they are
37889 + * part of.
37890 + *
37891 + * Situation is complicated by anonymous pages: i.e., extent-less pages
37892 + * dirtied through mmap. Fortunately sys_fsync() first calls
37893 + * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert
37894 + * all missing extents and capture anonymous pages.
37895 + */
37896 +int reiser4_sync_file_common(struct file *file, int datasync)
37897 +{
37898 +       reiser4_context *ctx;
37899 +       txn_atom *atom;
37900 +       reiser4_block_nr reserve;
37901 +       struct dentry *dentry = file->f_path.dentry;
37902 +
37903 +       ctx = reiser4_init_context(dentry->d_inode->i_sb);
37904 +       if (IS_ERR(ctx))
37905 +               return PTR_ERR(ctx);
37906 +
37907 +       reserve = estimate_update_common(dentry->d_inode);
37908 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
37909 +               reiser4_exit_context(ctx);
37910 +               return RETERR(-ENOSPC);
37911 +       }
37912 +       write_sd_by_inode_common(dentry->d_inode);
37913 +
37914 +       atom = get_current_atom_locked();
37915 +       spin_lock_txnh(ctx->trans);
37916 +       force_commit_atom(ctx->trans);
37917 +       reiser4_exit_context(ctx);
37918 +       return 0;
37919 +}
37920 +
37921 +
37922 +/* address space operations */
37923 +
37924 +
37925 +/* this is helper for plugin->write_begin() */
37926 +int do_prepare_write(struct file *file, struct page *page, unsigned from,
37927 +                unsigned to)
37928 +{
37929 +       int result;
37930 +       file_plugin *fplug;
37931 +       struct inode *inode;
37932 +
37933 +       assert("umka-3099", file != NULL);
37934 +       assert("umka-3100", page != NULL);
37935 +       assert("umka-3095", PageLocked(page));
37936 +
37937 +       if (to - from == PAGE_CACHE_SIZE || PageUptodate(page))
37938 +               return 0;
37939 +
37940 +       inode = page->mapping->host;
37941 +       fplug = inode_file_plugin(inode);
37942 +
37943 +       if (page->mapping->a_ops->readpage == NULL)
37944 +               return RETERR(-EINVAL);
37945 +
37946 +       result = page->mapping->a_ops->readpage(file, page);
37947 +       if (result != 0) {
37948 +               SetPageError(page);
37949 +               ClearPageUptodate(page);
37950 +               /* All reiser4 readpage() implementations should return the
37951 +                * page locked in case of error. */
37952 +               assert("nikita-3472", PageLocked(page));
37953 +       } else {
37954 +               /*
37955 +                * ->readpage() either:
37956 +                *
37957 +                *     1. starts IO against @page. @page is locked for IO in
37958 +                *     this case.
37959 +                *
37960 +                *     2. doesn't start IO. @page is unlocked.
37961 +                *
37962 +                * In either case, page should be locked.
37963 +                */
37964 +               lock_page(page);
37965 +               /*
37966 +                * IO (if any) is completed at this point. Check for IO
37967 +                * errors.
37968 +                */
37969 +               if (!PageUptodate(page))
37970 +                       result = RETERR(-EIO);
37971 +       }
37972 +       assert("umka-3098", PageLocked(page));
37973 +       return result;
37974 +}
37975 +
37976 +/*
37977 + * Local variables:
37978 + * c-indentation-style: "K&R"
37979 + * mode-name: "LC"
37980 + * c-basic-offset: 8
37981 + * tab-width: 8
37982 + * fill-column: 79
37983 + * scroll-step: 1
37984 + * End:
37985 + */
37986 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/file_ops_readdir.c linux-2.6.35/fs/reiser4/plugin/file_ops_readdir.c
37987 --- linux-2.6.35.orig/fs/reiser4/plugin/file_ops_readdir.c      1970-01-01 01:00:00.000000000 +0100
37988 +++ linux-2.6.35/fs/reiser4/plugin/file_ops_readdir.c   2010-08-04 15:44:57.000000000 +0200
37989 @@ -0,0 +1,658 @@
37990 +/* Copyright 2005 by Hans Reiser, licensing governed by
37991 + * reiser4/README */
37992 +
37993 +#include "../inode.h"
37994 +
37995 +/* return true, iff @coord points to the valid directory item that is part of
37996 + * @inode directory. */
37997 +static int is_valid_dir_coord(struct inode *inode, coord_t *coord)
37998 +{
37999 +       return plugin_of_group(item_plugin_by_coord(coord),
38000 +                              DIR_ENTRY_ITEM_TYPE) &&
38001 +              inode_file_plugin(inode)->owns_item(inode, coord);
38002 +}
38003 +
38004 +/* compare two logical positions within the same directory */
38005 +static cmp_t dir_pos_cmp(const struct dir_pos *p1, const struct dir_pos *p2)
38006 +{
38007 +       cmp_t result;
38008 +
38009 +       assert("nikita-2534", p1 != NULL);
38010 +       assert("nikita-2535", p2 != NULL);
38011 +
38012 +       result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
38013 +       if (result == EQUAL_TO) {
38014 +               int diff;
38015 +
38016 +               diff = p1->pos - p2->pos;
38017 +               result =
38018 +                   (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
38019 +       }
38020 +       return result;
38021 +}
38022 +
38023 +/* see comment before reiser4_readdir_common() for overview of why "adjustment"
38024 + * is necessary. */
38025 +static void
38026 +adjust_dir_pos(struct file *dir, struct readdir_pos *readdir_spot,
38027 +              const struct dir_pos *mod_point, int adj)
38028 +{
38029 +       struct dir_pos *pos;
38030 +
38031 +       /*
38032 +        * new directory entry was added (adj == +1) or removed (adj == -1) at
38033 +        * the @mod_point. Directory file descriptor @dir is doing readdir and
38034 +        * is currently positioned at @readdir_spot. Latter has to be updated
38035 +        * to maintain stable readdir.
38036 +        */
38037 +       /* directory is positioned to the beginning. */
38038 +       if (readdir_spot->entry_no == 0)
38039 +               return;
38040 +
38041 +       pos = &readdir_spot->position;
38042 +       switch (dir_pos_cmp(mod_point, pos)) {
38043 +       case LESS_THAN:
38044 +               /* @mod_pos is _before_ @readdir_spot, that is, entry was
38045 +                * added/removed on the left (in key order) of current
38046 +                * position. */
38047 +               /* logical number of directory entry readdir is "looking" at
38048 +                * changes */
38049 +               readdir_spot->entry_no += adj;
38050 +               assert("nikita-2577",
38051 +                      ergo(dir != NULL, reiser4_get_dir_fpos(dir) + adj >= 0));
38052 +               if (de_id_cmp(&pos->dir_entry_key,
38053 +                             &mod_point->dir_entry_key) == EQUAL_TO) {
38054 +                       assert("nikita-2575", mod_point->pos < pos->pos);
38055 +                       /*
38056 +                        * if entry added/removed has the same key as current
38057 +                        * for readdir, update counter of duplicate keys in
38058 +                        * @readdir_spot.
38059 +                        */
38060 +                       pos->pos += adj;
38061 +               }
38062 +               break;
38063 +       case GREATER_THAN:
38064 +               /* directory is modified after @pos: nothing to do. */
38065 +               break;
38066 +       case EQUAL_TO:
38067 +               /* cannot insert an entry readdir is looking at, because it
38068 +                  already exists. */
38069 +               assert("nikita-2576", adj < 0);
38070 +               /* directory entry to which @pos points to is being
38071 +                  removed.
38072 +
38073 +                  NOTE-NIKITA: Right thing to do is to update @pos to point
38074 +                  to the next entry. This is complex (we are under spin-lock
38075 +                  for one thing). Just rewind it to the beginning. Next
38076 +                  readdir will have to scan the beginning of
38077 +                  directory. Proper solution is to use semaphore in
38078 +                  spin lock's stead and use rewind_right() here.
38079 +
38080 +                  NOTE-NIKITA: now, semaphore is used, so...
38081 +                */
38082 +               memset(readdir_spot, 0, sizeof *readdir_spot);
38083 +       }
38084 +}
38085 +
38086 +/* scan all file-descriptors for this directory and adjust their
38087 +   positions respectively. Should be used by implementations of
38088 +   add_entry and rem_entry of dir plugin */
38089 +void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
38090 +                            int offset, int adj)
38091 +{
38092 +       reiser4_file_fsdata *scan;
38093 +       struct dir_pos mod_point;
38094 +
38095 +       assert("nikita-2536", dir != NULL);
38096 +       assert("nikita-2538", de != NULL);
38097 +       assert("nikita-2539", adj != 0);
38098 +
38099 +       build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
38100 +       mod_point.pos = offset;
38101 +
38102 +       spin_lock_inode(dir);
38103 +
38104 +       /*
38105 +        * new entry was added/removed in directory @dir. Scan all file
38106 +        * descriptors for @dir that are currently involved into @readdir and
38107 +        * update them.
38108 +        */
38109 +
38110 +       list_for_each_entry(scan, get_readdir_list(dir), dir.linkage)
38111 +               adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
38112 +
38113 +       spin_unlock_inode(dir);
38114 +}
38115 +
38116 +/*
38117 + * traverse tree to start/continue readdir from the readdir position @pos.
38118 + */
38119 +static int dir_go_to(struct file *dir, struct readdir_pos *pos, tap_t *tap)
38120 +{
38121 +       reiser4_key key;
38122 +       int result;
38123 +       struct inode *inode;
38124 +
38125 +       assert("nikita-2554", pos != NULL);
38126 +
38127 +       inode = dir->f_dentry->d_inode;
38128 +       result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
38129 +       if (result != 0)
38130 +               return result;
38131 +       result = reiser4_object_lookup(inode,
38132 +                                      &key,
38133 +                                      tap->coord,
38134 +                                      tap->lh,
38135 +                                      tap->mode,
38136 +                                      FIND_EXACT,
38137 +                                      LEAF_LEVEL, LEAF_LEVEL,
38138 +                                      0, &tap->ra_info);
38139 +       if (result == CBK_COORD_FOUND)
38140 +               result = rewind_right(tap, (int)pos->position.pos);
38141 +       else {
38142 +               tap->coord->node = NULL;
38143 +               done_lh(tap->lh);
38144 +               result = RETERR(-EIO);
38145 +       }
38146 +       return result;
38147 +}
38148 +
38149 +/*
38150 + * handling of non-unique keys: calculate at what ordinal position within
38151 + * sequence of directory items with identical keys @pos is.
38152 + */
38153 +static int set_pos(struct inode *inode, struct readdir_pos *pos, tap_t *tap)
38154 +{
38155 +       int result;
38156 +       coord_t coord;
38157 +       lock_handle lh;
38158 +       tap_t scan;
38159 +       de_id *did;
38160 +       reiser4_key de_key;
38161 +
38162 +       coord_init_zero(&coord);
38163 +       init_lh(&lh);
38164 +       reiser4_tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
38165 +       reiser4_tap_copy(&scan, tap);
38166 +       reiser4_tap_load(&scan);
38167 +       pos->position.pos = 0;
38168 +
38169 +       did = &pos->position.dir_entry_key;
38170 +
38171 +       if (is_valid_dir_coord(inode, scan.coord)) {
38172 +
38173 +               build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
38174 +
38175 +               while (1) {
38176 +
38177 +                       result = go_prev_unit(&scan);
38178 +                       if (result != 0)
38179 +                               break;
38180 +
38181 +                       if (!is_valid_dir_coord(inode, scan.coord)) {
38182 +                               result = -EINVAL;
38183 +                               break;
38184 +                       }
38185 +
38186 +                       /* get key of directory entry */
38187 +                       unit_key_by_coord(scan.coord, &de_key);
38188 +                       if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
38189 +                               /* duplicate-sequence is over */
38190 +                               break;
38191 +                       }
38192 +                       pos->position.pos++;
38193 +               }
38194 +       } else
38195 +               result = RETERR(-ENOENT);
38196 +       reiser4_tap_relse(&scan);
38197 +       reiser4_tap_done(&scan);
38198 +       return result;
38199 +}
38200 +
38201 +/*
38202 + * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
38203 + */
38204 +static int dir_rewind(struct file *dir, struct readdir_pos *pos, tap_t *tap)
38205 +{
38206 +       __u64 destination;
38207 +       __s64 shift;
38208 +       int result;
38209 +       struct inode *inode;
38210 +       loff_t dirpos;
38211 +
38212 +       assert("nikita-2553", dir != NULL);
38213 +       assert("nikita-2548", pos != NULL);
38214 +       assert("nikita-2551", tap->coord != NULL);
38215 +       assert("nikita-2552", tap->lh != NULL);
38216 +
38217 +       dirpos = reiser4_get_dir_fpos(dir);
38218 +       shift = dirpos - pos->fpos;
38219 +       /* this is logical directory entry within @dir which we are rewinding
38220 +        * to */
38221 +       destination = pos->entry_no + shift;
38222 +
38223 +       inode = dir->f_dentry->d_inode;
38224 +       if (dirpos < 0)
38225 +               return RETERR(-EINVAL);
38226 +       else if (destination == 0ll || dirpos == 0) {
38227 +               /* rewind to the beginning of directory */
38228 +               memset(pos, 0, sizeof *pos);
38229 +               return dir_go_to(dir, pos, tap);
38230 +       } else if (destination >= inode->i_size)
38231 +               return RETERR(-ENOENT);
38232 +
38233 +       if (shift < 0) {
38234 +               /* I am afraid of negative numbers */
38235 +               shift = -shift;
38236 +               /* rewinding to the left */
38237 +               if (shift <= (int)pos->position.pos) {
38238 +                       /* destination is within sequence of entries with
38239 +                          duplicate keys. */
38240 +                       result = dir_go_to(dir, pos, tap);
38241 +               } else {
38242 +                       shift -= pos->position.pos;
38243 +                       while (1) {
38244 +                               /* repetitions: deadlock is possible when
38245 +                                  going to the left. */
38246 +                               result = dir_go_to(dir, pos, tap);
38247 +                               if (result == 0) {
38248 +                                       result = rewind_left(tap, shift);
38249 +                                       if (result == -E_DEADLOCK) {
38250 +                                               reiser4_tap_done(tap);
38251 +                                               continue;
38252 +                                       }
38253 +                               }
38254 +                               break;
38255 +                       }
38256 +               }
38257 +       } else {
38258 +               /* rewinding to the right */
38259 +               result = dir_go_to(dir, pos, tap);
38260 +               if (result == 0)
38261 +                       result = rewind_right(tap, shift);
38262 +       }
38263 +       if (result == 0) {
38264 +               result = set_pos(inode, pos, tap);
38265 +               if (result == 0) {
38266 +                       /* update pos->position.pos */
38267 +                       pos->entry_no = destination;
38268 +                       pos->fpos = dirpos;
38269 +               }
38270 +       }
38271 +       return result;
38272 +}
38273 +
38274 +/*
38275 + * Function that is called by common_readdir() on each directory entry while
38276 + * doing readdir. ->filldir callback may block, so we had to release long term
38277 + * lock while calling it. To avoid repeating tree traversal, seal is used. If
38278 + * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
38279 + *
38280 + * Whether node is unlocked in case of any other error is undefined. It is
38281 + * guaranteed to be still locked if success (0) is returned.
38282 + *
38283 + * When ->filldir() wants no more, feed_entry() returns 1, and node is
38284 + * unlocked.
38285 + */
38286 +static int
38287 +feed_entry(struct file *f, struct readdir_pos *pos, tap_t *tap,
38288 +          filldir_t filldir, void *dirent)
38289 +{
38290 +       item_plugin *iplug;
38291 +       char *name;
38292 +       reiser4_key sd_key;
38293 +       int result;
38294 +       char buf[DE_NAME_BUF_LEN];
38295 +       char name_buf[32];
38296 +       char *local_name;
38297 +       unsigned file_type;
38298 +       seal_t seal;
38299 +       coord_t *coord;
38300 +       reiser4_key entry_key;
38301 +
38302 +       coord = tap->coord;
38303 +       iplug = item_plugin_by_coord(coord);
38304 +
38305 +       /* pointer to name within the node */
38306 +       name = iplug->s.dir.extract_name(coord, buf);
38307 +       assert("nikita-1371", name != NULL);
38308 +
38309 +       /* key of object the entry points to */
38310 +       if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
38311 +               return RETERR(-EIO);
38312 +
38313 +       /* we must release longterm znode lock before calling filldir to avoid
38314 +          deadlock which may happen if filldir causes page fault. So, copy
38315 +          name to intermediate buffer */
38316 +       if (strlen(name) + 1 > sizeof(name_buf)) {
38317 +               local_name = kmalloc(strlen(name) + 1,
38318 +                                    reiser4_ctx_gfp_mask_get());
38319 +               if (local_name == NULL)
38320 +                       return RETERR(-ENOMEM);
38321 +       } else
38322 +               local_name = name_buf;
38323 +
38324 +       strcpy(local_name, name);
38325 +       file_type = iplug->s.dir.extract_file_type(coord);
38326 +
38327 +       unit_key_by_coord(coord, &entry_key);
38328 +       reiser4_seal_init(&seal, coord, &entry_key);
38329 +
38330 +       longterm_unlock_znode(tap->lh);
38331 +
38332 +       /*
38333 +        * send information about directory entry to the ->filldir() filler
38334 +        * supplied to us by caller (VFS).
38335 +        *
38336 +        * ->filldir is entitled to do weird things. For example, ->filldir
38337 +        * supplied by knfsd re-enters file system. Make sure no locks are
38338 +        * held.
38339 +        */
38340 +       assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
38341 +
38342 +       reiser4_txn_restart_current();
38343 +       result = filldir(dirent, name, (int)strlen(name),
38344 +                        /* offset of this entry */
38345 +                        f->f_pos,
38346 +                        /* inode number of object bounden by this entry */
38347 +                        oid_to_uino(get_key_objectid(&sd_key)), file_type);
38348 +       if (local_name != name_buf)
38349 +               kfree(local_name);
38350 +       if (result < 0)
38351 +               /* ->filldir() is satisfied. (no space in buffer, IOW) */
38352 +               result = 1;
38353 +       else
38354 +               result = reiser4_seal_validate(&seal, coord, &entry_key,
38355 +                                              tap->lh, tap->mode,
38356 +                                              ZNODE_LOCK_HIPRI);
38357 +       return result;
38358 +}
38359 +
38360 +static void move_entry(struct readdir_pos *pos, coord_t *coord)
38361 +{
38362 +       reiser4_key de_key;
38363 +       de_id *did;
38364 +
38365 +       /* update @pos */
38366 +       ++pos->entry_no;
38367 +       did = &pos->position.dir_entry_key;
38368 +
38369 +       /* get key of directory entry */
38370 +       unit_key_by_coord(coord, &de_key);
38371 +
38372 +       if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
38373 +               /* we are within sequence of directory entries
38374 +                  with duplicate keys. */
38375 +               ++pos->position.pos;
38376 +       else {
38377 +               pos->position.pos = 0;
38378 +               build_de_id_by_key(&de_key, did);
38379 +       }
38380 +       ++pos->fpos;
38381 +}
38382 +
38383 +/*
38384 + *     STATELESS READDIR
38385 + *
38386 + * readdir support in reiser4 relies on ability to update readdir_pos embedded
38387 + * into reiser4_file_fsdata on each directory modification (name insertion and
38388 + * removal), see reiser4_readdir_common() function below. This obviously doesn't
38389 + * work when reiser4 is accessed over NFS, because NFS doesn't keep any state
38390 + * across client READDIR requests for the same directory.
38391 + *
38392 + * To address this we maintain a "pool" of detached reiser4_file_fsdata
38393 + * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
38394 + * find detached reiser4_file_fsdata corresponding to previous readdir
38395 + * request. In other words, additional state is maintained on the
38396 + * server. (This is somewhat contrary to the design goals of NFS protocol.)
38397 + *
38398 + * To efficiently detect when our ->readdir() method is called by NFS server,
38399 + * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
38400 + * file_is_stateless() function).
38401 + *
38402 + * To find out d_cursor in the pool, we encode client id (cid) in the highest
38403 + * bits of NFS readdir cookie: when first readdir request comes to the given
38404 + * directory from the given client, cookie is set to 0. This situation is
38405 + * detected, global cid_counter is incremented, and stored in highest bits of
38406 + * all direntry offsets returned to the client, including last one. As the
38407 + * only valid readdir cookie is one obtained as direntry->offset, we are
38408 + * guaranteed that next readdir request (continuing current one) will have
38409 + * current cid in the highest bits of starting readdir cookie. All d_cursors
38410 + * are hashed into per-super-block hash table by (oid, cid) key.
38411 + *
38412 + * In addition d_cursors are placed into per-super-block radix tree where they
38413 + * are keyed by oid alone. This is necessary to efficiently remove them during
38414 + * rmdir.
38415 + *
38416 + * At last, currently unused d_cursors are linked into special list. This list
38417 + * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
38418 + *
38419 + */
38420 +
38421 +/*
38422 + * prepare for readdir.
38423 + */
38424 +static int dir_readdir_init(struct file *f, tap_t *tap,
38425 +                           struct readdir_pos **pos)
38426 +{
38427 +       struct inode *inode;
38428 +       reiser4_file_fsdata *fsdata;
38429 +       int result;
38430 +
38431 +       assert("nikita-1359", f != NULL);
38432 +       inode = f->f_dentry->d_inode;
38433 +       assert("nikita-1360", inode != NULL);
38434 +
38435 +       if (!S_ISDIR(inode->i_mode))
38436 +               return RETERR(-ENOTDIR);
38437 +
38438 +       /* try to find detached readdir state */
38439 +       result = reiser4_attach_fsdata(f, inode);
38440 +       if (result != 0)
38441 +               return result;
38442 +
38443 +       fsdata = reiser4_get_file_fsdata(f);
38444 +       assert("nikita-2571", fsdata != NULL);
38445 +       if (IS_ERR(fsdata))
38446 +               return PTR_ERR(fsdata);
38447 +
38448 +       /* add file descriptor to the readdir list hanging of directory
38449 +        * inode. This list is used to scan "readdirs-in-progress" while
38450 +        * inserting or removing names in the directory. */
38451 +       spin_lock_inode(inode);
38452 +       if (list_empty_careful(&fsdata->dir.linkage))
38453 +               list_add(&fsdata->dir.linkage, get_readdir_list(inode));
38454 +       *pos = &fsdata->dir.readdir;
38455 +       spin_unlock_inode(inode);
38456 +
38457 +       /* move @tap to the current position */
38458 +       return dir_rewind(f, *pos, tap);
38459 +}
38460 +
38461 +/* this is implementation of vfs's llseek method of struct file_operations for
38462 +   typical directory
38463 +   See comment before reiser4_readdir_common() for explanation.
38464 +*/
38465 +loff_t reiser4_llseek_dir_common(struct file *file, loff_t off, int origin)
38466 +{
38467 +       reiser4_context *ctx;
38468 +       loff_t result;
38469 +       struct inode *inode;
38470 +
38471 +       inode = file->f_dentry->d_inode;
38472 +
38473 +       ctx = reiser4_init_context(inode->i_sb);
38474 +       if (IS_ERR(ctx))
38475 +               return PTR_ERR(ctx);
38476 +
38477 +       mutex_lock(&inode->i_mutex);
38478 +
38479 +       /* update ->f_pos */
38480 +       result = default_llseek(file, off, origin);
38481 +       if (result >= 0) {
38482 +               int ff;
38483 +               coord_t coord;
38484 +               lock_handle lh;
38485 +               tap_t tap;
38486 +               struct readdir_pos *pos;
38487 +
38488 +               coord_init_zero(&coord);
38489 +               init_lh(&lh);
38490 +               reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
38491 +
38492 +               ff = dir_readdir_init(file, &tap, &pos);
38493 +               reiser4_detach_fsdata(file);
38494 +               if (ff != 0)
38495 +                       result = (loff_t) ff;
38496 +               reiser4_tap_done(&tap);
38497 +       }
38498 +       reiser4_detach_fsdata(file);
38499 +       mutex_unlock(&inode->i_mutex);
38500 +
38501 +       reiser4_exit_context(ctx);
38502 +       return result;
38503 +}
38504 +
38505 +/* this is common implementation of vfs's readdir method of struct
38506 +   file_operations
38507 +
38508 +   readdir problems:
38509 +
38510 +   readdir(2)/getdents(2) interface is based on implicit assumption that
38511 +   readdir can be restarted from any particular point by supplying file system
38512 +   with off_t-full of data. That is, file system fills ->d_off field in struct
38513 +   dirent and later user passes ->d_off to the seekdir(3), which is, actually,
38514 +   implemented by glibc as lseek(2) on directory.
38515 +
38516 +   Reiser4 cannot restart readdir from 64 bits of data, because two last
38517 +   components of the key of directory entry are unknown, which given 128 bits:
38518 +   locality and type fields in the key of directory entry are always known, to
38519 +   start readdir() from given point objectid and offset fields have to be
38520 +   filled.
38521 +
38522 +   Traditional UNIX API for scanning through directory
38523 +   (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
38524 +   assumption that directory is structured very much like regular file, in
38525 +   particular, it is implied that each name within given directory (directory
38526 +   entry) can be uniquely identified by scalar offset and that such offset is
38527 +   stable across the life-time of the name is identifies.
38528 +
38529 +   This is manifestly not so for reiser4. In reiser4 the only stable unique
38530 +   identifies for the directory entry is its key that doesn't fit into
38531 +   seekdir/telldir API.
38532 +
38533 +   solution:
38534 +
38535 +   Within each file descriptor participating in readdir-ing of directory
38536 +   plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
38537 +   the "current" directory entry that file descriptor looks at. It contains a
38538 +   key of directory entry (plus some additional info to deal with non-unique
38539 +   keys that we wouldn't dwell onto here) and a logical position of this
38540 +   directory entry starting from the beginning of the directory, that is
38541 +   ordinal number of this entry in the readdir order.
38542 +
38543 +   Obviously this logical position is not stable in the face of directory
38544 +   modifications. To work around this, on each addition or removal of directory
38545 +   entry all file descriptors for directory inode are scanned and their
38546 +   readdir_pos are updated accordingly (adjust_dir_pos()).
38547 +*/
38548 +int reiser4_readdir_common(struct file *f /* directory file being read */,
38549 +                          void *dirent /* opaque data passed to us by VFS */,
38550 +                          filldir_t filld /* filler function passed to us
38551 +                                           * by VFS */)
38552 +{
38553 +       reiser4_context *ctx;
38554 +       int result;
38555 +       struct inode *inode;
38556 +       coord_t coord;
38557 +       lock_handle lh;
38558 +       tap_t tap;
38559 +       struct readdir_pos *pos;
38560 +
38561 +       assert("nikita-1359", f != NULL);
38562 +       inode = f->f_dentry->d_inode;
38563 +       assert("nikita-1360", inode != NULL);
38564 +
38565 +       if (!S_ISDIR(inode->i_mode))
38566 +               return RETERR(-ENOTDIR);
38567 +
38568 +       ctx = reiser4_init_context(inode->i_sb);
38569 +       if (IS_ERR(ctx))
38570 +               return PTR_ERR(ctx);
38571 +
38572 +       coord_init_zero(&coord);
38573 +       init_lh(&lh);
38574 +       reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
38575 +
38576 +       reiser4_readdir_readahead_init(inode, &tap);
38577 +
38578 +repeat:
38579 +       result = dir_readdir_init(f, &tap, &pos);
38580 +       if (result == 0) {
38581 +               result = reiser4_tap_load(&tap);
38582 +               /* scan entries one by one feeding them to @filld */
38583 +               while (result == 0) {
38584 +                       coord_t *coord;
38585 +
38586 +                       coord = tap.coord;
38587 +                       assert("nikita-2572", coord_is_existing_unit(coord));
38588 +                       assert("nikita-3227", is_valid_dir_coord(inode, coord));
38589 +
38590 +                       result = feed_entry(f, pos, &tap, filld, dirent);
38591 +                       if (result > 0) {
38592 +                               break;
38593 +                       } else if (result == 0) {
38594 +                               ++f->f_pos;
38595 +                               result = go_next_unit(&tap);
38596 +                               if (result == -E_NO_NEIGHBOR ||
38597 +                                   result == -ENOENT) {
38598 +                                       result = 0;
38599 +                                       break;
38600 +                               } else if (result == 0) {
38601 +                                       if (is_valid_dir_coord(inode, coord))
38602 +                                               move_entry(pos, coord);
38603 +                                       else
38604 +                                               break;
38605 +                               }
38606 +                       } else if (result == -E_REPEAT) {
38607 +                               /* feed_entry() had to restart. */
38608 +                               ++f->f_pos;
38609 +                               reiser4_tap_relse(&tap);
38610 +                               goto repeat;
38611 +                       } else
38612 +                               warning("vs-1617",
38613 +                                       "reiser4_readdir_common: unexpected error %d",
38614 +                                       result);
38615 +               }
38616 +               reiser4_tap_relse(&tap);
38617 +
38618 +               if (result >= 0)
38619 +                       f->f_version = inode->i_version;
38620 +       } else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
38621 +               result = 0;
38622 +       reiser4_tap_done(&tap);
38623 +       reiser4_detach_fsdata(f);
38624 +
38625 +       /* try to update directory's atime */
38626 +       if (reiser4_grab_space_force(inode_file_plugin(inode)->estimate.update(inode),
38627 +                              BA_CAN_COMMIT) != 0)
38628 +               warning("", "failed to update atime on readdir: %llu",
38629 +                       get_inode_oid(inode));
38630 +       else
38631 +               file_accessed(f);
38632 +
38633 +       context_set_commit_async(ctx);
38634 +       reiser4_exit_context(ctx);
38635 +
38636 +       return (result <= 0) ? result : 0;
38637 +}
38638 +
38639 +/*
38640 + * Local variables:
38641 + * c-indentation-style: "K&R"
38642 + * mode-name: "LC"
38643 + * c-basic-offset: 8
38644 + * tab-width: 8
38645 + * fill-column: 79
38646 + * End:
38647 + */
38648 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/file_plugin_common.c linux-2.6.35/fs/reiser4/plugin/file_plugin_common.c
38649 --- linux-2.6.35.orig/fs/reiser4/plugin/file_plugin_common.c    1970-01-01 01:00:00.000000000 +0100
38650 +++ linux-2.6.35/fs/reiser4/plugin/file_plugin_common.c 2010-08-04 15:44:57.000000000 +0200
38651 @@ -0,0 +1,1008 @@
38652 +/* Copyright 2005 by Hans Reiser, licensing governed by
38653 +   reiser4/README */
38654 +
38655 +/* this file contains typical implementations for most of methods of
38656 +   file plugin
38657 +*/
38658 +
38659 +#include "../inode.h"
38660 +#include "object.h"
38661 +#include "../safe_link.h"
38662 +
38663 +#include <linux/quotaops.h>
38664 +
38665 +static int insert_new_sd(struct inode *inode);
38666 +static int update_sd(struct inode *inode);
38667 +
38668 +/* this is common implementation of write_sd_by_inode method of file plugin
38669 +   either insert stat data or update it
38670 + */
38671 +int write_sd_by_inode_common(struct inode *inode/* object to save */)
38672 +{
38673 +       int result;
38674 +
38675 +       assert("nikita-730", inode != NULL);
38676 +
38677 +       if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
38678 +               /* object doesn't have stat-data yet */
38679 +               result = insert_new_sd(inode);
38680 +       else
38681 +               result = update_sd(inode);
38682 +       if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM)
38683 +               /* Don't issue warnings about "name is too long" */
38684 +               warning("nikita-2221", "Failed to save sd for %llu: %i",
38685 +                       (unsigned long long)get_inode_oid(inode), result);
38686 +       return result;
38687 +}
38688 +
38689 +/* this is common implementation of key_by_inode method of file plugin
38690 + */
38691 +int
38692 +key_by_inode_and_offset_common(struct inode *inode, loff_t off,
38693 +                              reiser4_key * key)
38694 +{
38695 +       reiser4_key_init(key);
38696 +       set_key_locality(key, reiser4_inode_data(inode)->locality_id);
38697 +       set_key_ordering(key, get_inode_ordering(inode));
38698 +       set_key_objectid(key, get_inode_oid(inode));    /*FIXME: inode->i_ino */
38699 +       set_key_type(key, KEY_BODY_MINOR);
38700 +       set_key_offset(key, (__u64) off);
38701 +       return 0;
38702 +}
38703 +
38704 +/* this is common implementation of set_plug_in_inode method of file plugin
38705 + */
38706 +int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ ,
38707 +                            struct inode *parent /* parent object */ ,
38708 +                            reiser4_object_create_data * data  /* creational
38709 +                                                                * data */ )
38710 +{
38711 +       __u64 mask;
38712 +
38713 +       object->i_mode = data->mode;
38714 +       /* this should be plugin decision */
38715 +       object->i_uid = current->cred->fsuid;
38716 +       object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME;
38717 +
38718 +       /* support for BSD style group-id assignment. See mount's manual page
38719 +          description of bsdgroups ext2 mount options for more details */
38720 +       if (reiser4_is_set(object->i_sb, REISER4_BSD_GID))
38721 +               object->i_gid = parent->i_gid;
38722 +       else if (parent->i_mode & S_ISGID) {
38723 +               /* parent directory has sguid bit */
38724 +               object->i_gid = parent->i_gid;
38725 +               if (S_ISDIR(object->i_mode))
38726 +                       /* sguid is inherited by sub-directories */
38727 +                       object->i_mode |= S_ISGID;
38728 +       } else
38729 +               object->i_gid = current->cred->fsgid;
38730 +
38731 +       /* this object doesn't have stat-data yet */
38732 +       reiser4_inode_set_flag(object, REISER4_NO_SD);
38733 +#if 0
38734 +       /* this is now called after all inode plugins are initialized:
38735 +          do_create_vfs_child after adjust_to_parent */
38736 +       /* setup inode and file-operations for this inode */
38737 +       setup_inode_ops(object, data);
38738 +#endif
38739 +       object->i_nlink = 0;
38740 +       reiser4_seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL);
38741 +       mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT);
38742 +       if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES))
38743 +               mask |= (1 << LARGE_TIMES_STAT);
38744 +
38745 +       reiser4_inode_data(object)->extmask = mask;
38746 +       return 0;
38747 +}
38748 +
38749 +/* this is common implementation of adjust_to_parent method of file plugin for
38750 +   regular files
38751 + */
38752 +int adjust_to_parent_common(struct inode *object /* new object */ ,
38753 +                           struct inode *parent /* parent directory */ ,
38754 +                           struct inode *root/* root directory */)
38755 +{
38756 +       assert("nikita-2165", object != NULL);
38757 +       if (parent == NULL)
38758 +               parent = root;
38759 +       assert("nikita-2069", parent != NULL);
38760 +
38761 +       /*
38762 +        * inherit missing plugins from parent
38763 +        */
38764 +
38765 +       grab_plugin_pset(object, parent, PSET_FILE);
38766 +       grab_plugin_pset(object, parent, PSET_SD);
38767 +       grab_plugin_pset(object, parent, PSET_FORMATTING);
38768 +       grab_plugin_pset(object, parent, PSET_PERM);
38769 +       return 0;
38770 +}
38771 +
38772 +/* this is common implementation of adjust_to_parent method of file plugin for
38773 +   typical directories
38774 + */
38775 +int adjust_to_parent_common_dir(struct inode *object /* new object */ ,
38776 +                               struct inode *parent /* parent directory */ ,
38777 +                               struct inode *root/* root directory */)
38778 +{
38779 +       int result = 0;
38780 +       pset_member memb;
38781 +
38782 +       assert("nikita-2166", object != NULL);
38783 +       if (parent == NULL)
38784 +               parent = root;
38785 +       assert("nikita-2167", parent != NULL);
38786 +
38787 +       /*
38788 +        * inherit missing plugins from parent
38789 +        */
38790 +       for (memb = 0; memb < PSET_LAST; ++memb) {
38791 +               result = grab_plugin_pset(object, parent, memb);
38792 +               if (result != 0)
38793 +                       break;
38794 +       }
38795 +       return result;
38796 +}
38797 +
38798 +int adjust_to_parent_cryptcompress(struct inode *object /* new object */ ,
38799 +                                  struct inode *parent /* parent directory */,
38800 +                                  struct inode *root/* root directory */)
38801 +{
38802 +       int result;
38803 +       result = adjust_to_parent_common(object, parent, root);
38804 +       if (result)
38805 +               return result;
38806 +       assert("edward-1416", parent != NULL);
38807 +
38808 +       grab_plugin_pset(object, parent, PSET_CLUSTER);
38809 +       grab_plugin_pset(object, parent, PSET_CIPHER);
38810 +       grab_plugin_pset(object, parent, PSET_DIGEST);
38811 +       grab_plugin_pset(object, parent, PSET_COMPRESSION);
38812 +       grab_plugin_pset(object, parent, PSET_COMPRESSION_MODE);
38813 +
38814 +       return 0;
38815 +}
38816 +
38817 +/* this is common implementation of create_object method of file plugin
38818 + */
38819 +int reiser4_create_object_common(struct inode *object, struct inode *parent,
38820 +                                reiser4_object_create_data * data)
38821 +{
38822 +       reiser4_block_nr reserve;
38823 +       assert("nikita-744", object != NULL);
38824 +       assert("nikita-745", parent != NULL);
38825 +       assert("nikita-747", data != NULL);
38826 +       assert("nikita-748", reiser4_inode_get_flag(object, REISER4_NO_SD));
38827 +
38828 +       reserve = estimate_create_common(object);
38829 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
38830 +               return RETERR(-ENOSPC);
38831 +       return write_sd_by_inode_common(object);
38832 +}
38833 +
38834 +static int common_object_delete_no_reserve(struct inode *inode);
38835 +
38836 +/**
38837 + * reiser4_delete_object_common - delete_object of file_plugin
38838 + * @inode: inode to be deleted
38839 + *
38840 + * This is common implementation of delete_object method of file_plugin. It
38841 + * applies to object its deletion consists of removing two items - stat data
38842 + * and safe-link.
38843 + */
38844 +int reiser4_delete_object_common(struct inode *inode)
38845 +{
38846 +       int result;
38847 +
38848 +       assert("nikita-1477", inode != NULL);
38849 +       /* FIXME: if file body deletion failed (i/o error, for instance),
38850 +          inode->i_size can be != 0 here */
38851 +       assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode));
38852 +       assert("nikita-3421", inode->i_nlink == 0);
38853 +
38854 +       if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
38855 +               reiser4_block_nr reserve;
38856 +
38857 +               /* grab space which is needed to remove 2 items from the tree:
38858 +                  stat data and safe-link */
38859 +               reserve = 2 *
38860 +                 estimate_one_item_removal(reiser4_tree_by_inode(inode));
38861 +               if (reiser4_grab_space_force(reserve,
38862 +                                            BA_RESERVED | BA_CAN_COMMIT))
38863 +                       return RETERR(-ENOSPC);
38864 +               result = common_object_delete_no_reserve(inode);
38865 +       } else
38866 +               result = 0;
38867 +       return result;
38868 +}
38869 +
38870 +/**
38871 + * reiser4_delete_dir_common - delete_object of file_plugin
38872 + * @inode: inode to be deleted
38873 + *
38874 + * This is common implementation of delete_object method of file_plugin for
38875 + * typical directory. It calls done method of dir_plugin to remove "." and
38876 + * removes stat data and safe-link.
38877 + */
38878 +int reiser4_delete_dir_common(struct inode *inode)
38879 +{
38880 +       int result;
38881 +       dir_plugin *dplug;
38882 +
38883 +       assert("", (get_current_context() &&
38884 +                   get_current_context()->trans->atom == NULL));
38885 +
38886 +       dplug = inode_dir_plugin(inode);
38887 +       assert("vs-1101", dplug && dplug->done);
38888 +
38889 +       /* kill cursors which might be attached to inode */
38890 +       reiser4_kill_cursors(inode);
38891 +
38892 +       /* grab space enough for removing two items */
38893 +       if (reiser4_grab_space
38894 +           (2 * estimate_one_item_removal(reiser4_tree_by_inode(inode)),
38895 +            BA_RESERVED | BA_CAN_COMMIT))
38896 +               return RETERR(-ENOSPC);
38897 +
38898 +       result = dplug->done(inode);
38899 +       if (!result)
38900 +               result = common_object_delete_no_reserve(inode);
38901 +       return result;
38902 +}
38903 +
38904 +/* this is common implementation of add_link method of file plugin
38905 + */
38906 +int reiser4_add_link_common(struct inode *object, struct inode *parent)
38907 +{
38908 +       /*
38909 +        * increment ->i_nlink and update ->i_ctime
38910 +        */
38911 +
38912 +       INODE_INC_FIELD(object, i_nlink);
38913 +       object->i_ctime = CURRENT_TIME;
38914 +       return 0;
38915 +}
38916 +
38917 +/* this is common implementation of rem_link method of file plugin
38918 + */
38919 +int reiser4_rem_link_common(struct inode *object, struct inode *parent)
38920 +{
38921 +       assert("nikita-2021", object != NULL);
38922 +       assert("nikita-2163", object->i_nlink > 0);
38923 +
38924 +       /*
38925 +        * decrement ->i_nlink and update ->i_ctime
38926 +        */
38927 +
38928 +       INODE_DEC_FIELD(object, i_nlink);
38929 +       object->i_ctime = CURRENT_TIME;
38930 +       return 0;
38931 +}
38932 +
38933 +/* this is common implementation of rem_link method of file plugin for typical
38934 +   directory
38935 +*/
38936 +int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG)
38937 +{
38938 +       assert("nikita-20211", object != NULL);
38939 +       assert("nikita-21631", object->i_nlink > 0);
38940 +
38941 +       /*
38942 +        * decrement ->i_nlink and update ->i_ctime
38943 +        */
38944 +       INODE_DEC_FIELD(object, i_nlink);
38945 +       if (object->i_nlink == 1)
38946 +               INODE_DEC_FIELD(object, i_nlink);
38947 +       object->i_ctime = CURRENT_TIME;
38948 +       return 0;
38949 +}
38950 +
38951 +/* this is common implementation of owns_item method of file plugin
38952 +   compare objectids of keys in inode and coord */
38953 +int owns_item_common(const struct inode *inode,        /* object to check
38954 +                                                * against */
38955 +                    const coord_t *coord/* coord to check */)
38956 +{
38957 +       reiser4_key item_key;
38958 +       reiser4_key file_key;
38959 +
38960 +       assert("nikita-760", inode != NULL);
38961 +       assert("nikita-761", coord != NULL);
38962 +
38963 +       return coord_is_existing_item(coord) &&
38964 +           (get_key_objectid(build_sd_key(inode, &file_key)) ==
38965 +            get_key_objectid(item_key_by_coord(coord, &item_key)));
38966 +}
38967 +
38968 +/* this is common implementation of owns_item method of file plugin
38969 +   for typical directory
38970 +*/
38971 +int owns_item_common_dir(const struct inode *inode,/* object to check against */
38972 +                        const coord_t *coord/* coord of item to check */)
38973 +{
38974 +       reiser4_key item_key;
38975 +
38976 +       assert("nikita-1335", inode != NULL);
38977 +       assert("nikita-1334", coord != NULL);
38978 +
38979 +       if (plugin_of_group(item_plugin_by_coord(coord), DIR_ENTRY_ITEM_TYPE))
38980 +               return get_key_locality(item_key_by_coord(coord, &item_key)) ==
38981 +                   get_inode_oid(inode);
38982 +       else
38983 +               return owns_item_common(inode, coord);
38984 +}
38985 +
38986 +/* this is common implementation of can_add_link method of file plugin
38987 +   checks whether yet another hard links to this object can be added
38988 +*/
38989 +int can_add_link_common(const struct inode *object/* object to check */)
38990 +{
38991 +       assert("nikita-732", object != NULL);
38992 +
38993 +       /* inode->i_nlink is unsigned int, so just check for integer
38994 +          overflow */
38995 +       return object->i_nlink + 1 != 0;
38996 +}
38997 +
38998 +/* this is common implementation of can_rem_link method of file plugin for
38999 +   typical directory
39000 +*/
39001 +int can_rem_link_common_dir(const struct inode *inode)
39002 +{
39003 +       /* is_dir_empty() returns 0 is dir is empty */
39004 +       return !is_dir_empty(inode);
39005 +}
39006 +
39007 +/* this is common implementation of detach method of file plugin for typical
39008 +   directory
39009 +*/
39010 +int reiser4_detach_common_dir(struct inode *child, struct inode *parent)
39011 +{
39012 +       dir_plugin *dplug;
39013 +
39014 +       dplug = inode_dir_plugin(child);
39015 +       assert("nikita-2883", dplug != NULL);
39016 +       assert("nikita-2884", dplug->detach != NULL);
39017 +       return dplug->detach(child, parent);
39018 +}
39019 +
39020 +/* this is common implementation of bind method of file plugin for typical
39021 +   directory
39022 +*/
39023 +int reiser4_bind_common_dir(struct inode *child, struct inode *parent)
39024 +{
39025 +       dir_plugin *dplug;
39026 +
39027 +       dplug = inode_dir_plugin(child);
39028 +       assert("nikita-2646", dplug != NULL);
39029 +       return dplug->attach(child, parent);
39030 +}
39031 +
39032 +static int process_truncate(struct inode *, __u64 size);
39033 +
39034 +/* this is common implementation of safelink method of file plugin
39035 + */
39036 +int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value)
39037 +{
39038 +       int result;
39039 +
39040 +       assert("vs-1705", get_current_context()->trans->atom == NULL);
39041 +       if (link == SAFE_UNLINK)
39042 +               /* nothing to do. iput() in the caller (process_safelink) will
39043 +                * finish with file */
39044 +               result = 0;
39045 +       else if (link == SAFE_TRUNCATE)
39046 +               result = process_truncate(object, value);
39047 +       else {
39048 +               warning("nikita-3438", "Unrecognized safe-link type: %i", link);
39049 +               result = RETERR(-EIO);
39050 +       }
39051 +       return result;
39052 +}
39053 +
39054 +/* this is common implementation of estimate.create method of file plugin
39055 +   can be used when object creation involves insertion of one item (usually stat
39056 +   data) into tree
39057 +*/
39058 +reiser4_block_nr estimate_create_common(const struct inode *object)
39059 +{
39060 +       return estimate_one_insert_item(reiser4_tree_by_inode(object));
39061 +}
39062 +
39063 +/* this is common implementation of estimate.create method of file plugin for
39064 +   typical directory
39065 +   can be used when directory creation involves insertion of two items (usually
39066 +   stat data and item containing "." and "..") into tree
39067 +*/
39068 +reiser4_block_nr estimate_create_common_dir(const struct inode *object)
39069 +{
39070 +       return 2 * estimate_one_insert_item(reiser4_tree_by_inode(object));
39071 +}
39072 +
39073 +/* this is common implementation of estimate.update method of file plugin
39074 +   can be used when stat data update does not do more than inserting a unit
39075 +   into a stat data item which is probably true for most cases
39076 +*/
39077 +reiser4_block_nr estimate_update_common(const struct inode *inode)
39078 +{
39079 +       return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
39080 +}
39081 +
39082 +/* this is common implementation of estimate.unlink method of file plugin
39083 + */
39084 +reiser4_block_nr
39085 +estimate_unlink_common(const struct inode *object UNUSED_ARG,
39086 +                      const struct inode *parent UNUSED_ARG)
39087 +{
39088 +       return 0;
39089 +}
39090 +
39091 +/* this is common implementation of estimate.unlink method of file plugin for
39092 +   typical directory
39093 +*/
39094 +reiser4_block_nr
39095 +estimate_unlink_common_dir(const struct inode *object,
39096 +                          const struct inode *parent)
39097 +{
39098 +       dir_plugin *dplug;
39099 +
39100 +       dplug = inode_dir_plugin(object);
39101 +       assert("nikita-2888", dplug != NULL);
39102 +       assert("nikita-2887", dplug->estimate.unlink != NULL);
39103 +       return dplug->estimate.unlink(object, parent);
39104 +}
39105 +
39106 +char *wire_write_common(struct inode *inode, char *start)
39107 +{
39108 +       return build_inode_onwire(inode, start);
39109 +}
39110 +
39111 +char *wire_read_common(char *addr, reiser4_object_on_wire * obj)
39112 +{
39113 +       if (!obj)
39114 +               return locate_obj_key_id_onwire(addr);
39115 +       return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id);
39116 +}
39117 +
39118 +struct dentry *wire_get_common(struct super_block *sb,
39119 +                              reiser4_object_on_wire * obj)
39120 +{
39121 +       struct inode *inode;
39122 +       struct dentry *dentry;
39123 +       reiser4_key key;
39124 +
39125 +       extract_key_from_id(&obj->u.std.key_id, &key);
39126 +       inode = reiser4_iget(sb, &key, 1);
39127 +       if (!IS_ERR(inode)) {
39128 +               reiser4_iget_complete(inode);
39129 +               dentry = d_obtain_alias(inode);
39130 +               if (!IS_ERR(dentry))
39131 +                       dentry->d_op = &get_super_private(sb)->ops.dentry;
39132 +       } else if (PTR_ERR(inode) == -ENOENT)
39133 +               /*
39134 +                * inode wasn't found at the key encoded in the file
39135 +                * handle. Hence, file handle is stale.
39136 +                */
39137 +               dentry = ERR_PTR(RETERR(-ESTALE));
39138 +       else
39139 +               dentry = (void *)inode;
39140 +       return dentry;
39141 +}
39142 +
39143 +int wire_size_common(struct inode *inode)
39144 +{
39145 +       return inode_onwire_size(inode);
39146 +}
39147 +
39148 +void wire_done_common(reiser4_object_on_wire * obj)
39149 +{
39150 +       /* nothing to do */
39151 +}
39152 +
39153 +/* helper function to print errors */
39154 +static void key_warning(const reiser4_key * key /* key to print */ ,
39155 +                       const struct inode *inode,
39156 +                       int code/* error code to print */)
39157 +{
39158 +       assert("nikita-716", key != NULL);
39159 +
39160 +       if (code != -ENOMEM) {
39161 +               warning("nikita-717", "Error for inode %llu (%i)",
39162 +                       (unsigned long long)get_key_objectid(key), code);
39163 +               reiser4_print_key("for key", key);
39164 +       }
39165 +}
39166 +
39167 +/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */
39168 +#if REISER4_DEBUG
39169 +static void
39170 +check_inode_seal(const struct inode *inode,
39171 +                const coord_t *coord, const reiser4_key * key)
39172 +{
39173 +       reiser4_key unit_key;
39174 +
39175 +       unit_key_by_coord(coord, &unit_key);
39176 +       assert("nikita-2752",
39177 +              WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key)));
39178 +       assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key));
39179 +}
39180 +
39181 +static void check_sd_coord(coord_t *coord, const reiser4_key * key)
39182 +{
39183 +       reiser4_key ukey;
39184 +
39185 +       coord_clear_iplug(coord);
39186 +       if (zload(coord->node))
39187 +               return;
39188 +
39189 +       if (!coord_is_existing_unit(coord) ||
39190 +           !item_plugin_by_coord(coord) ||
39191 +           !keyeq(unit_key_by_coord(coord, &ukey), key) ||
39192 +           (znode_get_level(coord->node) != LEAF_LEVEL) ||
39193 +           !item_is_statdata(coord)) {
39194 +               warning("nikita-1901", "Conspicuous seal");
39195 +               reiser4_print_key("key", key);
39196 +               print_coord("coord", coord, 1);
39197 +               impossible("nikita-2877", "no way");
39198 +       }
39199 +       zrelse(coord->node);
39200 +}
39201 +
39202 +#else
39203 +#define check_inode_seal(inode, coord, key) noop
39204 +#define check_sd_coord(coord, key) noop
39205 +#endif
39206 +
39207 +/* insert new stat-data into tree. Called with inode state
39208 +    locked. Return inode state locked. */
39209 +static int insert_new_sd(struct inode *inode/* inode to create sd for */)
39210 +{
39211 +       int result;
39212 +       reiser4_key key;
39213 +       coord_t coord;
39214 +       reiser4_item_data data;
39215 +       char *area;
39216 +       reiser4_inode *ref;
39217 +       lock_handle lh;
39218 +       oid_t oid;
39219 +
39220 +       assert("nikita-723", inode != NULL);
39221 +       assert("nikita-3406", reiser4_inode_get_flag(inode, REISER4_NO_SD));
39222 +
39223 +       ref = reiser4_inode_data(inode);
39224 +       spin_lock_inode(inode);
39225 +
39226 +       if (ref->plugin_mask != 0)
39227 +               /* inode has non-standard plugins */
39228 +               inode_set_extension(inode, PLUGIN_STAT);
39229 +       /*
39230 +        * prepare specification of new item to be inserted
39231 +        */
39232 +
39233 +       data.iplug = inode_sd_plugin(inode);
39234 +       data.length = data.iplug->s.sd.save_len(inode);
39235 +       spin_unlock_inode(inode);
39236 +
39237 +       data.data = NULL;
39238 +       data.user = 0;
39239 +/* could be optimized for case where there is only one node format in
39240 + * use in the filesystem, probably there are lots of such
39241 + * places we could optimize for only one node layout.... -Hans */
39242 +       if (data.length > reiser4_tree_by_inode(inode)->nplug->max_item_size()) {
39243 +               /* This is silly check, but we don't know actual node where
39244 +                  insertion will go into. */
39245 +               return RETERR(-ENAMETOOLONG);
39246 +       }
39247 +       oid = oid_allocate(inode->i_sb);
39248 +/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be
39249 + * encapsulated into oid_allocate? */
39250 +       if (oid == ABSOLUTE_MAX_OID)
39251 +               return RETERR(-EOVERFLOW);
39252 +
39253 +       set_inode_oid(inode, oid);
39254 +
39255 +       coord_init_zero(&coord);
39256 +       init_lh(&lh);
39257 +
39258 +       result = insert_by_key(reiser4_tree_by_inode(inode),
39259 +                              build_sd_key(inode, &key), &data, &coord, &lh,
39260 +                              /* stat data lives on a leaf level */
39261 +                              LEAF_LEVEL, CBK_UNIQUE);
39262 +
39263 +       /* we don't want to re-check that somebody didn't insert
39264 +          stat-data while we were doing io, because if it did,
39265 +          insert_by_key() returned error. */
39266 +       /* but what _is_ possible is that plugin for inode's stat-data,
39267 +          list of non-standard plugins or their state would change
39268 +          during io, so that stat-data wouldn't fit into sd. To avoid
39269 +          this race we keep inode_state lock. This lock has to be
39270 +          taken each time you access inode in a way that would cause
39271 +          changes in sd size: changing plugins etc.
39272 +        */
39273 +
39274 +       if (result == IBK_INSERT_OK) {
39275 +               coord_clear_iplug(&coord);
39276 +               result = zload(coord.node);
39277 +               if (result == 0) {
39278 +                       /* have we really inserted stat data? */
39279 +                       assert("nikita-725", item_is_statdata(&coord));
39280 +
39281 +                       /* inode was just created. It is inserted into hash
39282 +                          table, but no directory entry was yet inserted into
39283 +                          parent. So, inode is inaccessible through
39284 +                          ->lookup(). All places that directly grab inode
39285 +                          from hash-table (like old knfsd), should check
39286 +                          IMMUTABLE flag that is set by common_create_child.
39287 +                        */
39288 +                       assert("nikita-3240", data.iplug != NULL);
39289 +                       assert("nikita-3241", data.iplug->s.sd.save != NULL);
39290 +                       area = item_body_by_coord(&coord);
39291 +                       result = data.iplug->s.sd.save(inode, &area);
39292 +                       znode_make_dirty(coord.node);
39293 +                       if (result == 0) {
39294 +                               /* object has stat-data now */
39295 +                               reiser4_inode_clr_flag(inode, REISER4_NO_SD);
39296 +                               reiser4_inode_set_flag(inode,
39297 +                                                      REISER4_SDLEN_KNOWN);
39298 +                               /* initialise stat-data seal */
39299 +                               reiser4_seal_init(&ref->sd_seal, &coord, &key);
39300 +                               ref->sd_coord = coord;
39301 +                               check_inode_seal(inode, &coord, &key);
39302 +                       } else if (result != -ENOMEM)
39303 +                               /*
39304 +                                * convert any other error code to -EIO to
39305 +                                * avoid confusing user level with unexpected
39306 +                                * errors.
39307 +                                */
39308 +                               result = RETERR(-EIO);
39309 +                       zrelse(coord.node);
39310 +               }
39311 +       }
39312 +       done_lh(&lh);
39313 +
39314 +       if (result != 0)
39315 +               key_warning(&key, inode, result);
39316 +       else
39317 +               oid_count_allocated();
39318 +
39319 +       return result;
39320 +}
39321 +
39322 +/* find sd of inode in a tree, deal with errors */
39323 +int lookup_sd(struct inode *inode /* inode to look sd for */ ,
39324 +             znode_lock_mode lock_mode /* lock mode */ ,
39325 +             coord_t *coord /* resulting coord */ ,
39326 +             lock_handle * lh /* resulting lock handle */ ,
39327 +             const reiser4_key * key /* resulting key */ ,
39328 +             int silent)
39329 +{
39330 +       int result;
39331 +       __u32 flags;
39332 +
39333 +       assert("nikita-1692", inode != NULL);
39334 +       assert("nikita-1693", coord != NULL);
39335 +       assert("nikita-1694", key != NULL);
39336 +
39337 +       /* look for the object's stat data in a tree.
39338 +          This returns in "node" pointer to a locked znode and in "pos"
39339 +          position of an item found in node. Both are only valid if
39340 +          coord_found is returned. */
39341 +       flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
39342 +       flags |= CBK_UNIQUE;
39343 +       /*
39344 +        * traverse tree to find stat data. We cannot use vroot here, because
39345 +        * it only covers _body_ of the file, and stat data don't belong
39346 +        * there.
39347 +        */
39348 +       result = coord_by_key(reiser4_tree_by_inode(inode),
39349 +                             key,
39350 +                             coord,
39351 +                             lh,
39352 +                             lock_mode,
39353 +                             FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL);
39354 +       if (REISER4_DEBUG && result == 0)
39355 +               check_sd_coord(coord, key);
39356 +
39357 +       if (result != 0 && !silent)
39358 +               key_warning(key, inode, result);
39359 +       return result;
39360 +}
39361 +
39362 +static int
39363 +locate_inode_sd(struct inode *inode,
39364 +               reiser4_key * key, coord_t *coord, lock_handle * lh)
39365 +{
39366 +       reiser4_inode *state;
39367 +       seal_t seal;
39368 +       int result;
39369 +
39370 +       assert("nikita-3483", inode != NULL);
39371 +
39372 +       state = reiser4_inode_data(inode);
39373 +       spin_lock_inode(inode);
39374 +       *coord = state->sd_coord;
39375 +       coord_clear_iplug(coord);
39376 +       seal = state->sd_seal;
39377 +       spin_unlock_inode(inode);
39378 +
39379 +       build_sd_key(inode, key);
39380 +       if (reiser4_seal_is_set(&seal)) {
39381 +               /* first, try to use seal */
39382 +               result = reiser4_seal_validate(&seal,
39383 +                                              coord,
39384 +                                              key,
39385 +                                              lh, ZNODE_WRITE_LOCK,
39386 +                                              ZNODE_LOCK_LOPRI);
39387 +               if (result == 0)
39388 +                       check_sd_coord(coord, key);
39389 +       } else
39390 +               result = -E_REPEAT;
39391 +
39392 +       if (result != 0) {
39393 +               coord_init_zero(coord);
39394 +               result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0);
39395 +       }
39396 +       return result;
39397 +}
39398 +
39399 +#if REISER4_DEBUG
39400 +static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
39401 +{
39402 +       return (get_key_locality(k1) == get_key_locality(k2) &&
39403 +               get_key_type(k1) == get_key_type(k2) &&
39404 +               get_key_band(k1) == get_key_band(k2) &&
39405 +               get_key_ordering(k1) == get_key_ordering(k2) &&
39406 +               get_key_objectid(k1) == get_key_objectid(k2));
39407 +}
39408 +
39409 +#include "../tree_walk.h"
39410 +
39411 +/* make some checks before and after stat-data resize operation */
39412 +static int check_sd_resize(struct inode *inode, coord_t *coord,
39413 +                          int length, int progress/* 1 means after resize */)
39414 +{
39415 +       int ret = 0;
39416 +       lock_handle left_lock;
39417 +       coord_t left_coord;
39418 +       reiser4_key left_key;
39419 +       reiser4_key key;
39420 +
39421 +       if (inode_file_plugin(inode) !=
39422 +           file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
39423 +               return 0;
39424 +       if (!length)
39425 +               return 0;
39426 +       if (coord->item_pos != 0)
39427 +               return 0;
39428 +
39429 +       init_lh(&left_lock);
39430 +       ret = reiser4_get_left_neighbor(&left_lock,
39431 +                                       coord->node,
39432 +                                       ZNODE_WRITE_LOCK,
39433 +                                       GN_CAN_USE_UPPER_LEVELS);
39434 +       if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
39435 +           ret == -ENOENT || ret == -EINVAL
39436 +           || ret == -E_DEADLOCK) {
39437 +               ret = 0;
39438 +               goto exit;
39439 +       }
39440 +       ret = zload(left_lock.node);
39441 +       if (ret)
39442 +               goto exit;
39443 +       coord_init_last_unit(&left_coord, left_lock.node);
39444 +       item_key_by_coord(&left_coord, &left_key);
39445 +       item_key_by_coord(coord, &key);
39446 +
39447 +       if (all_but_offset_key_eq(&key, &left_key))
39448 +               /* corruption occured */
39449 +               ret = 1;
39450 +       zrelse(left_lock.node);
39451 + exit:
39452 +       done_lh(&left_lock);
39453 +       return ret;
39454 +}
39455 +#endif
39456 +
39457 +/* update stat-data at @coord */
39458 +static int
39459 +update_sd_at(struct inode *inode, coord_t *coord, reiser4_key * key,
39460 +            lock_handle * lh)
39461 +{
39462 +       int result;
39463 +       reiser4_item_data data;
39464 +       char *area;
39465 +       reiser4_inode *state;
39466 +       znode *loaded;
39467 +
39468 +       state = reiser4_inode_data(inode);
39469 +
39470 +       coord_clear_iplug(coord);
39471 +       result = zload(coord->node);
39472 +       if (result != 0)
39473 +               return result;
39474 +       loaded = coord->node;
39475 +
39476 +       spin_lock_inode(inode);
39477 +       assert("nikita-728", inode_sd_plugin(inode) != NULL);
39478 +       data.iplug = inode_sd_plugin(inode);
39479 +
39480 +       /* if inode has non-standard plugins, add appropriate stat data
39481 +        * extension */
39482 +       if (state->extmask & (1 << PLUGIN_STAT)) {
39483 +               if (state->plugin_mask == 0)
39484 +                       inode_clr_extension(inode, PLUGIN_STAT);
39485 +       } else if (state->plugin_mask != 0)
39486 +               inode_set_extension(inode, PLUGIN_STAT);
39487 +
39488 +       if (state->extmask & (1 << HEIR_STAT)) {
39489 +               if (state->heir_mask == 0)
39490 +                       inode_clr_extension(inode, HEIR_STAT);
39491 +       } else if (state->heir_mask != 0)
39492 +                       inode_set_extension(inode, HEIR_STAT);
39493 +
39494 +       /* data.length is how much space to add to (or remove
39495 +          from if negative) sd */
39496 +       if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
39497 +               /* recalculate stat-data length */
39498 +               data.length =
39499 +                   data.iplug->s.sd.save_len(inode) -
39500 +                   item_length_by_coord(coord);
39501 +               reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
39502 +       } else
39503 +               data.length = 0;
39504 +       spin_unlock_inode(inode);
39505 +
39506 +       /* if on-disk stat data is of different length than required
39507 +          for this inode, resize it */
39508 +
39509 +       if (data.length != 0) {
39510 +               data.data = NULL;
39511 +               data.user = 0;
39512 +
39513 +               assert("edward-1441",
39514 +                      !check_sd_resize(inode, coord,
39515 +                                       data.length, 0/* before resize */));
39516 +
39517 +               /* insertion code requires that insertion point (coord) was
39518 +                * between units. */
39519 +               coord->between = AFTER_UNIT;
39520 +               result = reiser4_resize_item(coord, &data, key, lh,
39521 +                                            COPI_DONT_SHIFT_LEFT);
39522 +               if (result != 0) {
39523 +                       key_warning(key, inode, result);
39524 +                       zrelse(loaded);
39525 +                       return result;
39526 +               }
39527 +               if (loaded != coord->node) {
39528 +                 /* reiser4_resize_item moved coord to another node.
39529 +                    Zload it */
39530 +                       zrelse(loaded);
39531 +                       coord_clear_iplug(coord);
39532 +                       result = zload(coord->node);
39533 +                       if (result != 0)
39534 +                               return result;
39535 +                       loaded = coord->node;
39536 +               }
39537 +               assert("edward-1442",
39538 +                      !check_sd_resize(inode, coord,
39539 +                                       data.length, 1/* after resize */));
39540 +       }
39541 +       area = item_body_by_coord(coord);
39542 +       spin_lock_inode(inode);
39543 +       result = data.iplug->s.sd.save(inode, &area);
39544 +       znode_make_dirty(coord->node);
39545 +
39546 +       /* re-initialise stat-data seal */
39547 +
39548 +       /*
39549 +        * coord.between was possibly skewed from AT_UNIT when stat-data size
39550 +        * was changed and new extensions were pasted into item.
39551 +        */
39552 +       coord->between = AT_UNIT;
39553 +       reiser4_seal_init(&state->sd_seal, coord, key);
39554 +       state->sd_coord = *coord;
39555 +       spin_unlock_inode(inode);
39556 +       check_inode_seal(inode, coord, key);
39557 +       zrelse(loaded);
39558 +       return result;
39559 +}
39560 +
39561 +/* Update existing stat-data in a tree. Called with inode state locked. Return
39562 +   inode state locked. */
39563 +static int update_sd(struct inode *inode/* inode to update sd for */)
39564 +{
39565 +       int result;
39566 +       reiser4_key key;
39567 +       coord_t coord;
39568 +       lock_handle lh;
39569 +
39570 +       assert("nikita-726", inode != NULL);
39571 +
39572 +       /* no stat-data, nothing to update?! */
39573 +       assert("nikita-3482", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
39574 +
39575 +       init_lh(&lh);
39576 +
39577 +       result = locate_inode_sd(inode, &key, &coord, &lh);
39578 +       if (result == 0)
39579 +               result = update_sd_at(inode, &coord, &key, &lh);
39580 +       done_lh(&lh);
39581 +
39582 +       return result;
39583 +}
39584 +
39585 +/* helper for reiser4_delete_object_common and reiser4_delete_dir_common.
39586 +   Remove object stat data. Space for that must be reserved by caller before
39587 +*/
39588 +static int
39589 +common_object_delete_no_reserve(struct inode *inode/* object to remove */)
39590 +{
39591 +       int result;
39592 +
39593 +       assert("nikita-1477", inode != NULL);
39594 +
39595 +       if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
39596 +               reiser4_key sd_key;
39597 +
39598 +               dquot_free_inode(inode);
39599 +               dquot_drop(inode);
39600 +
39601 +               build_sd_key(inode, &sd_key);
39602 +               result =
39603 +                   reiser4_cut_tree(reiser4_tree_by_inode(inode),
39604 +                                    &sd_key, &sd_key, NULL, 0);
39605 +               if (result == 0) {
39606 +                       reiser4_inode_set_flag(inode, REISER4_NO_SD);
39607 +                       result = oid_release(inode->i_sb, get_inode_oid(inode));
39608 +                       if (result == 0) {
39609 +                               oid_count_released();
39610 +
39611 +                               result = safe_link_del(reiser4_tree_by_inode(inode),
39612 +                                                      get_inode_oid(inode),
39613 +                                                      SAFE_UNLINK);
39614 +                       }
39615 +               }
39616 +       } else
39617 +               result = 0;
39618 +       return result;
39619 +}
39620 +
39621 +/* helper for safelink_common */
39622 +static int process_truncate(struct inode *inode, __u64 size)
39623 +{
39624 +       int result;
39625 +       struct iattr attr;
39626 +       file_plugin *fplug;
39627 +       reiser4_context *ctx;
39628 +       struct dentry dentry;
39629 +
39630 +       assert("vs-21", is_in_reiser4_context());
39631 +       ctx = reiser4_init_context(inode->i_sb);
39632 +       assert("vs-22", !IS_ERR(ctx));
39633 +
39634 +       attr.ia_size = size;
39635 +       attr.ia_valid = ATTR_SIZE | ATTR_CTIME;
39636 +       fplug = inode_file_plugin(inode);
39637 +
39638 +       mutex_lock(&inode->i_mutex);
39639 +       assert("vs-1704", get_current_context()->trans->atom == NULL);
39640 +       dentry.d_inode = inode;
39641 +       result = inode->i_op->setattr(&dentry, &attr);
39642 +       mutex_unlock(&inode->i_mutex);
39643 +
39644 +       context_set_commit_async(ctx);
39645 +       reiser4_exit_context(ctx);
39646 +
39647 +       return result;
39648 +}
39649 +
39650 +/*
39651 +  Local variables:
39652 +  c-indentation-style: "K&R"
39653 +  mode-name: "LC"
39654 +  c-basic-offset: 8
39655 +  tab-width: 8
39656 +  fill-column: 80
39657 +  scroll-step: 1
39658 +  End:
39659 +*/
39660 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/hash.c linux-2.6.35/fs/reiser4/plugin/hash.c
39661 --- linux-2.6.35.orig/fs/reiser4/plugin/hash.c  1970-01-01 01:00:00.000000000 +0100
39662 +++ linux-2.6.35/fs/reiser4/plugin/hash.c       2010-08-04 15:44:57.000000000 +0200
39663 @@ -0,0 +1,352 @@
39664 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
39665 + * reiser4/README */
39666 +
39667 +/* Hash functions */
39668 +
39669 +#include "../debug.h"
39670 +#include "plugin_header.h"
39671 +#include "plugin.h"
39672 +#include "../super.h"
39673 +#include "../inode.h"
39674 +
39675 +#include <linux/types.h>
39676 +
39677 +/* old rupasov (yura) hash */
39678 +static __u64 hash_rupasov(const unsigned char *name /* name to hash */ ,
39679 +                         int len/* @name's length */)
39680 +{
39681 +       int i;
39682 +       int j;
39683 +       int pow;
39684 +       __u64 a;
39685 +       __u64 c;
39686 +
39687 +       assert("nikita-672", name != NULL);
39688 +       assert("nikita-673", len >= 0);
39689 +
39690 +       for (pow = 1, i = 1; i < len; ++i)
39691 +               pow = pow * 10;
39692 +
39693 +       if (len == 1)
39694 +               a = name[0] - 48;
39695 +       else
39696 +               a = (name[0] - 48) * pow;
39697 +
39698 +       for (i = 1; i < len; ++i) {
39699 +               c = name[i] - 48;
39700 +               for (pow = 1, j = i; j < len - 1; ++j)
39701 +                       pow = pow * 10;
39702 +               a = a + c * pow;
39703 +       }
39704 +       for (; i < 40; ++i) {
39705 +               c = '0' - 48;
39706 +               for (pow = 1, j = i; j < len - 1; ++j)
39707 +                       pow = pow * 10;
39708 +               a = a + c * pow;
39709 +       }
39710 +
39711 +       for (; i < 256; ++i) {
39712 +               c = i;
39713 +               for (pow = 1, j = i; j < len - 1; ++j)
39714 +                       pow = pow * 10;
39715 +               a = a + c * pow;
39716 +       }
39717 +
39718 +       a = a << 7;
39719 +       return a;
39720 +}
39721 +
39722 +/* r5 hash */
39723 +static __u64 hash_r5(const unsigned char *name /* name to hash */ ,
39724 +                    int len UNUSED_ARG/* @name's length */)
39725 +{
39726 +       __u64 a = 0;
39727 +
39728 +       assert("nikita-674", name != NULL);
39729 +       assert("nikita-675", len >= 0);
39730 +
39731 +       while (*name) {
39732 +               a += *name << 4;
39733 +               a += *name >> 4;
39734 +               a *= 11;
39735 +               name++;
39736 +       }
39737 +       return a;
39738 +}
39739 +
39740 +/* Keyed 32-bit hash function using TEA in a Davis-Meyer function
39741 +     H0 = Key
39742 +     Hi = E Mi(Hi-1) + Hi-1
39743 +
39744 +   (see Applied Cryptography, 2nd edition, p448).
39745 +
39746 +   Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
39747 +
39748 +   Jeremy has agreed to the contents of reiserfs/README. -Hans
39749 +
39750 +   This code was blindly upgraded to __u64 by s/__u32/__u64/g.
39751 +*/
39752 +static __u64 hash_tea(const unsigned char *name /* name to hash */ ,
39753 +                     int len/* @name's length */)
39754 +{
39755 +       __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u };
39756 +
39757 +       __u64 h0 = k[0], h1 = k[1];
39758 +       __u64 a, b, c, d;
39759 +       __u64 pad;
39760 +       int i;
39761 +
39762 +       assert("nikita-676", name != NULL);
39763 +       assert("nikita-677", len >= 0);
39764 +
39765 +#define DELTA 0x9E3779B9u
39766 +#define FULLROUNDS 10          /* 32 is overkill, 16 is strong crypto */
39767 +#define PARTROUNDS 6           /* 6 gets complete mixing */
39768 +
39769 +/* a, b, c, d - data; h0, h1 - accumulated hash */
39770 +#define TEACORE(rounds)                                                        \
39771 +       do {                                                            \
39772 +               __u64 sum = 0;                                          \
39773 +               int n = rounds;                                         \
39774 +               __u64 b0, b1;                                           \
39775 +                                                                       \
39776 +               b0 = h0;                                                \
39777 +               b1 = h1;                                                \
39778 +                                                                       \
39779 +               do {                                                    \
39780 +                       sum += DELTA;                                   \
39781 +                       b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
39782 +                       b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
39783 +               } while (--n);                                          \
39784 +                                                                       \
39785 +               h0 += b0;                                               \
39786 +               h1 += b1;                                               \
39787 +       } while (0)
39788 +
39789 +       pad = (__u64) len | ((__u64) len << 8);
39790 +       pad |= pad << 16;
39791 +
39792 +       while (len >= 16) {
39793 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39794 +                   16 | (__u64) name[3] << 24;
39795 +               b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
39796 +                   16 | (__u64) name[7] << 24;
39797 +               c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
39798 +                   16 | (__u64) name[11] << 24;
39799 +               d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14]
39800 +                   << 16 | (__u64) name[15] << 24;
39801 +
39802 +               TEACORE(PARTROUNDS);
39803 +
39804 +               len -= 16;
39805 +               name += 16;
39806 +       }
39807 +
39808 +       if (len >= 12) {
39809 +               /* assert(len < 16); */
39810 +               if (len >= 16)
39811 +                       *(int *)0 = 0;
39812 +
39813 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39814 +                   16 | (__u64) name[3] << 24;
39815 +               b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
39816 +                   16 | (__u64) name[7] << 24;
39817 +               c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
39818 +                   16 | (__u64) name[11] << 24;
39819 +
39820 +               d = pad;
39821 +               for (i = 12; i < len; i++) {
39822 +                       d <<= 8;
39823 +                       d |= name[i];
39824 +               }
39825 +       } else if (len >= 8) {
39826 +               /* assert(len < 12); */
39827 +               if (len >= 12)
39828 +                       *(int *)0 = 0;
39829 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39830 +                   16 | (__u64) name[3] << 24;
39831 +               b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
39832 +                   16 | (__u64) name[7] << 24;
39833 +
39834 +               c = d = pad;
39835 +               for (i = 8; i < len; i++) {
39836 +                       c <<= 8;
39837 +                       c |= name[i];
39838 +               }
39839 +       } else if (len >= 4) {
39840 +               /* assert(len < 8); */
39841 +               if (len >= 8)
39842 +                       *(int *)0 = 0;
39843 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39844 +                   16 | (__u64) name[3] << 24;
39845 +
39846 +               b = c = d = pad;
39847 +               for (i = 4; i < len; i++) {
39848 +                       b <<= 8;
39849 +                       b |= name[i];
39850 +               }
39851 +       } else {
39852 +               /* assert(len < 4); */
39853 +               if (len >= 4)
39854 +                       *(int *)0 = 0;
39855 +               a = b = c = d = pad;
39856 +               for (i = 0; i < len; i++) {
39857 +                       a <<= 8;
39858 +                       a |= name[i];
39859 +               }
39860 +       }
39861 +
39862 +       TEACORE(FULLROUNDS);
39863 +
39864 +/*     return 0;*/
39865 +       return h0 ^ h1;
39866 +
39867 +}
39868 +
39869 +/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash.
39870 +
39871 +   See http://www.isthe.com/chongo/tech/comp/fnv/ for details.
39872 +
39873 +   Excerpts:
39874 +
39875 +     FNV hashes are designed to be fast while maintaining a low collision
39876 +     rate.
39877 +
39878 +     [This version also seems to preserve lexicographical order locally.]
39879 +
39880 +     FNV hash algorithms and source code have been released into the public
39881 +     domain.
39882 +
39883 +*/
39884 +static __u64 hash_fnv1(const unsigned char *name /* name to hash */ ,
39885 +                      int len UNUSED_ARG/* @name's length */)
39886 +{
39887 +       unsigned long long a = 0xcbf29ce484222325ull;
39888 +       const unsigned long long fnv_64_prime = 0x100000001b3ull;
39889 +
39890 +       assert("nikita-678", name != NULL);
39891 +       assert("nikita-679", len >= 0);
39892 +
39893 +       /* FNV-1 hash each octet in the buffer */
39894 +       for (; *name; ++name) {
39895 +               /* multiply by the 32 bit FNV magic prime mod 2^64 */
39896 +               a *= fnv_64_prime;
39897 +               /* xor the bottom with the current octet */
39898 +               a ^= (unsigned long long)(*name);
39899 +       }
39900 +       /* return our new hash value */
39901 +       return a;
39902 +}
39903 +
39904 +/* degenerate hash function used to simplify testing of non-unique key
39905 +   handling */
39906 +static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ ,
39907 +                     int len UNUSED_ARG/* @name's length */)
39908 +{
39909 +       return 0xc0c0c0c010101010ull;
39910 +}
39911 +
39912 +static int change_hash(struct inode *inode,
39913 +                      reiser4_plugin * plugin,
39914 +                      pset_member memb)
39915 +{
39916 +       int result;
39917 +
39918 +       assert("nikita-3503", inode != NULL);
39919 +       assert("nikita-3504", plugin != NULL);
39920 +
39921 +       assert("nikita-3505", is_reiser4_inode(inode));
39922 +       assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE);
39923 +
39924 +       if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
39925 +               return RETERR(-EINVAL);
39926 +
39927 +       result = 0;
39928 +       if (inode_hash_plugin(inode) == NULL ||
39929 +           inode_hash_plugin(inode)->h.id != plugin->h.id) {
39930 +               if (is_dir_empty(inode) == 0)
39931 +                       result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
39932 +                                                PSET_HASH, plugin);
39933 +               else
39934 +                       result = RETERR(-ENOTEMPTY);
39935 +
39936 +       }
39937 +       return result;
39938 +}
39939 +
39940 +static reiser4_plugin_ops hash_plugin_ops = {
39941 +       .init = NULL,
39942 +       .load = NULL,
39943 +       .save_len = NULL,
39944 +       .save = NULL,
39945 +       .change = change_hash
39946 +};
39947 +
39948 +/* hash plugins */
39949 +hash_plugin hash_plugins[LAST_HASH_ID] = {
39950 +       [RUPASOV_HASH_ID] = {
39951 +               .h = {
39952 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
39953 +                       .id = RUPASOV_HASH_ID,
39954 +                       .pops = &hash_plugin_ops,
39955 +                       .label = "rupasov",
39956 +                       .desc = "Original Yura's hash",
39957 +                       .linkage = {NULL, NULL}
39958 +               },
39959 +               .hash = hash_rupasov
39960 +       },
39961 +       [R5_HASH_ID] = {
39962 +               .h = {
39963 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
39964 +                       .id = R5_HASH_ID,
39965 +                       .pops = &hash_plugin_ops,
39966 +                       .label = "r5",
39967 +                       .desc = "r5 hash",
39968 +                       .linkage = {NULL, NULL}
39969 +               },
39970 +               .hash = hash_r5
39971 +       },
39972 +       [TEA_HASH_ID] = {
39973 +               .h = {
39974 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
39975 +                       .id = TEA_HASH_ID,
39976 +                       .pops = &hash_plugin_ops,
39977 +                       .label = "tea",
39978 +                       .desc = "tea hash",
39979 +                       .linkage = {NULL, NULL}
39980 +               },
39981 +               .hash = hash_tea
39982 +       },
39983 +       [FNV1_HASH_ID] = {
39984 +               .h = {
39985 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
39986 +                       .id = FNV1_HASH_ID,
39987 +                       .pops = &hash_plugin_ops,
39988 +                       .label = "fnv1",
39989 +                       .desc = "fnv1 hash",
39990 +                       .linkage = {NULL, NULL}
39991 +               },
39992 +               .hash = hash_fnv1
39993 +       },
39994 +       [DEGENERATE_HASH_ID] = {
39995 +               .h = {
39996 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
39997 +                       .id = DEGENERATE_HASH_ID,
39998 +                       .pops = &hash_plugin_ops,
39999 +                       .label = "degenerate hash",
40000 +                       .desc = "Degenerate hash: only for testing",
40001 +                       .linkage = {NULL, NULL}
40002 +               },
40003 +               .hash = hash_deg
40004 +       }
40005 +};
40006 +
40007 +/* Make Linus happy.
40008 +   Local variables:
40009 +   c-indentation-style: "K&R"
40010 +   mode-name: "LC"
40011 +   c-basic-offset: 8
40012 +   tab-width: 8
40013 +   fill-column: 120
40014 +   End:
40015 +*/
40016 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/inode_ops.c linux-2.6.35/fs/reiser4/plugin/inode_ops.c
40017 --- linux-2.6.35.orig/fs/reiser4/plugin/inode_ops.c     1970-01-01 01:00:00.000000000 +0100
40018 +++ linux-2.6.35/fs/reiser4/plugin/inode_ops.c  2010-08-04 15:44:57.000000000 +0200
40019 @@ -0,0 +1,916 @@
40020 +/*
40021 + * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README
40022 + */
40023 +
40024 +/*
40025 + * this file contains typical implementations for most of methods of struct
40026 + * inode_operations
40027 + */
40028 +
40029 +#include "../inode.h"
40030 +#include "../safe_link.h"
40031 +
40032 +#include <linux/quotaops.h>
40033 +#include <linux/namei.h>
40034 +
40035 +static int create_vfs_object(struct inode *parent, struct dentry *dentry,
40036 +                     reiser4_object_create_data *data);
40037 +
40038 +/**
40039 + * reiser4_create_common - create of inode operations
40040 + * @parent: inode of parent directory
40041 + * @dentry: dentry of new object to create
40042 + * @mode: the permissions to use
40043 + * @nameidata:
40044 + *
40045 + * This is common implementation of vfs's create method of struct
40046 + * inode_operations.
40047 + * Creates regular file using file plugin from parent directory plugin set.
40048 + */
40049 +int reiser4_create_common(struct inode *parent, struct dentry *dentry,
40050 +                         int mode, struct nameidata *nameidata)
40051 +{
40052 +       reiser4_object_create_data data;
40053 +       file_plugin *fplug;
40054 +
40055 +       memset(&data, 0, sizeof data);
40056 +       data.mode = S_IFREG | mode;
40057 +       fplug = child_create_plugin(parent) ? : inode_create_plugin(parent);
40058 +       if (!plugin_of_group(fplug, REISER4_REGULAR_FILE)) {
40059 +               warning("vpf-1900", "'%s' is not a regular file plugin.",
40060 +                       fplug->h.label);
40061 +               return RETERR(-EIO);
40062 +       }
40063 +       data.id = fplug->h.id;
40064 +       return create_vfs_object(parent, dentry, &data);
40065 +}
40066 +
40067 +int reiser4_lookup_name(struct inode *dir, struct dentry *, reiser4_key *);
40068 +void check_light_weight(struct inode *inode, struct inode *parent);
40069 +
40070 +/**
40071 + * reiser4_lookup_common - lookup of inode operations
40072 + * @parent: inode of directory to lookup into
40073 + * @dentry: name to look for
40074 + * @nameidata:
40075 + *
40076 + * This is common implementation of vfs's lookup method of struct
40077 + * inode_operations.
40078 + */
40079 +struct dentry *reiser4_lookup_common(struct inode *parent,
40080 +                                    struct dentry *dentry,
40081 +                                    struct nameidata *nameidata)
40082 +{
40083 +       reiser4_context *ctx;
40084 +       int result;
40085 +       struct dentry *new;
40086 +       struct inode *inode;
40087 +       reiser4_dir_entry_desc entry;
40088 +
40089 +       ctx = reiser4_init_context(parent->i_sb);
40090 +       if (IS_ERR(ctx))
40091 +               return (struct dentry *)ctx;
40092 +
40093 +       /* set up operations on dentry. */
40094 +       dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry;
40095 +
40096 +       result = reiser4_lookup_name(parent, dentry, &entry.key);
40097 +       if (result) {
40098 +               context_set_commit_async(ctx);
40099 +               reiser4_exit_context(ctx);
40100 +               if (result == -ENOENT) {
40101 +                       /* object not found */
40102 +                       if (!IS_DEADDIR(parent))
40103 +                               d_add(dentry, NULL);
40104 +                       return NULL;
40105 +               }
40106 +               return ERR_PTR(result);
40107 +       }
40108 +
40109 +       inode = reiser4_iget(parent->i_sb, &entry.key, 0);
40110 +       if (IS_ERR(inode)) {
40111 +               context_set_commit_async(ctx);
40112 +               reiser4_exit_context(ctx);
40113 +               return ERR_PTR(PTR_ERR(inode));
40114 +       }
40115 +
40116 +       /* success */
40117 +       check_light_weight(inode, parent);
40118 +       new = d_splice_alias(inode, dentry);
40119 +       reiser4_iget_complete(inode);
40120 +
40121 +       /* prevent balance_dirty_pages() from being called: we don't want to
40122 +        * do this under directory i_mutex. */
40123 +       context_set_commit_async(ctx);
40124 +       reiser4_exit_context(ctx);
40125 +       return new;
40126 +}
40127 +
40128 +static reiser4_block_nr common_estimate_link(struct inode *parent,
40129 +                                            struct inode *object);
40130 +int reiser4_update_dir(struct inode *);
40131 +
40132 +/**
40133 + * reiser4_link_common - link of inode operations
40134 + * @existing: dentry of object which is to get new name
40135 + * @parent: directory where new name is to be created
40136 + * @newname: new name
40137 + *
40138 + * This is common implementation of vfs's link method of struct
40139 + * inode_operations.
40140 + */
40141 +int reiser4_link_common(struct dentry *existing, struct inode *parent,
40142 +                       struct dentry *newname)
40143 +{
40144 +       reiser4_context *ctx;
40145 +       int result;
40146 +       struct inode *object;
40147 +       dir_plugin *parent_dplug;
40148 +       reiser4_dir_entry_desc entry;
40149 +       reiser4_object_create_data data;
40150 +       reiser4_block_nr reserve;
40151 +
40152 +       ctx = reiser4_init_context(parent->i_sb);
40153 +       if (IS_ERR(ctx))
40154 +               return PTR_ERR(ctx);
40155 +
40156 +       assert("nikita-1431", existing != NULL);
40157 +       assert("nikita-1432", parent != NULL);
40158 +       assert("nikita-1433", newname != NULL);
40159 +
40160 +       object = existing->d_inode;
40161 +       assert("nikita-1434", object != NULL);
40162 +
40163 +       /* check for race with create_object() */
40164 +       if (reiser4_inode_get_flag(object, REISER4_IMMUTABLE)) {
40165 +               context_set_commit_async(ctx);
40166 +               reiser4_exit_context(ctx);
40167 +               return RETERR(-E_REPEAT);
40168 +       }
40169 +
40170 +       parent_dplug = inode_dir_plugin(parent);
40171 +
40172 +       memset(&entry, 0, sizeof entry);
40173 +       entry.obj = object;
40174 +
40175 +       data.mode = object->i_mode;
40176 +       data.id = inode_file_plugin(object)->h.id;
40177 +
40178 +       reserve = common_estimate_link(parent, existing->d_inode);
40179 +       if ((__s64) reserve < 0) {
40180 +               context_set_commit_async(ctx);
40181 +               reiser4_exit_context(ctx);
40182 +               return reserve;
40183 +       }
40184 +
40185 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
40186 +               context_set_commit_async(ctx);
40187 +               reiser4_exit_context(ctx);
40188 +               return RETERR(-ENOSPC);
40189 +       }
40190 +
40191 +       /*
40192 +        * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It
40193 +        * means that link(2) can race against unlink(2) or rename(2), and
40194 +        * inode is dead (->i_nlink == 0) when reiser4_link() is entered.
40195 +        *
40196 +        * For such inode we have to undo special processing done in
40197 +        * reiser4_unlink() viz. creation of safe-link.
40198 +        */
40199 +       if (unlikely(object->i_nlink == 0)) {
40200 +               result = safe_link_del(reiser4_tree_by_inode(object),
40201 +                                      get_inode_oid(object), SAFE_UNLINK);
40202 +               if (result != 0) {
40203 +                       context_set_commit_async(ctx);
40204 +                       reiser4_exit_context(ctx);
40205 +                       return result;
40206 +               }
40207 +       }
40208 +
40209 +       /* increment nlink of @existing and update its stat data */
40210 +       result = reiser4_add_nlink(object, parent, 1);
40211 +       if (result == 0) {
40212 +               /* add entry to the parent */
40213 +               result =
40214 +                   parent_dplug->add_entry(parent, newname, &data, &entry);
40215 +               if (result != 0) {
40216 +                       /* failed to add entry to the parent, decrement nlink
40217 +                          of @existing */
40218 +                       reiser4_del_nlink(object, parent, 1);
40219 +                       /*
40220 +                        * now, if that failed, we have a file with too big
40221 +                        * nlink---space leak, much better than directory
40222 +                        * entry pointing to nowhere
40223 +                        */
40224 +               }
40225 +       }
40226 +       if (result == 0) {
40227 +               atomic_inc(&object->i_count);
40228 +               /*
40229 +                * Upon successful completion, link() shall mark for update
40230 +                * the st_ctime field of the file. Also, the st_ctime and
40231 +                * st_mtime fields of the directory that contains the new
40232 +                * entry shall be marked for update. --SUS
40233 +                */
40234 +               result = reiser4_update_dir(parent);
40235 +       }
40236 +       if (result == 0)
40237 +               d_instantiate(newname, existing->d_inode);
40238 +
40239 +       context_set_commit_async(ctx);
40240 +       reiser4_exit_context(ctx);
40241 +       return result;
40242 +}
40243 +
40244 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim);
40245 +
40246 +/**
40247 + * reiser4_unlink_common - unlink of inode operations
40248 + * @parent: inode of directory to remove name from
40249 + * @victim: name to be removed
40250 + *
40251 + * This is common implementation of vfs's unlink method of struct
40252 + * inode_operations.
40253 + */
40254 +int reiser4_unlink_common(struct inode *parent, struct dentry *victim)
40255 +{
40256 +       reiser4_context *ctx;
40257 +       int result;
40258 +       struct inode *object;
40259 +       file_plugin *fplug;
40260 +
40261 +       ctx = reiser4_init_context(parent->i_sb);
40262 +       if (IS_ERR(ctx))
40263 +               return PTR_ERR(ctx);
40264 +
40265 +       object = victim->d_inode;
40266 +       fplug = inode_file_plugin(object);
40267 +       assert("nikita-2882", fplug->detach != NULL);
40268 +
40269 +       result = unlink_check_and_grab(parent, victim);
40270 +       if (result != 0) {
40271 +               context_set_commit_async(ctx);
40272 +               reiser4_exit_context(ctx);
40273 +               return result;
40274 +       }
40275 +
40276 +       result = fplug->detach(object, parent);
40277 +       if (result == 0) {
40278 +               dir_plugin *parent_dplug;
40279 +               reiser4_dir_entry_desc entry;
40280 +
40281 +               parent_dplug = inode_dir_plugin(parent);
40282 +               memset(&entry, 0, sizeof entry);
40283 +
40284 +               /* first, delete directory entry */
40285 +               result = parent_dplug->rem_entry(parent, victim, &entry);
40286 +               if (result == 0) {
40287 +                       /*
40288 +                        * if name was removed successfully, we _have_ to
40289 +                        * return 0 from this function, because upper level
40290 +                        * caller (vfs_{rmdir,unlink}) expect this.
40291 +                        *
40292 +                        * now that directory entry is removed, update
40293 +                        * stat-data
40294 +                        */
40295 +                       reiser4_del_nlink(object, parent, 1);
40296 +                       /*
40297 +                        * Upon successful completion, unlink() shall mark for
40298 +                        * update the st_ctime and st_mtime fields of the
40299 +                        * parent directory. Also, if the file's link count is
40300 +                        * not 0, the st_ctime field of the file shall be
40301 +                        * marked for update. --SUS
40302 +                        */
40303 +                       reiser4_update_dir(parent);
40304 +                       /* add safe-link for this file */
40305 +                       if (object->i_nlink == 0)
40306 +                               safe_link_add(object, SAFE_UNLINK);
40307 +               }
40308 +       }
40309 +
40310 +       if (unlikely(result != 0)) {
40311 +               if (result != -ENOMEM)
40312 +                       warning("nikita-3398", "Cannot unlink %llu (%i)",
40313 +                               (unsigned long long)get_inode_oid(object),
40314 +                               result);
40315 +               /* if operation failed commit pending inode modifications to
40316 +                * the stat-data */
40317 +               reiser4_update_sd(object);
40318 +               reiser4_update_sd(parent);
40319 +       }
40320 +
40321 +       reiser4_release_reserved(object->i_sb);
40322 +
40323 +       /* @object's i_ctime was updated by ->rem_link() method(). */
40324 +
40325 +       /* @victim can be already removed from the disk by this time. Inode is
40326 +          then marked so that iput() wouldn't try to remove stat data. But
40327 +          inode itself is still there.
40328 +        */
40329 +
40330 +       /*
40331 +        * we cannot release directory semaphore here, because name has
40332 +        * already been deleted, but dentry (@victim) still exists.  Prevent
40333 +        * balance_dirty_pages() from being called on exiting this context: we
40334 +        * don't want to do this under directory i_mutex.
40335 +        */
40336 +       context_set_commit_async(ctx);
40337 +       reiser4_exit_context(ctx);
40338 +       return result;
40339 +}
40340 +
40341 +/**
40342 + * reiser4_symlink_common - symlink of inode operations
40343 + * @parent: inode of parent directory
40344 + * @dentry: dentry of object to be created
40345 + * @linkname: string symlink is to contain
40346 + *
40347 + * This is common implementation of vfs's symlink method of struct
40348 + * inode_operations.
40349 + * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID.
40350 + */
40351 +int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
40352 +                          const char *linkname)
40353 +{
40354 +       reiser4_object_create_data data;
40355 +
40356 +       memset(&data, 0, sizeof data);
40357 +       data.name = linkname;
40358 +       data.id = SYMLINK_FILE_PLUGIN_ID;
40359 +       data.mode = S_IFLNK | S_IRWXUGO;
40360 +       return create_vfs_object(parent, dentry, &data);
40361 +}
40362 +
40363 +/**
40364 + * reiser4_mkdir_common - mkdir of inode operations
40365 + * @parent: inode of parent directory
40366 + * @dentry: dentry of object to be created
40367 + * @mode: the permissions to use
40368 + *
40369 + * This is common implementation of vfs's mkdir method of struct
40370 + * inode_operations.
40371 + * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID.
40372 + */
40373 +int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode)
40374 +{
40375 +       reiser4_object_create_data data;
40376 +
40377 +       memset(&data, 0, sizeof data);
40378 +       data.mode = S_IFDIR | mode;
40379 +       data.id = DIRECTORY_FILE_PLUGIN_ID;
40380 +       return create_vfs_object(parent, dentry, &data);
40381 +}
40382 +
40383 +/**
40384 + * reiser4_mknod_common - mknod of inode operations
40385 + * @parent: inode of parent directory
40386 + * @dentry: dentry of object to be created
40387 + * @mode: the permissions to use and file type
40388 + * @rdev: minor and major of new device file
40389 + *
40390 + * This is common implementation of vfs's mknod method of struct
40391 + * inode_operations.
40392 + * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID.
40393 + */
40394 +int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
40395 +                        int mode, dev_t rdev)
40396 +{
40397 +       reiser4_object_create_data data;
40398 +
40399 +       memset(&data, 0, sizeof data);
40400 +       data.mode = mode;
40401 +       data.rdev = rdev;
40402 +       data.id = SPECIAL_FILE_PLUGIN_ID;
40403 +       return create_vfs_object(parent, dentry, &data);
40404 +}
40405 +
40406 +/*
40407 + * implementation of vfs's rename method of struct inode_operations for typical
40408 + * directory is in inode_ops_rename.c
40409 + */
40410 +
40411 +/**
40412 + * reiser4_follow_link_common - follow_link of inode operations
40413 + * @dentry: dentry of symlink
40414 + * @data:
40415 + *
40416 + * This is common implementation of vfs's followlink method of struct
40417 + * inode_operations.
40418 + * Assumes that inode's i_private points to the content of symbolic link.
40419 + */
40420 +void *reiser4_follow_link_common(struct dentry *dentry, struct nameidata *nd)
40421 +{
40422 +       assert("vs-851", S_ISLNK(dentry->d_inode->i_mode));
40423 +
40424 +       if (!dentry->d_inode->i_private
40425 +           || !reiser4_inode_get_flag(dentry->d_inode,
40426 +                                      REISER4_GENERIC_PTR_USED))
40427 +               return ERR_PTR(RETERR(-EINVAL));
40428 +       nd_set_link(nd, dentry->d_inode->i_private);
40429 +       return NULL;
40430 +}
40431 +
40432 +/**
40433 + * reiser4_permission_common - permission of inode operations
40434 + * @inode: inode to check permissions for
40435 + * @mask: mode bits to check permissions for
40436 + * @nameidata:
40437 + *
40438 + * Uses generic function to check for rwx permissions.
40439 + */
40440 +int reiser4_permission_common(struct inode *inode, int mask)
40441 +{
40442 +       return generic_permission(inode, mask, NULL);
40443 +}
40444 +
40445 +static int setattr_reserve(reiser4_tree *);
40446 +
40447 +/* this is common implementation of vfs's setattr method of struct
40448 +   inode_operations
40449 +*/
40450 +int reiser4_setattr_common(struct dentry *dentry, struct iattr *attr)
40451 +{
40452 +       reiser4_context *ctx;
40453 +       struct inode *inode;
40454 +       int result;
40455 +
40456 +       inode = dentry->d_inode;
40457 +       result = inode_change_ok(inode, attr);
40458 +       if (result)
40459 +               return result;
40460 +
40461 +       ctx = reiser4_init_context(inode->i_sb);
40462 +       if (IS_ERR(ctx))
40463 +               return PTR_ERR(ctx);
40464 +
40465 +       assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE));
40466 +
40467 +       /*
40468 +        * grab disk space and call standard inode_setattr().
40469 +        */
40470 +       result = setattr_reserve(reiser4_tree_by_inode(inode));
40471 +       if (!result) {
40472 +               if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid)
40473 +BROKEN             || (attr->ia_valid & ATTR_GID
40474 +                       && attr->ia_gid != inode->i_gid)) {
40475 +                       result = dquot_transfer(inode, attr) ? -EDQUOT : 0;
40476 +                       if (result) {
40477 +                               context_set_commit_async(ctx);
40478 +                               reiser4_exit_context(ctx);
40479 +                               return result;
40480 +                       }
40481 +               }
40482 +
40483 +               if ((attr->ia_valid & ATTR_SIZE) &&
40484 +                   attr->ia_size != i_size_read(inode))
40485 +                       result = vmtruncate(inode, attr->ia_size);
40486 +BROKEN                 if (result)
40487 +                               goto result_error;
40488 +               setattr_copy(inode, attr);
40489 +               mark_inode_dirty(inode);
40490 +               result = 0;
40491 +
40492 +
40493 +result_error:
40494 +               if (!result) {
40495 +                       reiser4_update_sd(inode);
40496 +               }
40497 +       }
40498 +
40499 +       context_set_commit_async(ctx);
40500 +       reiser4_exit_context(ctx);
40501 +       return result;
40502 +}
40503 +
40504 +/* this is common implementation of vfs's getattr method of struct
40505 +   inode_operations
40506 +*/
40507 +int reiser4_getattr_common(struct vfsmount *mnt UNUSED_ARG,
40508 +                          struct dentry *dentry, struct kstat *stat)
40509 +{
40510 +       struct inode *obj;
40511 +
40512 +       assert("nikita-2298", dentry != NULL);
40513 +       assert("nikita-2299", stat != NULL);
40514 +       assert("nikita-2300", dentry->d_inode != NULL);
40515 +
40516 +       obj = dentry->d_inode;
40517 +
40518 +       stat->dev = obj->i_sb->s_dev;
40519 +       stat->ino = oid_to_uino(get_inode_oid(obj));
40520 +       stat->mode = obj->i_mode;
40521 +       /* don't confuse userland with huge nlink. This is not entirely
40522 +        * correct, because nlink_t is not necessary 16 bit signed. */
40523 +       stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff);
40524 +       stat->uid = obj->i_uid;
40525 +       stat->gid = obj->i_gid;
40526 +       stat->rdev = obj->i_rdev;
40527 +       stat->atime = obj->i_atime;
40528 +       stat->mtime = obj->i_mtime;
40529 +       stat->ctime = obj->i_ctime;
40530 +       stat->size = obj->i_size;
40531 +       stat->blocks =
40532 +           (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS;
40533 +       /* "preferred" blocksize for efficient file system I/O */
40534 +       stat->blksize = get_super_private(obj->i_sb)->optimal_io_size;
40535 +
40536 +       return 0;
40537 +}
40538 +
40539 +/* Estimate the maximum amount of nodes which might be allocated or changed on
40540 +   typical new object creation. Typical creation consists of calling create
40541 +   method of file plugin, adding directory entry to parent and update parent
40542 +   directory's stat data.
40543 +*/
40544 +static reiser4_block_nr estimate_create_vfs_object(struct inode *parent,
40545 +                                                  /* parent object */
40546 +                                                  struct inode *object
40547 +                                                  /* object */)
40548 +{
40549 +       assert("vpf-309", parent != NULL);
40550 +       assert("vpf-307", object != NULL);
40551 +
40552 +       return
40553 +           /* object creation estimation */
40554 +           inode_file_plugin(object)->estimate.create(object) +
40555 +           /* stat data of parent directory estimation */
40556 +           inode_file_plugin(parent)->estimate.update(parent) +
40557 +           /* adding entry estimation */
40558 +           inode_dir_plugin(parent)->estimate.add_entry(parent) +
40559 +           /* to undo in the case of failure */
40560 +           inode_dir_plugin(parent)->estimate.rem_entry(parent);
40561 +}
40562 +
40563 +/* Create child in directory.
40564 +
40565 +   . get object's plugin
40566 +   . get fresh inode
40567 +   . initialize inode
40568 +   . add object's stat-data
40569 +   . initialize object's directory
40570 +   . add entry to the parent
40571 +   . instantiate dentry
40572 +
40573 +*/
40574 +static int do_create_vfs_child(reiser4_object_create_data * data,/* parameters
40575 +                                                                   of new
40576 +                                                                   object */
40577 +                              struct inode **retobj)
40578 +{
40579 +       int result;
40580 +
40581 +       struct dentry *dentry;  /* parent object */
40582 +       struct inode *parent;   /* new name */
40583 +
40584 +       dir_plugin *par_dir;    /* directory plugin on the parent */
40585 +       dir_plugin *obj_dir;    /* directory plugin on the new object */
40586 +       file_plugin *obj_plug;  /* object plugin on the new object */
40587 +       struct inode *object;   /* new object */
40588 +       reiser4_block_nr reserve;
40589 +
40590 +       reiser4_dir_entry_desc entry;   /* new directory entry */
40591 +
40592 +       assert("nikita-1420", data != NULL);
40593 +       parent = data->parent;
40594 +       dentry = data->dentry;
40595 +
40596 +       assert("nikita-1418", parent != NULL);
40597 +       assert("nikita-1419", dentry != NULL);
40598 +
40599 +       /* check, that name is acceptable for parent */
40600 +       par_dir = inode_dir_plugin(parent);
40601 +       if (par_dir->is_name_acceptable &&
40602 +           !par_dir->is_name_acceptable(parent,
40603 +                                        dentry->d_name.name,
40604 +                                        (int)dentry->d_name.len))
40605 +               return RETERR(-ENAMETOOLONG);
40606 +
40607 +       result = 0;
40608 +       obj_plug = file_plugin_by_id((int)data->id);
40609 +       if (obj_plug == NULL) {
40610 +               warning("nikita-430", "Cannot find plugin %i", data->id);
40611 +               return RETERR(-ENOENT);
40612 +       }
40613 +       object = new_inode(parent->i_sb);
40614 +       if (object == NULL)
40615 +               return RETERR(-ENOMEM);
40616 +       /* we'll update i_nlink below */
40617 +       object->i_nlink = 0;
40618 +       /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0,
40619 +        * to simplify error handling: if some error occurs before i_ino is
40620 +        * initialized with oid, i_ino should already be set to some
40621 +        * distinguished value. */
40622 +       object->i_ino = 0;
40623 +
40624 +       /* So that on error iput will be called. */
40625 +       *retobj = object;
40626 +
40627 +       if (dquot_alloc_inode(object)) {
40628 +               dquot_drop(object);
40629 +               object->i_flags |= S_NOQUOTA;
40630 +               return RETERR(-EDQUOT);
40631 +       }
40632 +
40633 +       memset(&entry, 0, sizeof entry);
40634 +       entry.obj = object;
40635 +
40636 +       set_plugin(&reiser4_inode_data(object)->pset, PSET_FILE,
40637 +                  file_plugin_to_plugin(obj_plug));
40638 +       result = obj_plug->set_plug_in_inode(object, parent, data);
40639 +       if (result) {
40640 +               warning("nikita-431", "Cannot install plugin %i on %llx",
40641 +                       data->id, (unsigned long long)get_inode_oid(object));
40642 +               dquot_free_inode(object);
40643 +               object->i_flags |= S_NOQUOTA;
40644 +               return result;
40645 +       }
40646 +
40647 +       /* reget plugin after installation */
40648 +       obj_plug = inode_file_plugin(object);
40649 +
40650 +       if (obj_plug->create_object == NULL) {
40651 +               dquot_free_inode(object);
40652 +               object->i_flags |= S_NOQUOTA;
40653 +               return RETERR(-EPERM);
40654 +       }
40655 +
40656 +       /* if any of hash, tail, sd or permission plugins for newly created
40657 +          object are not set yet set them here inheriting them from parent
40658 +          directory
40659 +        */
40660 +       assert("nikita-2070", obj_plug->adjust_to_parent != NULL);
40661 +       result = obj_plug->adjust_to_parent(object,
40662 +                                           parent,
40663 +                                           object->i_sb->s_root->d_inode);
40664 +       if (result == 0)
40665 +               result = finish_pset(object);
40666 +       if (result != 0) {
40667 +               warning("nikita-432", "Cannot inherit from %llx to %llx",
40668 +                       (unsigned long long)get_inode_oid(parent),
40669 +                       (unsigned long long)get_inode_oid(object));
40670 +               dquot_free_inode(object);
40671 +               object->i_flags |= S_NOQUOTA;
40672 +               return result;
40673 +       }
40674 +
40675 +       /* setup inode and file-operations for this inode */
40676 +       setup_inode_ops(object, data);
40677 +
40678 +       /* call file plugin's method to initialize plugin specific part of
40679 +        * inode */
40680 +       if (obj_plug->init_inode_data)
40681 +               obj_plug->init_inode_data(object, data, 1/*create */);
40682 +
40683 +       /* obtain directory plugin (if any) for new object. */
40684 +       obj_dir = inode_dir_plugin(object);
40685 +       if (obj_dir != NULL && obj_dir->init == NULL) {
40686 +               dquot_free_inode(object);
40687 +               object->i_flags |= S_NOQUOTA;
40688 +               return RETERR(-EPERM);
40689 +       }
40690 +
40691 +       reiser4_inode_data(object)->locality_id = get_inode_oid(parent);
40692 +
40693 +       reserve = estimate_create_vfs_object(parent, object);
40694 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
40695 +               dquot_free_inode(object);
40696 +               object->i_flags |= S_NOQUOTA;
40697 +               return RETERR(-ENOSPC);
40698 +       }
40699 +
40700 +       /* mark inode `immutable'. We disable changes to the file being
40701 +          created until valid directory entry for it is inserted. Otherwise,
40702 +          if file were expanded and insertion of directory entry fails, we
40703 +          have to remove file, but we only alloted enough space in
40704 +          transaction to remove _empty_ file. 3.x code used to remove stat
40705 +          data in different transaction thus possibly leaking disk space on
40706 +          crash. This all only matters if it's possible to access file
40707 +          without name, for example, by inode number
40708 +        */
40709 +       reiser4_inode_set_flag(object, REISER4_IMMUTABLE);
40710 +
40711 +       /* create empty object, this includes allocation of new objectid. For
40712 +          directories this implies creation of dot and dotdot  */
40713 +       assert("nikita-2265", reiser4_inode_get_flag(object, REISER4_NO_SD));
40714 +
40715 +       /* mark inode as `loaded'. From this point onward
40716 +          reiser4_delete_inode() will try to remove its stat-data. */
40717 +       reiser4_inode_set_flag(object, REISER4_LOADED);
40718 +
40719 +       result = obj_plug->create_object(object, parent, data);
40720 +       if (result != 0) {
40721 +               reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
40722 +               if (result != -ENAMETOOLONG && result != -ENOMEM)
40723 +                       warning("nikita-2219",
40724 +                               "Failed to create sd for %llu",
40725 +                               (unsigned long long)get_inode_oid(object));
40726 +               dquot_free_inode(object);
40727 +               object->i_flags |= S_NOQUOTA;
40728 +               return result;
40729 +       }
40730 +
40731 +       if (obj_dir != NULL)
40732 +               result = obj_dir->init(object, parent, data);
40733 +       if (result == 0) {
40734 +               assert("nikita-434", !reiser4_inode_get_flag(object,
40735 +                                                            REISER4_NO_SD));
40736 +               /* insert inode into VFS hash table */
40737 +               insert_inode_hash(object);
40738 +               /* create entry */
40739 +               result = par_dir->add_entry(parent, dentry, data, &entry);
40740 +               if (result == 0) {
40741 +                       result = reiser4_add_nlink(object, parent, 0);
40742 +                       /* If O_CREAT is set and the file did not previously
40743 +                          exist, upon successful completion, open() shall
40744 +                          mark for update the st_atime, st_ctime, and
40745 +                          st_mtime fields of the file and the st_ctime and
40746 +                          st_mtime fields of the parent directory. --SUS
40747 +                        */
40748 +                       /* @object times are already updated by
40749 +                          reiser4_add_nlink() */
40750 +                       if (result == 0)
40751 +                               reiser4_update_dir(parent);
40752 +                       if (result != 0)
40753 +                               /* cleanup failure to add nlink */
40754 +                               par_dir->rem_entry(parent, dentry, &entry);
40755 +               }
40756 +               if (result != 0)
40757 +                       /* cleanup failure to add entry */
40758 +                       obj_plug->detach(object, parent);
40759 +       } else if (result != -ENOMEM)
40760 +               warning("nikita-2219", "Failed to initialize dir for %llu: %i",
40761 +                       (unsigned long long)get_inode_oid(object), result);
40762 +
40763 +       /*
40764 +        * update stat-data, committing all pending modifications to the inode
40765 +        * fields.
40766 +        */
40767 +       reiser4_update_sd(object);
40768 +       if (result != 0) {
40769 +               dquot_free_inode(object);
40770 +               object->i_flags |= S_NOQUOTA;
40771 +               /* if everything was ok (result == 0), parent stat-data is
40772 +                * already updated above (update_parent_dir()) */
40773 +               reiser4_update_sd(parent);
40774 +               /* failure to create entry, remove object */
40775 +               obj_plug->delete_object(object);
40776 +       }
40777 +
40778 +       /* file has name now, clear immutable flag */
40779 +       reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
40780 +
40781 +       /* on error, iput() will call ->delete_inode(). We should keep track
40782 +          of the existence of stat-data for this inode and avoid attempt to
40783 +          remove it in reiser4_delete_inode(). This is accomplished through
40784 +          REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags
40785 +        */
40786 +       return result;
40787 +}
40788 +
40789 +/* this is helper for common implementations of reiser4_mkdir, reiser4_create,
40790 +   reiser4_mknod and reiser4_symlink
40791 +*/
40792 +static int
40793 +create_vfs_object(struct inode *parent,
40794 +                 struct dentry *dentry, reiser4_object_create_data * data)
40795 +{
40796 +       reiser4_context *ctx;
40797 +       int result;
40798 +       struct inode *child;
40799 +
40800 +       ctx = reiser4_init_context(parent->i_sb);
40801 +       if (IS_ERR(ctx))
40802 +               return PTR_ERR(ctx);
40803 +       context_set_commit_async(ctx);
40804 +
40805 +       data->parent = parent;
40806 +       data->dentry = dentry;
40807 +       child = NULL;
40808 +       result = do_create_vfs_child(data, &child);
40809 +       if (unlikely(result != 0)) {
40810 +               if (child != NULL) {
40811 +                       reiser4_make_bad_inode(child);
40812 +                       iput(child);
40813 +               }
40814 +       } else
40815 +               d_instantiate(dentry, child);
40816 +
40817 +       reiser4_exit_context(ctx);
40818 +       return result;
40819 +}
40820 +
40821 +/**
40822 + * helper for link_common. Estimate disk space necessary to add a link
40823 + * from @parent to @object
40824 + */
40825 +static reiser4_block_nr common_estimate_link(struct inode *parent /* parent
40826 +                                                                  * directory
40827 +                                                                  */,
40828 +                                            struct inode *object /* object to
40829 +                                                                  * which new
40830 +                                                                  * link is
40831 +                                                                  * being
40832 +                                                                  * created */)
40833 +{
40834 +       reiser4_block_nr res = 0;
40835 +       file_plugin *fplug;
40836 +       dir_plugin *dplug;
40837 +
40838 +       assert("vpf-317", object != NULL);
40839 +       assert("vpf-318", parent != NULL);
40840 +
40841 +       fplug = inode_file_plugin(object);
40842 +       dplug = inode_dir_plugin(parent);
40843 +       /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice
40844 +        * instead of multiplying by 2? */
40845 +       /* reiser4_add_nlink(object) */
40846 +       res += fplug->estimate.update(object);
40847 +       /* add_entry(parent) */
40848 +       res += dplug->estimate.add_entry(parent);
40849 +       /* reiser4_del_nlink(object) */
40850 +       res += fplug->estimate.update(object);
40851 +       /* update_dir(parent) */
40852 +       res += inode_file_plugin(parent)->estimate.update(parent);
40853 +       /* safe-link */
40854 +       res += estimate_one_item_removal(reiser4_tree_by_inode(object));
40855 +
40856 +       return res;
40857 +}
40858 +
40859 +/* Estimate disk space necessary to remove a link between @parent and
40860 +   @object.
40861 +*/
40862 +static reiser4_block_nr estimate_unlink(struct inode *parent /* parent
40863 +                                                             * directory */,
40864 +                                       struct inode *object /* object to which
40865 +                                                             * new link is
40866 +                                                             * being created
40867 +                                                             */)
40868 +{
40869 +       reiser4_block_nr res = 0;
40870 +       file_plugin *fplug;
40871 +       dir_plugin *dplug;
40872 +
40873 +       assert("vpf-317", object != NULL);
40874 +       assert("vpf-318", parent != NULL);
40875 +
40876 +       fplug = inode_file_plugin(object);
40877 +       dplug = inode_dir_plugin(parent);
40878 +
40879 +       /* rem_entry(parent) */
40880 +       res += dplug->estimate.rem_entry(parent);
40881 +       /* reiser4_del_nlink(object) */
40882 +       res += fplug->estimate.update(object);
40883 +       /* update_dir(parent) */
40884 +       res += inode_file_plugin(parent)->estimate.update(parent);
40885 +       /* fplug->unlink */
40886 +       res += fplug->estimate.unlink(object, parent);
40887 +       /* safe-link */
40888 +       res += estimate_one_insert_item(reiser4_tree_by_inode(object));
40889 +
40890 +       return res;
40891 +}
40892 +
40893 +/* helper for reiser4_unlink_common. Estimate and grab space for unlink. */
40894 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim)
40895 +{
40896 +       file_plugin *fplug;
40897 +       struct inode *child;
40898 +       int result;
40899 +
40900 +       result = 0;
40901 +       child = victim->d_inode;
40902 +       fplug = inode_file_plugin(child);
40903 +
40904 +       /* check for race with create_object() */
40905 +       if (reiser4_inode_get_flag(child, REISER4_IMMUTABLE))
40906 +               return RETERR(-E_REPEAT);
40907 +       /* object being deleted should have stat data */
40908 +       assert("vs-949", !reiser4_inode_get_flag(child, REISER4_NO_SD));
40909 +
40910 +       /* ask object plugin */
40911 +       if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child))
40912 +               return RETERR(-ENOTEMPTY);
40913 +
40914 +       result = (int)estimate_unlink(parent, child);
40915 +       if (result < 0)
40916 +               return result;
40917 +
40918 +       return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT);
40919 +}
40920 +
40921 +/* helper for reiser4_setattr_common */
40922 +static int setattr_reserve(reiser4_tree * tree)
40923 +{
40924 +       assert("vs-1096", is_grab_enabled(get_current_context()));
40925 +       return reiser4_grab_space(estimate_one_insert_into_item(tree),
40926 +                                 BA_CAN_COMMIT);
40927 +}
40928 +
40929 +/* helper function. Standards require that for many file-system operations
40930 +   on success ctime and mtime of parent directory is to be updated. */
40931 +int reiser4_update_dir(struct inode *dir)
40932 +{
40933 +       assert("nikita-2525", dir != NULL);
40934 +
40935 +       dir->i_ctime = dir->i_mtime = CURRENT_TIME;
40936 +       return reiser4_update_sd(dir);
40937 +}
40938 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/inode_ops_rename.c linux-2.6.35/fs/reiser4/plugin/inode_ops_rename.c
40939 --- linux-2.6.35.orig/fs/reiser4/plugin/inode_ops_rename.c      1970-01-01 01:00:00.000000000 +0100
40940 +++ linux-2.6.35/fs/reiser4/plugin/inode_ops_rename.c   2010-08-04 15:44:57.000000000 +0200
40941 @@ -0,0 +1,925 @@
40942 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
40943 + * reiser4/README */
40944 +
40945 +#include "../inode.h"
40946 +#include "../safe_link.h"
40947 +
40948 +static const char *possible_leak = "Possible disk space leak.";
40949 +
40950 +/* re-bind existing name at @from_coord in @from_dir to point to @to_inode.
40951 +
40952 +   Helper function called from hashed_rename() */
40953 +static int replace_name(struct inode *to_inode,        /* inode where @from_coord is
40954 +                                                * to be re-targeted at */
40955 +                       struct inode *from_dir, /* directory where @from_coord
40956 +                                                * lives */
40957 +                       struct inode *from_inode, /* inode @from_coord
40958 +                                                  * originally point to */
40959 +                       coord_t *from_coord,    /* where directory entry is in
40960 +                                                * the tree */
40961 +                       lock_handle * from_lh/* lock handle on @from_coord */)
40962 +{
40963 +       item_plugin *from_item;
40964 +       int result;
40965 +       znode *node;
40966 +
40967 +       coord_clear_iplug(from_coord);
40968 +       node = from_coord->node;
40969 +       result = zload(node);
40970 +       if (result != 0)
40971 +               return result;
40972 +       from_item = item_plugin_by_coord(from_coord);
40973 +       if (plugin_of_group(item_plugin_by_coord(from_coord),
40974 +                           DIR_ENTRY_ITEM_TYPE)) {
40975 +               reiser4_key to_key;
40976 +
40977 +               build_sd_key(to_inode, &to_key);
40978 +
40979 +               /* everything is found and prepared to change directory entry
40980 +                  at @from_coord to point to @to_inode.
40981 +
40982 +                  @to_inode is just about to get new name, so bump its link
40983 +                  counter.
40984 +
40985 +                */
40986 +               result = reiser4_add_nlink(to_inode, from_dir, 0);
40987 +               if (result != 0) {
40988 +                       /* Don't issue warning: this may be plain -EMLINK */
40989 +                       zrelse(node);
40990 +                       return result;
40991 +               }
40992 +
40993 +               result =
40994 +                   from_item->s.dir.update_key(from_coord, &to_key, from_lh);
40995 +               if (result != 0) {
40996 +                       reiser4_del_nlink(to_inode, from_dir, 0);
40997 +                       zrelse(node);
40998 +                       return result;
40999 +               }
41000 +
41001 +               /* @from_inode just lost its name, he-he.
41002 +
41003 +                  If @from_inode was directory, it contained dotdot pointing
41004 +                  to @from_dir. @from_dir i_nlink will be decreased when
41005 +                  iput() will be called on @from_inode.
41006 +
41007 +                  If file-system is not ADG (hard-links are
41008 +                  supported on directories), iput(from_inode) will not remove
41009 +                  @from_inode, and thus above is incorrect, but hard-links on
41010 +                  directories are problematic in many other respects.
41011 +                */
41012 +               result = reiser4_del_nlink(from_inode, from_dir, 0);
41013 +               if (result != 0) {
41014 +                       warning("nikita-2330",
41015 +                               "Cannot remove link from source: %i. %s",
41016 +                               result, possible_leak);
41017 +               }
41018 +               /* Has to return success, because entry is already
41019 +                * modified. */
41020 +               result = 0;
41021 +
41022 +               /* NOTE-NIKITA consider calling plugin method in stead of
41023 +                  accessing inode fields directly. */
41024 +               from_dir->i_mtime = CURRENT_TIME;
41025 +       } else {
41026 +               warning("nikita-2326", "Unexpected item type");
41027 +               result = RETERR(-EIO);
41028 +       }
41029 +       zrelse(node);
41030 +       return result;
41031 +}
41032 +
41033 +/* add new entry pointing to @inode into @dir at @coord, locked by @lh
41034 +
41035 +   Helper function used by hashed_rename(). */
41036 +static int add_name(struct inode *inode,       /* inode where @coord is to be
41037 +                                                * re-targeted at */
41038 +                   struct inode *dir,  /* directory where @coord lives */
41039 +                   struct dentry *name,        /* new name */
41040 +                   coord_t *coord,     /* where directory entry is in the tree
41041 +                                        */
41042 +                   lock_handle * lh,   /* lock handle on @coord */
41043 +                   int is_dir/* true, if @inode is directory */)
41044 +{
41045 +       int result;
41046 +       reiser4_dir_entry_desc entry;
41047 +
41048 +       assert("nikita-2333", lh->node == coord->node);
41049 +       assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode));
41050 +
41051 +       memset(&entry, 0, sizeof entry);
41052 +       entry.obj = inode;
41053 +       /* build key of directory entry description */
41054 +       inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key);
41055 +
41056 +       /* ext2 does this in different order: first inserts new entry,
41057 +          then increases directory nlink. We don't want do this,
41058 +          because reiser4_add_nlink() calls ->add_link() plugin
41059 +          method that can fail for whatever reason, leaving as with
41060 +          cleanup problems.
41061 +        */
41062 +       /* @inode is getting new name */
41063 +       reiser4_add_nlink(inode, dir, 0);
41064 +       /* create @new_name in @new_dir pointing to
41065 +          @old_inode */
41066 +       result = WITH_COORD(coord,
41067 +                           inode_dir_item_plugin(dir)->s.dir.add_entry(dir,
41068 +                                                                       coord,
41069 +                                                                       lh,
41070 +                                                                       name,
41071 +                                                                       &entry));
41072 +       if (result != 0) {
41073 +               int result2;
41074 +               result2 = reiser4_del_nlink(inode, dir, 0);
41075 +               if (result2 != 0) {
41076 +                       warning("nikita-2327",
41077 +                               "Cannot drop link on %lli %i. %s",
41078 +                               (unsigned long long)get_inode_oid(inode),
41079 +                               result2, possible_leak);
41080 +               }
41081 +       } else
41082 +               INODE_INC_FIELD(dir, i_size);
41083 +       return result;
41084 +}
41085 +
41086 +static reiser4_block_nr estimate_rename(struct inode *old_dir,  /* directory
41087 +                                                                * where @old is
41088 +                                                                * located */
41089 +                                       struct dentry *old_name,/* old name */
41090 +                                       struct inode *new_dir,  /* directory
41091 +                                                                * where @new is
41092 +                                                                * located */
41093 +                                       struct dentry *new_name /* new name */)
41094 +{
41095 +       reiser4_block_nr res1, res2;
41096 +       dir_plugin * p_parent_old, *p_parent_new;
41097 +       file_plugin * p_child_old, *p_child_new;
41098 +
41099 +       assert("vpf-311", old_dir != NULL);
41100 +       assert("vpf-312", new_dir != NULL);
41101 +       assert("vpf-313", old_name != NULL);
41102 +       assert("vpf-314", new_name != NULL);
41103 +
41104 +       p_parent_old = inode_dir_plugin(old_dir);
41105 +       p_parent_new = inode_dir_plugin(new_dir);
41106 +       p_child_old = inode_file_plugin(old_name->d_inode);
41107 +       if (new_name->d_inode)
41108 +               p_child_new = inode_file_plugin(new_name->d_inode);
41109 +       else
41110 +               p_child_new = NULL;
41111 +
41112 +       /* find_entry - can insert one leaf. */
41113 +       res1 = res2 = 1;
41114 +
41115 +       /* replace_name */
41116 +       {
41117 +               /* reiser4_add_nlink(p_child_old) and
41118 +                * reiser4_del_nlink(p_child_old) */
41119 +               res1 += 2 * p_child_old->estimate.update(old_name->d_inode);
41120 +               /* update key */
41121 +               res1 += 1;
41122 +               /* reiser4_del_nlink(p_child_new) */
41123 +               if (p_child_new)
41124 +                       res1 += p_child_new->estimate.update(new_name->d_inode);
41125 +       }
41126 +
41127 +       /* else add_name */
41128 +       {
41129 +               /* reiser4_add_nlink(p_parent_new) and
41130 +                * reiser4_del_nlink(p_parent_new) */
41131 +               res2 +=
41132 +                   2 * inode_file_plugin(new_dir)->estimate.update(new_dir);
41133 +               /* reiser4_add_nlink(p_parent_old) */
41134 +               res2 += p_child_old->estimate.update(old_name->d_inode);
41135 +               /* add_entry(p_parent_new) */
41136 +               res2 += p_parent_new->estimate.add_entry(new_dir);
41137 +               /* reiser4_del_nlink(p_parent_old) */
41138 +               res2 += p_child_old->estimate.update(old_name->d_inode);
41139 +       }
41140 +
41141 +       res1 = res1 < res2 ? res2 : res1;
41142 +
41143 +       /* reiser4_write_sd(p_parent_new) */
41144 +       res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
41145 +
41146 +       /* reiser4_write_sd(p_child_new) */
41147 +       if (p_child_new)
41148 +               res1 += p_child_new->estimate.update(new_name->d_inode);
41149 +
41150 +       /* hashed_rem_entry(p_parent_old) */
41151 +       res1 += p_parent_old->estimate.rem_entry(old_dir);
41152 +
41153 +       /* reiser4_del_nlink(p_child_old) */
41154 +       res1 += p_child_old->estimate.update(old_name->d_inode);
41155 +
41156 +       /* replace_name */
41157 +       {
41158 +               /* reiser4_add_nlink(p_parent_dir_new) */
41159 +               res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
41160 +               /* update_key */
41161 +               res1 += 1;
41162 +               /* reiser4_del_nlink(p_parent_new) */
41163 +               res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
41164 +               /* reiser4_del_nlink(p_parent_old) */
41165 +               res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
41166 +       }
41167 +
41168 +       /* reiser4_write_sd(p_parent_old) */
41169 +       res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
41170 +
41171 +       /* reiser4_write_sd(p_child_old) */
41172 +       res1 += p_child_old->estimate.update(old_name->d_inode);
41173 +
41174 +       return res1;
41175 +}
41176 +
41177 +static int hashed_rename_estimate_and_grab(struct inode *old_dir,  /* directory
41178 +                                                                   * where @old
41179 +                                                                   * is located
41180 +                                                                   */
41181 +                                          struct dentry *old_name,/* old name
41182 +                                                                   */
41183 +                                          struct inode *new_dir,  /* directory
41184 +                                                                   * where @new
41185 +                                                                   * is located
41186 +                                                                   */
41187 +                                          struct dentry *new_name /* new name
41188 +                                                                   */)
41189 +{
41190 +       reiser4_block_nr reserve;
41191 +
41192 +       reserve = estimate_rename(old_dir, old_name, new_dir, new_name);
41193 +
41194 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
41195 +               return RETERR(-ENOSPC);
41196 +
41197 +       return 0;
41198 +}
41199 +
41200 +/* check whether @old_inode and @new_inode can be moved within file system
41201 + * tree. This singles out attempts to rename pseudo-files, for example. */
41202 +static int can_rename(struct inode *old_dir, struct inode *old_inode,
41203 +                     struct inode *new_dir, struct inode *new_inode)
41204 +{
41205 +       file_plugin *fplug;
41206 +       dir_plugin *dplug;
41207 +
41208 +       assert("nikita-3370", old_inode != NULL);
41209 +
41210 +       dplug = inode_dir_plugin(new_dir);
41211 +       fplug = inode_file_plugin(old_inode);
41212 +
41213 +       if (dplug == NULL)
41214 +               return RETERR(-ENOTDIR);
41215 +       else if (new_dir->i_op->create == NULL)
41216 +               return RETERR(-EPERM);
41217 +       else if (!fplug->can_add_link(old_inode))
41218 +               return RETERR(-EMLINK);
41219 +       else if (new_inode != NULL) {
41220 +               fplug = inode_file_plugin(new_inode);
41221 +               if (fplug->can_rem_link != NULL &&
41222 +                   !fplug->can_rem_link(new_inode))
41223 +                       return RETERR(-EBUSY);
41224 +       }
41225 +       return 0;
41226 +}
41227 +
41228 +int reiser4_find_entry(struct inode *, struct dentry *, lock_handle * ,
41229 +              znode_lock_mode, reiser4_dir_entry_desc *);
41230 +int reiser4_update_dir(struct inode *);
41231 +
41232 +/* this is common implementation of vfs's rename method of struct
41233 +   inode_operations
41234 +   See comments in the body.
41235 +
41236 +   It is arguable that this function can be made generic so, that it
41237 +   will be applicable to any kind of directory plugin that deals with
41238 +   directories composed out of directory entries. The only obstacle
41239 +   here is that we don't have any data-type to represent directory
41240 +   entry. This should be re-considered when more than one different
41241 +   directory plugin will be implemented.
41242 +*/
41243 +int reiser4_rename_common(struct inode *old_dir /* directory where @old
41244 +                                                * is located */ ,
41245 +                         struct dentry *old_name /* old name */ ,
41246 +                         struct inode *new_dir /* directory where @new
41247 +                                                * is located */ ,
41248 +                         struct dentry *new_name/* new name */)
41249 +{
41250 +       /* From `The Open Group Base Specifications Issue 6'
41251 +
41252 +          If either the old or new argument names a symbolic link, rename()
41253 +          shall operate on the symbolic link itself, and shall not resolve
41254 +          the last component of the argument. If the old argument and the new
41255 +          argument resolve to the same existing file, rename() shall return
41256 +          successfully and perform no other action.
41257 +
41258 +          [this is done by VFS: vfs_rename()]
41259 +
41260 +          If the old argument points to the pathname of a file that is not a
41261 +          directory, the new argument shall not point to the pathname of a
41262 +          directory.
41263 +
41264 +          [checked by VFS: vfs_rename->may_delete()]
41265 +
41266 +          If the link named by the new argument exists, it shall
41267 +          be removed and old renamed to new. In this case, a link named new
41268 +          shall remain visible to other processes throughout the renaming
41269 +          operation and refer either to the file referred to by new or old
41270 +          before the operation began.
41271 +
41272 +          [we should assure this]
41273 +
41274 +          Write access permission is required for
41275 +          both the directory containing old and the directory containing new.
41276 +
41277 +          [checked by VFS: vfs_rename->may_delete(), may_create()]
41278 +
41279 +          If the old argument points to the pathname of a directory, the new
41280 +          argument shall not point to the pathname of a file that is not a
41281 +          directory.
41282 +
41283 +          [checked by VFS: vfs_rename->may_delete()]
41284 +
41285 +          If the directory named by the new argument exists, it
41286 +          shall be removed and old renamed to new. In this case, a link named
41287 +          new shall exist throughout the renaming operation and shall refer
41288 +          either to the directory referred to by new or old before the
41289 +          operation began.
41290 +
41291 +          [we should assure this]
41292 +
41293 +          If new names an existing directory, it shall be
41294 +          required to be an empty directory.
41295 +
41296 +          [we should check this]
41297 +
41298 +          If the old argument points to a pathname of a symbolic link, the
41299 +          symbolic link shall be renamed. If the new argument points to a
41300 +          pathname of a symbolic link, the symbolic link shall be removed.
41301 +
41302 +          The new pathname shall not contain a path prefix that names
41303 +          old. Write access permission is required for the directory
41304 +          containing old and the directory containing new. If the old
41305 +          argument points to the pathname of a directory, write access
41306 +          permission may be required for the directory named by old, and, if
41307 +          it exists, the directory named by new.
41308 +
41309 +          [checked by VFS: vfs_rename(), vfs_rename_dir()]
41310 +
41311 +          If the link named by the new argument exists and the file's link
41312 +          count becomes 0 when it is removed and no process has the file
41313 +          open, the space occupied by the file shall be freed and the file
41314 +          shall no longer be accessible. If one or more processes have the
41315 +          file open when the last link is removed, the link shall be removed
41316 +          before rename() returns, but the removal of the file contents shall
41317 +          be postponed until all references to the file are closed.
41318 +
41319 +          [iput() handles this, but we can do this manually, a la
41320 +          reiser4_unlink()]
41321 +
41322 +          Upon successful completion, rename() shall mark for update the
41323 +          st_ctime and st_mtime fields of the parent directory of each file.
41324 +
41325 +          [N/A]
41326 +
41327 +        */
41328 +       reiser4_context *ctx;
41329 +       int result;
41330 +       int is_dir;             /* is @old_name directory */
41331 +
41332 +       struct inode *old_inode;
41333 +       struct inode *new_inode;
41334 +       coord_t *new_coord;
41335 +
41336 +       struct reiser4_dentry_fsdata *new_fsdata;
41337 +       dir_plugin *dplug;
41338 +       file_plugin *fplug;
41339 +
41340 +       reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry;
41341 +       lock_handle * new_lh, *dotdot_lh;
41342 +       struct dentry *dotdot_name;
41343 +       struct reiser4_dentry_fsdata *dataonstack;
41344 +
41345 +       ctx = reiser4_init_context(old_dir->i_sb);
41346 +       if (IS_ERR(ctx))
41347 +               return PTR_ERR(ctx);
41348 +
41349 +       old_entry = kzalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
41350 +                           sizeof(*dotdot_name) + sizeof(*dataonstack),
41351 +                           reiser4_ctx_gfp_mask_get());
41352 +       if (!old_entry) {
41353 +               context_set_commit_async(ctx);
41354 +               reiser4_exit_context(ctx);
41355 +               return RETERR(-ENOMEM);
41356 +       }
41357 +
41358 +       new_entry = old_entry + 1;
41359 +       dotdot_entry = old_entry + 2;
41360 +       new_lh = (lock_handle *)(old_entry + 3);
41361 +       dotdot_lh = new_lh + 1;
41362 +       dotdot_name = (struct dentry *)(new_lh + 2);
41363 +       dataonstack = (struct reiser4_dentry_fsdata *)(dotdot_name + 1);
41364 +
41365 +       assert("nikita-2318", old_dir != NULL);
41366 +       assert("nikita-2319", new_dir != NULL);
41367 +       assert("nikita-2320", old_name != NULL);
41368 +       assert("nikita-2321", new_name != NULL);
41369 +
41370 +       old_inode = old_name->d_inode;
41371 +       new_inode = new_name->d_inode;
41372 +
41373 +       dplug = inode_dir_plugin(old_dir);
41374 +       fplug = NULL;
41375 +
41376 +       new_fsdata = reiser4_get_dentry_fsdata(new_name);
41377 +       if (IS_ERR(new_fsdata)) {
41378 +               kfree(old_entry);
41379 +               context_set_commit_async(ctx);
41380 +               reiser4_exit_context(ctx);
41381 +               return PTR_ERR(new_fsdata);
41382 +       }
41383 +
41384 +       new_coord = &new_fsdata->dec.entry_coord;
41385 +       coord_clear_iplug(new_coord);
41386 +
41387 +       is_dir = S_ISDIR(old_inode->i_mode);
41388 +
41389 +       assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
41390 +
41391 +       /* if target is existing directory and it's not empty---return error.
41392 +
41393 +          This check is done specifically, because is_dir_empty() requires
41394 +          tree traversal and have to be done before locks are taken.
41395 +        */
41396 +       if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) {
41397 +               kfree(old_entry);
41398 +               context_set_commit_async(ctx);
41399 +               reiser4_exit_context(ctx);
41400 +               return RETERR(-ENOTEMPTY);
41401 +       }
41402 +
41403 +       result = can_rename(old_dir, old_inode, new_dir, new_inode);
41404 +       if (result != 0) {
41405 +               kfree(old_entry);
41406 +               context_set_commit_async(ctx);
41407 +               reiser4_exit_context(ctx);
41408 +               return result;
41409 +       }
41410 +
41411 +       result = hashed_rename_estimate_and_grab(old_dir, old_name,
41412 +                                                new_dir, new_name);
41413 +       if (result != 0) {
41414 +               kfree(old_entry);
41415 +               context_set_commit_async(ctx);
41416 +               reiser4_exit_context(ctx);
41417 +               return result;
41418 +       }
41419 +
41420 +       init_lh(new_lh);
41421 +
41422 +       /* find entry for @new_name */
41423 +       result = reiser4_find_entry(new_dir, new_name, new_lh, ZNODE_WRITE_LOCK,
41424 +                                   new_entry);
41425 +
41426 +       if (IS_CBKERR(result)) {
41427 +               done_lh(new_lh);
41428 +               kfree(old_entry);
41429 +               context_set_commit_async(ctx);
41430 +               reiser4_exit_context(ctx);
41431 +               return result;
41432 +       }
41433 +
41434 +       reiser4_seal_done(&new_fsdata->dec.entry_seal);
41435 +
41436 +       /* add or replace name for @old_inode as @new_name */
41437 +       if (new_inode != NULL) {
41438 +               /* target (@new_name) exists. */
41439 +               /* Not clear what to do with objects that are
41440 +                  both directories and files at the same time. */
41441 +               if (result == CBK_COORD_FOUND) {
41442 +                       result = replace_name(old_inode,
41443 +                                             new_dir,
41444 +                                             new_inode, new_coord, new_lh);
41445 +                       if (result == 0)
41446 +                               fplug = inode_file_plugin(new_inode);
41447 +               } else if (result == CBK_COORD_NOTFOUND) {
41448 +                       /* VFS told us that @new_name is bound to existing
41449 +                          inode, but we failed to find directory entry. */
41450 +                       warning("nikita-2324", "Target not found");
41451 +                       result = RETERR(-ENOENT);
41452 +               }
41453 +       } else {
41454 +               /* target (@new_name) doesn't exists. */
41455 +               if (result == CBK_COORD_NOTFOUND)
41456 +                       result = add_name(old_inode,
41457 +                                         new_dir,
41458 +                                         new_name, new_coord, new_lh, is_dir);
41459 +               else if (result == CBK_COORD_FOUND) {
41460 +                       /* VFS told us that @new_name is "negative" dentry,
41461 +                          but we found directory entry. */
41462 +                       warning("nikita-2331", "Target found unexpectedly");
41463 +                       result = RETERR(-EIO);
41464 +               }
41465 +       }
41466 +
41467 +       assert("nikita-3462", ergo(result == 0,
41468 +                                  old_inode->i_nlink >= 2 + !!is_dir));
41469 +
41470 +       /* We are done with all modifications to the @new_dir, release lock on
41471 +          node. */
41472 +       done_lh(new_lh);
41473 +
41474 +       if (fplug != NULL) {
41475 +               /* detach @new_inode from name-space */
41476 +               result = fplug->detach(new_inode, new_dir);
41477 +               if (result != 0)
41478 +                       warning("nikita-2330", "Cannot detach %lli: %i. %s",
41479 +                               (unsigned long long)get_inode_oid(new_inode),
41480 +                               result, possible_leak);
41481 +       }
41482 +
41483 +       if (new_inode != NULL)
41484 +               reiser4_update_sd(new_inode);
41485 +
41486 +       if (result == 0) {
41487 +               old_entry->obj = old_inode;
41488 +
41489 +               dplug->build_entry_key(old_dir,
41490 +                                      &old_name->d_name, &old_entry->key);
41491 +
41492 +               /* At this stage new name was introduced for
41493 +                  @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
41494 +                  counters were updated.
41495 +
41496 +                  We want to remove @old_name now. If @old_inode wasn't
41497 +                  directory this is simple.
41498 +                */
41499 +               result = dplug->rem_entry(old_dir, old_name, old_entry);
41500 +               if (result != 0 && result != -ENOMEM) {
41501 +                       warning("nikita-2335",
41502 +                               "Cannot remove old name: %i", result);
41503 +               } else {
41504 +                       result = reiser4_del_nlink(old_inode, old_dir, 0);
41505 +                       if (result != 0 && result != -ENOMEM) {
41506 +                               warning("nikita-2337",
41507 +                                       "Cannot drop link on old: %i", result);
41508 +                       }
41509 +               }
41510 +
41511 +               if (result == 0 && is_dir) {
41512 +                       /* @old_inode is directory. We also have to update
41513 +                          dotdot entry. */
41514 +                       coord_t *dotdot_coord;
41515 +
41516 +                       memset(dataonstack, 0, sizeof dataonstack);
41517 +                       memset(dotdot_entry, 0, sizeof dotdot_entry);
41518 +                       dotdot_entry->obj = old_dir;
41519 +                       memset(dotdot_name, 0, sizeof dotdot_name);
41520 +                       dotdot_name->d_name.name = "..";
41521 +                       dotdot_name->d_name.len = 2;
41522 +                       /*
41523 +                        * allocate ->d_fsdata on the stack to avoid using
41524 +                        * reiser4_get_dentry_fsdata(). Locking is not needed,
41525 +                        * because dentry is private to the current thread.
41526 +                        */
41527 +                       dotdot_name->d_fsdata = dataonstack;
41528 +                       init_lh(dotdot_lh);
41529 +
41530 +                       dotdot_coord = &dataonstack->dec.entry_coord;
41531 +                       coord_clear_iplug(dotdot_coord);
41532 +
41533 +                       result = reiser4_find_entry(old_inode, dotdot_name,
41534 +                                                   dotdot_lh, ZNODE_WRITE_LOCK,
41535 +                                                   dotdot_entry);
41536 +                       if (result == 0) {
41537 +                               /* replace_name() decreases i_nlink on
41538 +                                * @old_dir */
41539 +                               result = replace_name(new_dir,
41540 +                                                     old_inode,
41541 +                                                     old_dir,
41542 +                                                     dotdot_coord, dotdot_lh);
41543 +                       } else
41544 +                               result = RETERR(-EIO);
41545 +                       done_lh(dotdot_lh);
41546 +               }
41547 +       }
41548 +       reiser4_update_dir(new_dir);
41549 +       reiser4_update_dir(old_dir);
41550 +       reiser4_update_sd(old_inode);
41551 +       if (result == 0) {
41552 +               file_plugin *fplug;
41553 +
41554 +               if (new_inode != NULL) {
41555 +                       /* add safe-link for target file (in case we removed
41556 +                        * last reference to the poor fellow */
41557 +                       fplug = inode_file_plugin(new_inode);
41558 +                       if (new_inode->i_nlink == 0)
41559 +                               result = safe_link_add(new_inode, SAFE_UNLINK);
41560 +               }
41561 +       }
41562 +       kfree(old_entry);
41563 +       context_set_commit_async(ctx);
41564 +       reiser4_exit_context(ctx);
41565 +       return result;
41566 +}
41567 +
41568 +#if 0
41569 +int reiser4_rename_common(struct inode *old_dir /* directory where @old
41570 +                                                * is located */ ,
41571 +                         struct dentry *old_name /* old name */ ,
41572 +                         struct inode *new_dir /* directory where @new
41573 +                                                * is located */ ,
41574 +                         struct dentry *new_name/* new name */)
41575 +{
41576 +       /* From `The Open Group Base Specifications Issue 6'
41577 +
41578 +          If either the old or new argument names a symbolic link, rename()
41579 +          shall operate on the symbolic link itself, and shall not resolve
41580 +          the last component of the argument. If the old argument and the new
41581 +          argument resolve to the same existing file, rename() shall return
41582 +          successfully and perform no other action.
41583 +
41584 +          [this is done by VFS: vfs_rename()]
41585 +
41586 +          If the old argument points to the pathname of a file that is not a
41587 +          directory, the new argument shall not point to the pathname of a
41588 +          directory.
41589 +
41590 +          [checked by VFS: vfs_rename->may_delete()]
41591 +
41592 +          If the link named by the new argument exists, it shall
41593 +          be removed and old renamed to new. In this case, a link named new
41594 +          shall remain visible to other processes throughout the renaming
41595 +          operation and refer either to the file referred to by new or old
41596 +          before the operation began.
41597 +
41598 +          [we should assure this]
41599 +
41600 +          Write access permission is required for
41601 +          both the directory containing old and the directory containing new.
41602 +
41603 +          [checked by VFS: vfs_rename->may_delete(), may_create()]
41604 +
41605 +          If the old argument points to the pathname of a directory, the new
41606 +          argument shall not point to the pathname of a file that is not a
41607 +          directory.
41608 +
41609 +          [checked by VFS: vfs_rename->may_delete()]
41610 +
41611 +          If the directory named by the new argument exists, it
41612 +          shall be removed and old renamed to new. In this case, a link named
41613 +          new shall exist throughout the renaming operation and shall refer
41614 +          either to the directory referred to by new or old before the
41615 +          operation began.
41616 +
41617 +          [we should assure this]
41618 +
41619 +          If new names an existing directory, it shall be
41620 +          required to be an empty directory.
41621 +
41622 +          [we should check this]
41623 +
41624 +          If the old argument points to a pathname of a symbolic link, the
41625 +          symbolic link shall be renamed. If the new argument points to a
41626 +          pathname of a symbolic link, the symbolic link shall be removed.
41627 +
41628 +          The new pathname shall not contain a path prefix that names
41629 +          old. Write access permission is required for the directory
41630 +          containing old and the directory containing new. If the old
41631 +          argument points to the pathname of a directory, write access
41632 +          permission may be required for the directory named by old, and, if
41633 +          it exists, the directory named by new.
41634 +
41635 +          [checked by VFS: vfs_rename(), vfs_rename_dir()]
41636 +
41637 +          If the link named by the new argument exists and the file's link
41638 +          count becomes 0 when it is removed and no process has the file
41639 +          open, the space occupied by the file shall be freed and the file
41640 +          shall no longer be accessible. If one or more processes have the
41641 +          file open when the last link is removed, the link shall be removed
41642 +          before rename() returns, but the removal of the file contents shall
41643 +          be postponed until all references to the file are closed.
41644 +
41645 +          [iput() handles this, but we can do this manually, a la
41646 +          reiser4_unlink()]
41647 +
41648 +          Upon successful completion, rename() shall mark for update the
41649 +          st_ctime and st_mtime fields of the parent directory of each file.
41650 +
41651 +          [N/A]
41652 +
41653 +        */
41654 +       reiser4_context *ctx;
41655 +       int result;
41656 +       int is_dir;             /* is @old_name directory */
41657 +       struct inode *old_inode;
41658 +       struct inode *new_inode;
41659 +       reiser4_dir_entry_desc old_entry;
41660 +       reiser4_dir_entry_desc new_entry;
41661 +       coord_t *new_coord;
41662 +       struct reiser4_dentry_fsdata *new_fsdata;
41663 +       lock_handle new_lh;
41664 +       dir_plugin *dplug;
41665 +       file_plugin *fplug;
41666 +
41667 +       ctx = reiser4_init_context(old_dir->i_sb);
41668 +       if (IS_ERR(ctx))
41669 +               return PTR_ERR(ctx);
41670 +
41671 +       assert("nikita-2318", old_dir != NULL);
41672 +       assert("nikita-2319", new_dir != NULL);
41673 +       assert("nikita-2320", old_name != NULL);
41674 +       assert("nikita-2321", new_name != NULL);
41675 +
41676 +       old_inode = old_name->d_inode;
41677 +       new_inode = new_name->d_inode;
41678 +
41679 +       dplug = inode_dir_plugin(old_dir);
41680 +       fplug = NULL;
41681 +
41682 +       new_fsdata = reiser4_get_dentry_fsdata(new_name);
41683 +       if (IS_ERR(new_fsdata)) {
41684 +               result = PTR_ERR(new_fsdata);
41685 +               goto exit;
41686 +       }
41687 +
41688 +       new_coord = &new_fsdata->dec.entry_coord;
41689 +       coord_clear_iplug(new_coord);
41690 +
41691 +       is_dir = S_ISDIR(old_inode->i_mode);
41692 +
41693 +       assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
41694 +
41695 +       /* if target is existing directory and it's not empty---return error.
41696 +
41697 +          This check is done specifically, because is_dir_empty() requires
41698 +          tree traversal and have to be done before locks are taken.
41699 +        */
41700 +       if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0)
41701 +               return RETERR(-ENOTEMPTY);
41702 +
41703 +       result = can_rename(old_dir, old_inode, new_dir, new_inode);
41704 +       if (result != 0)
41705 +               goto exit;
41706 +
41707 +       result = hashed_rename_estimate_and_grab(old_dir, old_name,
41708 +                                                new_dir, new_name);
41709 +       if (result != 0)
41710 +               goto exit;
41711 +
41712 +       init_lh(&new_lh);
41713 +
41714 +       /* find entry for @new_name */
41715 +       result = reiser4_find_entry(new_dir, new_name, &new_lh,
41716 +                                   ZNODE_WRITE_LOCK, &new_entry);
41717 +
41718 +       if (IS_CBKERR(result)) {
41719 +               done_lh(&new_lh);
41720 +               goto exit;
41721 +       }
41722 +
41723 +       reiser4_seal_done(&new_fsdata->dec.entry_seal);
41724 +
41725 +       /* add or replace name for @old_inode as @new_name */
41726 +       if (new_inode != NULL) {
41727 +               /* target (@new_name) exists. */
41728 +               /* Not clear what to do with objects that are
41729 +                  both directories and files at the same time. */
41730 +               if (result == CBK_COORD_FOUND) {
41731 +                       result = replace_name(old_inode,
41732 +                                             new_dir,
41733 +                                             new_inode, new_coord, &new_lh);
41734 +                       if (result == 0)
41735 +                               fplug = inode_file_plugin(new_inode);
41736 +               } else if (result == CBK_COORD_NOTFOUND) {
41737 +                       /* VFS told us that @new_name is bound to existing
41738 +                          inode, but we failed to find directory entry. */
41739 +                       warning("nikita-2324", "Target not found");
41740 +                       result = RETERR(-ENOENT);
41741 +               }
41742 +       } else {
41743 +               /* target (@new_name) doesn't exists. */
41744 +               if (result == CBK_COORD_NOTFOUND)
41745 +                       result = add_name(old_inode,
41746 +                                         new_dir,
41747 +                                         new_name, new_coord, &new_lh, is_dir);
41748 +               else if (result == CBK_COORD_FOUND) {
41749 +                       /* VFS told us that @new_name is "negative" dentry,
41750 +                          but we found directory entry. */
41751 +                       warning("nikita-2331", "Target found unexpectedly");
41752 +                       result = RETERR(-EIO);
41753 +               }
41754 +       }
41755 +
41756 +       assert("nikita-3462", ergo(result == 0,
41757 +                                  old_inode->i_nlink >= 2 + !!is_dir));
41758 +
41759 +       /* We are done with all modifications to the @new_dir, release lock on
41760 +          node. */
41761 +       done_lh(&new_lh);
41762 +
41763 +       if (fplug != NULL) {
41764 +               /* detach @new_inode from name-space */
41765 +               result = fplug->detach(new_inode, new_dir);
41766 +               if (result != 0)
41767 +                       warning("nikita-2330", "Cannot detach %lli: %i. %s",
41768 +                               (unsigned long long)get_inode_oid(new_inode),
41769 +                               result, possible_leak);
41770 +       }
41771 +
41772 +       if (new_inode != NULL)
41773 +               reiser4_update_sd(new_inode);
41774 +
41775 +       if (result == 0) {
41776 +               memset(&old_entry, 0, sizeof old_entry);
41777 +               old_entry.obj = old_inode;
41778 +
41779 +               dplug->build_entry_key(old_dir,
41780 +                                      &old_name->d_name, &old_entry.key);
41781 +
41782 +               /* At this stage new name was introduced for
41783 +                  @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
41784 +                  counters were updated.
41785 +
41786 +                  We want to remove @old_name now. If @old_inode wasn't
41787 +                  directory this is simple.
41788 +                */
41789 +               result = dplug->rem_entry(old_dir, old_name, &old_entry);
41790 +               /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */
41791 +               if (result != 0 && result != -ENOMEM) {
41792 +                       warning("nikita-2335",
41793 +                               "Cannot remove old name: %i", result);
41794 +               } else {
41795 +                       result = reiser4_del_nlink(old_inode, old_dir, 0);
41796 +                       if (result != 0 && result != -ENOMEM) {
41797 +                               warning("nikita-2337",
41798 +                                       "Cannot drop link on old: %i", result);
41799 +                       }
41800 +               }
41801 +
41802 +               if (result == 0 && is_dir) {
41803 +                       /* @old_inode is directory. We also have to update
41804 +                          dotdot entry. */
41805 +                       coord_t *dotdot_coord;
41806 +                       lock_handle dotdot_lh;
41807 +                       struct dentry dotdot_name;
41808 +                       reiser4_dir_entry_desc dotdot_entry;
41809 +                       struct reiser4_dentry_fsdata dataonstack;
41810 +                       struct reiser4_dentry_fsdata *fsdata;
41811 +
41812 +                       memset(&dataonstack, 0, sizeof dataonstack);
41813 +                       memset(&dotdot_entry, 0, sizeof dotdot_entry);
41814 +                       dotdot_entry.obj = old_dir;
41815 +                       memset(&dotdot_name, 0, sizeof dotdot_name);
41816 +                       dotdot_name.d_name.name = "..";
41817 +                       dotdot_name.d_name.len = 2;
41818 +                       /*
41819 +                        * allocate ->d_fsdata on the stack to avoid using
41820 +                        * reiser4_get_dentry_fsdata(). Locking is not needed,
41821 +                        * because dentry is private to the current thread.
41822 +                        */
41823 +                       dotdot_name.d_fsdata = &dataonstack;
41824 +                       init_lh(&dotdot_lh);
41825 +
41826 +                       fsdata = &dataonstack;
41827 +                       dotdot_coord = &fsdata->dec.entry_coord;
41828 +                       coord_clear_iplug(dotdot_coord);
41829 +
41830 +                       result = reiser4_find_entry(old_inode,
41831 +                                                   &dotdot_name,
41832 +                                                   &dotdot_lh,
41833 +                                                   ZNODE_WRITE_LOCK,
41834 +                                                   &dotdot_entry);
41835 +                       if (result == 0) {
41836 +                               /* replace_name() decreases i_nlink on
41837 +                                * @old_dir */
41838 +                               result = replace_name(new_dir,
41839 +                                                     old_inode,
41840 +                                                     old_dir,
41841 +                                                     dotdot_coord, &dotdot_lh);
41842 +                       } else
41843 +                               result = RETERR(-EIO);
41844 +                       done_lh(&dotdot_lh);
41845 +               }
41846 +       }
41847 +       reiser4_update_dir(new_dir);
41848 +       reiser4_update_dir(old_dir);
41849 +       reiser4_update_sd(old_inode);
41850 +       if (result == 0) {
41851 +               file_plugin *fplug;
41852 +
41853 +               if (new_inode != NULL) {
41854 +                       /* add safe-link for target file (in case we removed
41855 +                        * last reference to the poor fellow */
41856 +                       fplug = inode_file_plugin(new_inode);
41857 +                       if (new_inode->i_nlink == 0)
41858 +                               result = safe_link_add(new_inode, SAFE_UNLINK);
41859 +               }
41860 +       }
41861 +exit:
41862 +       context_set_commit_async(ctx);
41863 +       reiser4_exit_context(ctx);
41864 +       return result;
41865 +}
41866 +#endif
41867 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/acl.h linux-2.6.35/fs/reiser4/plugin/item/acl.h
41868 --- linux-2.6.35.orig/fs/reiser4/plugin/item/acl.h      1970-01-01 01:00:00.000000000 +0100
41869 +++ linux-2.6.35/fs/reiser4/plugin/item/acl.h   2010-08-04 15:44:57.000000000 +0200
41870 @@ -0,0 +1,66 @@
41871 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
41872 +
41873 +/* Directory entry. */
41874 +
41875 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
41876 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
41877 +
41878 +#include "../../forward.h"
41879 +#include "../../dformat.h"
41880 +#include "../../kassign.h"
41881 +#include "../../key.h"
41882 +
41883 +#include <linux/fs.h>
41884 +#include <linux/dcache.h>      /* for struct dentry */
41885 +
41886 +typedef struct directory_entry_format {
41887 +       /* key of object stat-data. It's not necessary to store whole
41888 +          key here, because it's always key of stat-data, so minor
41889 +          packing locality and offset can be omitted here. But this
41890 +          relies on particular key allocation scheme for stat-data, so,
41891 +          for extensibility sake, whole key can be stored here.
41892 +
41893 +          We store key as array of bytes, because we don't want 8-byte
41894 +          alignment of dir entries.
41895 +        */
41896 +       obj_key_id id;
41897 +       /* file name. Null terminated string. */
41898 +       d8 name[0];
41899 +} directory_entry_format;
41900 +
41901 +void print_de(const char *prefix, coord_t * coord);
41902 +int extract_key_de(const coord_t * coord, reiser4_key * key);
41903 +int update_key_de(const coord_t * coord, const reiser4_key * key,
41904 +                 lock_handle * lh);
41905 +char *extract_name_de(const coord_t * coord, char *buf);
41906 +unsigned extract_file_type_de(const coord_t * coord);
41907 +int add_entry_de(struct inode *dir, coord_t * coord,
41908 +                lock_handle * lh, const struct dentry *name,
41909 +                reiser4_dir_entry_desc * entry);
41910 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
41911 +                lock_handle * lh, reiser4_dir_entry_desc * entry);
41912 +int max_name_len_de(const struct inode *dir);
41913 +
41914 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
41915 +
41916 +char *extract_dent_name(const coord_t * coord,
41917 +                       directory_entry_format * dent, char *buf);
41918 +
41919 +#if REISER4_LARGE_KEY
41920 +#define DE_NAME_BUF_LEN (24)
41921 +#else
41922 +#define DE_NAME_BUF_LEN (16)
41923 +#endif
41924 +
41925 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
41926 +#endif
41927 +
41928 +/* Make Linus happy.
41929 +   Local variables:
41930 +   c-indentation-style: "K&R"
41931 +   mode-name: "LC"
41932 +   c-basic-offset: 8
41933 +   tab-width: 8
41934 +   fill-column: 120
41935 +   End:
41936 +*/
41937 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/blackbox.c linux-2.6.35/fs/reiser4/plugin/item/blackbox.c
41938 --- linux-2.6.35.orig/fs/reiser4/plugin/item/blackbox.c 1970-01-01 01:00:00.000000000 +0100
41939 +++ linux-2.6.35/fs/reiser4/plugin/item/blackbox.c      2010-08-04 15:44:57.000000000 +0200
41940 @@ -0,0 +1,142 @@
41941 +/* Copyright 2003 by Hans Reiser, licensing governed by
41942 + * reiser4/README */
41943 +
41944 +/* Black box item implementation */
41945 +
41946 +#include "../../forward.h"
41947 +#include "../../debug.h"
41948 +#include "../../dformat.h"
41949 +#include "../../kassign.h"
41950 +#include "../../coord.h"
41951 +#include "../../tree.h"
41952 +#include "../../lock.h"
41953 +
41954 +#include "blackbox.h"
41955 +#include "item.h"
41956 +#include "../plugin.h"
41957 +
41958 +int
41959 +store_black_box(reiser4_tree * tree,
41960 +               const reiser4_key * key, void *data, int length)
41961 +{
41962 +       int result;
41963 +       reiser4_item_data idata;
41964 +       coord_t coord;
41965 +       lock_handle lh;
41966 +
41967 +       memset(&idata, 0, sizeof idata);
41968 +
41969 +       idata.data = data;
41970 +       idata.user = 0;
41971 +       idata.length = length;
41972 +       idata.iplug = item_plugin_by_id(BLACK_BOX_ID);
41973 +
41974 +       init_lh(&lh);
41975 +       result = insert_by_key(tree, key,
41976 +                              &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
41977 +
41978 +       assert("nikita-3413",
41979 +              ergo(result == 0,
41980 +                   WITH_COORD(&coord,
41981 +                              item_length_by_coord(&coord) == length)));
41982 +
41983 +       done_lh(&lh);
41984 +       return result;
41985 +}
41986 +
41987 +int
41988 +load_black_box(reiser4_tree * tree,
41989 +              reiser4_key * key, void *data, int length, int exact)
41990 +{
41991 +       int result;
41992 +       coord_t coord;
41993 +       lock_handle lh;
41994 +
41995 +       init_lh(&lh);
41996 +       result = coord_by_key(tree, key,
41997 +                             &coord, &lh, ZNODE_READ_LOCK,
41998 +                             exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
41999 +                             LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
42000 +
42001 +       if (result == 0) {
42002 +               int ilen;
42003 +
42004 +               result = zload(coord.node);
42005 +               if (result == 0) {
42006 +                       ilen = item_length_by_coord(&coord);
42007 +                       if (ilen <= length) {
42008 +                               memcpy(data, item_body_by_coord(&coord), ilen);
42009 +                               unit_key_by_coord(&coord, key);
42010 +                       } else if (exact) {
42011 +                               /*
42012 +                                * item is larger than buffer provided by the
42013 +                                * user. Only issue a warning if @exact is
42014 +                                * set. If @exact is false, we are iterating
42015 +                                * over all safe-links and here we are reaching
42016 +                                * the end of the iteration.
42017 +                                */
42018 +                               warning("nikita-3415",
42019 +                                       "Wrong black box length: %i > %i",
42020 +                                       ilen, length);
42021 +                               result = RETERR(-EIO);
42022 +                       }
42023 +                       zrelse(coord.node);
42024 +               }
42025 +       }
42026 +
42027 +       done_lh(&lh);
42028 +       return result;
42029 +
42030 +}
42031 +
42032 +int
42033 +update_black_box(reiser4_tree * tree,
42034 +                const reiser4_key * key, void *data, int length)
42035 +{
42036 +       int result;
42037 +       coord_t coord;
42038 +       lock_handle lh;
42039 +
42040 +       init_lh(&lh);
42041 +       result = coord_by_key(tree, key,
42042 +                             &coord, &lh, ZNODE_READ_LOCK,
42043 +                             FIND_EXACT,
42044 +                             LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
42045 +       if (result == 0) {
42046 +               int ilen;
42047 +
42048 +               result = zload(coord.node);
42049 +               if (result == 0) {
42050 +                       ilen = item_length_by_coord(&coord);
42051 +                       if (length <= ilen) {
42052 +                               memcpy(item_body_by_coord(&coord), data,
42053 +                                      length);
42054 +                       } else {
42055 +                               warning("nikita-3437",
42056 +                                       "Wrong black box length: %i < %i",
42057 +                                       ilen, length);
42058 +                               result = RETERR(-EIO);
42059 +                       }
42060 +                       zrelse(coord.node);
42061 +               }
42062 +       }
42063 +
42064 +       done_lh(&lh);
42065 +       return result;
42066 +
42067 +}
42068 +
42069 +int kill_black_box(reiser4_tree * tree, const reiser4_key * key)
42070 +{
42071 +       return reiser4_cut_tree(tree, key, key, NULL, 1);
42072 +}
42073 +
42074 +/* Make Linus happy.
42075 +   Local variables:
42076 +   c-indentation-style: "K&R"
42077 +   mode-name: "LC"
42078 +   c-basic-offset: 8
42079 +   tab-width: 8
42080 +   fill-column: 120
42081 +   End:
42082 +*/
42083 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/blackbox.h linux-2.6.35/fs/reiser4/plugin/item/blackbox.h
42084 --- linux-2.6.35.orig/fs/reiser4/plugin/item/blackbox.h 1970-01-01 01:00:00.000000000 +0100
42085 +++ linux-2.6.35/fs/reiser4/plugin/item/blackbox.h      2010-08-04 15:44:57.000000000 +0200
42086 @@ -0,0 +1,33 @@
42087 +/* Copyright 2003 by Hans Reiser, licensing governed by
42088 + * reiser4/README */
42089 +
42090 +/* "Black box" entry to fixed-width contain user supplied data */
42091 +
42092 +#if !defined( __FS_REISER4_BLACK_BOX_H__ )
42093 +#define __FS_REISER4_BLACK_BOX_H__
42094 +
42095 +#include "../../forward.h"
42096 +#include "../../dformat.h"
42097 +#include "../../kassign.h"
42098 +#include "../../key.h"
42099 +
42100 +extern int store_black_box(reiser4_tree * tree,
42101 +                          const reiser4_key * key, void *data, int length);
42102 +extern int load_black_box(reiser4_tree * tree,
42103 +                         reiser4_key * key, void *data, int length, int exact);
42104 +extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key);
42105 +extern int update_black_box(reiser4_tree * tree,
42106 +                           const reiser4_key * key, void *data, int length);
42107 +
42108 +/* __FS_REISER4_BLACK_BOX_H__ */
42109 +#endif
42110 +
42111 +/* Make Linus happy.
42112 +   Local variables:
42113 +   c-indentation-style: "K&R"
42114 +   mode-name: "LC"
42115 +   c-basic-offset: 8
42116 +   tab-width: 8
42117 +   fill-column: 120
42118 +   End:
42119 +*/
42120 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/cde.c linux-2.6.35/fs/reiser4/plugin/item/cde.c
42121 --- linux-2.6.35.orig/fs/reiser4/plugin/item/cde.c      1970-01-01 01:00:00.000000000 +0100
42122 +++ linux-2.6.35/fs/reiser4/plugin/item/cde.c   2010-08-04 15:44:57.000000000 +0200
42123 @@ -0,0 +1,1008 @@
42124 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
42125 +
42126 +/* Directory entry implementation */
42127 +
42128 +/* DESCRIPTION:
42129 +
42130 +   This is "compound" directory item plugin implementation. This directory
42131 +   item type is compound (as opposed to the "simple directory item" in
42132 +   fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory
42133 +   entries.
42134 +
42135 +   The reason behind this decision is disk space efficiency: all directory
42136 +   entries inside the same directory have identical fragment in their
42137 +   keys. This, of course, depends on key assignment policy. In our default key
42138 +   assignment policy, all directory entries have the same locality which is
42139 +   equal to the object id of their directory.
42140 +
42141 +   Composing directory item out of several directory entries for the same
42142 +   directory allows us to store said key fragment only once. That is, this is
42143 +   some ad hoc form of key compression (stem compression) that is implemented
42144 +   here, because general key compression is not supposed to be implemented in
42145 +   v4.0.
42146 +
42147 +   Another decision that was made regarding all directory item plugins, is
42148 +   that they will store entry keys unaligned. This is for that sake of disk
42149 +   space efficiency again.
42150 +
42151 +   In should be noted, that storing keys unaligned increases CPU consumption,
42152 +   at least on some architectures.
42153 +
42154 +   Internal on-disk structure of the compound directory item is the following:
42155 +
42156 +        HEADER          cde_item_format.        Here number of entries is stored.
42157 +        ENTRY_HEADER_0  cde_unit_header.        Here part of entry key and
42158 +        ENTRY_HEADER_1                          offset of entry body are stored.
42159 +        ENTRY_HEADER_2                         (basically two last parts of key)
42160 +        ...
42161 +        ENTRY_HEADER_N
42162 +        ENTRY_BODY_0    directory_entry_format. Here part of stat data key and
42163 +        ENTRY_BODY_1                            NUL-terminated name are stored.
42164 +        ENTRY_BODY_2                           (part of statadta key in the
42165 +                                                sence that since all SDs have
42166 +                                                zero offset, this offset is not
42167 +                                                stored on disk).
42168 +        ...
42169 +        ENTRY_BODY_N
42170 +
42171 +   When it comes to the balancing, each directory entry in compound directory
42172 +   item is unit, that is, something that can be cut from one item and pasted
42173 +   into another item of the same type. Handling of unit cut and paste is major
42174 +   reason for the complexity of code below.
42175 +
42176 +*/
42177 +
42178 +#include "../../forward.h"
42179 +#include "../../debug.h"
42180 +#include "../../dformat.h"
42181 +#include "../../kassign.h"
42182 +#include "../../key.h"
42183 +#include "../../coord.h"
42184 +#include "sde.h"
42185 +#include "cde.h"
42186 +#include "item.h"
42187 +#include "../node/node.h"
42188 +#include "../plugin.h"
42189 +#include "../../znode.h"
42190 +#include "../../carry.h"
42191 +#include "../../tree.h"
42192 +#include "../../inode.h"
42193 +
42194 +#include <linux/fs.h>          /* for struct inode */
42195 +#include <linux/dcache.h>      /* for struct dentry */
42196 +#include <linux/quotaops.h>
42197 +
42198 +#if 0
42199 +#define CHECKME(coord)                                         \
42200 +({                                                             \
42201 +       const char *message;                                    \
42202 +       coord_t dup;                                            \
42203 +                                                               \
42204 +       coord_dup_nocheck(&dup, (coord));                       \
42205 +       dup.unit_pos = 0;                                       \
42206 +       assert("nikita-2871", cde_check(&dup, &message) == 0);  \
42207 +})
42208 +#else
42209 +#define CHECKME(coord) noop
42210 +#endif
42211 +
42212 +/* return body of compound directory item at @coord */
42213 +static inline cde_item_format *formatted_at(const coord_t * coord)
42214 +{
42215 +       assert("nikita-1282", coord != NULL);
42216 +       return item_body_by_coord(coord);
42217 +}
42218 +
42219 +/* return entry header at @coord */
42220 +static inline cde_unit_header *header_at(const coord_t *
42221 +                                        coord /* coord of item */ ,
42222 +                                        int idx /* index of unit */ )
42223 +{
42224 +       assert("nikita-1283", coord != NULL);
42225 +       return &formatted_at(coord)->entry[idx];
42226 +}
42227 +
42228 +/* return number of units in compound directory item at @coord */
42229 +static int units(const coord_t * coord /* coord of item */ )
42230 +{
42231 +       return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries));
42232 +}
42233 +
42234 +/* return offset of the body of @idx-th entry in @coord */
42235 +static unsigned int offset_of(const coord_t * coord /* coord of item */ ,
42236 +                             int idx /* index of unit */ )
42237 +{
42238 +       if (idx < units(coord))
42239 +               return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset));
42240 +       else if (idx == units(coord))
42241 +               return item_length_by_coord(coord);
42242 +       else
42243 +               impossible("nikita-1308", "Wrong idx");
42244 +       return 0;
42245 +}
42246 +
42247 +/* set offset of the body of @idx-th entry in @coord */
42248 +static void set_offset(const coord_t * coord /* coord of item */ ,
42249 +                      int idx /* index of unit */ ,
42250 +                      unsigned int offset /* new offset */ )
42251 +{
42252 +       put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset);
42253 +}
42254 +
42255 +static void adj_offset(const coord_t * coord /* coord of item */ ,
42256 +                      int idx /* index of unit */ ,
42257 +                      int delta /* offset change */ )
42258 +{
42259 +       d16 *doffset;
42260 +       __u16 offset;
42261 +
42262 +       doffset = &header_at(coord, idx)->offset;
42263 +       offset = le16_to_cpu(get_unaligned(doffset));
42264 +       offset += delta;
42265 +       put_unaligned(cpu_to_le16((__u16) offset), doffset);
42266 +}
42267 +
42268 +/* return pointer to @offset-th byte from the beginning of @coord */
42269 +static char *address(const coord_t * coord /* coord of item */ ,
42270 +                    int offset)
42271 +{
42272 +       return ((char *)item_body_by_coord(coord)) + offset;
42273 +}
42274 +
42275 +/* return pointer to the body of @idx-th entry in @coord */
42276 +static directory_entry_format *entry_at(const coord_t * coord  /* coord of
42277 +                                                                * item */ ,
42278 +                                       int idx /* index of unit */ )
42279 +{
42280 +       return (directory_entry_format *) address(coord,
42281 +                                                 (int)offset_of(coord, idx));
42282 +}
42283 +
42284 +/* return number of unit referenced by @coord */
42285 +static int idx_of(const coord_t * coord /* coord of item */ )
42286 +{
42287 +       assert("nikita-1285", coord != NULL);
42288 +       return coord->unit_pos;
42289 +}
42290 +
42291 +/* find position where entry with @entry_key would be inserted into @coord */
42292 +static int find(const coord_t * coord /* coord of item */ ,
42293 +               const reiser4_key * entry_key /* key to look for */ ,
42294 +               cmp_t * last /* result of last comparison */ )
42295 +{
42296 +       int entries;
42297 +
42298 +       int left;
42299 +       int right;
42300 +
42301 +       cde_unit_header *header;
42302 +
42303 +       assert("nikita-1295", coord != NULL);
42304 +       assert("nikita-1296", entry_key != NULL);
42305 +       assert("nikita-1297", last != NULL);
42306 +
42307 +       entries = units(coord);
42308 +       left = 0;
42309 +       right = entries - 1;
42310 +       while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
42311 +               int median;
42312 +
42313 +               median = (left + right) >> 1;
42314 +
42315 +               header = header_at(coord, median);
42316 +               *last = de_id_key_cmp(&header->hash, entry_key);
42317 +               switch (*last) {
42318 +               case LESS_THAN:
42319 +                       left = median;
42320 +                       break;
42321 +               case GREATER_THAN:
42322 +                       right = median;
42323 +                       break;
42324 +               case EQUAL_TO:{
42325 +                               do {
42326 +                                       median--;
42327 +                                       header--;
42328 +                               } while (median >= 0 &&
42329 +                                        de_id_key_cmp(&header->hash,
42330 +                                                      entry_key) == EQUAL_TO);
42331 +                               return median + 1;
42332 +                       }
42333 +               }
42334 +       }
42335 +       header = header_at(coord, left);
42336 +       for (; left < entries; ++left, ++header) {
42337 +               prefetch(header + 1);
42338 +               *last = de_id_key_cmp(&header->hash, entry_key);
42339 +               if (*last != LESS_THAN)
42340 +                       break;
42341 +       }
42342 +       if (left < entries)
42343 +               return left;
42344 +       else
42345 +               return RETERR(-ENOENT);
42346 +
42347 +}
42348 +
42349 +/* expand @coord as to accommodate for insertion of @no new entries starting
42350 +   from @pos, with total bodies size @size. */
42351 +static int expand_item(const coord_t * coord /* coord of item */ ,
42352 +                      int pos /* unit position */ , int no     /* number of new
42353 +                                                                * units*/ ,
42354 +                      int size /* total size of new units' data */ ,
42355 +                      unsigned int data_size   /* free space already reserved
42356 +                                                * in the item for insertion */ )
42357 +{
42358 +       int entries;
42359 +       cde_unit_header *header;
42360 +       char *dent;
42361 +       int i;
42362 +
42363 +       assert("nikita-1310", coord != NULL);
42364 +       assert("nikita-1311", pos >= 0);
42365 +       assert("nikita-1312", no > 0);
42366 +       assert("nikita-1313", data_size >= no * sizeof(directory_entry_format));
42367 +       assert("nikita-1343",
42368 +              item_length_by_coord(coord) >=
42369 +              (int)(size + data_size + no * sizeof *header));
42370 +
42371 +       entries = units(coord);
42372 +
42373 +       if (pos == entries)
42374 +               dent = address(coord, size);
42375 +       else
42376 +               dent = (char *)entry_at(coord, pos);
42377 +       /* place where new header will be in */
42378 +       header = header_at(coord, pos);
42379 +       /* free space for new entry headers */
42380 +       memmove(header + no, header,
42381 +               (unsigned)(address(coord, size) - (char *)header));
42382 +       /* if adding to the end initialise first new header */
42383 +       if (pos == entries) {
42384 +               set_offset(coord, pos, (unsigned)size);
42385 +       }
42386 +
42387 +       /* adjust entry pointer and size */
42388 +       dent = dent + no * sizeof *header;
42389 +       size += no * sizeof *header;
42390 +       /* free space for new entries */
42391 +       memmove(dent + data_size, dent,
42392 +               (unsigned)(address(coord, size) - dent));
42393 +
42394 +       /* increase counter */
42395 +       entries += no;
42396 +       put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries);
42397 +
42398 +       /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header )
42399 +          bytes.  */
42400 +       for (i = 0; i <= pos; ++i)
42401 +               adj_offset(coord, i, no * sizeof *header);
42402 +       /* [ pos + no ... +\infty ) entries were shifted by ( no *
42403 +          sizeof *header + data_size ) bytes */
42404 +       for (i = pos + no; i < entries; ++i)
42405 +               adj_offset(coord, i, no * sizeof *header + data_size);
42406 +       return 0;
42407 +}
42408 +
42409 +/* insert new @entry into item */
42410 +static int expand(const coord_t * coord /* coord of item */ ,
42411 +                 struct cde_entry * entry /* entry to insert */ ,
42412 +                 int len /* length of @entry data */ ,
42413 +                 int *pos /* position to insert */ ,
42414 +                 reiser4_dir_entry_desc * dir_entry    /* parameters for new
42415 +                                                        * entry */ )
42416 +{
42417 +       cmp_t cmp_res;
42418 +       int datasize;
42419 +
42420 +       *pos = find(coord, &dir_entry->key, &cmp_res);
42421 +       if (*pos < 0)
42422 +               *pos = units(coord);
42423 +
42424 +       datasize = sizeof(directory_entry_format);
42425 +       if (is_longname(entry->name->name, entry->name->len))
42426 +               datasize += entry->name->len + 1;
42427 +
42428 +       expand_item(coord, *pos, 1, item_length_by_coord(coord) - len,
42429 +                   datasize);
42430 +       return 0;
42431 +}
42432 +
42433 +/* paste body of @entry into item */
42434 +static int paste_entry(const coord_t * coord /* coord of item */ ,
42435 +                      struct cde_entry * entry /* new entry */ ,
42436 +                      int pos /* position to insert */ ,
42437 +                      reiser4_dir_entry_desc * dir_entry       /* parameters for
42438 +                                                                * new entry */ )
42439 +{
42440 +       cde_unit_header *header;
42441 +       directory_entry_format *dent;
42442 +       const char *name;
42443 +       int len;
42444 +
42445 +       header = header_at(coord, pos);
42446 +       dent = entry_at(coord, pos);
42447 +
42448 +       build_de_id_by_key(&dir_entry->key, &header->hash);
42449 +       build_inode_key_id(entry->obj, &dent->id);
42450 +       /* AUDIT unsafe strcpy() operation! It should be replaced with
42451 +          much less CPU hungry
42452 +          memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len );
42453 +
42454 +          Also a more major thing is that there should be a way to figure out
42455 +          amount of space in dent -> name and be able to check that we are
42456 +          not going to overwrite more than we supposed to */
42457 +       name = entry->name->name;
42458 +       len = entry->name->len;
42459 +       if (is_longname(name, len)) {
42460 +               strcpy((unsigned char *)dent->name, name);
42461 +               put_unaligned(0, &dent->name[len]);
42462 +       }
42463 +       return 0;
42464 +}
42465 +
42466 +/* estimate how much space is necessary in item to insert/paste set of entries
42467 +   described in @data. */
42468 +int estimate_cde(const coord_t * coord /* coord of item */ ,
42469 +                const reiser4_item_data * data /* parameters for new item */ )
42470 +{
42471 +       struct cde_entry_data *e;
42472 +       int result;
42473 +       int i;
42474 +
42475 +       e = (struct cde_entry_data *) data->data;
42476 +
42477 +       assert("nikita-1288", e != NULL);
42478 +       assert("nikita-1289", e->num_of_entries >= 0);
42479 +
42480 +       if (coord == NULL)
42481 +               /* insert */
42482 +               result = sizeof(cde_item_format);
42483 +       else
42484 +               /* paste */
42485 +               result = 0;
42486 +
42487 +       result += e->num_of_entries *
42488 +           (sizeof(cde_unit_header) + sizeof(directory_entry_format));
42489 +       for (i = 0; i < e->num_of_entries; ++i) {
42490 +               const char *name;
42491 +               int len;
42492 +
42493 +               name = e->entry[i].name->name;
42494 +               len = e->entry[i].name->len;
42495 +               assert("nikita-2054", strlen(name) == len);
42496 +               if (is_longname(name, len))
42497 +                       result += len + 1;
42498 +       }
42499 +       ((reiser4_item_data *) data)->length = result;
42500 +       return result;
42501 +}
42502 +
42503 +/* ->nr_units() method for this item plugin. */
42504 +pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ )
42505 +{
42506 +       return units(coord);
42507 +}
42508 +
42509 +/* ->unit_key() method for this item plugin. */
42510 +reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ ,
42511 +                         reiser4_key * key /* resulting key */ )
42512 +{
42513 +       assert("nikita-1452", coord != NULL);
42514 +       assert("nikita-1345", idx_of(coord) < units(coord));
42515 +       assert("nikita-1346", key != NULL);
42516 +
42517 +       item_key_by_coord(coord, key);
42518 +       extract_key_from_de_id(extract_dir_id_from_key(key),
42519 +                              &header_at(coord, idx_of(coord))->hash, key);
42520 +       return key;
42521 +}
42522 +
42523 +/* mergeable_cde(): implementation of ->mergeable() item method.
42524 +
42525 +   Two directory items are mergeable iff they are from the same
42526 +   directory. That simple.
42527 +
42528 +*/
42529 +int mergeable_cde(const coord_t * p1 /* coord of first item */ ,
42530 +                 const coord_t * p2 /* coord of second item */ )
42531 +{
42532 +       reiser4_key k1;
42533 +       reiser4_key k2;
42534 +
42535 +       assert("nikita-1339", p1 != NULL);
42536 +       assert("nikita-1340", p2 != NULL);
42537 +
42538 +       return
42539 +           (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) &&
42540 +           (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) ==
42541 +            extract_dir_id_from_key(item_key_by_coord(p2, &k2)));
42542 +
42543 +}
42544 +
42545 +/* ->max_key_inside() method for this item plugin. */
42546 +reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ ,
42547 +                               reiser4_key * result /* resulting key */ )
42548 +{
42549 +       assert("nikita-1342", coord != NULL);
42550 +
42551 +       item_key_by_coord(coord, result);
42552 +       set_key_ordering(result, get_key_ordering(reiser4_max_key()));
42553 +       set_key_fulloid(result, get_key_fulloid(reiser4_max_key()));
42554 +       set_key_offset(result, get_key_offset(reiser4_max_key()));
42555 +       return result;
42556 +}
42557 +
42558 +/* @data contains data which are to be put into tree */
42559 +int can_contain_key_cde(const coord_t * coord /* coord of item */ ,
42560 +                       const reiser4_key * key /* key to check */ ,
42561 +                       const reiser4_item_data * data  /* parameters of new
42562 +                                                        * item/unit being
42563 +                                                        * created */ )
42564 +{
42565 +       reiser4_key item_key;
42566 +
42567 +       /* FIXME-VS: do not rely on anything but iplug field of @data. Only
42568 +          data->iplug is initialized */
42569 +       assert("vs-457", data && data->iplug);
42570 +/*     assert( "vs-553", data -> user == 0 );*/
42571 +       item_key_by_coord(coord, &item_key);
42572 +
42573 +       return (item_plugin_by_coord(coord) == data->iplug) &&
42574 +           (extract_dir_id_from_key(&item_key) ==
42575 +            extract_dir_id_from_key(key));
42576 +}
42577 +
42578 +#if REISER4_DEBUG
42579 +/* cde_check ->check() method for compressed directory items
42580 +
42581 +   used for debugging, every item should have here the most complete
42582 +   possible check of the consistency of the item that the inventor can
42583 +   construct
42584 +*/
42585 +int reiser4_check_cde(const coord_t * coord /* coord of item to check */,
42586 +                     const char **error /* where to store error message */)
42587 +{
42588 +       int i;
42589 +       int result;
42590 +       char *item_start;
42591 +       char *item_end;
42592 +       reiser4_key key;
42593 +
42594 +       coord_t c;
42595 +
42596 +       assert("nikita-1357", coord != NULL);
42597 +       assert("nikita-1358", error != NULL);
42598 +
42599 +       if (!ergo(coord->item_pos != 0,
42600 +                 is_dot_key(item_key_by_coord(coord, &key)))) {
42601 +               *error = "CDE doesn't start with dot";
42602 +               return -1;
42603 +       }
42604 +       item_start = item_body_by_coord(coord);
42605 +       item_end = item_start + item_length_by_coord(coord);
42606 +
42607 +       coord_dup(&c, coord);
42608 +       result = 0;
42609 +       for (i = 0; i < units(coord); ++i) {
42610 +               directory_entry_format *entry;
42611 +
42612 +               if ((char *)(header_at(coord, i) + 1) >
42613 +                   item_end - units(coord) * sizeof *entry) {
42614 +                       *error = "CDE header is out of bounds";
42615 +                       result = -1;
42616 +                       break;
42617 +               }
42618 +               entry = entry_at(coord, i);
42619 +               if ((char *)entry < item_start + sizeof(cde_item_format)) {
42620 +                       *error = "CDE header is too low";
42621 +                       result = -1;
42622 +                       break;
42623 +               }
42624 +               if ((char *)(entry + 1) > item_end) {
42625 +                       *error = "CDE header is too high";
42626 +                       result = -1;
42627 +                       break;
42628 +               }
42629 +       }
42630 +
42631 +       return result;
42632 +}
42633 +#endif
42634 +
42635 +/* ->init() method for this item plugin. */
42636 +int init_cde(coord_t * coord /* coord of item */ ,
42637 +            coord_t * from UNUSED_ARG, reiser4_item_data * data        /* structure used for insertion */
42638 +            UNUSED_ARG)
42639 +{
42640 +       put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries);
42641 +       return 0;
42642 +}
42643 +
42644 +/* ->lookup() method for this item plugin. */
42645 +lookup_result lookup_cde(const reiser4_key * key /* key to search for */ ,
42646 +                        lookup_bias bias /* search bias */ ,
42647 +                        coord_t * coord /* coord of item to lookup in */ )
42648 +{
42649 +       cmp_t last_comp;
42650 +       int pos;
42651 +
42652 +       reiser4_key utmost_key;
42653 +
42654 +       assert("nikita-1293", coord != NULL);
42655 +       assert("nikita-1294", key != NULL);
42656 +
42657 +       CHECKME(coord);
42658 +
42659 +       if (keygt(item_key_by_coord(coord, &utmost_key), key)) {
42660 +               coord->unit_pos = 0;
42661 +               coord->between = BEFORE_UNIT;
42662 +               return CBK_COORD_NOTFOUND;
42663 +       }
42664 +       pos = find(coord, key, &last_comp);
42665 +       if (pos >= 0) {
42666 +               coord->unit_pos = (int)pos;
42667 +               switch (last_comp) {
42668 +               case EQUAL_TO:
42669 +                       coord->between = AT_UNIT;
42670 +                       return CBK_COORD_FOUND;
42671 +               case GREATER_THAN:
42672 +                       coord->between = BEFORE_UNIT;
42673 +                       return RETERR(-ENOENT);
42674 +               case LESS_THAN:
42675 +               default:
42676 +                       impossible("nikita-1298", "Broken find");
42677 +                       return RETERR(-EIO);
42678 +               }
42679 +       } else {
42680 +               coord->unit_pos = units(coord) - 1;
42681 +               coord->between = AFTER_UNIT;
42682 +               return (bias ==
42683 +                       FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND :
42684 +                   CBK_COORD_NOTFOUND;
42685 +       }
42686 +}
42687 +
42688 +/* ->paste() method for this item plugin. */
42689 +int paste_cde(coord_t * coord /* coord of item */ ,
42690 +             reiser4_item_data * data  /* parameters of new unit being
42691 +                                        * inserted */ ,
42692 +             carry_plugin_info * info UNUSED_ARG /* todo carry queue */ )
42693 +{
42694 +       struct cde_entry_data *e;
42695 +       int result;
42696 +       int i;
42697 +
42698 +       CHECKME(coord);
42699 +       e = (struct cde_entry_data *) data->data;
42700 +
42701 +       result = 0;
42702 +       for (i = 0; i < e->num_of_entries; ++i) {
42703 +               int pos;
42704 +               int phantom_size;
42705 +
42706 +               phantom_size = data->length;
42707 +               if (units(coord) == 0)
42708 +                       phantom_size -= sizeof(cde_item_format);
42709 +
42710 +               result =
42711 +                   expand(coord, e->entry + i, phantom_size, &pos, data->arg);
42712 +               if (result != 0)
42713 +                       break;
42714 +               result = paste_entry(coord, e->entry + i, pos, data->arg);
42715 +               if (result != 0)
42716 +                       break;
42717 +       }
42718 +       CHECKME(coord);
42719 +       return result;
42720 +}
42721 +
42722 +/* amount of space occupied by all entries starting from @idx both headers and
42723 +   bodies. */
42724 +static unsigned int part_size(const coord_t * coord /* coord of item */ ,
42725 +                             int idx /* index of unit */ )
42726 +{
42727 +       assert("nikita-1299", coord != NULL);
42728 +       assert("nikita-1300", idx < (int)units(coord));
42729 +
42730 +       return sizeof(cde_item_format) +
42731 +           (idx + 1) * sizeof(cde_unit_header) + offset_of(coord,
42732 +                                                           idx + 1) -
42733 +           offset_of(coord, 0);
42734 +}
42735 +
42736 +/* how many but not more than @want units of @source can be merged with
42737 +   item in @target node. If pend == append - we try to append last item
42738 +   of @target by first units of @source. If pend == prepend - we try to
42739 +   "prepend" first item in @target by last units of @source. @target
42740 +   node has @free_space bytes of free space. Total size of those units
42741 +   are returned via @size */
42742 +int can_shift_cde(unsigned free_space /* free space in item */ ,
42743 +                 coord_t * coord /* coord of source item */ ,
42744 +                 znode * target /* target node */ ,
42745 +                 shift_direction pend /* shift direction */ ,
42746 +                 unsigned *size /* resulting number of shifted bytes */ ,
42747 +                 unsigned want /* maximal number of bytes to shift */ )
42748 +{
42749 +       int shift;
42750 +
42751 +       CHECKME(coord);
42752 +       if (want == 0) {
42753 +               *size = 0;
42754 +               return 0;
42755 +       }
42756 +
42757 +       /* pend == SHIFT_LEFT <==> shifting to the left */
42758 +       if (pend == SHIFT_LEFT) {
42759 +               for (shift = min((int)want - 1, units(coord)); shift >= 0;
42760 +                    --shift) {
42761 +                       *size = part_size(coord, shift);
42762 +                       if (target != NULL)
42763 +                               *size -= sizeof(cde_item_format);
42764 +                       if (*size <= free_space)
42765 +                               break;
42766 +               }
42767 +               shift = shift + 1;
42768 +       } else {
42769 +               int total_size;
42770 +
42771 +               assert("nikita-1301", pend == SHIFT_RIGHT);
42772 +
42773 +               total_size = item_length_by_coord(coord);
42774 +               for (shift = units(coord) - want - 1; shift < units(coord) - 1;
42775 +                    ++shift) {
42776 +                       *size = total_size - part_size(coord, shift);
42777 +                       if (target == NULL)
42778 +                               *size += sizeof(cde_item_format);
42779 +                       if (*size <= free_space)
42780 +                               break;
42781 +               }
42782 +               shift = units(coord) - shift - 1;
42783 +       }
42784 +       if (shift == 0)
42785 +               *size = 0;
42786 +       CHECKME(coord);
42787 +       return shift;
42788 +}
42789 +
42790 +/* ->copy_units() method for this item plugin. */
42791 +void copy_units_cde(coord_t * target /* coord of target item */ ,
42792 +                   coord_t * source /* coord of source item */ ,
42793 +                   unsigned from /* starting unit */ ,
42794 +                   unsigned count /* how many units to copy */ ,
42795 +                   shift_direction where_is_free_space /* shift direction */ ,
42796 +                   unsigned free_space /* free space in item */ )
42797 +{
42798 +       char *header_from;
42799 +       char *header_to;
42800 +
42801 +       char *entry_from;
42802 +       char *entry_to;
42803 +
42804 +       int pos_in_target;
42805 +       int data_size;
42806 +       int data_delta;
42807 +       int i;
42808 +
42809 +       assert("nikita-1303", target != NULL);
42810 +       assert("nikita-1304", source != NULL);
42811 +       assert("nikita-1305", (int)from < units(source));
42812 +       assert("nikita-1307", (int)(from + count) <= units(source));
42813 +
42814 +       if (where_is_free_space == SHIFT_LEFT) {
42815 +               assert("nikita-1453", from == 0);
42816 +               pos_in_target = units(target);
42817 +       } else {
42818 +               assert("nikita-1309", (int)(from + count) == units(source));
42819 +               pos_in_target = 0;
42820 +               memmove(item_body_by_coord(target),
42821 +                       (char *)item_body_by_coord(target) + free_space,
42822 +                       item_length_by_coord(target) - free_space);
42823 +       }
42824 +
42825 +       CHECKME(target);
42826 +       CHECKME(source);
42827 +
42828 +       /* expand @target */
42829 +       data_size =
42830 +           offset_of(source, (int)(from + count)) - offset_of(source,
42831 +                                                              (int)from);
42832 +
42833 +       if (units(target) == 0)
42834 +               free_space -= sizeof(cde_item_format);
42835 +
42836 +       expand_item(target, pos_in_target, (int)count,
42837 +                   (int)(item_length_by_coord(target) - free_space),
42838 +                   (unsigned)data_size);
42839 +
42840 +       /* copy first @count units of @source into @target */
42841 +       data_delta =
42842 +           offset_of(target, pos_in_target) - offset_of(source, (int)from);
42843 +
42844 +       /* copy entries */
42845 +       entry_from = (char *)entry_at(source, (int)from);
42846 +       entry_to = (char *)entry_at(source, (int)(from + count));
42847 +       memmove(entry_at(target, pos_in_target), entry_from,
42848 +               (unsigned)(entry_to - entry_from));
42849 +
42850 +       /* copy headers */
42851 +       header_from = (char *)header_at(source, (int)from);
42852 +       header_to = (char *)header_at(source, (int)(from + count));
42853 +       memmove(header_at(target, pos_in_target), header_from,
42854 +               (unsigned)(header_to - header_from));
42855 +
42856 +       /* update offsets */
42857 +       for (i = pos_in_target; i < (int)(pos_in_target + count); ++i)
42858 +               adj_offset(target, i, data_delta);
42859 +       CHECKME(target);
42860 +       CHECKME(source);
42861 +}
42862 +
42863 +/* ->cut_units() method for this item plugin. */
42864 +int cut_units_cde(coord_t * coord /* coord of item */ ,
42865 +                 pos_in_node_t from /* start unit pos */ ,
42866 +                 pos_in_node_t to /* stop unit pos */ ,
42867 +                 struct carry_cut_data *cdata UNUSED_ARG,
42868 +                 reiser4_key * smallest_removed, reiser4_key * new_first)
42869 +{
42870 +       char *header_from;
42871 +       char *header_to;
42872 +
42873 +       char *entry_from;
42874 +       char *entry_to;
42875 +
42876 +       int size;
42877 +       int entry_delta;
42878 +       int header_delta;
42879 +       int i;
42880 +
42881 +       unsigned count;
42882 +
42883 +       CHECKME(coord);
42884 +
42885 +       count = to - from + 1;
42886 +
42887 +       assert("nikita-1454", coord != NULL);
42888 +       assert("nikita-1455", (int)(from + count) <= units(coord));
42889 +
42890 +       if (smallest_removed)
42891 +               unit_key_by_coord(coord, smallest_removed);
42892 +
42893 +       if (new_first) {
42894 +               coord_t next;
42895 +
42896 +               /* not everything is cut from item head */
42897 +               assert("vs-1527", from == 0);
42898 +               assert("vs-1528", to < units(coord) - 1);
42899 +
42900 +               coord_dup(&next, coord);
42901 +               next.unit_pos++;
42902 +               unit_key_by_coord(&next, new_first);
42903 +       }
42904 +
42905 +       size = item_length_by_coord(coord);
42906 +       if (count == (unsigned)units(coord)) {
42907 +               return size;
42908 +       }
42909 +
42910 +       header_from = (char *)header_at(coord, (int)from);
42911 +       header_to = (char *)header_at(coord, (int)(from + count));
42912 +
42913 +       entry_from = (char *)entry_at(coord, (int)from);
42914 +       entry_to = (char *)entry_at(coord, (int)(from + count));
42915 +
42916 +       /* move headers */
42917 +       memmove(header_from, header_to,
42918 +               (unsigned)(address(coord, size) - header_to));
42919 +
42920 +       header_delta = header_to - header_from;
42921 +
42922 +       entry_from -= header_delta;
42923 +       entry_to -= header_delta;
42924 +       size -= header_delta;
42925 +
42926 +       /* copy entries */
42927 +       memmove(entry_from, entry_to,
42928 +               (unsigned)(address(coord, size) - entry_to));
42929 +
42930 +       entry_delta = entry_to - entry_from;
42931 +       size -= entry_delta;
42932 +
42933 +       /* update offsets */
42934 +
42935 +       for (i = 0; i < (int)from; ++i)
42936 +               adj_offset(coord, i, -header_delta);
42937 +
42938 +       for (i = from; i < units(coord) - (int)count; ++i)
42939 +               adj_offset(coord, i, -header_delta - entry_delta);
42940 +
42941 +       put_unaligned(cpu_to_le16((__u16) units(coord) - count),
42942 +                     &formatted_at(coord)->num_of_entries);
42943 +
42944 +       if (from == 0) {
42945 +               /* entries from head was removed - move remaining to right */
42946 +               memmove((char *)item_body_by_coord(coord) +
42947 +                       header_delta + entry_delta, item_body_by_coord(coord),
42948 +                       (unsigned)size);
42949 +               if (REISER4_DEBUG)
42950 +                       memset(item_body_by_coord(coord), 0,
42951 +                              (unsigned)header_delta + entry_delta);
42952 +       } else {
42953 +               /* freed space is already at the end of item */
42954 +               if (REISER4_DEBUG)
42955 +                       memset((char *)item_body_by_coord(coord) + size, 0,
42956 +                              (unsigned)header_delta + entry_delta);
42957 +       }
42958 +
42959 +       return header_delta + entry_delta;
42960 +}
42961 +
42962 +int kill_units_cde(coord_t * coord /* coord of item */ ,
42963 +                  pos_in_node_t from /* start unit pos */ ,
42964 +                  pos_in_node_t to /* stop unit pos */ ,
42965 +                  struct carry_kill_data *kdata UNUSED_ARG,
42966 +                  reiser4_key * smallest_removed, reiser4_key * new_first)
42967 +{
42968 +       return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first);
42969 +}
42970 +
42971 +/* ->s.dir.extract_key() method for this item plugin. */
42972 +int extract_key_cde(const coord_t * coord /* coord of item */ ,
42973 +                   reiser4_key * key /* resulting key */ )
42974 +{
42975 +       directory_entry_format *dent;
42976 +
42977 +       assert("nikita-1155", coord != NULL);
42978 +       assert("nikita-1156", key != NULL);
42979 +
42980 +       dent = entry_at(coord, idx_of(coord));
42981 +       return extract_key_from_id(&dent->id, key);
42982 +}
42983 +
42984 +int
42985 +update_key_cde(const coord_t * coord, const reiser4_key * key,
42986 +              lock_handle * lh UNUSED_ARG)
42987 +{
42988 +       directory_entry_format *dent;
42989 +       obj_key_id obj_id;
42990 +       int result;
42991 +
42992 +       assert("nikita-2344", coord != NULL);
42993 +       assert("nikita-2345", key != NULL);
42994 +
42995 +       dent = entry_at(coord, idx_of(coord));
42996 +       result = build_obj_key_id(key, &obj_id);
42997 +       if (result == 0) {
42998 +               dent->id = obj_id;
42999 +               znode_make_dirty(coord->node);
43000 +       }
43001 +       return 0;
43002 +}
43003 +
43004 +/* ->s.dir.extract_name() method for this item plugin. */
43005 +char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf)
43006 +{
43007 +       directory_entry_format *dent;
43008 +
43009 +       assert("nikita-1157", coord != NULL);
43010 +
43011 +       dent = entry_at(coord, idx_of(coord));
43012 +       return extract_dent_name(coord, dent, buf);
43013 +}
43014 +
43015 +static int cde_bytes(int pasting, const reiser4_item_data * data)
43016 +{
43017 +       int result;
43018 +
43019 +       result = data->length;
43020 +       if (!pasting)
43021 +               result -= sizeof(cde_item_format);
43022 +       return result;
43023 +}
43024 +
43025 +/* ->s.dir.add_entry() method for this item plugin */
43026 +int add_entry_cde(struct inode *dir /* directory object */ ,
43027 +                 coord_t * coord /* coord of item */ ,
43028 +                 lock_handle * lh /* lock handle for insertion */ ,
43029 +                 const struct dentry *name /* name to insert */ ,
43030 +                 reiser4_dir_entry_desc * dir_entry    /* parameters of new
43031 +                                                        * directory entry */ )
43032 +{
43033 +       reiser4_item_data data;
43034 +       struct cde_entry entry;
43035 +       struct cde_entry_data edata;
43036 +       int result;
43037 +
43038 +       assert("nikita-1656", coord->node == lh->node);
43039 +       assert("nikita-1657", znode_is_write_locked(coord->node));
43040 +
43041 +       edata.num_of_entries = 1;
43042 +       edata.entry = &entry;
43043 +
43044 +       entry.dir = dir;
43045 +       entry.obj = dir_entry->obj;
43046 +       entry.name = &name->d_name;
43047 +
43048 +       data.data = (char *)&edata;
43049 +       data.user = 0;          /* &edata is not user space */
43050 +       data.iplug = item_plugin_by_id(COMPOUND_DIR_ID);
43051 +       data.arg = dir_entry;
43052 +       assert("nikita-1302", data.iplug != NULL);
43053 +
43054 +       result = is_dot_key(&dir_entry->key);
43055 +       data.length = estimate_cde(result ? coord : NULL, &data);
43056 +
43057 +       /* NOTE-NIKITA quota plugin? */
43058 +       if (dquot_alloc_space_nodirty(dir, cde_bytes(result, &data)))
43059 +               return RETERR(-EDQUOT);
43060 +
43061 +       if (result)
43062 +               result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0);
43063 +       else
43064 +               result = reiser4_resize_item(coord, &data, &dir_entry->key,
43065 +                                            lh, 0);
43066 +       return result;
43067 +}
43068 +
43069 +/* ->s.dir.rem_entry() */
43070 +int rem_entry_cde(struct inode *dir /* directory of item */ ,
43071 +                 const struct qstr *name, coord_t * coord /* coord of item */ ,
43072 +                 lock_handle * lh UNUSED_ARG   /* lock handle for
43073 +                                                * removal */ ,
43074 +                 reiser4_dir_entry_desc * entry UNUSED_ARG     /* parameters of
43075 +                                                                * directory entry
43076 +                                                                * being removed */ )
43077 +{
43078 +       coord_t shadow;
43079 +       int result;
43080 +       int length;
43081 +       ON_DEBUG(char buf[DE_NAME_BUF_LEN]);
43082 +
43083 +       assert("nikita-2870", strlen(name->name) == name->len);
43084 +       assert("nikita-2869",
43085 +              !strcmp(name->name, extract_name_cde(coord, buf)));
43086 +
43087 +       length = sizeof(directory_entry_format) + sizeof(cde_unit_header);
43088 +       if (is_longname(name->name, name->len))
43089 +               length += name->len + 1;
43090 +
43091 +       if (inode_get_bytes(dir) < length) {
43092 +               warning("nikita-2628", "Dir is broke: %llu: %llu",
43093 +                       (unsigned long long)get_inode_oid(dir),
43094 +                       inode_get_bytes(dir));
43095 +
43096 +               return RETERR(-EIO);
43097 +       }
43098 +
43099 +       /* cut_node() is supposed to take pointers to _different_
43100 +          coords, because it will modify them without respect to
43101 +          possible aliasing. To work around this, create temporary copy
43102 +          of @coord.
43103 +        */
43104 +       coord_dup(&shadow, coord);
43105 +       result =
43106 +           kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
43107 +       if (result == 0) {
43108 +               /* NOTE-NIKITA quota plugin? */
43109 +               dquot_free_space_nodirty(dir, length);
43110 +       }
43111 +       return result;
43112 +}
43113 +
43114 +/* ->s.dir.max_name_len() method for this item plugin */
43115 +int max_name_len_cde(const struct inode *dir /* directory */ )
43116 +{
43117 +       return
43118 +               reiser4_tree_by_inode(dir)->nplug->max_item_size() -
43119 +               sizeof(directory_entry_format) - sizeof(cde_item_format) -
43120 +               sizeof(cde_unit_header) - 2;
43121 +}
43122 +
43123 +/* Make Linus happy.
43124 +   Local variables:
43125 +   c-indentation-style: "K&R"
43126 +   mode-name: "LC"
43127 +   c-basic-offset: 8
43128 +   tab-width: 8
43129 +   fill-column: 120
43130 +   End:
43131 +*/
43132 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/cde.h linux-2.6.35/fs/reiser4/plugin/item/cde.h
43133 --- linux-2.6.35.orig/fs/reiser4/plugin/item/cde.h      1970-01-01 01:00:00.000000000 +0100
43134 +++ linux-2.6.35/fs/reiser4/plugin/item/cde.h   2010-08-04 15:44:57.000000000 +0200
43135 @@ -0,0 +1,87 @@
43136 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
43137 +
43138 +/* Compound directory item. See cde.c for description. */
43139 +
43140 +#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ )
43141 +#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__
43142 +
43143 +#include "../../forward.h"
43144 +#include "../../kassign.h"
43145 +#include "../../dformat.h"
43146 +
43147 +#include <linux/fs.h>          /* for struct inode */
43148 +#include <linux/dcache.h>      /* for struct dentry, etc  */
43149 +
43150 +typedef struct cde_unit_header {
43151 +       de_id hash;
43152 +       d16 offset;
43153 +} cde_unit_header;
43154 +
43155 +typedef struct cde_item_format {
43156 +       d16 num_of_entries;
43157 +       cde_unit_header entry[0];
43158 +} cde_item_format;
43159 +
43160 +struct cde_entry {
43161 +       const struct inode *dir;
43162 +       const struct inode *obj;
43163 +       const struct qstr *name;
43164 +};
43165 +
43166 +struct cde_entry_data {
43167 +       int num_of_entries;
43168 +       struct cde_entry *entry;
43169 +};
43170 +
43171 +/* plugin->item.b.* */
43172 +reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result);
43173 +int can_contain_key_cde(const coord_t * coord, const reiser4_key * key,
43174 +                       const reiser4_item_data *);
43175 +int mergeable_cde(const coord_t * p1, const coord_t * p2);
43176 +pos_in_node_t nr_units_cde(const coord_t * coord);
43177 +reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key);
43178 +int estimate_cde(const coord_t * coord, const reiser4_item_data * data);
43179 +void print_cde(const char *prefix, coord_t * coord);
43180 +int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data);
43181 +lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias,
43182 +                        coord_t * coord);
43183 +int paste_cde(coord_t * coord, reiser4_item_data * data,
43184 +             carry_plugin_info * info UNUSED_ARG);
43185 +int can_shift_cde(unsigned free_space, coord_t * coord, znode * target,
43186 +                 shift_direction pend, unsigned *size, unsigned want);
43187 +void copy_units_cde(coord_t * target, coord_t * source, unsigned from,
43188 +                   unsigned count, shift_direction where_is_free_space,
43189 +                   unsigned free_space);
43190 +int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
43191 +                 struct carry_cut_data *, reiser4_key * smallest_removed,
43192 +                 reiser4_key * new_first);
43193 +int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
43194 +                  struct carry_kill_data *, reiser4_key * smallest_removed,
43195 +                  reiser4_key * new_first);
43196 +void print_cde(const char *prefix, coord_t * coord);
43197 +int reiser4_check_cde(const coord_t * coord, const char **error);
43198 +
43199 +/* plugin->u.item.s.dir.* */
43200 +int extract_key_cde(const coord_t * coord, reiser4_key * key);
43201 +int update_key_cde(const coord_t * coord, const reiser4_key * key,
43202 +                  lock_handle * lh);
43203 +char *extract_name_cde(const coord_t * coord, char *buf);
43204 +int add_entry_cde(struct inode *dir, coord_t * coord,
43205 +                 lock_handle * lh, const struct dentry *name,
43206 +                 reiser4_dir_entry_desc * entry);
43207 +int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord,
43208 +                 lock_handle * lh, reiser4_dir_entry_desc * entry);
43209 +int max_name_len_cde(const struct inode *dir);
43210 +
43211 +/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */
43212 +#endif
43213 +
43214 +/* Make Linus happy.
43215 +   Local variables:
43216 +   c-indentation-style: "K&R"
43217 +   mode-name: "LC"
43218 +   c-basic-offset: 8
43219 +   tab-width: 8
43220 +   fill-column: 120
43221 +   End:
43222 +*/
43223 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/ctail.c linux-2.6.35/fs/reiser4/plugin/item/ctail.c
43224 --- linux-2.6.35.orig/fs/reiser4/plugin/item/ctail.c    1970-01-01 01:00:00.000000000 +0100
43225 +++ linux-2.6.35/fs/reiser4/plugin/item/ctail.c 2010-08-04 15:44:57.000000000 +0200
43226 @@ -0,0 +1,1613 @@
43227 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
43228 +
43229 +/* ctails (aka "clustered tails") are items for cryptcompress objects */
43230 +
43231 +/* DESCRIPTION:
43232 +
43233 +Each cryptcompress object is stored on disk as a set of clusters sliced
43234 +into ctails.
43235 +
43236 +Internal on-disk structure:
43237 +
43238 +        HEADER   (1)  Here stored disk cluster shift
43239 +       BODY
43240 +*/
43241 +
43242 +#include "../../forward.h"
43243 +#include "../../debug.h"
43244 +#include "../../dformat.h"
43245 +#include "../../kassign.h"
43246 +#include "../../key.h"
43247 +#include "../../coord.h"
43248 +#include "item.h"
43249 +#include "../node/node.h"
43250 +#include "../plugin.h"
43251 +#include "../object.h"
43252 +#include "../../znode.h"
43253 +#include "../../carry.h"
43254 +#include "../../tree.h"
43255 +#include "../../inode.h"
43256 +#include "../../super.h"
43257 +#include "../../context.h"
43258 +#include "../../page_cache.h"
43259 +#include "../cluster.h"
43260 +#include "../../flush.h"
43261 +#include "../../tree_walk.h"
43262 +
43263 +#include <linux/pagevec.h>
43264 +#include <linux/swap.h>
43265 +#include <linux/fs.h>
43266 +
43267 +/* return body of ctail item at @coord */
43268 +static ctail_item_format *ctail_formatted_at(const coord_t * coord)
43269 +{
43270 +       assert("edward-60", coord != NULL);
43271 +       return item_body_by_coord(coord);
43272 +}
43273 +
43274 +static int cluster_shift_by_coord(const coord_t * coord)
43275 +{
43276 +       return get_unaligned(&ctail_formatted_at(coord)->cluster_shift);
43277 +}
43278 +
43279 +static inline void dclust_set_extension_shift(hint_t * hint)
43280 +{
43281 +       assert("edward-1270",
43282 +              item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID);
43283 +       hint->ext_coord.extension.ctail.shift =
43284 +           cluster_shift_by_coord(&hint->ext_coord.coord);
43285 +}
43286 +
43287 +static loff_t off_by_coord(const coord_t * coord)
43288 +{
43289 +       reiser4_key key;
43290 +       return get_key_offset(item_key_by_coord(coord, &key));
43291 +}
43292 +
43293 +int coord_is_unprepped_ctail(const coord_t * coord)
43294 +{
43295 +       assert("edward-1233", coord != NULL);
43296 +       assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID);
43297 +       assert("edward-1235",
43298 +              ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT,
43299 +                   nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS));
43300 +
43301 +       return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT;
43302 +}
43303 +
43304 +static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode)
43305 +{
43306 +       int shift;
43307 +
43308 +       if (inode != NULL) {
43309 +               shift = inode_cluster_shift(inode);
43310 +               assert("edward-1236",
43311 +                      ergo(!coord_is_unprepped_ctail(coord),
43312 +                           shift == cluster_shift_by_coord(coord)));
43313 +       } else {
43314 +               assert("edward-1237", !coord_is_unprepped_ctail(coord));
43315 +               shift = cluster_shift_by_coord(coord);
43316 +       }
43317 +       return off_by_coord(coord) >> shift;
43318 +}
43319 +
43320 +static int disk_cluster_size(const coord_t * coord)
43321 +{
43322 +       assert("edward-1156",
43323 +              item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
43324 +       /* calculation of disk cluster size
43325 +          is meaninless if ctail is unprepped */
43326 +       assert("edward-1238", !coord_is_unprepped_ctail(coord));
43327 +
43328 +       return 1 << cluster_shift_by_coord(coord);
43329 +}
43330 +
43331 +/* true if the key is of first disk cluster item */
43332 +static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord)
43333 +{
43334 +       assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID);
43335 +
43336 +       return coord_is_unprepped_ctail(coord) ||
43337 +           ((get_key_offset(key) &
43338 +             ((loff_t) disk_cluster_size(coord) - 1)) == 0);
43339 +}
43340 +
43341 +static char *first_unit(coord_t * coord)
43342 +{
43343 +       /* FIXME: warning: pointer of type `void *' used in arithmetic */
43344 +       return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format);
43345 +}
43346 +
43347 +/* plugin->u.item.b.max_key_inside :
43348 +   tail_max_key_inside */
43349 +
43350 +/* plugin->u.item.b.can_contain_key */
43351 +int
43352 +can_contain_key_ctail(const coord_t * coord, const reiser4_key * key,
43353 +                     const reiser4_item_data * data)
43354 +{
43355 +       reiser4_key item_key;
43356 +
43357 +       if (item_plugin_by_coord(coord) != data->iplug)
43358 +               return 0;
43359 +
43360 +       item_key_by_coord(coord, &item_key);
43361 +       if (get_key_locality(key) != get_key_locality(&item_key) ||
43362 +           get_key_objectid(key) != get_key_objectid(&item_key))
43363 +               return 0;
43364 +       if (get_key_offset(&item_key) + nr_units_ctail(coord) !=
43365 +           get_key_offset(key))
43366 +               return 0;
43367 +       if (is_disk_cluster_key(key, coord))
43368 +               return 0;
43369 +       return 1;
43370 +}
43371 +
43372 +/* plugin->u.item.b.mergeable */
43373 +int mergeable_ctail(const coord_t * p1, const coord_t * p2)
43374 +{
43375 +       reiser4_key key1, key2;
43376 +
43377 +       assert("edward-62", item_id_by_coord(p1) == CTAIL_ID);
43378 +       assert("edward-61", plugin_of_group(item_plugin_by_coord(p1),
43379 +                                           UNIX_FILE_METADATA_ITEM_TYPE));
43380 +
43381 +       if (item_id_by_coord(p2) != CTAIL_ID) {
43382 +               /* second item is of another type */
43383 +               return 0;
43384 +       }
43385 +
43386 +       item_key_by_coord(p1, &key1);
43387 +       item_key_by_coord(p2, &key2);
43388 +       if (get_key_locality(&key1) != get_key_locality(&key2) ||
43389 +           get_key_objectid(&key1) != get_key_objectid(&key2) ||
43390 +           get_key_type(&key1) != get_key_type(&key2)) {
43391 +               /* items of different objects */
43392 +               return 0;
43393 +       }
43394 +       if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2))
43395 +               /*  not adjacent items */
43396 +               return 0;
43397 +       if (is_disk_cluster_key(&key2, p2))
43398 +               return 0;
43399 +       return 1;
43400 +}
43401 +
43402 +/* plugin->u.item.b.nr_units */
43403 +pos_in_node_t nr_units_ctail(const coord_t * coord)
43404 +{
43405 +       return (item_length_by_coord(coord) -
43406 +               sizeof(ctail_formatted_at(coord)->cluster_shift));
43407 +}
43408 +
43409 +/* plugin->u.item.b.estimate:
43410 +   estimate how much space is needed to insert/paste @data->length bytes
43411 +   into ctail at @coord */
43412 +int estimate_ctail(const coord_t * coord /* coord of item */ ,
43413 +                  const reiser4_item_data *
43414 +                  data /* parameters for new item */ )
43415 +{
43416 +       if (coord == NULL)
43417 +               /* insert */
43418 +               return (sizeof(ctail_item_format) + data->length);
43419 +       else
43420 +               /* paste */
43421 +               return data->length;
43422 +}
43423 +
43424 +/* ->init() method for this item plugin. */
43425 +int init_ctail(coord_t * to /* coord of item */ ,
43426 +              coord_t * from /* old_item */ ,
43427 +              reiser4_item_data * data /* structure used for insertion */ )
43428 +{
43429 +       int cluster_shift;      /* cpu value to convert */
43430 +
43431 +       if (data) {
43432 +               assert("edward-463", data->length > sizeof(ctail_item_format));
43433 +               cluster_shift = *((int *)(data->arg));
43434 +               data->length -= sizeof(ctail_item_format);
43435 +       } else {
43436 +               assert("edward-464", from != NULL);
43437 +               assert("edward-855", ctail_ok(from));
43438 +               cluster_shift = (int)(cluster_shift_by_coord(from));
43439 +       }
43440 +       put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift);
43441 +       assert("edward-856", ctail_ok(to));
43442 +       return 0;
43443 +}
43444 +
43445 +/* plugin->u.item.b.lookup:
43446 +   NULL: We are looking for item keys only */
43447 +
43448 +#if REISER4_DEBUG
43449 +int ctail_ok(const coord_t * coord)
43450 +{
43451 +       return coord_is_unprepped_ctail(coord) ||
43452 +           cluster_shift_ok(cluster_shift_by_coord(coord));
43453 +}
43454 +
43455 +/* plugin->u.item.b.check */
43456 +int check_ctail(const coord_t * coord, const char **error)
43457 +{
43458 +       if (!ctail_ok(coord)) {
43459 +               if (error)
43460 +                       *error = "bad cluster shift in ctail";
43461 +               return 1;
43462 +       }
43463 +       return 0;
43464 +}
43465 +#endif
43466 +
43467 +/* plugin->u.item.b.paste */
43468 +int
43469 +paste_ctail(coord_t * coord, reiser4_item_data * data,
43470 +           carry_plugin_info * info UNUSED_ARG)
43471 +{
43472 +       unsigned old_nr_units;
43473 +
43474 +       assert("edward-268", data->data != NULL);
43475 +       /* copy only from kernel space */
43476 +       assert("edward-66", data->user == 0);
43477 +
43478 +       old_nr_units =
43479 +           item_length_by_coord(coord) - sizeof(ctail_item_format) -
43480 +           data->length;
43481 +
43482 +       /* ctail items never get pasted in the middle */
43483 +
43484 +       if (coord->unit_pos == 0 && coord->between == AT_UNIT) {
43485 +
43486 +               /* paste at the beginning when create new item */
43487 +               assert("edward-450",
43488 +                      item_length_by_coord(coord) ==
43489 +                      data->length + sizeof(ctail_item_format));
43490 +               assert("edward-451", old_nr_units == 0);
43491 +       } else if (coord->unit_pos == old_nr_units - 1
43492 +                  && coord->between == AFTER_UNIT) {
43493 +
43494 +               /* paste at the end */
43495 +               coord->unit_pos++;
43496 +       } else
43497 +               impossible("edward-453", "bad paste position");
43498 +
43499 +       memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length);
43500 +
43501 +       assert("edward-857", ctail_ok(coord));
43502 +
43503 +       return 0;
43504 +}
43505 +
43506 +/* plugin->u.item.b.fast_paste */
43507 +
43508 +/* plugin->u.item.b.can_shift
43509 +   number of units is returned via return value, number of bytes via @size. For
43510 +   ctail items they coincide */
43511 +int
43512 +can_shift_ctail(unsigned free_space, coord_t * source,
43513 +               znode * target, shift_direction direction UNUSED_ARG,
43514 +               unsigned *size /* number of bytes */ , unsigned want)
43515 +{
43516 +       /* make sure that that we do not want to shift more than we have */
43517 +       assert("edward-68", want > 0 && want <= nr_units_ctail(source));
43518 +
43519 +       *size = min(want, free_space);
43520 +
43521 +       if (!target) {
43522 +               /* new item will be created */
43523 +               if (*size <= sizeof(ctail_item_format)) {
43524 +                       *size = 0;
43525 +                       return 0;
43526 +               }
43527 +               return *size - sizeof(ctail_item_format);
43528 +       }
43529 +       return *size;
43530 +}
43531 +
43532 +/* plugin->u.item.b.copy_units
43533 +   cooperates with ->can_shift() */
43534 +void
43535 +copy_units_ctail(coord_t * target, coord_t * source,
43536 +                unsigned from, unsigned count /* units */ ,
43537 +                shift_direction where_is_free_space,
43538 +                unsigned free_space /* bytes */ )
43539 +{
43540 +       /* make sure that item @target is expanded already */
43541 +       assert("edward-69", (unsigned)item_length_by_coord(target) >= count);
43542 +       assert("edward-70", free_space == count || free_space == count + 1);
43543 +
43544 +       assert("edward-858", ctail_ok(source));
43545 +
43546 +       if (where_is_free_space == SHIFT_LEFT) {
43547 +               /* append item @target with @count first bytes of @source:
43548 +                  this restriction came from ordinary tails */
43549 +               assert("edward-71", from == 0);
43550 +               assert("edward-860", ctail_ok(target));
43551 +
43552 +               memcpy(first_unit(target) + nr_units_ctail(target) - count,
43553 +                      first_unit(source), count);
43554 +       } else {
43555 +               /* target item is moved to right already */
43556 +               reiser4_key key;
43557 +
43558 +               assert("edward-72", nr_units_ctail(source) == from + count);
43559 +
43560 +               if (free_space == count) {
43561 +                       init_ctail(target, source, NULL);
43562 +               } else {
43563 +                       /* new item has been created */
43564 +                       assert("edward-862", ctail_ok(target));
43565 +               }
43566 +               memcpy(first_unit(target), first_unit(source) + from, count);
43567 +
43568 +               assert("edward-863", ctail_ok(target));
43569 +
43570 +               /* new units are inserted before first unit in an item,
43571 +                  therefore, we have to update item key */
43572 +               item_key_by_coord(source, &key);
43573 +               set_key_offset(&key, get_key_offset(&key) + from);
43574 +
43575 +               node_plugin_by_node(target->node)->update_item_key(target, &key,
43576 +                                                                  NULL /*info */);
43577 +       }
43578 +}
43579 +
43580 +/* plugin->u.item.b.create_hook */
43581 +int create_hook_ctail(const coord_t * coord, void *arg)
43582 +{
43583 +       assert("edward-864", znode_is_loaded(coord->node));
43584 +
43585 +       znode_set_convertible(coord->node);
43586 +       return 0;
43587 +}
43588 +
43589 +/* plugin->u.item.b.kill_hook */
43590 +int kill_hook_ctail(const coord_t * coord, pos_in_node_t from,
43591 +                   pos_in_node_t count, carry_kill_data * kdata)
43592 +{
43593 +       struct inode *inode;
43594 +
43595 +       assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID);
43596 +       assert("edward-291", znode_is_write_locked(coord->node));
43597 +
43598 +       inode = kdata->inode;
43599 +       if (inode) {
43600 +               reiser4_key key;
43601 +               struct cryptcompress_info * info;
43602 +               cloff_t index;
43603 +
43604 +               item_key_by_coord(coord, &key);
43605 +               info = cryptcompress_inode_data(inode);
43606 +               index = off_to_clust(get_key_offset(&key), inode);
43607 +
43608 +               if (from == 0) {
43609 +                       info->trunc_index = index;
43610 +                       if (is_disk_cluster_key(&key, coord)) {
43611 +                               /*
43612 +                                * first item of disk cluster is to be killed
43613 +                                */
43614 +                               truncate_complete_page_cluster(
43615 +                                       inode, index, kdata->params.truncate);
43616 +                               inode_sub_bytes(inode,
43617 +                                               inode_cluster_size(inode));
43618 +                       }
43619 +               }
43620 +       }
43621 +       return 0;
43622 +}
43623 +
43624 +/* for shift_hook_ctail(),
43625 +   return true if the first disk cluster item has dirty child
43626 +*/
43627 +static int ctail_convertible(const coord_t * coord)
43628 +{
43629 +       int result;
43630 +       reiser4_key key;
43631 +       jnode *child = NULL;
43632 +
43633 +       assert("edward-477", coord != NULL);
43634 +       assert("edward-478", item_id_by_coord(coord) == CTAIL_ID);
43635 +
43636 +       if (coord_is_unprepped_ctail(coord))
43637 +               /* unprepped ctail should be converted */
43638 +               return 1;
43639 +
43640 +       item_key_by_coord(coord, &key);
43641 +       child = jlookup(current_tree,
43642 +                       get_key_objectid(&key),
43643 +                       off_to_pg(off_by_coord(coord)));
43644 +       if (!child)
43645 +               return 0;
43646 +       result = JF_ISSET(child, JNODE_DIRTY);
43647 +       jput(child);
43648 +       return result;
43649 +}
43650 +
43651 +/* FIXME-EDWARD */
43652 +/* plugin->u.item.b.shift_hook */
43653 +int shift_hook_ctail(const coord_t * item /* coord of item */ ,
43654 +                    unsigned from UNUSED_ARG /* start unit */ ,
43655 +                    unsigned count UNUSED_ARG /* stop unit */ ,
43656 +                    znode * old_node /* old parent */ )
43657 +{
43658 +       assert("edward-479", item != NULL);
43659 +       assert("edward-480", item->node != old_node);
43660 +
43661 +       if (!znode_convertible(old_node) || znode_convertible(item->node))
43662 +               return 0;
43663 +       if (ctail_convertible(item))
43664 +               znode_set_convertible(item->node);
43665 +       return 0;
43666 +}
43667 +
43668 +static int
43669 +cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
43670 +                       int cut, void *p, reiser4_key * smallest_removed,
43671 +                       reiser4_key * new_first)
43672 +{
43673 +       pos_in_node_t count;    /* number of units to cut */
43674 +       char *item;
43675 +
43676 +       count = to - from + 1;
43677 +       item = item_body_by_coord(coord);
43678 +
43679 +       assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord)));
43680 +
43681 +       if (smallest_removed) {
43682 +               /* store smallest key removed */
43683 +               item_key_by_coord(coord, smallest_removed);
43684 +               set_key_offset(smallest_removed,
43685 +                              get_key_offset(smallest_removed) + from);
43686 +       }
43687 +
43688 +       if (new_first) {
43689 +               assert("vs-1531", from == 0);
43690 +
43691 +               item_key_by_coord(coord, new_first);
43692 +               set_key_offset(new_first,
43693 +                              get_key_offset(new_first) + from + count);
43694 +       }
43695 +
43696 +       if (!cut)
43697 +               kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p);
43698 +
43699 +       if (from == 0) {
43700 +               if (count != nr_units_ctail(coord)) {
43701 +                       /* part of item is removed, so move free space at the beginning
43702 +                          of the item and update item key */
43703 +                       reiser4_key key;
43704 +                       memcpy(item + to + 1, item, sizeof(ctail_item_format));
43705 +                       item_key_by_coord(coord, &key);
43706 +                       set_key_offset(&key, get_key_offset(&key) + count);
43707 +                       node_plugin_by_node(coord->node)->update_item_key(coord,
43708 +                                                                         &key,
43709 +                                                                         NULL);
43710 +               } else {
43711 +                       /* cut_units should not be called to cut evrything */
43712 +                       assert("vs-1532", ergo(cut, 0));
43713 +                       /* whole item is cut, so more then amount of space occupied
43714 +                          by units got freed */
43715 +                       count += sizeof(ctail_item_format);
43716 +               }
43717 +               if (REISER4_DEBUG)
43718 +                       memset(item, 0, count);
43719 +       } else if (REISER4_DEBUG)
43720 +               memset(item + sizeof(ctail_item_format) + from, 0, count);
43721 +       return count;
43722 +}
43723 +
43724 +/* plugin->u.item.b.cut_units */
43725 +int
43726 +cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
43727 +               carry_cut_data * cdata, reiser4_key * smallest_removed,
43728 +               reiser4_key * new_first)
43729 +{
43730 +       return cut_or_kill_ctail_units(item, from, to, 1, NULL,
43731 +                                      smallest_removed, new_first);
43732 +}
43733 +
43734 +/* plugin->u.item.b.kill_units */
43735 +int
43736 +kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
43737 +                struct carry_kill_data *kdata, reiser4_key * smallest_removed,
43738 +                reiser4_key * new_first)
43739 +{
43740 +       return cut_or_kill_ctail_units(item, from, to, 0, kdata,
43741 +                                      smallest_removed, new_first);
43742 +}
43743 +
43744 +/* plugin->u.item.s.file.read */
43745 +int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint)
43746 +{
43747 +       uf_coord_t *uf_coord;
43748 +       coord_t *coord;
43749 +
43750 +       uf_coord = &hint->ext_coord;
43751 +       coord = &uf_coord->coord;
43752 +       assert("edward-127", f->user == 0);
43753 +       assert("edward-129", coord && coord->node);
43754 +       assert("edward-130", coord_is_existing_unit(coord));
43755 +       assert("edward-132", znode_is_loaded(coord->node));
43756 +
43757 +       /* start read only from the beginning of ctail */
43758 +       assert("edward-133", coord->unit_pos == 0);
43759 +       /* read only whole ctails */
43760 +       assert("edward-135", nr_units_ctail(coord) <= f->length);
43761 +
43762 +       assert("edward-136", reiser4_schedulable());
43763 +       assert("edward-886", ctail_ok(coord));
43764 +
43765 +       if (f->data)
43766 +               memcpy(f->data, (char *)first_unit(coord),
43767 +                      (size_t) nr_units_ctail(coord));
43768 +
43769 +       dclust_set_extension_shift(hint);
43770 +       mark_page_accessed(znode_page(coord->node));
43771 +       move_flow_forward(f, nr_units_ctail(coord));
43772 +
43773 +       return 0;
43774 +}
43775 +
43776 +/**
43777 + * Prepare transform stream with plain text for page
43778 + * @page taking into account synchronization issues.
43779 + */
43780 +static int ctail_read_disk_cluster(struct cluster_handle * clust,
43781 +                                  struct inode * inode, struct page * page,
43782 +                                  znode_lock_mode mode)
43783 +{
43784 +       int result;
43785 +
43786 +       assert("edward-1450", mode == ZNODE_READ_LOCK || ZNODE_WRITE_LOCK);
43787 +       assert("edward-671", clust->hint != NULL);
43788 +       assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER);
43789 +       assert("edward-672", cryptcompress_inode_ok(inode));
43790 +       assert("edward-1527", PageLocked(page));
43791 +
43792 +       unlock_page(page);
43793 +
43794 +       /* set input stream */
43795 +       result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM);
43796 +       if (result) {
43797 +               lock_page(page);
43798 +               return result;
43799 +       }
43800 +       result = find_disk_cluster(clust, inode, 1 /* read items */, mode);
43801 +       lock_page(page);
43802 +       if (result)
43803 +               return result;
43804 +       /*
43805 +        * at this point we have locked position in the tree
43806 +        */
43807 +       assert("edward-1528", znode_is_any_locked(clust->hint->lh.node));
43808 +
43809 +       if (page->mapping != inode->i_mapping) {
43810 +               /* page was truncated */
43811 +               reiser4_unset_hint(clust->hint);
43812 +               reset_cluster_params(clust);
43813 +               return AOP_TRUNCATED_PAGE;
43814 +       }
43815 +       if (PageUptodate(page)) {
43816 +               /* disk cluster can be obsolete, don't use it! */
43817 +               reiser4_unset_hint(clust->hint);
43818 +               reset_cluster_params(clust);
43819 +               return 0;
43820 +       }
43821 +       if (clust->dstat == FAKE_DISK_CLUSTER ||
43822 +           clust->dstat == UNPR_DISK_CLUSTER ||
43823 +           clust->dstat == TRNC_DISK_CLUSTER) {
43824 +               /*
43825 +                * this information about disk cluster will be valid
43826 +                * as long as we keep the position in the tree locked
43827 +                */
43828 +               tfm_cluster_set_uptodate(&clust->tc);
43829 +               return 0;
43830 +       }
43831 +       /* now prepare output stream.. */
43832 +       result = grab_coa(&clust->tc, inode_compression_plugin(inode));
43833 +       if (result)
43834 +               return result;
43835 +       /* ..and fill this with plain text */
43836 +       result = reiser4_inflate_cluster(clust, inode);
43837 +       if (result)
43838 +               return result;
43839 +       /*
43840 +        * The stream is ready! It won't be obsolete as
43841 +        * long as we keep last disk cluster item locked.
43842 +        */
43843 +       tfm_cluster_set_uptodate(&clust->tc);
43844 +       return 0;
43845 +}
43846 +
43847 +/*
43848 + * fill one page with plain text.
43849 + */
43850 +int do_readpage_ctail(struct inode * inode, struct cluster_handle * clust,
43851 +                     struct page *page, znode_lock_mode mode)
43852 +{
43853 +       int ret;
43854 +       unsigned cloff;
43855 +       char *data;
43856 +       size_t to_page;
43857 +       struct tfm_cluster * tc = &clust->tc;
43858 +
43859 +       assert("edward-212", PageLocked(page));
43860 +
43861 +       if (unlikely(page->mapping != inode->i_mapping))
43862 +               return AOP_TRUNCATED_PAGE;
43863 +       if (PageUptodate(page))
43864 +               goto exit;
43865 +       to_page = pbytes(page_index(page), inode);
43866 +       if (to_page == 0) {
43867 +               zero_user(page, 0, PAGE_CACHE_SIZE);
43868 +               SetPageUptodate(page);
43869 +               goto exit;
43870 +       }
43871 +       if (!tfm_cluster_is_uptodate(&clust->tc)) {
43872 +               clust->index = pg_to_clust(page->index, inode);
43873 +
43874 +               /* this will unlock/lock the page */
43875 +               ret = ctail_read_disk_cluster(clust, inode, page, mode);
43876 +
43877 +               assert("edward-212", PageLocked(page));
43878 +               if (ret)
43879 +                       return ret;
43880 +
43881 +               /* refresh bytes */
43882 +               to_page = pbytes(page_index(page), inode);
43883 +               if (to_page == 0) {
43884 +                       zero_user(page, 0, PAGE_CACHE_SIZE);
43885 +                       SetPageUptodate(page);
43886 +                       goto exit;
43887 +               }
43888 +       }
43889 +       if (PageUptodate(page))
43890 +               /* somebody else fill it already */
43891 +               goto exit;
43892 +
43893 +       assert("edward-119", tfm_cluster_is_uptodate(tc));
43894 +       assert("edward-1529", znode_is_any_locked(clust->hint->lh.node));
43895 +
43896 +       switch (clust->dstat) {
43897 +       case UNPR_DISK_CLUSTER:
43898 +               BUG_ON(1);
43899 +       case TRNC_DISK_CLUSTER:
43900 +               /*
43901 +                * Race with truncate!
43902 +                * We resolve it in favour of the last one (the only way,
43903 +                 * as in this case plain text is unrecoverable)
43904 +                */
43905 +       case FAKE_DISK_CLUSTER:
43906 +               /* fill the page by zeroes */
43907 +               zero_user(page, 0, PAGE_CACHE_SIZE);
43908 +               SetPageUptodate(page);
43909 +               break;
43910 +       case PREP_DISK_CLUSTER:
43911 +               /* fill page by transformed stream with plain text */
43912 +               assert("edward-1058", !PageUptodate(page));
43913 +               assert("edward-120", tc->len <= inode_cluster_size(inode));
43914 +
43915 +               /* page index in this logical cluster */
43916 +               cloff = pg_to_off_to_cloff(page->index, inode);
43917 +
43918 +               data = kmap(page);
43919 +               memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, to_page);
43920 +               memset(data + to_page, 0, (size_t) PAGE_CACHE_SIZE - to_page);
43921 +               flush_dcache_page(page);
43922 +               kunmap(page);
43923 +               SetPageUptodate(page);
43924 +               break;
43925 +       default:
43926 +               impossible("edward-1169", "bad disk cluster state");
43927 +       }
43928 +      exit:
43929 +       return 0;
43930 +}
43931 +
43932 +/* plugin->u.item.s.file.readpage */
43933 +int readpage_ctail(void *vp, struct page *page)
43934 +{
43935 +       int result;
43936 +       hint_t * hint;
43937 +       struct cluster_handle * clust = vp;
43938 +
43939 +       assert("edward-114", clust != NULL);
43940 +       assert("edward-115", PageLocked(page));
43941 +       assert("edward-116", !PageUptodate(page));
43942 +       assert("edward-118", page->mapping && page->mapping->host);
43943 +       assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc));
43944 +
43945 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
43946 +       if (hint == NULL) {
43947 +               unlock_page(page);
43948 +               return RETERR(-ENOMEM);
43949 +       }
43950 +       clust->hint = hint;
43951 +       result = load_file_hint(clust->file, hint);
43952 +       if (result) {
43953 +               kfree(hint);
43954 +               unlock_page(page);
43955 +               return result;
43956 +       }
43957 +       assert("vs-25", hint->ext_coord.lh == &hint->lh);
43958 +
43959 +       result = do_readpage_ctail(page->mapping->host, clust, page,
43960 +                                  ZNODE_READ_LOCK);
43961 +       assert("edward-213", PageLocked(page));
43962 +       assert("edward-1163", ergo(!result, PageUptodate(page)));
43963 +
43964 +       unlock_page(page);
43965 +       done_lh(&hint->lh);
43966 +       hint->ext_coord.valid = 0;
43967 +       save_file_hint(clust->file, hint);
43968 +       kfree(hint);
43969 +       tfm_cluster_clr_uptodate(&clust->tc);
43970 +
43971 +       return result;
43972 +}
43973 +
43974 +/* Helper function for ->readpages() */
43975 +static int ctail_read_page_cluster(struct cluster_handle * clust,
43976 +                                  struct inode *inode)
43977 +{
43978 +       int i;
43979 +       int result;
43980 +       assert("edward-779", clust != NULL);
43981 +       assert("edward-1059", clust->win == NULL);
43982 +       assert("edward-780", inode != NULL);
43983 +
43984 +       result = prepare_page_cluster(inode, clust, READ_OP);
43985 +       if (result)
43986 +               return result;
43987 +
43988 +       assert("edward-781", !tfm_cluster_is_uptodate(&clust->tc));
43989 +
43990 +       for (i = 0; i < clust->nr_pages; i++) {
43991 +               struct page *page = clust->pages[i];
43992 +               lock_page(page);
43993 +               result = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK);
43994 +               unlock_page(page);
43995 +               if (result)
43996 +                       break;
43997 +       }
43998 +       tfm_cluster_clr_uptodate(&clust->tc);
43999 +       put_page_cluster(clust, inode, READ_OP);
44000 +       return result;
44001 +}
44002 +
44003 +/* filler for read_cache_pages() */
44004 +static int ctail_readpages_filler(void * data, struct page * page)
44005 +{
44006 +       int ret = 0;
44007 +       struct cluster_handle * clust = data;
44008 +       struct inode * inode = clust->file->f_dentry->d_inode;
44009 +
44010 +       assert("edward-1525", page->mapping == inode->i_mapping);
44011 +
44012 +       if (PageUptodate(page)) {
44013 +               unlock_page(page);
44014 +               return 0;
44015 +       }
44016 +       if (pbytes(page_index(page), inode) == 0) {
44017 +               zero_user(page, 0, PAGE_CACHE_SIZE);
44018 +               SetPageUptodate(page);
44019 +               unlock_page(page);
44020 +               return 0;
44021 +       }
44022 +       move_cluster_forward(clust, inode, page->index);
44023 +       unlock_page(page);
44024 +       /*
44025 +        * read the whole page cluster
44026 +        */
44027 +       ret = ctail_read_page_cluster(clust, inode);
44028 +
44029 +       assert("edward-869", !tfm_cluster_is_uptodate(&clust->tc));
44030 +       return ret;
44031 +}
44032 +
44033 +/*
44034 + * We populate a bit more then upper readahead suggests:
44035 + * with each nominated page we read the whole page cluster
44036 + * this page belongs to.
44037 + */
44038 +int readpages_ctail(struct file *file, struct address_space *mapping,
44039 +                   struct list_head *pages)
44040 +{
44041 +       int ret = 0;
44042 +       hint_t *hint;
44043 +       struct cluster_handle clust;
44044 +       struct inode *inode = mapping->host;
44045 +
44046 +       assert("edward-1521", inode == file->f_dentry->d_inode);
44047 +
44048 +       cluster_init_read(&clust, NULL);
44049 +       clust.file = file;
44050 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
44051 +       if (hint == NULL) {
44052 +               warning("vs-28", "failed to allocate hint");
44053 +               ret = RETERR(-ENOMEM);
44054 +               goto exit1;
44055 +       }
44056 +       clust.hint = hint;
44057 +       ret = load_file_hint(clust.file, hint);
44058 +       if (ret) {
44059 +               warning("edward-1522", "failed to load hint");
44060 +               goto exit2;
44061 +       }
44062 +       assert("vs-26", hint->ext_coord.lh == &hint->lh);
44063 +       ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
44064 +       if (ret) {
44065 +               warning("edward-1523", "failed to alloc pgset");
44066 +               goto exit3;
44067 +       }
44068 +       ret = read_cache_pages(mapping, pages, ctail_readpages_filler, &clust);
44069 +
44070 +       assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc));
44071 + exit3:
44072 +       done_lh(&hint->lh);
44073 +       save_file_hint(file, hint);
44074 +       hint->ext_coord.valid = 0;
44075 + exit2:
44076 +       kfree(hint);
44077 + exit1:
44078 +       put_cluster_handle(&clust);
44079 +       return ret;
44080 +}
44081 +
44082 +/*
44083 +   plugin->u.item.s.file.append_key
44084 +   key of the first item of the next disk cluster
44085 +*/
44086 +reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key)
44087 +{
44088 +       assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID);
44089 +       assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord)));
44090 +
44091 +       item_key_by_coord(coord, key);
44092 +       set_key_offset(key, ((__u64) (clust_by_coord(coord, NULL)) + 1)
44093 +                      << cluster_shift_by_coord(coord));
44094 +       return key;
44095 +}
44096 +
44097 +static int insert_unprepped_ctail(struct cluster_handle * clust,
44098 +                                 struct inode *inode)
44099 +{
44100 +       int result;
44101 +       char buf[UCTAIL_NR_UNITS];
44102 +       reiser4_item_data data;
44103 +       reiser4_key key;
44104 +       int shift = (int)UCTAIL_SHIFT;
44105 +
44106 +       memset(buf, 0, (size_t) UCTAIL_NR_UNITS);
44107 +       result = key_by_inode_cryptcompress(inode,
44108 +                                           clust_to_off(clust->index, inode),
44109 +                                           &key);
44110 +       if (result)
44111 +               return result;
44112 +       data.user = 0;
44113 +       data.iplug = item_plugin_by_id(CTAIL_ID);
44114 +       data.arg = &shift;
44115 +       data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS;
44116 +       data.data = buf;
44117 +
44118 +       result = insert_by_coord(&clust->hint->ext_coord.coord,
44119 +                                &data, &key, clust->hint->ext_coord.lh, 0);
44120 +       return result;
44121 +}
44122 +
44123 +static int
44124 +insert_cryptcompress_flow(coord_t * coord, lock_handle * lh, flow_t * f,
44125 +                         int cluster_shift)
44126 +{
44127 +       int result;
44128 +       carry_pool *pool;
44129 +       carry_level *lowest_level;
44130 +       reiser4_item_data *data;
44131 +       carry_op *op;
44132 +
44133 +       pool =
44134 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
44135 +                           sizeof(*data));
44136 +       if (IS_ERR(pool))
44137 +               return PTR_ERR(pool);
44138 +       lowest_level = (carry_level *) (pool + 1);
44139 +       init_carry_level(lowest_level, pool);
44140 +       data = (reiser4_item_data *) (lowest_level + 3);
44141 +
44142 +       assert("edward-466", coord->between == AFTER_ITEM
44143 +              || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM
44144 +              || coord->between == EMPTY_NODE
44145 +              || coord->between == BEFORE_UNIT);
44146 +
44147 +       if (coord->between == AFTER_UNIT) {
44148 +               coord->unit_pos = 0;
44149 +               coord->between = AFTER_ITEM;
44150 +       }
44151 +       op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
44152 +                               0 /* operate directly on coord -> node */);
44153 +       if (IS_ERR(op) || (op == NULL)) {
44154 +               done_carry_pool(pool);
44155 +               return RETERR(op ? PTR_ERR(op) : -EIO);
44156 +       }
44157 +       data->user = 0;
44158 +       data->iplug = item_plugin_by_id(CTAIL_ID);
44159 +       data->arg = &cluster_shift;
44160 +
44161 +       data->length = 0;
44162 +       data->data = NULL;
44163 +
44164 +       op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT;
44165 +       op->u.insert_flow.insert_point = coord;
44166 +       op->u.insert_flow.flow = f;
44167 +       op->u.insert_flow.data = data;
44168 +       op->u.insert_flow.new_nodes = 0;
44169 +
44170 +       lowest_level->track_type = CARRY_TRACK_CHANGE;
44171 +       lowest_level->tracked = lh;
44172 +
44173 +       result = reiser4_carry(lowest_level, NULL);
44174 +       done_carry_pool(pool);
44175 +
44176 +       return result;
44177 +}
44178 +
44179 +/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */
44180 +static int insert_cryptcompress_flow_in_place(coord_t * coord,
44181 +                                             lock_handle * lh, flow_t * f,
44182 +                                             int cluster_shift)
44183 +{
44184 +       int ret;
44185 +       coord_t pos;
44186 +       lock_handle lock;
44187 +
44188 +       assert("edward-484",
44189 +              coord->between == AT_UNIT || coord->between == AFTER_ITEM);
44190 +       assert("edward-485", item_id_by_coord(coord) == CTAIL_ID);
44191 +
44192 +       coord_dup(&pos, coord);
44193 +       pos.unit_pos = 0;
44194 +       pos.between = AFTER_ITEM;
44195 +
44196 +       init_lh(&lock);
44197 +       copy_lh(&lock, lh);
44198 +
44199 +       ret = insert_cryptcompress_flow(&pos, &lock, f, cluster_shift);
44200 +       done_lh(&lock);
44201 +       assert("edward-1347", znode_is_write_locked(lh->node));
44202 +       assert("edward-1228", !ret);
44203 +       return ret;
44204 +}
44205 +
44206 +/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */
44207 +static int overwrite_ctail(coord_t * coord, flow_t * f)
44208 +{
44209 +       unsigned count;
44210 +
44211 +       assert("edward-269", f->user == 0);
44212 +       assert("edward-270", f->data != NULL);
44213 +       assert("edward-271", f->length > 0);
44214 +       assert("edward-272", coord_is_existing_unit(coord));
44215 +       assert("edward-273", coord->unit_pos == 0);
44216 +       assert("edward-274", znode_is_write_locked(coord->node));
44217 +       assert("edward-275", reiser4_schedulable());
44218 +       assert("edward-467", item_id_by_coord(coord) == CTAIL_ID);
44219 +       assert("edward-1243", ctail_ok(coord));
44220 +
44221 +       count = nr_units_ctail(coord);
44222 +
44223 +       if (count > f->length)
44224 +               count = f->length;
44225 +       memcpy(first_unit(coord), f->data, count);
44226 +       move_flow_forward(f, count);
44227 +       coord->unit_pos += count;
44228 +       return 0;
44229 +}
44230 +
44231 +/* Implementation of CRC_CUT_ITEM mode of ctail conversion:
44232 +   cut ctail (part or whole) starting from next unit position */
44233 +static int cut_ctail(coord_t * coord)
44234 +{
44235 +       coord_t stop;
44236 +
44237 +       assert("edward-435", coord->between == AT_UNIT &&
44238 +              coord->item_pos < coord_num_items(coord) &&
44239 +              coord->unit_pos <= coord_num_units(coord));
44240 +
44241 +       if (coord->unit_pos == coord_num_units(coord))
44242 +               /* nothing to cut */
44243 +               return 0;
44244 +       coord_dup(&stop, coord);
44245 +       stop.unit_pos = coord_last_unit_pos(coord);
44246 +
44247 +       return cut_node_content(coord, &stop, NULL, NULL, NULL);
44248 +}
44249 +
44250 +int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
44251 +                                  struct inode * inode)
44252 +{
44253 +       int result;
44254 +       assert("edward-1244", inode != NULL);
44255 +       assert("edward-1245", clust->hint != NULL);
44256 +       assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER);
44257 +       assert("edward-1247", clust->reserved == 1);
44258 +
44259 +       result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK);
44260 +       if (cbk_errored(result))
44261 +               return result;
44262 +       assert("edward-1249", result == CBK_COORD_NOTFOUND);
44263 +       assert("edward-1250", znode_is_write_locked(clust->hint->lh.node));
44264 +
44265 +       assert("edward-1295",
44266 +              clust->hint->ext_coord.lh->node ==
44267 +              clust->hint->ext_coord.coord.node);
44268 +
44269 +       coord_set_between_clusters(&clust->hint->ext_coord.coord);
44270 +
44271 +       result = insert_unprepped_ctail(clust, inode);
44272 +       all_grabbed2free();
44273 +
44274 +       assert("edward-1251", !result);
44275 +       assert("edward-1252", cryptcompress_inode_ok(inode));
44276 +       assert("edward-1253", znode_is_write_locked(clust->hint->lh.node));
44277 +       assert("edward-1254",
44278 +              reiser4_clustered_blocks(reiser4_get_current_sb()));
44279 +       assert("edward-1255",
44280 +              znode_convertible(clust->hint->ext_coord.coord.node));
44281 +
44282 +       return result;
44283 +}
44284 +
44285 +static int do_convert_ctail(flush_pos_t * pos, cryptcompress_write_mode_t mode)
44286 +{
44287 +       int result = 0;
44288 +       struct convert_item_info * info;
44289 +
44290 +       assert("edward-468", pos != NULL);
44291 +       assert("edward-469", pos->sq != NULL);
44292 +       assert("edward-845", item_convert_data(pos) != NULL);
44293 +
44294 +       info = item_convert_data(pos);
44295 +       assert("edward-679", info->flow.data != NULL);
44296 +
44297 +       switch (mode) {
44298 +       case CRC_APPEND_ITEM:
44299 +               assert("edward-1229", info->flow.length != 0);
44300 +               assert("edward-1256",
44301 +                      cluster_shift_ok(cluster_shift_by_coord(&pos->coord)));
44302 +               result =
44303 +                   insert_cryptcompress_flow_in_place(&pos->coord,
44304 +                                                      &pos->lock,
44305 +                                                      &info->flow,
44306 +                                                      info->cluster_shift);
44307 +               break;
44308 +       case CRC_OVERWRITE_ITEM:
44309 +               assert("edward-1230", info->flow.length != 0);
44310 +               overwrite_ctail(&pos->coord, &info->flow);
44311 +               if (info->flow.length != 0)
44312 +                       break;
44313 +       case CRC_CUT_ITEM:
44314 +               assert("edward-1231", info->flow.length == 0);
44315 +               result = cut_ctail(&pos->coord);
44316 +               break;
44317 +       default:
44318 +               result = RETERR(-EIO);
44319 +               impossible("edward-244", "bad convert mode");
44320 +       }
44321 +       return result;
44322 +}
44323 +
44324 +/* plugin->u.item.f.scan */
44325 +int scan_ctail(flush_scan * scan)
44326 +{
44327 +       int result = 0;
44328 +       struct page *page;
44329 +       struct inode *inode;
44330 +       jnode *node = scan->node;
44331 +
44332 +       assert("edward-227", scan->node != NULL);
44333 +       assert("edward-228", jnode_is_cluster_page(scan->node));
44334 +       assert("edward-639", znode_is_write_locked(scan->parent_lock.node));
44335 +
44336 +       page = jnode_page(node);
44337 +       inode = page->mapping->host;
44338 +
44339 +       if (!reiser4_scanning_left(scan))
44340 +               return result;
44341 +       if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY))
44342 +               znode_make_dirty(scan->parent_lock.node);
44343 +
44344 +       if (!znode_convertible(scan->parent_lock.node)) {
44345 +               if (JF_ISSET(scan->node, JNODE_DIRTY))
44346 +                       znode_set_convertible(scan->parent_lock.node);
44347 +               else {
44348 +                       warning("edward-681",
44349 +                               "cluster page is already processed");
44350 +                       return -EAGAIN;
44351 +               }
44352 +       }
44353 +       return result;
44354 +}
44355 +
44356 +/* If true, this function attaches children */
44357 +static int should_attach_convert_idata(flush_pos_t * pos)
44358 +{
44359 +       int result;
44360 +       assert("edward-431", pos != NULL);
44361 +       assert("edward-432", pos->child == NULL);
44362 +       assert("edward-619", znode_is_write_locked(pos->coord.node));
44363 +       assert("edward-470",
44364 +              item_plugin_by_coord(&pos->coord) ==
44365 +              item_plugin_by_id(CTAIL_ID));
44366 +
44367 +       /* check for leftmost child */
44368 +       utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child);
44369 +
44370 +       if (!pos->child)
44371 +               return 0;
44372 +       spin_lock_jnode(pos->child);
44373 +       result = (JF_ISSET(pos->child, JNODE_DIRTY) &&
44374 +                 pos->child->atom == ZJNODE(pos->coord.node)->atom);
44375 +       spin_unlock_jnode(pos->child);
44376 +       if (!result && pos->child) {
44377 +               /* existing child isn't to attach, clear up this one */
44378 +               jput(pos->child);
44379 +               pos->child = NULL;
44380 +       }
44381 +       return result;
44382 +}
44383 +
44384 +/**
44385 + * Collect all needed information about the object here,
44386 + * as in-memory inode can be evicted from memory before
44387 + * disk update completion.
44388 + */
44389 +static int init_convert_data_ctail(struct convert_item_info * idata,
44390 +                                  struct inode *inode)
44391 +{
44392 +       assert("edward-813", idata != NULL);
44393 +       assert("edward-814", inode != NULL);
44394 +
44395 +       idata->cluster_shift = inode_cluster_shift(inode);
44396 +       idata->d_cur = DC_FIRST_ITEM;
44397 +       idata->d_next = DC_INVALID_STATE;
44398 +
44399 +       return 0;
44400 +}
44401 +
44402 +static int alloc_item_convert_data(struct convert_info * sq)
44403 +{
44404 +       assert("edward-816", sq != NULL);
44405 +       assert("edward-817", sq->itm == NULL);
44406 +
44407 +       sq->itm = kmalloc(sizeof(*sq->itm), reiser4_ctx_gfp_mask_get());
44408 +       if (sq->itm == NULL)
44409 +               return RETERR(-ENOMEM);
44410 +       return 0;
44411 +}
44412 +
44413 +static void free_item_convert_data(struct convert_info * sq)
44414 +{
44415 +       assert("edward-818", sq != NULL);
44416 +       assert("edward-819", sq->itm != NULL);
44417 +       assert("edward-820", sq->iplug != NULL);
44418 +
44419 +       kfree(sq->itm);
44420 +       sq->itm = NULL;
44421 +       return;
44422 +}
44423 +
44424 +static int alloc_convert_data(flush_pos_t * pos)
44425 +{
44426 +       assert("edward-821", pos != NULL);
44427 +       assert("edward-822", pos->sq == NULL);
44428 +
44429 +       pos->sq = kmalloc(sizeof(*pos->sq), reiser4_ctx_gfp_mask_get());
44430 +       if (!pos->sq)
44431 +               return RETERR(-ENOMEM);
44432 +       memset(pos->sq, 0, sizeof(*pos->sq));
44433 +       cluster_init_write(&pos->sq->clust, NULL);
44434 +       return 0;
44435 +}
44436 +
44437 +void free_convert_data(flush_pos_t * pos)
44438 +{
44439 +       struct convert_info *sq;
44440 +
44441 +       assert("edward-823", pos != NULL);
44442 +       assert("edward-824", pos->sq != NULL);
44443 +
44444 +       sq = pos->sq;
44445 +       if (sq->itm)
44446 +               free_item_convert_data(sq);
44447 +       put_cluster_handle(&sq->clust);
44448 +       kfree(pos->sq);
44449 +       pos->sq = NULL;
44450 +       return;
44451 +}
44452 +
44453 +static int init_item_convert_data(flush_pos_t * pos, struct inode *inode)
44454 +{
44455 +       struct convert_info *sq;
44456 +
44457 +       assert("edward-825", pos != NULL);
44458 +       assert("edward-826", pos->sq != NULL);
44459 +       assert("edward-827", item_convert_data(pos) != NULL);
44460 +       assert("edward-828", inode != NULL);
44461 +
44462 +       sq = pos->sq;
44463 +
44464 +       memset(sq->itm, 0, sizeof(*sq->itm));
44465 +
44466 +       /* iplug->init_convert_data() */
44467 +       return init_convert_data_ctail(sq->itm, inode);
44468 +}
44469 +
44470 +/* create and attach disk cluster info used by 'convert' phase of the flush
44471 +   squalloc() */
44472 +static int attach_convert_idata(flush_pos_t * pos, struct inode *inode)
44473 +{
44474 +       int ret = 0;
44475 +       struct convert_item_info *info;
44476 +       struct cluster_handle *clust;
44477 +       file_plugin *fplug = inode_file_plugin(inode);
44478 +       compression_plugin *cplug = inode_compression_plugin(inode);
44479 +
44480 +       assert("edward-248", pos != NULL);
44481 +       assert("edward-249", pos->child != NULL);
44482 +       assert("edward-251", inode != NULL);
44483 +       assert("edward-682", cryptcompress_inode_ok(inode));
44484 +       assert("edward-252",
44485 +              fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
44486 +       assert("edward-473",
44487 +              item_plugin_by_coord(&pos->coord) ==
44488 +              item_plugin_by_id(CTAIL_ID));
44489 +
44490 +       if (!pos->sq) {
44491 +               ret = alloc_convert_data(pos);
44492 +               if (ret)
44493 +                       return ret;
44494 +       }
44495 +       clust = &pos->sq->clust;
44496 +       ret = grab_coa(&clust->tc, cplug);
44497 +       if (ret)
44498 +               goto err;
44499 +       ret = set_cluster_by_page(clust,
44500 +                                 jnode_page(pos->child),
44501 +                                 MAX_CLUSTER_NRPAGES);
44502 +       if (ret)
44503 +               goto err;
44504 +
44505 +       assert("edward-829", pos->sq != NULL);
44506 +       assert("edward-250", item_convert_data(pos) == NULL);
44507 +
44508 +       pos->sq->iplug = item_plugin_by_id(CTAIL_ID);
44509 +
44510 +       ret = alloc_item_convert_data(pos->sq);
44511 +       if (ret)
44512 +               goto err;
44513 +       ret = init_item_convert_data(pos, inode);
44514 +       if (ret)
44515 +               goto err;
44516 +       info = item_convert_data(pos);
44517 +
44518 +       ret = checkout_logical_cluster(clust, pos->child, inode);
44519 +       if (ret)
44520 +               goto err;
44521 +
44522 +       reiser4_deflate_cluster(clust, inode);
44523 +       inc_item_convert_count(pos);
44524 +
44525 +       /* prepare flow for insertion */
44526 +       fplug->flow_by_inode(inode,
44527 +                            (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM),
44528 +                            0 /* kernel space */ ,
44529 +                            clust->tc.len,
44530 +                            clust_to_off(clust->index, inode),
44531 +                            WRITE_OP, &info->flow);
44532 +       jput(pos->child);
44533 +       return 0;
44534 +      err:
44535 +       jput(pos->child);
44536 +       free_convert_data(pos);
44537 +       return ret;
44538 +}
44539 +
44540 +/* clear up disk cluster info */
44541 +static void detach_convert_idata(struct convert_info * sq)
44542 +{
44543 +       struct convert_item_info *info;
44544 +
44545 +       assert("edward-253", sq != NULL);
44546 +       assert("edward-840", sq->itm != NULL);
44547 +
44548 +       info = sq->itm;
44549 +       assert("edward-1212", info->flow.length == 0);
44550 +
44551 +       free_item_convert_data(sq);
44552 +       return;
44553 +}
44554 +
44555 +/* plugin->u.item.f.utmost_child */
44556 +
44557 +/* This function sets leftmost child for a first cluster item,
44558 +   if the child exists, and NULL in other cases.
44559 +   NOTE-EDWARD: Do not call this for RIGHT_SIDE */
44560 +
44561 +int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child)
44562 +{
44563 +       reiser4_key key;
44564 +
44565 +       item_key_by_coord(coord, &key);
44566 +
44567 +       assert("edward-257", coord != NULL);
44568 +       assert("edward-258", child != NULL);
44569 +       assert("edward-259", side == LEFT_SIDE);
44570 +       assert("edward-260",
44571 +              item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
44572 +
44573 +       if (!is_disk_cluster_key(&key, coord))
44574 +               *child = NULL;
44575 +       else
44576 +               *child = jlookup(current_tree,
44577 +                                get_key_objectid(item_key_by_coord
44578 +                                                 (coord, &key)),
44579 +                                off_to_pg(get_key_offset(&key)));
44580 +       return 0;
44581 +}
44582 +
44583 +/* Returns true if @p2 is the next item to @p1
44584 +   in the _same_ disk cluster.
44585 +   Disk cluster is a set of items. If ->clustered() != NULL,
44586 +   with each item the whole disk cluster should be read/modified
44587 +*/
44588 +
44589 +/* Go rightward and check for next disk cluster item, set
44590 + * d_next to DC_CHAINED_ITEM, if the last one exists.
44591 + * If the current position is last item, go to right neighbor.
44592 + * Skip empty nodes. Note, that right neighbors may be not in
44593 + * the slum because of races. If so, make it dirty and
44594 + * convertible.
44595 + */
44596 +static int next_item_dc_stat(flush_pos_t * pos)
44597 +{
44598 +       int ret = 0;
44599 +       int stop = 0;
44600 +       znode *cur;
44601 +       coord_t coord;
44602 +       lock_handle lh;
44603 +       lock_handle right_lock;
44604 +
44605 +       assert("edward-1232", !node_is_empty(pos->coord.node));
44606 +       assert("edward-1014",
44607 +              pos->coord.item_pos < coord_num_items(&pos->coord));
44608 +       assert("edward-1015", chaining_data_present(pos));
44609 +       assert("edward-1017",
44610 +              item_convert_data(pos)->d_next == DC_INVALID_STATE);
44611 +
44612 +       item_convert_data(pos)->d_next = DC_AFTER_CLUSTER;
44613 +
44614 +       if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER)
44615 +               return ret;
44616 +       if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1)
44617 +               return ret;
44618 +
44619 +       /* Check next slum item.
44620 +        * Note, that it can not be killed by concurrent truncate,
44621 +        * as the last one will want the lock held by us.
44622 +        */
44623 +       init_lh(&right_lock);
44624 +       cur = pos->coord.node;
44625 +
44626 +       while (!stop) {
44627 +               init_lh(&lh);
44628 +               ret = reiser4_get_right_neighbor(&lh,
44629 +                                                cur,
44630 +                                                ZNODE_WRITE_LOCK,
44631 +                                                GN_CAN_USE_UPPER_LEVELS);
44632 +               if (ret)
44633 +                       break;
44634 +               ret = zload(lh.node);
44635 +               if (ret) {
44636 +                       done_lh(&lh);
44637 +                       break;
44638 +               }
44639 +               coord_init_before_first_item(&coord, lh.node);
44640 +
44641 +               if (node_is_empty(lh.node)) {
44642 +                       znode_make_dirty(lh.node);
44643 +                       znode_set_convertible(lh.node);
44644 +                       stop = 0;
44645 +               } else if (same_disk_cluster(&pos->coord, &coord)) {
44646 +
44647 +                       item_convert_data(pos)->d_next = DC_CHAINED_ITEM;
44648 +
44649 +                       if (!ZF_ISSET(lh.node, JNODE_DIRTY)) {
44650 +                               /*
44651 +                                  warning("edward-1024",
44652 +                                  "next slum item mergeable, "
44653 +                                  "but znode %p isn't dirty\n",
44654 +                                  lh.node);
44655 +                                */
44656 +                               znode_make_dirty(lh.node);
44657 +                       }
44658 +                       if (!znode_convertible(lh.node)) {
44659 +                               /*
44660 +                                  warning("edward-1272",
44661 +                                  "next slum item mergeable, "
44662 +                                  "but znode %p isn't convertible\n",
44663 +                                  lh.node);
44664 +                                */
44665 +                               znode_set_convertible(lh.node);
44666 +                       }
44667 +                       stop = 1;
44668 +               } else
44669 +                       stop = 1;
44670 +               zrelse(lh.node);
44671 +               done_lh(&right_lock);
44672 +               copy_lh(&right_lock, &lh);
44673 +               done_lh(&lh);
44674 +               cur = right_lock.node;
44675 +       }
44676 +       done_lh(&right_lock);
44677 +
44678 +       if (ret == -E_NO_NEIGHBOR)
44679 +               ret = 0;
44680 +       return ret;
44681 +}
44682 +
44683 +static int
44684 +assign_convert_mode(struct convert_item_info * idata,
44685 +                   cryptcompress_write_mode_t * mode)
44686 +{
44687 +       int result = 0;
44688 +
44689 +       assert("edward-1025", idata != NULL);
44690 +
44691 +       if (idata->flow.length) {
44692 +               /* append or overwrite */
44693 +               switch (idata->d_cur) {
44694 +               case DC_FIRST_ITEM:
44695 +               case DC_CHAINED_ITEM:
44696 +                       *mode = CRC_OVERWRITE_ITEM;
44697 +                       break;
44698 +               case DC_AFTER_CLUSTER:
44699 +                       *mode = CRC_APPEND_ITEM;
44700 +                       break;
44701 +               default:
44702 +                       impossible("edward-1018", "wrong current item state");
44703 +               }
44704 +       } else {
44705 +               /* cut or invalidate */
44706 +               switch (idata->d_cur) {
44707 +               case DC_FIRST_ITEM:
44708 +               case DC_CHAINED_ITEM:
44709 +                       *mode = CRC_CUT_ITEM;
44710 +                       break;
44711 +               case DC_AFTER_CLUSTER:
44712 +                       result = 1;
44713 +                       break;
44714 +               default:
44715 +                       impossible("edward-1019", "wrong current item state");
44716 +               }
44717 +       }
44718 +       return result;
44719 +}
44720 +
44721 +/* plugin->u.item.f.convert */
44722 +/* write ctail in guessed mode */
44723 +int convert_ctail(flush_pos_t * pos)
44724 +{
44725 +       int result;
44726 +       int nr_items;
44727 +       cryptcompress_write_mode_t mode = CRC_OVERWRITE_ITEM;
44728 +
44729 +       assert("edward-1020", pos != NULL);
44730 +       assert("edward-1213", coord_num_items(&pos->coord) != 0);
44731 +       assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID);
44732 +       assert("edward-1258", ctail_ok(&pos->coord));
44733 +       assert("edward-261", pos->coord.node != NULL);
44734 +
44735 +       nr_items = coord_num_items(&pos->coord);
44736 +       if (!chaining_data_present(pos)) {
44737 +               if (should_attach_convert_idata(pos)) {
44738 +                       /* attach convert item info */
44739 +                       struct inode *inode;
44740 +
44741 +                       assert("edward-264", pos->child != NULL);
44742 +                       assert("edward-265", jnode_page(pos->child) != NULL);
44743 +                       assert("edward-266",
44744 +                              jnode_page(pos->child)->mapping != NULL);
44745 +
44746 +                       inode = jnode_page(pos->child)->mapping->host;
44747 +
44748 +                       assert("edward-267", inode != NULL);
44749 +
44750 +                       /* attach item convert info by child and put the last one */
44751 +                       result = attach_convert_idata(pos, inode);
44752 +                       pos->child = NULL;
44753 +                       if (result == -E_REPEAT) {
44754 +                               /* jnode became clean, or there is no dirty
44755 +                                  pages (nothing to update in disk cluster) */
44756 +                               warning("edward-1021",
44757 +                                       "convert_ctail: nothing to attach");
44758 +                               return 0;
44759 +                       }
44760 +                       if (result != 0)
44761 +                               return result;
44762 +               } else
44763 +                       /* unconvertible */
44764 +                       return 0;
44765 +       } else {
44766 +               /* use old convert info */
44767 +
44768 +               struct convert_item_info *idata;
44769 +
44770 +               idata = item_convert_data(pos);
44771 +
44772 +               result = assign_convert_mode(idata, &mode);
44773 +               if (result) {
44774 +                       /* disk cluster is over,
44775 +                          nothing to update anymore */
44776 +                       detach_convert_idata(pos->sq);
44777 +                       return 0;
44778 +               }
44779 +       }
44780 +
44781 +       assert("edward-433", chaining_data_present(pos));
44782 +       assert("edward-1022",
44783 +              pos->coord.item_pos < coord_num_items(&pos->coord));
44784 +
44785 +       /* check if next item is of current disk cluster */
44786 +       result = next_item_dc_stat(pos);
44787 +       if (result) {
44788 +               detach_convert_idata(pos->sq);
44789 +               return result;
44790 +       }
44791 +       result = do_convert_ctail(pos, mode);
44792 +       if (result) {
44793 +               detach_convert_idata(pos->sq);
44794 +               return result;
44795 +       }
44796 +       switch (mode) {
44797 +       case CRC_CUT_ITEM:
44798 +               assert("edward-1214", item_convert_data(pos)->flow.length == 0);
44799 +               assert("edward-1215",
44800 +                      coord_num_items(&pos->coord) == nr_items ||
44801 +                      coord_num_items(&pos->coord) == nr_items - 1);
44802 +               if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM)
44803 +                       break;
44804 +               if (coord_num_items(&pos->coord) != nr_items) {
44805 +                       /* the item was killed, no more chained items */
44806 +                       detach_convert_idata(pos->sq);
44807 +                       if (!node_is_empty(pos->coord.node))
44808 +                               /* make sure the next item will be scanned */
44809 +                               coord_init_before_item(&pos->coord);
44810 +                       break;
44811 +               }
44812 +       case CRC_APPEND_ITEM:
44813 +               assert("edward-434", item_convert_data(pos)->flow.length == 0);
44814 +               detach_convert_idata(pos->sq);
44815 +               break;
44816 +       case CRC_OVERWRITE_ITEM:
44817 +               if (coord_is_unprepped_ctail(&pos->coord)) {
44818 +                       /* convert unpprepped ctail to prepped one */
44819 +                       assert("edward-1259",
44820 +                              cluster_shift_ok(item_convert_data(pos)->
44821 +                                               cluster_shift));
44822 +                       put_unaligned((d8)item_convert_data(pos)->cluster_shift,
44823 +                                     &ctail_formatted_at(&pos->coord)->
44824 +                                     cluster_shift);
44825 +               }
44826 +               break;
44827 +       }
44828 +       return result;
44829 +}
44830 +
44831 +/* Make Linus happy.
44832 +   Local variables:
44833 +   c-indentation-style: "K&R"
44834 +   mode-name: "LC"
44835 +   c-basic-offset: 8
44836 +   tab-width: 8
44837 +   fill-column: 120
44838 +   End:
44839 +*/
44840 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/ctail.h linux-2.6.35/fs/reiser4/plugin/item/ctail.h
44841 --- linux-2.6.35.orig/fs/reiser4/plugin/item/ctail.h    1970-01-01 01:00:00.000000000 +0100
44842 +++ linux-2.6.35/fs/reiser4/plugin/item/ctail.h 2010-08-04 15:44:57.000000000 +0200
44843 @@ -0,0 +1,102 @@
44844 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44845 +
44846 +/* Ctail items are fragments (or bodies) of special tipe to provide
44847 +   optimal storage of encrypted and(or) compressed files. */
44848 +
44849 +
44850 +#if !defined( __FS_REISER4_CTAIL_H__ )
44851 +#define __FS_REISER4_CTAIL_H__
44852 +
44853 +/* Disk format of ctail item */
44854 +typedef struct ctail_item_format {
44855 +       /* packed shift;
44856 +          if its value is different from UCTAIL_SHIFT (see below), then
44857 +          size of disk cluster is calculated as (1 << cluster_shift) */
44858 +       d8 cluster_shift;
44859 +       /* ctail body */
44860 +       d8 body[0];
44861 +} __attribute__ ((packed)) ctail_item_format;
44862 +
44863 +/* "Unprepped" disk cluster is represented by a single ctail item
44864 +   with the following "magic" attributes: */
44865 +/* "magic" cluster_shift */
44866 +#define UCTAIL_SHIFT 0xff
44867 +/* How many units unprepped ctail item has */
44868 +#define UCTAIL_NR_UNITS 1
44869 +
44870 +/* The following is a set of various item states in a disk cluster.
44871 +   Disk cluster is a set of items whose keys belong to the interval
44872 +   [dc_key , dc_key + disk_cluster_size - 1] */
44873 +typedef enum {
44874 +       DC_INVALID_STATE = 0,
44875 +       DC_FIRST_ITEM = 1,
44876 +       DC_CHAINED_ITEM = 2,
44877 +       DC_AFTER_CLUSTER = 3
44878 +} dc_item_stat;
44879 +
44880 +/* ctail-specific extension.
44881 +   In particular this describes parameters of disk cluster an item belongs to */
44882 +struct ctail_coord_extension {
44883 +       int shift; /* this contains cluster_shift extracted from
44884 +                     ctail_item_format (above), or UCTAIL_SHIFT
44885 +                     (the last one is the "magic" of unprepped disk clusters)*/
44886 +       int dsize; /* size of a prepped disk cluster */
44887 +       int ncount; /* count of nodes occupied by a disk cluster */
44888 +};
44889 +
44890 +struct cut_list;
44891 +
44892 +/* plugin->item.b.* */
44893 +int can_contain_key_ctail(const coord_t *, const reiser4_key *,
44894 +                         const reiser4_item_data *);
44895 +int mergeable_ctail(const coord_t * p1, const coord_t * p2);
44896 +pos_in_node_t nr_units_ctail(const coord_t * coord);
44897 +int estimate_ctail(const coord_t * coord, const reiser4_item_data * data);
44898 +void print_ctail(const char *prefix, coord_t * coord);
44899 +lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *);
44900 +
44901 +int paste_ctail(coord_t * coord, reiser4_item_data * data,
44902 +               carry_plugin_info * info UNUSED_ARG);
44903 +int init_ctail(coord_t *, coord_t *, reiser4_item_data *);
44904 +int can_shift_ctail(unsigned free_space, coord_t * coord,
44905 +                   znode * target, shift_direction pend, unsigned *size,
44906 +                   unsigned want);
44907 +void copy_units_ctail(coord_t * target, coord_t * source, unsigned from,
44908 +                     unsigned count, shift_direction where_is_free_space,
44909 +                     unsigned free_space);
44910 +int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
44911 +                   carry_cut_data *, reiser4_key * smallest_removed,
44912 +                   reiser4_key * new_first);
44913 +int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
44914 +                    carry_kill_data *, reiser4_key * smallest_removed,
44915 +                    reiser4_key * new_first);
44916 +int ctail_ok(const coord_t * coord);
44917 +int check_ctail(const coord_t * coord, const char **error);
44918 +
44919 +/* plugin->u.item.s.* */
44920 +int read_ctail(struct file *, flow_t *, hint_t *);
44921 +int readpage_ctail(void *, struct page *);
44922 +int readpages_ctail(struct file *, struct address_space *, struct list_head *);
44923 +reiser4_key *append_key_ctail(const coord_t *, reiser4_key *);
44924 +int create_hook_ctail(const coord_t * coord, void *arg);
44925 +int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t,
44926 +                   carry_kill_data *);
44927 +int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *);
44928 +
44929 +/* plugin->u.item.f */
44930 +int utmost_child_ctail(const coord_t *, sideof, jnode **);
44931 +int scan_ctail(flush_scan *);
44932 +int convert_ctail(flush_pos_t *);
44933 +size_t inode_scaled_cluster_size(struct inode *);
44934 +
44935 +#endif                         /* __FS_REISER4_CTAIL_H__ */
44936 +
44937 +/* Make Linus happy.
44938 +   Local variables:
44939 +   c-indentation-style: "K&R"
44940 +   mode-name: "LC"
44941 +   c-basic-offset: 8
44942 +   tab-width: 8
44943 +   fill-column: 120
44944 +   End:
44945 +*/
44946 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/extent.c linux-2.6.35/fs/reiser4/plugin/item/extent.c
44947 --- linux-2.6.35.orig/fs/reiser4/plugin/item/extent.c   1970-01-01 01:00:00.000000000 +0100
44948 +++ linux-2.6.35/fs/reiser4/plugin/item/extent.c        2010-08-04 15:44:57.000000000 +0200
44949 @@ -0,0 +1,197 @@
44950 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44951 +
44952 +#include "item.h"
44953 +#include "../../key.h"
44954 +#include "../../super.h"
44955 +#include "../../carry.h"
44956 +#include "../../inode.h"
44957 +#include "../../page_cache.h"
44958 +#include "../../flush.h"
44959 +#include "../object.h"
44960 +
44961 +/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */
44962 +/* Audited by: green(2002.06.13) */
44963 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
44964 +                                  int nr_extents)
44965 +{
44966 +       data->data = ext_unit;
44967 +       /* data->data is kernel space */
44968 +       data->user = 0;
44969 +       data->length = sizeof(reiser4_extent) * nr_extents;
44970 +       data->arg = NULL;
44971 +       data->iplug = item_plugin_by_id(EXTENT_POINTER_ID);
44972 +       return data;
44973 +}
44974 +
44975 +/* how many bytes are addressed by @nr first extents of the extent item */
44976 +reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr)
44977 +{
44978 +       pos_in_node_t i;
44979 +       reiser4_block_nr blocks;
44980 +       reiser4_extent *ext;
44981 +
44982 +       ext = item_body_by_coord(coord);
44983 +       assert("vs-263", nr <= nr_units_extent(coord));
44984 +
44985 +       blocks = 0;
44986 +       for (i = 0; i < nr; i++, ext++) {
44987 +               blocks += extent_get_width(ext);
44988 +       }
44989 +
44990 +       return blocks * current_blocksize;
44991 +}
44992 +
44993 +extent_state state_of_extent(reiser4_extent * ext)
44994 +{
44995 +       switch ((int)extent_get_start(ext)) {
44996 +       case 0:
44997 +               return HOLE_EXTENT;
44998 +       case 1:
44999 +               return UNALLOCATED_EXTENT;
45000 +       default:
45001 +               break;
45002 +       }
45003 +       return ALLOCATED_EXTENT;
45004 +}
45005 +
45006 +int extent_is_unallocated(const coord_t * item)
45007 +{
45008 +       assert("jmacd-5133", item_is_extent(item));
45009 +
45010 +       return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT;
45011 +}
45012 +
45013 +/* set extent's start and width */
45014 +void reiser4_set_extent(reiser4_extent * ext, reiser4_block_nr start,
45015 +                       reiser4_block_nr width)
45016 +{
45017 +       extent_set_start(ext, start);
45018 +       extent_set_width(ext, width);
45019 +}
45020 +
45021 +/**
45022 + * reiser4_replace_extent - replace extent and paste 1 or 2 after it
45023 + * @un_extent: coordinate of extent to be overwritten
45024 + * @lh: need better comment
45025 + * @key: need better comment
45026 + * @exts_to_add: data prepared for insertion into tree
45027 + * @replace: need better comment
45028 + * @flags: need better comment
45029 + * @return_insert_position: need better comment
45030 + *
45031 + * Overwrites one extent, pastes 1 or 2 more ones after overwritten one.  If
45032 + * @return_inserted_position is 1 - @un_extent and @lh are returned set to
45033 + * first of newly inserted units, if it is 0 - @un_extent and @lh are returned
45034 + * set to extent which was overwritten.
45035 + */
45036 +int reiser4_replace_extent(struct replace_handle *h,
45037 +                          int return_inserted_position)
45038 +{
45039 +       int result;
45040 +       znode *orig_znode;
45041 +       /*ON_DEBUG(reiser4_extent orig_ext);*/  /* this is for debugging */
45042 +
45043 +       assert("vs-990", coord_is_existing_unit(h->coord));
45044 +       assert("vs-1375", znode_is_write_locked(h->coord->node));
45045 +       assert("vs-1426", extent_get_width(&h->overwrite) != 0);
45046 +       assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0);
45047 +       assert("vs-1427", ergo(h->nr_new_extents == 2,
45048 +                              extent_get_width(&h->new_extents[1]) != 0));
45049 +
45050 +       /* compose structure for paste */
45051 +       init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents);
45052 +
45053 +       coord_dup(&h->coord_after, h->coord);
45054 +       init_lh(&h->lh_after);
45055 +       copy_lh(&h->lh_after, h->lh);
45056 +       reiser4_tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK);
45057 +       reiser4_tap_monitor(&h->watch);
45058 +
45059 +       ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord));
45060 +       orig_znode = h->coord->node;
45061 +
45062 +#if REISER4_DEBUG
45063 +       /* make sure that key is set properly */
45064 +       unit_key_by_coord(h->coord, &h->tmp);
45065 +       set_key_offset(&h->tmp,
45066 +                      get_key_offset(&h->tmp) +
45067 +                      extent_get_width(&h->overwrite) * current_blocksize);
45068 +       assert("vs-1080", keyeq(&h->tmp, &h->paste_key));
45069 +#endif
45070 +
45071 +       /* set insert point after unit to be replaced */
45072 +       h->coord->between = AFTER_UNIT;
45073 +
45074 +       result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL,
45075 +                                 &h->paste_key, &h->item, h->flags);
45076 +       if (!result) {
45077 +               /* now we have to replace the unit after which new units were
45078 +                  inserted. Its position is tracked by @watch */
45079 +               reiser4_extent *ext;
45080 +               znode *node;
45081 +
45082 +               node = h->coord_after.node;
45083 +               if (node != orig_znode) {
45084 +                       coord_clear_iplug(&h->coord_after);
45085 +                       result = zload(node);
45086 +               }
45087 +
45088 +               if (likely(!result)) {
45089 +                       ext = extent_by_coord(&h->coord_after);
45090 +
45091 +                       assert("vs-987", znode_is_loaded(node));
45092 +                       assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext)));
45093 +
45094 +                       /* overwrite extent unit */
45095 +                       memcpy(ext, &h->overwrite, sizeof(reiser4_extent));
45096 +                       znode_make_dirty(node);
45097 +
45098 +                       if (node != orig_znode)
45099 +                               zrelse(node);
45100 +
45101 +                       if (return_inserted_position == 0) {
45102 +                               /* coord and lh are to be set to overwritten
45103 +                                  extent */
45104 +                               assert("vs-1662",
45105 +                                      WITH_DATA(node, !memcmp(&h->overwrite,
45106 +                                                              extent_by_coord(
45107 +                                                                      &h->coord_after),
45108 +                                                              sizeof(reiser4_extent))));
45109 +
45110 +                               *h->coord = h->coord_after;
45111 +                               done_lh(h->lh);
45112 +                               copy_lh(h->lh, &h->lh_after);
45113 +                       } else {
45114 +                               /* h->coord and h->lh are to be set to first of
45115 +                                  inserted units */
45116 +                               assert("vs-1663",
45117 +                                      WITH_DATA(h->coord->node,
45118 +                                                !memcmp(&h->new_extents[0],
45119 +                                                        extent_by_coord(h->coord),
45120 +                                                        sizeof(reiser4_extent))));
45121 +                               assert("vs-1664", h->lh->node == h->coord->node);
45122 +                       }
45123 +               }
45124 +       }
45125 +       reiser4_tap_done(&h->watch);
45126 +
45127 +       return result;
45128 +}
45129 +
45130 +lock_handle *znode_lh(znode *node)
45131 +{
45132 +       assert("vs-1371", znode_is_write_locked(node));
45133 +       assert("vs-1372", znode_is_wlocked_once(node));
45134 +       return list_entry(node->lock.owners.next, lock_handle, owners_link);
45135 +}
45136 +
45137 +/*
45138 + * Local variables:
45139 + * c-indentation-style: "K&R"
45140 + * mode-name: "LC"
45141 + * c-basic-offset: 8
45142 + * tab-width: 8
45143 + * fill-column: 79
45144 + * scroll-step: 1
45145 + * End:
45146 + */
45147 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/extent_file_ops.c linux-2.6.35/fs/reiser4/plugin/item/extent_file_ops.c
45148 --- linux-2.6.35.orig/fs/reiser4/plugin/item/extent_file_ops.c  1970-01-01 01:00:00.000000000 +0100
45149 +++ linux-2.6.35/fs/reiser4/plugin/item/extent_file_ops.c       2010-08-04 15:44:57.000000000 +0200
45150 @@ -0,0 +1,1453 @@
45151 +/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45152 +
45153 +#include "item.h"
45154 +#include "../../inode.h"
45155 +#include "../../page_cache.h"
45156 +#include "../object.h"
45157 +
45158 +#include <linux/quotaops.h>
45159 +#include <linux/swap.h>
45160 +
45161 +static inline reiser4_extent *ext_by_offset(const znode *node, int offset)
45162 +{
45163 +       reiser4_extent *ext;
45164 +
45165 +       ext = (reiser4_extent *) (zdata(node) + offset);
45166 +       return ext;
45167 +}
45168 +
45169 +/**
45170 + * check_uf_coord - verify coord extension
45171 + * @uf_coord:
45172 + * @key:
45173 + *
45174 + * Makes sure that all fields of @uf_coord are set properly. If @key is
45175 + * specified - check whether @uf_coord is set correspondingly.
45176 + */
45177 +static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key)
45178 +{
45179 +#if REISER4_DEBUG
45180 +       const coord_t *coord;
45181 +       const struct extent_coord_extension *ext_coord;
45182 +       reiser4_extent *ext;
45183 +
45184 +       coord = &uf_coord->coord;
45185 +       ext_coord = &uf_coord->extension.extent;
45186 +       ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset);
45187 +
45188 +       assert("",
45189 +              WITH_DATA(coord->node,
45190 +                        (uf_coord->valid == 1 &&
45191 +                         coord_is_iplug_set(coord) &&
45192 +                         item_is_extent(coord) &&
45193 +                         ext_coord->nr_units == nr_units_extent(coord) &&
45194 +                         ext == extent_by_coord(coord) &&
45195 +                         ext_coord->width == extent_get_width(ext) &&
45196 +                         coord->unit_pos < ext_coord->nr_units &&
45197 +                         ext_coord->pos_in_unit < ext_coord->width &&
45198 +                         memcmp(ext, &ext_coord->extent,
45199 +                                sizeof(reiser4_extent)) == 0)));
45200 +       if (key) {
45201 +               reiser4_key coord_key;
45202 +
45203 +               unit_key_by_coord(&uf_coord->coord, &coord_key);
45204 +               set_key_offset(&coord_key,
45205 +                              get_key_offset(&coord_key) +
45206 +                              (uf_coord->extension.extent.
45207 +                               pos_in_unit << PAGE_CACHE_SHIFT));
45208 +               assert("", keyeq(key, &coord_key));
45209 +       }
45210 +#endif
45211 +}
45212 +
45213 +static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord)
45214 +{
45215 +       check_uf_coord(uf_coord, NULL);
45216 +
45217 +       return ext_by_offset(uf_coord->coord.node,
45218 +                            uf_coord->extension.extent.ext_offset);
45219 +}
45220 +
45221 +#if REISER4_DEBUG
45222 +
45223 +/**
45224 + * offset_is_in_unit
45225 + *
45226 + *
45227 + *
45228 + */
45229 +/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set
45230 +   pos_in_unit inside of unit correspondingly */
45231 +static int offset_is_in_unit(const coord_t *coord, loff_t off)
45232 +{
45233 +       reiser4_key unit_key;
45234 +       __u64 unit_off;
45235 +       reiser4_extent *ext;
45236 +
45237 +       ext = extent_by_coord(coord);
45238 +
45239 +       unit_key_extent(coord, &unit_key);
45240 +       unit_off = get_key_offset(&unit_key);
45241 +       if (off < unit_off)
45242 +               return 0;
45243 +       if (off >= (unit_off + (current_blocksize * extent_get_width(ext))))
45244 +               return 0;
45245 +       return 1;
45246 +}
45247 +
45248 +static int
45249 +coord_matches_key_extent(const coord_t * coord, const reiser4_key * key)
45250 +{
45251 +       reiser4_key item_key;
45252 +
45253 +       assert("vs-771", coord_is_existing_unit(coord));
45254 +       assert("vs-1258", keylt(key, append_key_extent(coord, &item_key)));
45255 +       assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key)));
45256 +
45257 +       return offset_is_in_unit(coord, get_key_offset(key));
45258 +}
45259 +
45260 +#endif
45261 +
45262 +/**
45263 + * can_append -
45264 + * @key:
45265 + * @coord:
45266 + *
45267 + * Returns 1 if @key is equal to an append key of item @coord is set to
45268 + */
45269 +static int can_append(const reiser4_key *key, const coord_t *coord)
45270 +{
45271 +       reiser4_key append_key;
45272 +
45273 +       return keyeq(key, append_key_extent(coord, &append_key));
45274 +}
45275 +
45276 +/**
45277 + * append_hole
45278 + * @coord:
45279 + * @lh:
45280 + * @key:
45281 + *
45282 + */
45283 +static int append_hole(coord_t *coord, lock_handle *lh,
45284 +                      const reiser4_key *key)
45285 +{
45286 +       reiser4_key append_key;
45287 +       reiser4_block_nr hole_width;
45288 +       reiser4_extent *ext, new_ext;
45289 +       reiser4_item_data idata;
45290 +
45291 +       /* last item of file may have to be appended with hole */
45292 +       assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL);
45293 +       assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID);
45294 +
45295 +       /* key of first byte which is not addressed by this extent */
45296 +       append_key_extent(coord, &append_key);
45297 +
45298 +       assert("", keyle(&append_key, key));
45299 +
45300 +       /*
45301 +        * extent item has to be appended with hole. Calculate length of that
45302 +        * hole
45303 +        */
45304 +       hole_width = ((get_key_offset(key) - get_key_offset(&append_key) +
45305 +                      current_blocksize - 1) >> current_blocksize_bits);
45306 +       assert("vs-954", hole_width > 0);
45307 +
45308 +       /* set coord after last unit */
45309 +       coord_init_after_item_end(coord);
45310 +
45311 +       /* get last extent in the item */
45312 +       ext = extent_by_coord(coord);
45313 +       if (state_of_extent(ext) == HOLE_EXTENT) {
45314 +               /*
45315 +                * last extent of a file is hole extent. Widen that extent by
45316 +                * @hole_width blocks. Note that we do not worry about
45317 +                * overflowing - extent width is 64 bits
45318 +                */
45319 +               reiser4_set_extent(ext, HOLE_EXTENT_START,
45320 +                                  extent_get_width(ext) + hole_width);
45321 +               znode_make_dirty(coord->node);
45322 +               return 0;
45323 +       }
45324 +
45325 +       /* append last item of the file with hole extent unit */
45326 +       assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT ||
45327 +                         state_of_extent(ext) == UNALLOCATED_EXTENT));
45328 +
45329 +       reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
45330 +       init_new_extent(&idata, &new_ext, 1);
45331 +       return insert_into_item(coord, lh, &append_key, &idata, 0);
45332 +}
45333 +
45334 +/**
45335 + * check_jnodes
45336 + * @twig: longterm locked twig node
45337 + * @key:
45338 + *
45339 + */
45340 +static void check_jnodes(znode *twig, const reiser4_key *key, int count)
45341 +{
45342 +#if REISER4_DEBUG
45343 +       coord_t c;
45344 +       reiser4_key node_key, jnode_key;
45345 +
45346 +       jnode_key = *key;
45347 +
45348 +       assert("", twig != NULL);
45349 +       assert("", znode_get_level(twig) == TWIG_LEVEL);
45350 +       assert("", znode_is_write_locked(twig));
45351 +
45352 +       zload(twig);
45353 +       /* get the smallest key in twig node */
45354 +       coord_init_first_unit(&c, twig);
45355 +       unit_key_by_coord(&c, &node_key);
45356 +       assert("", keyle(&node_key, &jnode_key));
45357 +
45358 +       coord_init_last_unit(&c, twig);
45359 +       unit_key_by_coord(&c, &node_key);
45360 +       if (item_plugin_by_coord(&c)->s.file.append_key)
45361 +               item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key);
45362 +       set_key_offset(&jnode_key,
45363 +                      get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1);
45364 +       assert("", keylt(&jnode_key, &node_key));
45365 +       zrelse(twig);
45366 +#endif
45367 +}
45368 +
45369 +/**
45370 + * append_last_extent - append last file item
45371 + * @uf_coord: coord to start insertion from
45372 + * @jnodes: array of jnodes
45373 + * @count: number of jnodes in the array
45374 + *
45375 + * There is already at least one extent item of file @inode in the tree. Append
45376 + * the last of them with unallocated extent unit of width @count. Assign
45377 + * fake block numbers to jnodes corresponding to the inserted extent.
45378 + */
45379 +static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key,
45380 +                             jnode **jnodes, int count)
45381 +{
45382 +       int result;
45383 +       reiser4_extent new_ext;
45384 +       reiser4_item_data idata;
45385 +       coord_t *coord;
45386 +       struct extent_coord_extension *ext_coord;
45387 +       reiser4_extent *ext;
45388 +       reiser4_block_nr block;
45389 +       jnode *node;
45390 +       int i;
45391 +
45392 +       coord = &uf_coord->coord;
45393 +       ext_coord = &uf_coord->extension.extent;
45394 +       ext = ext_by_ext_coord(uf_coord);
45395 +
45396 +       /* check correctness of position in the item */
45397 +       assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord));
45398 +       assert("vs-1311", coord->between == AFTER_UNIT);
45399 +       assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1);
45400 +
45401 +       if (!can_append(key, coord)) {
45402 +               /* hole extent has to be inserted */
45403 +               result = append_hole(coord, uf_coord->lh, key);
45404 +               uf_coord->valid = 0;
45405 +               return result;
45406 +       }
45407 +
45408 +       if (count == 0)
45409 +               return 0;
45410 +
45411 +       assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE);
45412 +
45413 +       result = dquot_alloc_block_nodirty(mapping_jnode(jnodes[0])->host,
45414 +                                          count);
45415 +       BUG_ON(result != 0);
45416 +
45417 +       switch (state_of_extent(ext)) {
45418 +       case UNALLOCATED_EXTENT:
45419 +               /*
45420 +                * last extent unit of the file is unallocated one. Increase
45421 +                * its width by @count
45422 +                */
45423 +               reiser4_set_extent(ext, UNALLOCATED_EXTENT_START,
45424 +                                  extent_get_width(ext) + count);
45425 +               znode_make_dirty(coord->node);
45426 +
45427 +               /* update coord extension */
45428 +               ext_coord->width += count;
45429 +               ON_DEBUG(extent_set_width
45430 +                        (&uf_coord->extension.extent.extent,
45431 +                         ext_coord->width));
45432 +               break;
45433 +
45434 +       case HOLE_EXTENT:
45435 +       case ALLOCATED_EXTENT:
45436 +               /*
45437 +                * last extent unit of the file is either hole or allocated
45438 +                * one. Append one unallocated extent of width @count
45439 +                */
45440 +               reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
45441 +               init_new_extent(&idata, &new_ext, 1);
45442 +               result = insert_into_item(coord, uf_coord->lh, key, &idata, 0);
45443 +               uf_coord->valid = 0;
45444 +               if (result)
45445 +                       return result;
45446 +               break;
45447 +
45448 +       default:
45449 +               return RETERR(-EIO);
45450 +       }
45451 +
45452 +       /*
45453 +        * make sure that we hold long term locked twig node containing all
45454 +        * jnodes we are about to capture
45455 +        */
45456 +       check_jnodes(uf_coord->lh->node, key, count);
45457 +
45458 +       /*
45459 +        * assign fake block numbers to all jnodes. FIXME: make sure whether
45460 +        * twig node containing inserted extent item is locked
45461 +        */
45462 +       block = fake_blocknr_unformatted(count);
45463 +       for (i = 0; i < count; i ++, block ++) {
45464 +               node = jnodes[i];
45465 +               spin_lock_jnode(node);
45466 +               JF_SET(node, JNODE_CREATED);
45467 +               jnode_set_block(node, &block);
45468 +               result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
45469 +               BUG_ON(result != 0);
45470 +               jnode_make_dirty_locked(node);
45471 +               spin_unlock_jnode(node);
45472 +       }
45473 +       return count;
45474 +}
45475 +
45476 +/**
45477 + * insert_first_hole - inser hole extent into tree
45478 + * @coord:
45479 + * @lh:
45480 + * @key:
45481 + *
45482 + *
45483 + */
45484 +static int insert_first_hole(coord_t *coord, lock_handle *lh,
45485 +                            const reiser4_key *key)
45486 +{
45487 +       reiser4_extent new_ext;
45488 +       reiser4_item_data idata;
45489 +       reiser4_key item_key;
45490 +       reiser4_block_nr hole_width;
45491 +
45492 +       /* @coord must be set for inserting of new item */
45493 +       assert("vs-711", coord_is_between_items(coord));
45494 +
45495 +       item_key = *key;
45496 +       set_key_offset(&item_key, 0ull);
45497 +
45498 +       hole_width = ((get_key_offset(key) + current_blocksize - 1) >>
45499 +                     current_blocksize_bits);
45500 +       assert("vs-710", hole_width > 0);
45501 +
45502 +       /* compose body of hole extent and insert item into tree */
45503 +       reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
45504 +       init_new_extent(&idata, &new_ext, 1);
45505 +       return insert_extent_by_coord(coord, &idata, &item_key, lh);
45506 +}
45507 +
45508 +
45509 +/**
45510 + * insert_first_extent - insert first file item
45511 + * @inode: inode of file
45512 + * @uf_coord: coord to start insertion from
45513 + * @jnodes: array of jnodes
45514 + * @count: number of jnodes in the array
45515 + * @inode:
45516 + *
45517 + * There are no items of file @inode in the tree yet. Insert unallocated extent
45518 + * of width @count into tree or hole extent if writing not to the
45519 + * beginning. Assign fake block numbers to jnodes corresponding to the inserted
45520 + * unallocated extent. Returns number of jnodes or error code.
45521 + */
45522 +static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key,
45523 +                              jnode **jnodes, int count,
45524 +                              struct inode *inode)
45525 +{
45526 +       int result;
45527 +       int i;
45528 +       reiser4_extent new_ext;
45529 +       reiser4_item_data idata;
45530 +       reiser4_block_nr block;
45531 +       struct unix_file_info *uf_info;
45532 +       jnode *node;
45533 +
45534 +       /* first extent insertion starts at leaf level */
45535 +       assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL);
45536 +       assert("vs-711", coord_is_between_items(&uf_coord->coord));
45537 +
45538 +       if (get_key_offset(key) != 0) {
45539 +               result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key);
45540 +               uf_coord->valid = 0;
45541 +               uf_info = unix_file_inode_data(inode);
45542 +
45543 +               /*
45544 +                * first item insertion is only possible when writing to empty
45545 +                * file or performing tail conversion
45546 +                */
45547 +               assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
45548 +                           (reiser4_inode_get_flag(inode,
45549 +                                                   REISER4_PART_MIXED) &&
45550 +                            reiser4_inode_get_flag(inode,
45551 +                                                   REISER4_PART_IN_CONV))));
45552 +               /* if file was empty - update its state */
45553 +               if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
45554 +                       uf_info->container = UF_CONTAINER_EXTENTS;
45555 +               return result;
45556 +       }
45557 +
45558 +       if (count == 0)
45559 +               return 0;
45560 +
45561 +       result = dquot_alloc_block_nodirty(mapping_jnode(jnodes[0])->host,
45562 +                                          count);
45563 +       BUG_ON(result != 0);
45564 +
45565 +       /*
45566 +        * prepare for tree modification: compose body of item and item data
45567 +        * structure needed for insertion
45568 +        */
45569 +       reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
45570 +       init_new_extent(&idata, &new_ext, 1);
45571 +
45572 +       /* insert extent item into the tree */
45573 +       result = insert_extent_by_coord(&uf_coord->coord, &idata, key,
45574 +                                       uf_coord->lh);
45575 +       if (result)
45576 +               return result;
45577 +
45578 +       /*
45579 +        * make sure that we hold long term locked twig node containing all
45580 +        * jnodes we are about to capture
45581 +        */
45582 +       check_jnodes(uf_coord->lh->node, key, count);
45583 +       /*
45584 +        * assign fake block numbers to all jnodes, capture and mark them dirty
45585 +        */
45586 +       block = fake_blocknr_unformatted(count);
45587 +       for (i = 0; i < count; i ++, block ++) {
45588 +               node = jnodes[i];
45589 +               spin_lock_jnode(node);
45590 +               JF_SET(node, JNODE_CREATED);
45591 +               jnode_set_block(node, &block);
45592 +               result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
45593 +               BUG_ON(result != 0);
45594 +               jnode_make_dirty_locked(node);
45595 +               spin_unlock_jnode(node);
45596 +       }
45597 +
45598 +       /*
45599 +        * invalidate coordinate, research must be performed to continue
45600 +        * because write will continue on twig level
45601 +        */
45602 +       uf_coord->valid = 0;
45603 +       return count;
45604 +}
45605 +
45606 +/**
45607 + * plug_hole - replace hole extent with unallocated and holes
45608 + * @uf_coord:
45609 + * @key:
45610 + * @node:
45611 + * @h: structure containing coordinate, lock handle, key, etc
45612 + *
45613 + * Creates an unallocated extent of width 1 within a hole. In worst case two
45614 + * additional extents can be created.
45615 + */
45616 +static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how)
45617 +{
45618 +       struct replace_handle rh;
45619 +       reiser4_extent *ext;
45620 +       reiser4_block_nr width, pos_in_unit;
45621 +       coord_t *coord;
45622 +       struct extent_coord_extension *ext_coord;
45623 +       int return_inserted_position;
45624 +
45625 +       check_uf_coord(uf_coord, key);
45626 +
45627 +       rh.coord = coord_by_uf_coord(uf_coord);
45628 +       rh.lh = uf_coord->lh;
45629 +       rh.flags = 0;
45630 +
45631 +       coord = coord_by_uf_coord(uf_coord);
45632 +       ext_coord = ext_coord_by_uf_coord(uf_coord);
45633 +       ext = ext_by_ext_coord(uf_coord);
45634 +
45635 +       width = ext_coord->width;
45636 +       pos_in_unit = ext_coord->pos_in_unit;
45637 +
45638 +       *how = 0;
45639 +       if (width == 1) {
45640 +               reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, 1);
45641 +               znode_make_dirty(coord->node);
45642 +               /* update uf_coord */
45643 +               ON_DEBUG(ext_coord->extent = *ext);
45644 +               *how = 1;
45645 +               return 0;
45646 +       } else if (pos_in_unit == 0) {
45647 +               /* we deal with first element of extent */
45648 +               if (coord->unit_pos) {
45649 +                       /* there is an extent to the left */
45650 +                       if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) {
45651 +                               /*
45652 +                                * left neighboring unit is an unallocated
45653 +                                * extent. Increase its width and decrease
45654 +                                * width of hole
45655 +                                */
45656 +                               extent_set_width(ext - 1,
45657 +                                                extent_get_width(ext - 1) + 1);
45658 +                               extent_set_width(ext, width - 1);
45659 +                               znode_make_dirty(coord->node);
45660 +
45661 +                               /* update coord extension */
45662 +                               coord->unit_pos--;
45663 +                               ext_coord->width = extent_get_width(ext - 1);
45664 +                               ext_coord->pos_in_unit = ext_coord->width - 1;
45665 +                               ext_coord->ext_offset -= sizeof(reiser4_extent);
45666 +                               ON_DEBUG(ext_coord->extent =
45667 +                                        *extent_by_coord(coord));
45668 +                               *how = 2;
45669 +                               return 0;
45670 +                       }
45671 +               }
45672 +               /* extent for replace */
45673 +               reiser4_set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1);
45674 +               /* extent to be inserted */
45675 +               reiser4_set_extent(&rh.new_extents[0], HOLE_EXTENT_START,
45676 +                                  width - 1);
45677 +               rh.nr_new_extents = 1;
45678 +
45679 +               /* have reiser4_replace_extent to return with @coord and
45680 +                  @uf_coord->lh set to unit which was replaced */
45681 +               return_inserted_position = 0;
45682 +               *how = 3;
45683 +       } else if (pos_in_unit == width - 1) {
45684 +               /* we deal with last element of extent */
45685 +               if (coord->unit_pos < nr_units_extent(coord) - 1) {
45686 +                       /* there is an extent unit to the right */
45687 +                       if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) {
45688 +                               /*
45689 +                                * right neighboring unit is an unallocated
45690 +                                * extent. Increase its width and decrease
45691 +                                * width of hole
45692 +                                */
45693 +                               extent_set_width(ext + 1,
45694 +                                                extent_get_width(ext + 1) + 1);
45695 +                               extent_set_width(ext, width - 1);
45696 +                               znode_make_dirty(coord->node);
45697 +
45698 +                               /* update coord extension */
45699 +                               coord->unit_pos++;
45700 +                               ext_coord->width = extent_get_width(ext + 1);
45701 +                               ext_coord->pos_in_unit = 0;
45702 +                               ext_coord->ext_offset += sizeof(reiser4_extent);
45703 +                               ON_DEBUG(ext_coord->extent =
45704 +                                        *extent_by_coord(coord));
45705 +                               *how = 4;
45706 +                               return 0;
45707 +                       }
45708 +               }
45709 +               /* extent for replace */
45710 +               reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1);
45711 +               /* extent to be inserted */
45712 +               reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
45713 +                                  1);
45714 +               rh.nr_new_extents = 1;
45715 +
45716 +               /* have reiser4_replace_extent to return with @coord and
45717 +                  @uf_coord->lh set to unit which was inserted */
45718 +               return_inserted_position = 1;
45719 +               *how = 5;
45720 +       } else {
45721 +               /* extent for replace */
45722 +               reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START,
45723 +                                  pos_in_unit);
45724 +               /* extents to be inserted */
45725 +               reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
45726 +                                  1);
45727 +               reiser4_set_extent(&rh.new_extents[1], HOLE_EXTENT_START,
45728 +                                  width - pos_in_unit - 1);
45729 +               rh.nr_new_extents = 2;
45730 +
45731 +               /* have reiser4_replace_extent to return with @coord and
45732 +                  @uf_coord->lh set to first of units which were inserted */
45733 +               return_inserted_position = 1;
45734 +               *how = 6;
45735 +       }
45736 +       unit_key_by_coord(coord, &rh.paste_key);
45737 +       set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) +
45738 +                      extent_get_width(&rh.overwrite) * current_blocksize);
45739 +
45740 +       uf_coord->valid = 0;
45741 +       return reiser4_replace_extent(&rh, return_inserted_position);
45742 +}
45743 +
45744 +/**
45745 + * overwrite_one_block -
45746 + * @uf_coord:
45747 + * @key:
45748 + * @node:
45749 + *
45750 + * If @node corresponds to hole extent - create unallocated extent for it and
45751 + * assign fake block number. If @node corresponds to allocated extent - assign
45752 + * block number of jnode
45753 + */
45754 +static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key,
45755 +                              jnode *node, int *hole_plugged)
45756 +{
45757 +       int result;
45758 +       struct extent_coord_extension *ext_coord;
45759 +       reiser4_extent *ext;
45760 +       reiser4_block_nr block;
45761 +       int how;
45762 +
45763 +       assert("vs-1312", uf_coord->coord.between == AT_UNIT);
45764 +
45765 +       result = 0;
45766 +       ext_coord = ext_coord_by_uf_coord(uf_coord);
45767 +       ext = ext_by_ext_coord(uf_coord);
45768 +       assert("", state_of_extent(ext) != UNALLOCATED_EXTENT);
45769 +
45770 +       switch (state_of_extent(ext)) {
45771 +       case ALLOCATED_EXTENT:
45772 +               block = extent_get_start(ext) + ext_coord->pos_in_unit;
45773 +               break;
45774 +
45775 +       case HOLE_EXTENT:
45776 +               result = dquot_alloc_block_nodirty(mapping_jnode(node)->host,
45777 +                                                  1);
45778 +               BUG_ON(result != 0);
45779 +               result = plug_hole(uf_coord, key, &how);
45780 +               if (result)
45781 +                       return result;
45782 +               block = fake_blocknr_unformatted(1);
45783 +               if (hole_plugged)
45784 +                       *hole_plugged = 1;
45785 +               JF_SET(node, JNODE_CREATED);
45786 +               break;
45787 +
45788 +       default:
45789 +               return RETERR(-EIO);
45790 +       }
45791 +
45792 +       jnode_set_block(node, &block);
45793 +       return 0;
45794 +}
45795 +
45796 +/**
45797 + * move_coord - move coordinate forward
45798 + * @uf_coord:
45799 + *
45800 + * Move coordinate one data block pointer forward. Return 1 if coord is set to
45801 + * the last one already or is invalid.
45802 + */
45803 +static int move_coord(uf_coord_t *uf_coord)
45804 +{
45805 +       struct extent_coord_extension *ext_coord;
45806 +
45807 +       if (uf_coord->valid == 0)
45808 +               return 1;
45809 +       ext_coord = &uf_coord->extension.extent;
45810 +       ext_coord->pos_in_unit ++;
45811 +       if (ext_coord->pos_in_unit < ext_coord->width)
45812 +               /* coordinate moved within the unit */
45813 +               return 0;
45814 +
45815 +       /* end of unit is reached. Try to move to next unit */
45816 +       ext_coord->pos_in_unit = 0;
45817 +       uf_coord->coord.unit_pos ++;
45818 +       if (uf_coord->coord.unit_pos < ext_coord->nr_units) {
45819 +               /* coordinate moved to next unit */
45820 +               ext_coord->ext_offset += sizeof(reiser4_extent);
45821 +               ext_coord->width =
45822 +                       extent_get_width(ext_by_offset
45823 +                                        (uf_coord->coord.node,
45824 +                                         ext_coord->ext_offset));
45825 +               ON_DEBUG(ext_coord->extent =
45826 +                        *ext_by_offset(uf_coord->coord.node,
45827 +                                       ext_coord->ext_offset));
45828 +               return 0;
45829 +       }
45830 +       /* end of item is reached */
45831 +       uf_coord->valid = 0;
45832 +       return 1;
45833 +}
45834 +
45835 +/**
45836 + * overwrite_extent -
45837 + * @inode:
45838 + *
45839 + * Returns number of handled jnodes.
45840 + */
45841 +static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key,
45842 +                           jnode **jnodes, int count, int *plugged_hole)
45843 +{
45844 +       int result;
45845 +       reiser4_key k;
45846 +       int i;
45847 +       jnode *node;
45848 +
45849 +       k = *key;
45850 +       for (i = 0; i < count; i ++) {
45851 +               node = jnodes[i];
45852 +               if (*jnode_get_block(node) == 0) {
45853 +                       result = overwrite_one_block(uf_coord, &k, node, plugged_hole);
45854 +                       if (result)
45855 +                               return result;
45856 +               }
45857 +               /*
45858 +                * make sure that we hold long term locked twig node containing
45859 +                * all jnodes we are about to capture
45860 +                */
45861 +               check_jnodes(uf_coord->lh->node, &k, 1);
45862 +               /*
45863 +                * assign fake block numbers to all jnodes, capture and mark
45864 +                * them dirty
45865 +                */
45866 +               spin_lock_jnode(node);
45867 +               result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
45868 +               BUG_ON(result != 0);
45869 +               jnode_make_dirty_locked(node);
45870 +               spin_unlock_jnode(node);
45871 +
45872 +               if (uf_coord->valid == 0)
45873 +                       return i + 1;
45874 +
45875 +               check_uf_coord(uf_coord, &k);
45876 +
45877 +               if (move_coord(uf_coord)) {
45878 +                       /*
45879 +                        * failed to move to the next node pointer. Either end
45880 +                        * of file or end of twig node is reached. In the later
45881 +                        * case we might go to the right neighbor.
45882 +                        */
45883 +                       uf_coord->valid = 0;
45884 +                       return i + 1;
45885 +               }
45886 +               set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE);
45887 +       }
45888 +
45889 +       return count;
45890 +}
45891 +
45892 +/**
45893 + * reiser4_update_extent
45894 + * @file:
45895 + * @jnodes:
45896 + * @count:
45897 + * @off:
45898 + *
45899 + */
45900 +int reiser4_update_extent(struct inode *inode, jnode *node, loff_t pos,
45901 +                 int *plugged_hole)
45902 +{
45903 +       int result;
45904 +       znode *loaded;
45905 +       uf_coord_t uf_coord;
45906 +       coord_t *coord;
45907 +       lock_handle lh;
45908 +       reiser4_key key;
45909 +
45910 +       assert("", reiser4_lock_counters()->d_refs == 0);
45911 +
45912 +       key_by_inode_and_offset_common(inode, pos, &key);
45913 +
45914 +       init_uf_coord(&uf_coord, &lh);
45915 +       coord = &uf_coord.coord;
45916 +       result = find_file_item_nohint(coord, &lh, &key,
45917 +                                      ZNODE_WRITE_LOCK, inode);
45918 +       if (IS_CBKERR(result)) {
45919 +               assert("", reiser4_lock_counters()->d_refs == 0);
45920 +               return result;
45921 +       }
45922 +
45923 +       result = zload(coord->node);
45924 +       BUG_ON(result != 0);
45925 +       loaded = coord->node;
45926 +
45927 +       if (coord->between == AFTER_UNIT) {
45928 +               /*
45929 +                * append existing extent item with unallocated extent of width
45930 +                * nr_jnodes
45931 +                */
45932 +               init_coord_extension_extent(&uf_coord,
45933 +                                           get_key_offset(&key));
45934 +               result = append_last_extent(&uf_coord, &key,
45935 +                                           &node, 1);
45936 +       } else if (coord->between == AT_UNIT) {
45937 +               /*
45938 +                * overwrite
45939 +                * not optimal yet. Will be optimized if new write will show
45940 +                * performance win.
45941 +                */
45942 +               init_coord_extension_extent(&uf_coord,
45943 +                                           get_key_offset(&key));
45944 +               result = overwrite_extent(&uf_coord, &key,
45945 +                                         &node, 1, plugged_hole);
45946 +       } else {
45947 +               /*
45948 +                * there are no items of this file in the tree yet. Create
45949 +                * first item of the file inserting one unallocated extent of
45950 +                * width nr_jnodes
45951 +                */
45952 +               result = insert_first_extent(&uf_coord, &key, &node, 1, inode);
45953 +       }
45954 +       assert("", result == 1 || result < 0);
45955 +       zrelse(loaded);
45956 +       done_lh(&lh);
45957 +       assert("", reiser4_lock_counters()->d_refs == 0);
45958 +       return (result == 1) ? 0 : result;
45959 +}
45960 +
45961 +/**
45962 + * update_extents
45963 + * @file:
45964 + * @jnodes:
45965 + * @count:
45966 + * @off:
45967 + *
45968 + */
45969 +static int update_extents(struct file *file, struct inode *inode,
45970 +                         jnode **jnodes, int count, loff_t pos)
45971 +{
45972 +       struct hint hint;
45973 +       reiser4_key key;
45974 +       int result;
45975 +       znode *loaded;
45976 +
45977 +       result = load_file_hint(file, &hint);
45978 +       BUG_ON(result != 0);
45979 +
45980 +       if (count != 0)
45981 +               /*
45982 +                * count == 0 is special case: expanding truncate
45983 +                */
45984 +               pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT;
45985 +       key_by_inode_and_offset_common(inode, pos, &key);
45986 +
45987 +       assert("", reiser4_lock_counters()->d_refs == 0);
45988 +
45989 +       do {
45990 +               result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode);
45991 +               if (IS_CBKERR(result)) {
45992 +                       assert("", reiser4_lock_counters()->d_refs == 0);
45993 +                       return result;
45994 +               }
45995 +
45996 +               result = zload(hint.ext_coord.coord.node);
45997 +               BUG_ON(result != 0);
45998 +               loaded = hint.ext_coord.coord.node;
45999 +
46000 +               if (hint.ext_coord.coord.between == AFTER_UNIT) {
46001 +                       /*
46002 +                        * append existing extent item with unallocated extent
46003 +                        * of width nr_jnodes
46004 +                        */
46005 +                       if (hint.ext_coord.valid == 0)
46006 +                               /* NOTE: get statistics on this */
46007 +                               init_coord_extension_extent(&hint.ext_coord,
46008 +                                                           get_key_offset(&key));
46009 +                       result = append_last_extent(&hint.ext_coord, &key,
46010 +                                                   jnodes, count);
46011 +               } else if (hint.ext_coord.coord.between == AT_UNIT) {
46012 +                       /*
46013 +                        * overwrite
46014 +                        * not optimal yet. Will be optimized if new write will
46015 +                        * show performance win.
46016 +                        */
46017 +                       if (hint.ext_coord.valid == 0)
46018 +                               /* NOTE: get statistics on this */
46019 +                               init_coord_extension_extent(&hint.ext_coord,
46020 +                                                           get_key_offset(&key));
46021 +                       result = overwrite_extent(&hint.ext_coord, &key,
46022 +                                                 jnodes, count, NULL);
46023 +               } else {
46024 +                       /*
46025 +                        * there are no items of this file in the tree
46026 +                        * yet. Create first item of the file inserting one
46027 +                        * unallocated extent of * width nr_jnodes
46028 +                        */
46029 +                       result = insert_first_extent(&hint.ext_coord, &key,
46030 +                                                    jnodes, count, inode);
46031 +               }
46032 +               zrelse(loaded);
46033 +               if (result < 0) {
46034 +                       done_lh(hint.ext_coord.lh);
46035 +                       break;
46036 +               }
46037 +
46038 +               jnodes += result;
46039 +               count -= result;
46040 +               set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE);
46041 +
46042 +               /* seal and unlock znode */
46043 +               if (hint.ext_coord.valid)
46044 +                       reiser4_set_hint(&hint, &key, ZNODE_WRITE_LOCK);
46045 +               else
46046 +                       reiser4_unset_hint(&hint);
46047 +
46048 +       } while (count > 0);
46049 +
46050 +       save_file_hint(file, &hint);
46051 +       assert("", reiser4_lock_counters()->d_refs == 0);
46052 +       return result;
46053 +}
46054 +
46055 +/**
46056 + * write_extent_reserve_space - reserve space for extent write operation
46057 + * @inode:
46058 + *
46059 + * Estimates and reserves space which may be required for writing
46060 + * WRITE_GRANULARITY pages of file.
46061 + */
46062 +static int write_extent_reserve_space(struct inode *inode)
46063 +{
46064 +       __u64 count;
46065 +       reiser4_tree *tree;
46066 +
46067 +       /*
46068 +        * to write WRITE_GRANULARITY pages to a file by extents we have to
46069 +        * reserve disk space for:
46070 +
46071 +        * 1. find_file_item may have to insert empty node to the tree (empty
46072 +        * leaf node between two extent items). This requires 1 block and
46073 +        * number of blocks which are necessary to perform insertion of an
46074 +        * internal item into twig level.
46075 +
46076 +        * 2. for each of written pages there might be needed 1 block and
46077 +        * number of blocks which might be necessary to perform insertion of or
46078 +        * paste to an extent item.
46079 +
46080 +        * 3. stat data update
46081 +        */
46082 +       tree = reiser4_tree_by_inode(inode);
46083 +       count = estimate_one_insert_item(tree) +
46084 +               WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) +
46085 +               estimate_one_insert_item(tree);
46086 +       grab_space_enable();
46087 +       return reiser4_grab_space(count, 0 /* flags */);
46088 +}
46089 +
46090 +/*
46091 + * filemap_copy_from_user no longer exists in generic code, because it
46092 + * is deadlocky (copying from user while holding the page lock is bad).
46093 + * As a temporary fix for reiser4, just define it here.
46094 + */
46095 +static inline size_t
46096 +filemap_copy_from_user(struct page *page, unsigned long offset,
46097 +                       const char __user *buf, unsigned bytes)
46098 +{
46099 +       char *kaddr;
46100 +       int left;
46101 +
46102 +       kaddr = kmap_atomic(page, KM_USER0);
46103 +       left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
46104 +       kunmap_atomic(kaddr, KM_USER0);
46105 +
46106 +       if (left != 0) {
46107 +               /* Do it the slow way */
46108 +               kaddr = kmap(page);
46109 +               left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
46110 +               kunmap(page);
46111 +       }
46112 +       return bytes - left;
46113 +}
46114 +
46115 +/**
46116 + * reiser4_write_extent - write method of extent item plugin
46117 + * @file: file to write to
46118 + * @buf: address of user-space buffer
46119 + * @count: number of bytes to write
46120 + * @pos: position in file to write to
46121 + *
46122 + */
46123 +ssize_t reiser4_write_extent(struct file *file, struct inode * inode,
46124 +                            const char __user *buf, size_t count, loff_t *pos)
46125 +{
46126 +       int have_to_update_extent;
46127 +       int nr_pages, nr_dirty;
46128 +       struct page *page;
46129 +       jnode *jnodes[WRITE_GRANULARITY + 1];
46130 +       unsigned long index;
46131 +       unsigned long end;
46132 +       int i;
46133 +       int to_page, page_off;
46134 +       size_t left, written;
46135 +       int result = 0;
46136 +
46137 +       if (write_extent_reserve_space(inode))
46138 +               return RETERR(-ENOSPC);
46139 +
46140 +       if (count == 0) {
46141 +               /* truncate case */
46142 +               update_extents(file, inode, jnodes, 0, *pos);
46143 +               return 0;
46144 +       }
46145 +
46146 +       BUG_ON(get_current_context()->trans->atom != NULL);
46147 +
46148 +       left = count;
46149 +       index = *pos >> PAGE_CACHE_SHIFT;
46150 +       /* calculate number of pages which are to be written */
46151 +       end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT);
46152 +       nr_pages = end - index + 1;
46153 +       nr_dirty = 0;
46154 +       assert("", nr_pages <= WRITE_GRANULARITY + 1);
46155 +
46156 +       /* get pages and jnodes */
46157 +       for (i = 0; i < nr_pages; i ++) {
46158 +               page = find_or_create_page(inode->i_mapping, index + i,
46159 +                                          reiser4_ctx_gfp_mask_get());
46160 +               if (page == NULL) {
46161 +                       nr_pages = i;
46162 +                       result = RETERR(-ENOMEM);
46163 +                       goto out;
46164 +               }
46165 +
46166 +               jnodes[i] = jnode_of_page(page);
46167 +               if (IS_ERR(jnodes[i])) {
46168 +                       unlock_page(page);
46169 +                       page_cache_release(page);
46170 +                       nr_pages = i;
46171 +                       result = RETERR(-ENOMEM);
46172 +                       goto out;
46173 +               }
46174 +               /* prevent jnode and page from disconnecting */
46175 +               JF_SET(jnodes[i], JNODE_WRITE_PREPARED);
46176 +               unlock_page(page);
46177 +       }
46178 +
46179 +       BUG_ON(get_current_context()->trans->atom != NULL);
46180 +
46181 +       have_to_update_extent = 0;
46182 +
46183 +       page_off = (*pos & (PAGE_CACHE_SIZE - 1));
46184 +       for (i = 0; i < nr_pages; i ++) {
46185 +               to_page = PAGE_CACHE_SIZE - page_off;
46186 +               if (to_page > left)
46187 +                       to_page = left;
46188 +               page = jnode_page(jnodes[i]);
46189 +               if (page_offset(page) < inode->i_size &&
46190 +                   !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
46191 +                       /*
46192 +                        * the above is not optimal for partial write to last
46193 +                        * page of file when file size is not at boundary of
46194 +                        * page
46195 +                        */
46196 +                       lock_page(page);
46197 +                       if (!PageUptodate(page)) {
46198 +                               result = readpage_unix_file(NULL, page);
46199 +                               BUG_ON(result != 0);
46200 +                               /* wait for read completion */
46201 +                               lock_page(page);
46202 +                               BUG_ON(!PageUptodate(page));
46203 +                       } else
46204 +                               result = 0;
46205 +                       unlock_page(page);
46206 +               }
46207 +
46208 +               BUG_ON(get_current_context()->trans->atom != NULL);
46209 +               fault_in_pages_readable(buf, to_page);
46210 +               BUG_ON(get_current_context()->trans->atom != NULL);
46211 +
46212 +               lock_page(page);
46213 +               if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE)
46214 +                       zero_user_segments(page, 0, page_off,
46215 +                                          page_off + to_page,
46216 +                                          PAGE_CACHE_SIZE);
46217 +
46218 +               written = filemap_copy_from_user(page, page_off, buf, to_page);
46219 +               if (unlikely(written != to_page)) {
46220 +                       unlock_page(page);
46221 +                       result = RETERR(-EFAULT);
46222 +                       break;
46223 +               }
46224 +
46225 +               flush_dcache_page(page);
46226 +               set_page_dirty_notag(page);
46227 +               unlock_page(page);
46228 +               nr_dirty++;
46229 +
46230 +               mark_page_accessed(page);
46231 +               SetPageUptodate(page);
46232 +
46233 +               if (jnodes[i]->blocknr == 0)
46234 +                       have_to_update_extent ++;
46235 +
46236 +               page_off = 0;
46237 +               buf += to_page;
46238 +               left -= to_page;
46239 +               BUG_ON(get_current_context()->trans->atom != NULL);
46240 +       }
46241 +
46242 +       if (have_to_update_extent) {
46243 +               update_extents(file, inode, jnodes, nr_dirty, *pos);
46244 +       } else {
46245 +               for (i = 0; i < nr_dirty; i ++) {
46246 +                       int ret;
46247 +                       spin_lock_jnode(jnodes[i]);
46248 +                       ret = reiser4_try_capture(jnodes[i],
46249 +                                                    ZNODE_WRITE_LOCK, 0);
46250 +                       BUG_ON(ret != 0);
46251 +                       jnode_make_dirty_locked(jnodes[i]);
46252 +                       spin_unlock_jnode(jnodes[i]);
46253 +               }
46254 +       }
46255 +out:
46256 +       for (i = 0; i < nr_pages; i ++) {
46257 +               page_cache_release(jnode_page(jnodes[i]));
46258 +               JF_CLR(jnodes[i], JNODE_WRITE_PREPARED);
46259 +               jput(jnodes[i]);
46260 +       }
46261 +
46262 +       /* the only errors handled so far is ENOMEM and
46263 +          EFAULT on copy_from_user  */
46264 +
46265 +       return (count - left) ? (count - left) : result;
46266 +}
46267 +
46268 +int reiser4_do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos,
46269 +                              struct page *page)
46270 +{
46271 +       jnode *j;
46272 +       struct address_space *mapping;
46273 +       unsigned long index;
46274 +       oid_t oid;
46275 +       reiser4_block_nr block;
46276 +
46277 +       mapping = page->mapping;
46278 +       oid = get_inode_oid(mapping->host);
46279 +       index = page->index;
46280 +
46281 +       switch (state_of_extent(ext)) {
46282 +       case HOLE_EXTENT:
46283 +               /*
46284 +                * it is possible to have hole page with jnode, if page was
46285 +                * eflushed previously.
46286 +                */
46287 +               j = jfind(mapping, index);
46288 +               if (j == NULL) {
46289 +                       zero_user(page, 0, PAGE_CACHE_SIZE);
46290 +                       SetPageUptodate(page);
46291 +                       unlock_page(page);
46292 +                       return 0;
46293 +               }
46294 +               spin_lock_jnode(j);
46295 +               if (!jnode_page(j)) {
46296 +                       jnode_attach_page(j, page);
46297 +               } else {
46298 +                       BUG_ON(jnode_page(j) != page);
46299 +                       assert("vs-1504", jnode_page(j) == page);
46300 +               }
46301 +               block = *jnode_get_io_block(j);
46302 +               spin_unlock_jnode(j);
46303 +               if (block == 0) {
46304 +                       zero_user(page, 0, PAGE_CACHE_SIZE);
46305 +                       SetPageUptodate(page);
46306 +                       unlock_page(page);
46307 +                       jput(j);
46308 +                       return 0;
46309 +               }
46310 +               break;
46311 +
46312 +       case ALLOCATED_EXTENT:
46313 +               j = jnode_of_page(page);
46314 +               if (IS_ERR(j))
46315 +                       return PTR_ERR(j);
46316 +               if (*jnode_get_block(j) == 0) {
46317 +                       reiser4_block_nr blocknr;
46318 +
46319 +                       blocknr = extent_get_start(ext) + pos;
46320 +                       jnode_set_block(j, &blocknr);
46321 +               } else
46322 +                       assert("vs-1403",
46323 +                              j->blocknr == extent_get_start(ext) + pos);
46324 +               break;
46325 +
46326 +       case UNALLOCATED_EXTENT:
46327 +               j = jfind(mapping, index);
46328 +               assert("nikita-2688", j);
46329 +               assert("vs-1426", jnode_page(j) == NULL);
46330 +
46331 +               spin_lock_jnode(j);
46332 +               jnode_attach_page(j, page);
46333 +               spin_unlock_jnode(j);
46334 +               break;
46335 +
46336 +       default:
46337 +               warning("vs-957", "wrong extent\n");
46338 +               return RETERR(-EIO);
46339 +       }
46340 +
46341 +       BUG_ON(j == 0);
46342 +       reiser4_page_io(page, j, READ, reiser4_ctx_gfp_mask_get());
46343 +       jput(j);
46344 +       return 0;
46345 +}
46346 +
46347 +/* Implements plugin->u.item.s.file.read operation for extent items. */
46348 +int reiser4_read_extent(struct file *file, flow_t *flow, hint_t *hint)
46349 +{
46350 +       int result;
46351 +       struct page *page;
46352 +       unsigned long cur_page, next_page;
46353 +       unsigned long page_off, count;
46354 +       struct address_space *mapping;
46355 +       loff_t file_off;
46356 +       uf_coord_t *uf_coord;
46357 +       coord_t *coord;
46358 +       struct extent_coord_extension *ext_coord;
46359 +       unsigned long nr_pages;
46360 +       char *kaddr;
46361 +
46362 +       assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE);
46363 +       assert("vs-572", flow->user == 1);
46364 +       assert("vs-1351", flow->length > 0);
46365 +
46366 +       uf_coord = &hint->ext_coord;
46367 +
46368 +       check_uf_coord(uf_coord, NULL);
46369 +       assert("vs-33", uf_coord->lh == &hint->lh);
46370 +
46371 +       coord = &uf_coord->coord;
46372 +       assert("vs-1119", znode_is_rlocked(coord->node));
46373 +       assert("vs-1120", znode_is_loaded(coord->node));
46374 +       assert("vs-1256", coord_matches_key_extent(coord, &flow->key));
46375 +
46376 +       mapping = file->f_dentry->d_inode->i_mapping;
46377 +       ext_coord = &uf_coord->extension.extent;
46378 +
46379 +       /* offset in a file to start read from */
46380 +       file_off = get_key_offset(&flow->key);
46381 +       /* offset within the page to start read from */
46382 +       page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1));
46383 +       /* bytes which can be read from the page which contains file_off */
46384 +       count = PAGE_CACHE_SIZE - page_off;
46385 +
46386 +       /* index of page containing offset read is to start from */
46387 +       cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT);
46388 +       next_page = cur_page;
46389 +       /* number of pages flow spans over */
46390 +       nr_pages =
46391 +           ((file_off + flow->length + PAGE_CACHE_SIZE -
46392 +             1) >> PAGE_CACHE_SHIFT) - cur_page;
46393 +
46394 +       /* we start having twig node read locked. However, we do not want to
46395 +          keep that lock all the time readahead works. So, set a sel and
46396 +          release twig node. */
46397 +       reiser4_set_hint(hint, &flow->key, ZNODE_READ_LOCK);
46398 +       /* &hint->lh is done-ed */
46399 +
46400 +       do {
46401 +               reiser4_txn_restart_current();
46402 +               page = read_mapping_page(mapping, cur_page, file);
46403 +               if (IS_ERR(page))
46404 +                       return PTR_ERR(page);
46405 +               lock_page(page);
46406 +               if (!PageUptodate(page)) {
46407 +                       unlock_page(page);
46408 +                       page_cache_release(page);
46409 +                       warning("jmacd-97178", "extent_read: page is not up to date");
46410 +                       return RETERR(-EIO);
46411 +               }
46412 +               mark_page_accessed(page);
46413 +               unlock_page(page);
46414 +
46415 +               /* If users can be writing to this page using arbitrary virtual
46416 +                  addresses, take care about potential aliasing before reading
46417 +                  the page on the kernel side.
46418 +                */
46419 +               if (mapping_writably_mapped(mapping))
46420 +                       flush_dcache_page(page);
46421 +
46422 +               assert("nikita-3034", reiser4_schedulable());
46423 +
46424 +               /* number of bytes which are to be read from the page */
46425 +               if (count > flow->length)
46426 +                       count = flow->length;
46427 +
46428 +               result = fault_in_pages_writeable(flow->data, count);
46429 +               if (result) {
46430 +                       page_cache_release(page);
46431 +                       return RETERR(-EFAULT);
46432 +               }
46433 +
46434 +               kaddr = kmap_atomic(page, KM_USER0);
46435 +               result = __copy_to_user_inatomic(flow->data,
46436 +                                              kaddr + page_off, count);
46437 +               kunmap_atomic(kaddr, KM_USER0);
46438 +               if (result != 0) {
46439 +                       kaddr = kmap(page);
46440 +                       result = __copy_to_user(flow->data, kaddr + page_off, count);
46441 +                       kunmap(page);
46442 +                       if (unlikely(result))
46443 +                               return RETERR(-EFAULT);
46444 +               }
46445 +
46446 +               page_cache_release(page);
46447 +
46448 +               /* increase key (flow->key), update user area pointer (flow->data) */
46449 +               move_flow_forward(flow, count);
46450 +
46451 +               page_off = 0;
46452 +               cur_page ++;
46453 +               count = PAGE_CACHE_SIZE;
46454 +               nr_pages--;
46455 +       } while (flow->length);
46456 +
46457 +       return 0;
46458 +}
46459 +
46460 +/*
46461 +   plugin->s.file.readpage
46462 +   reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage
46463 +   or
46464 +   filemap_fault->reiser4_readpage->readpage_unix_file->->readpage_extent
46465 +
46466 +   At the beginning: coord->node is read locked, zloaded, page is
46467 +   locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index)
46468 +*/
46469 +int reiser4_readpage_extent(void *vp, struct page *page)
46470 +{
46471 +       uf_coord_t *uf_coord = vp;
46472 +       ON_DEBUG(coord_t * coord = &uf_coord->coord);
46473 +       ON_DEBUG(reiser4_key key);
46474 +
46475 +       assert("vs-1040", PageLocked(page));
46476 +       assert("vs-1050", !PageUptodate(page));
46477 +       assert("vs-1039", page->mapping && page->mapping->host);
46478 +
46479 +       assert("vs-1044", znode_is_loaded(coord->node));
46480 +       assert("vs-758", item_is_extent(coord));
46481 +       assert("vs-1046", coord_is_existing_unit(coord));
46482 +       assert("vs-1045", znode_is_rlocked(coord->node));
46483 +       assert("vs-1047",
46484 +              page->mapping->host->i_ino ==
46485 +              get_key_objectid(item_key_by_coord(coord, &key)));
46486 +       check_uf_coord(uf_coord, NULL);
46487 +
46488 +       return reiser4_do_readpage_extent(
46489 +               ext_by_ext_coord(uf_coord),
46490 +               uf_coord->extension.extent.pos_in_unit, page);
46491 +}
46492 +
46493 +/**
46494 + * get_block_address_extent
46495 + * @coord:
46496 + * @block:
46497 + * @result:
46498 + *
46499 + *
46500 + */
46501 +int get_block_address_extent(const coord_t *coord, sector_t block,
46502 +                            sector_t *result)
46503 +{
46504 +       reiser4_extent *ext;
46505 +
46506 +       if (!coord_is_existing_unit(coord))
46507 +               return RETERR(-EINVAL);
46508 +
46509 +       ext = extent_by_coord(coord);
46510 +
46511 +       if (state_of_extent(ext) != ALLOCATED_EXTENT)
46512 +               /* FIXME: bad things may happen if it is unallocated extent */
46513 +               *result = 0;
46514 +       else {
46515 +               reiser4_key key;
46516 +
46517 +               unit_key_by_coord(coord, &key);
46518 +               assert("vs-1645",
46519 +                      block >= get_key_offset(&key) >> current_blocksize_bits);
46520 +               assert("vs-1646",
46521 +                      block <
46522 +                      (get_key_offset(&key) >> current_blocksize_bits) +
46523 +                      extent_get_width(ext));
46524 +               *result =
46525 +                   extent_get_start(ext) + (block -
46526 +                                            (get_key_offset(&key) >>
46527 +                                             current_blocksize_bits));
46528 +       }
46529 +       return 0;
46530 +}
46531 +
46532 +/*
46533 +  plugin->u.item.s.file.append_key
46534 +  key of first byte which is the next to last byte by addressed by this extent
46535 +*/
46536 +reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key)
46537 +{
46538 +       item_key_by_coord(coord, key);
46539 +       set_key_offset(key,
46540 +                      get_key_offset(key) + reiser4_extent_size(coord,
46541 +                                                                nr_units_extent
46542 +                                                                (coord)));
46543 +
46544 +       assert("vs-610", get_key_offset(key)
46545 +              && (get_key_offset(key) & (current_blocksize - 1)) == 0);
46546 +       return key;
46547 +}
46548 +
46549 +/* plugin->u.item.s.file.init_coord_extension */
46550 +void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped)
46551 +{
46552 +       coord_t *coord;
46553 +       struct extent_coord_extension *ext_coord;
46554 +       reiser4_key key;
46555 +       loff_t offset;
46556 +
46557 +       assert("vs-1295", uf_coord->valid == 0);
46558 +
46559 +       coord = &uf_coord->coord;
46560 +       assert("vs-1288", coord_is_iplug_set(coord));
46561 +       assert("vs-1327", znode_is_loaded(coord->node));
46562 +
46563 +       if (coord->between != AFTER_UNIT && coord->between != AT_UNIT)
46564 +               return;
46565 +
46566 +       ext_coord = &uf_coord->extension.extent;
46567 +       ext_coord->nr_units = nr_units_extent(coord);
46568 +       ext_coord->ext_offset =
46569 +           (char *)extent_by_coord(coord) - zdata(coord->node);
46570 +       ext_coord->width = extent_get_width(extent_by_coord(coord));
46571 +       ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
46572 +       uf_coord->valid = 1;
46573 +
46574 +       /* pos_in_unit is the only uninitialized field in extended coord */
46575 +       if (coord->between == AFTER_UNIT) {
46576 +               assert("vs-1330",
46577 +                      coord->unit_pos == nr_units_extent(coord) - 1);
46578 +
46579 +               ext_coord->pos_in_unit = ext_coord->width - 1;
46580 +       } else {
46581 +               /* AT_UNIT */
46582 +               unit_key_by_coord(coord, &key);
46583 +               offset = get_key_offset(&key);
46584 +
46585 +               assert("vs-1328", offset <= lookuped);
46586 +               assert("vs-1329",
46587 +                      lookuped <
46588 +                      offset + ext_coord->width * current_blocksize);
46589 +               ext_coord->pos_in_unit =
46590 +                   ((lookuped - offset) >> current_blocksize_bits);
46591 +       }
46592 +}
46593 +
46594 +/*
46595 + * Local variables:
46596 + * c-indentation-style: "K&R"
46597 + * mode-name: "LC"
46598 + * c-basic-offset: 8
46599 + * tab-width: 8
46600 + * fill-column: 79
46601 + * scroll-step: 1
46602 + * End:
46603 + */
46604 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/extent_flush_ops.c linux-2.6.35/fs/reiser4/plugin/item/extent_flush_ops.c
46605 --- linux-2.6.35.orig/fs/reiser4/plugin/item/extent_flush_ops.c 1970-01-01 01:00:00.000000000 +0100
46606 +++ linux-2.6.35/fs/reiser4/plugin/item/extent_flush_ops.c      2010-08-04 15:44:57.000000000 +0200
46607 @@ -0,0 +1,1028 @@
46608 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
46609 +
46610 +#include "item.h"
46611 +#include "../../tree.h"
46612 +#include "../../jnode.h"
46613 +#include "../../super.h"
46614 +#include "../../flush.h"
46615 +#include "../../carry.h"
46616 +#include "../object.h"
46617 +
46618 +#include <linux/pagemap.h>
46619 +
46620 +static reiser4_block_nr extent_unit_start(const coord_t * item);
46621 +
46622 +/* Return either first or last extent (depending on @side) of the item
46623 +   @coord is set to. Set @pos_in_unit either to first or to last block
46624 +   of extent. */
46625 +static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side,
46626 +                                        reiser4_block_nr * pos_in_unit)
46627 +{
46628 +       reiser4_extent *ext;
46629 +
46630 +       if (side == LEFT_SIDE) {
46631 +               /* get first extent of item */
46632 +               ext = extent_item(coord);
46633 +               *pos_in_unit = 0;
46634 +       } else {
46635 +               /* get last extent of item and last position within it */
46636 +               assert("vs-363", side == RIGHT_SIDE);
46637 +               ext = extent_item(coord) + coord_last_unit_pos(coord);
46638 +               *pos_in_unit = extent_get_width(ext) - 1;
46639 +       }
46640 +
46641 +       return ext;
46642 +}
46643 +
46644 +/* item_plugin->f.utmost_child */
46645 +/* Return the child. Coord is set to extent item. Find jnode corresponding
46646 +   either to first or to last unformatted node pointed by the item */
46647 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp)
46648 +{
46649 +       reiser4_extent *ext;
46650 +       reiser4_block_nr pos_in_unit;
46651 +
46652 +       ext = extent_utmost_ext(coord, side, &pos_in_unit);
46653 +
46654 +       switch (state_of_extent(ext)) {
46655 +       case HOLE_EXTENT:
46656 +               *childp = NULL;
46657 +               return 0;
46658 +       case ALLOCATED_EXTENT:
46659 +       case UNALLOCATED_EXTENT:
46660 +               break;
46661 +       default:
46662 +               /* this should never happen */
46663 +               assert("vs-1417", 0);
46664 +       }
46665 +
46666 +       {
46667 +               reiser4_key key;
46668 +               reiser4_tree *tree;
46669 +               unsigned long index;
46670 +
46671 +               if (side == LEFT_SIDE) {
46672 +                       /* get key of first byte addressed by the extent */
46673 +                       item_key_by_coord(coord, &key);
46674 +               } else {
46675 +                       /* get key of byte which next after last byte addressed by the extent */
46676 +                       append_key_extent(coord, &key);
46677 +               }
46678 +
46679 +               assert("vs-544",
46680 +                      (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul);
46681 +               /* index of first or last (depending on @side) page addressed
46682 +                  by the extent */
46683 +               index =
46684 +                   (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT);
46685 +               if (side == RIGHT_SIDE)
46686 +                       index--;
46687 +
46688 +               tree = coord->node->zjnode.tree;
46689 +               *childp = jlookup(tree, get_key_objectid(&key), index);
46690 +       }
46691 +
46692 +       return 0;
46693 +}
46694 +
46695 +/* item_plugin->f.utmost_child_real_block */
46696 +/* Return the child's block, if allocated. */
46697 +int
46698 +utmost_child_real_block_extent(const coord_t * coord, sideof side,
46699 +                              reiser4_block_nr * block)
46700 +{
46701 +       reiser4_extent *ext;
46702 +
46703 +       ext = extent_by_coord(coord);
46704 +
46705 +       switch (state_of_extent(ext)) {
46706 +       case ALLOCATED_EXTENT:
46707 +               *block = extent_get_start(ext);
46708 +               if (side == RIGHT_SIDE)
46709 +                       *block += extent_get_width(ext) - 1;
46710 +               break;
46711 +       case HOLE_EXTENT:
46712 +       case UNALLOCATED_EXTENT:
46713 +               *block = 0;
46714 +               break;
46715 +       default:
46716 +               /* this should never happen */
46717 +               assert("vs-1418", 0);
46718 +       }
46719 +
46720 +       return 0;
46721 +}
46722 +
46723 +/* item_plugin->f.scan */
46724 +/* Performs leftward scanning starting from an unformatted node and its parent coordinate.
46725 +   This scan continues, advancing the parent coordinate, until either it encounters a
46726 +   formatted child or it finishes scanning this node.
46727 +
46728 +   If unallocated, the entire extent must be dirty and in the same atom.  (Actually, I'm
46729 +   not sure this is last property (same atom) is enforced, but it should be the case since
46730 +   one atom must write the parent and the others must read the parent, thus fusing?).  In
46731 +   any case, the code below asserts this case for unallocated extents.  Unallocated
46732 +   extents are thus optimized because we can skip to the endpoint when scanning.
46733 +
46734 +   It returns control to reiser4_scan_extent, handles these terminating conditions,
46735 +   e.g., by loading the next twig.
46736 +*/
46737 +int reiser4_scan_extent(flush_scan * scan)
46738 +{
46739 +       coord_t coord;
46740 +       jnode *neighbor;
46741 +       unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist;
46742 +       reiser4_block_nr unit_start;
46743 +       __u64 oid;
46744 +       reiser4_key key;
46745 +       int ret = 0, allocated, incr;
46746 +       reiser4_tree *tree;
46747 +
46748 +       if (!JF_ISSET(scan->node, JNODE_DIRTY)) {
46749 +               scan->stop = 1;
46750 +               return 0;       /* Race with truncate, this node is already
46751 +                                * truncated. */
46752 +       }
46753 +
46754 +       coord_dup(&coord, &scan->parent_coord);
46755 +
46756 +       assert("jmacd-1404", !reiser4_scan_finished(scan));
46757 +       assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL);
46758 +       assert("jmacd-1406", jnode_is_unformatted(scan->node));
46759 +
46760 +       /* The scan_index variable corresponds to the current page index of the
46761 +          unformatted block scan position. */
46762 +       scan_index = index_jnode(scan->node);
46763 +
46764 +       assert("jmacd-7889", item_is_extent(&coord));
46765 +
46766 +      repeat:
46767 +       /* objectid of file */
46768 +       oid = get_key_objectid(item_key_by_coord(&coord, &key));
46769 +
46770 +       allocated = !extent_is_unallocated(&coord);
46771 +       /* Get the values of this extent unit: */
46772 +       unit_index = extent_unit_index(&coord);
46773 +       unit_width = extent_unit_width(&coord);
46774 +       unit_start = extent_unit_start(&coord);
46775 +
46776 +       assert("jmacd-7187", unit_width > 0);
46777 +       assert("jmacd-7188", scan_index >= unit_index);
46778 +       assert("jmacd-7189", scan_index <= unit_index + unit_width - 1);
46779 +
46780 +       /* Depending on the scan direction, we set different maximum values for scan_index
46781 +          (scan_max) and the number of nodes that would be passed if the scan goes the
46782 +          entire way (scan_dist).  Incr is an integer reflecting the incremental
46783 +          direction of scan_index. */
46784 +       if (reiser4_scanning_left(scan)) {
46785 +               scan_max = unit_index;
46786 +               scan_dist = scan_index - unit_index;
46787 +               incr = -1;
46788 +       } else {
46789 +               scan_max = unit_index + unit_width - 1;
46790 +               scan_dist = scan_max - unit_index;
46791 +               incr = +1;
46792 +       }
46793 +
46794 +       tree = coord.node->zjnode.tree;
46795 +
46796 +       /* If the extent is allocated we have to check each of its blocks.  If the extent
46797 +          is unallocated we can skip to the scan_max. */
46798 +       if (allocated) {
46799 +               do {
46800 +                       neighbor = jlookup(tree, oid, scan_index);
46801 +                       if (neighbor == NULL)
46802 +                               goto stop_same_parent;
46803 +
46804 +                       if (scan->node != neighbor
46805 +                           && !reiser4_scan_goto(scan, neighbor)) {
46806 +                               /* @neighbor was jput() by reiser4_scan_goto */
46807 +                               goto stop_same_parent;
46808 +                       }
46809 +
46810 +                       ret = scan_set_current(scan, neighbor, 1, &coord);
46811 +                       if (ret != 0) {
46812 +                               goto exit;
46813 +                       }
46814 +
46815 +                       /* reference to @neighbor is stored in @scan, no need
46816 +                          to jput(). */
46817 +                       scan_index += incr;
46818 +
46819 +               } while (incr + scan_max != scan_index);
46820 +
46821 +       } else {
46822 +               /* Optimized case for unallocated extents, skip to the end. */
46823 +               neighbor = jlookup(tree, oid, scan_max /*index */ );
46824 +               if (neighbor == NULL) {
46825 +                       /* Race with truncate */
46826 +                       scan->stop = 1;
46827 +                       ret = 0;
46828 +                       goto exit;
46829 +               }
46830 +
46831 +               assert("zam-1043",
46832 +                      reiser4_blocknr_is_fake(jnode_get_block(neighbor)));
46833 +
46834 +               ret = scan_set_current(scan, neighbor, scan_dist, &coord);
46835 +               if (ret != 0) {
46836 +                       goto exit;
46837 +               }
46838 +       }
46839 +
46840 +       if (coord_sideof_unit(&coord, scan->direction) == 0
46841 +           && item_is_extent(&coord)) {
46842 +               /* Continue as long as there are more extent units. */
46843 +
46844 +               scan_index =
46845 +                   extent_unit_index(&coord) +
46846 +                   (reiser4_scanning_left(scan) ?
46847 +                    extent_unit_width(&coord) - 1 : 0);
46848 +               goto repeat;
46849 +       }
46850 +
46851 +       if (0) {
46852 +             stop_same_parent:
46853 +
46854 +               /* If we are scanning left and we stop in the middle of an allocated
46855 +                  extent, we know the preceder immediately.. */
46856 +               /* middle of extent is (scan_index - unit_index) != 0. */
46857 +               if (reiser4_scanning_left(scan) &&
46858 +                   (scan_index - unit_index) != 0) {
46859 +                       /* FIXME(B): Someone should step-through and verify that this preceder
46860 +                          calculation is indeed correct. */
46861 +                       /* @unit_start is starting block (number) of extent
46862 +                          unit. Flush stopped at the @scan_index block from
46863 +                          the beginning of the file, which is (scan_index -
46864 +                          unit_index) block within extent.
46865 +                        */
46866 +                       if (unit_start) {
46867 +                               /* skip preceder update when we are at hole */
46868 +                               scan->preceder_blk =
46869 +                                   unit_start + scan_index - unit_index;
46870 +                               check_preceder(scan->preceder_blk);
46871 +                       }
46872 +               }
46873 +
46874 +               /* In this case, we leave coord set to the parent of scan->node. */
46875 +               scan->stop = 1;
46876 +
46877 +       } else {
46878 +               /* In this case, we are still scanning, coord is set to the next item which is
46879 +                  either off-the-end of the node or not an extent. */
46880 +               assert("jmacd-8912", scan->stop == 0);
46881 +               assert("jmacd-7812",
46882 +                      (coord_is_after_sideof_unit(&coord, scan->direction)
46883 +                       || !item_is_extent(&coord)));
46884 +       }
46885 +
46886 +       ret = 0;
46887 +      exit:
46888 +       return ret;
46889 +}
46890 +
46891 +/* ask block allocator for some blocks */
46892 +static void extent_allocate_blocks(reiser4_blocknr_hint *preceder,
46893 +                                  reiser4_block_nr wanted_count,
46894 +                                  reiser4_block_nr *first_allocated,
46895 +                                  reiser4_block_nr *allocated,
46896 +                                  block_stage_t block_stage)
46897 +{
46898 +       *allocated = wanted_count;
46899 +       preceder->max_dist = 0; /* scan whole disk, if needed */
46900 +
46901 +       /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */
46902 +       preceder->block_stage = block_stage;
46903 +
46904 +       /* FIXME: we do not handle errors here now */
46905 +       check_me("vs-420",
46906 +                reiser4_alloc_blocks(preceder, first_allocated, allocated,
46907 +                                     BA_PERMANENT) == 0);
46908 +       /* update flush_pos's preceder to last allocated block number */
46909 +       preceder->blk = *first_allocated + *allocated - 1;
46910 +}
46911 +
46912 +/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent
46913 +   will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have
46914 +   to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */
46915 +static reiser4_block_nr reserve_replace(void)
46916 +{
46917 +       reiser4_block_nr grabbed, needed;
46918 +
46919 +       grabbed = get_current_context()->grabbed_blocks;
46920 +       needed = estimate_one_insert_into_item(current_tree);
46921 +       check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED));
46922 +       return grabbed;
46923 +}
46924 +
46925 +static void free_replace_reserved(reiser4_block_nr grabbed)
46926 +{
46927 +       reiser4_context *ctx;
46928 +
46929 +       ctx = get_current_context();
46930 +       grabbed2free(ctx, get_super_private(ctx->super),
46931 +                    ctx->grabbed_blocks - grabbed);
46932 +}
46933 +
46934 +/* Block offset of first block addressed by unit */
46935 +__u64 extent_unit_index(const coord_t * item)
46936 +{
46937 +       reiser4_key key;
46938 +
46939 +       assert("vs-648", coord_is_existing_unit(item));
46940 +       unit_key_by_coord(item, &key);
46941 +       return get_key_offset(&key) >> current_blocksize_bits;
46942 +}
46943 +
46944 +/* AUDIT shouldn't return value be of reiser4_block_nr type?
46945 +   Josh's answer: who knows?  Is a "number of blocks" the same type as "block offset"? */
46946 +__u64 extent_unit_width(const coord_t * item)
46947 +{
46948 +       assert("vs-649", coord_is_existing_unit(item));
46949 +       return width_by_coord(item);
46950 +}
46951 +
46952 +/* Starting block location of this unit */
46953 +static reiser4_block_nr extent_unit_start(const coord_t * item)
46954 +{
46955 +       return extent_get_start(extent_by_coord(item));
46956 +}
46957 +
46958 +/**
46959 + * split_allocated_extent -
46960 + * @coord:
46961 + * @pos_in_unit:
46962 + *
46963 + * replace allocated extent with two allocated extents
46964 + */
46965 +static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit)
46966 +{
46967 +       int result;
46968 +       struct replace_handle *h;
46969 +       reiser4_extent *ext;
46970 +       reiser4_block_nr grabbed;
46971 +
46972 +       ext = extent_by_coord(coord);
46973 +       assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT);
46974 +       assert("vs-1411", extent_get_width(ext) > pos_in_unit);
46975 +
46976 +       h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
46977 +       if (h == NULL)
46978 +               return RETERR(-ENOMEM);
46979 +       h->coord = coord;
46980 +       h->lh = znode_lh(coord->node);
46981 +       h->pkey = &h->key;
46982 +       unit_key_by_coord(coord, h->pkey);
46983 +       set_key_offset(h->pkey,
46984 +                      (get_key_offset(h->pkey) +
46985 +                       pos_in_unit * current_blocksize));
46986 +       reiser4_set_extent(&h->overwrite, extent_get_start(ext),
46987 +                          pos_in_unit);
46988 +       reiser4_set_extent(&h->new_extents[0],
46989 +                          extent_get_start(ext) + pos_in_unit,
46990 +                          extent_get_width(ext) - pos_in_unit);
46991 +       h->nr_new_extents = 1;
46992 +       h->flags = COPI_DONT_SHIFT_LEFT;
46993 +       h->paste_key = h->key;
46994 +
46995 +       /* reserve space for extent unit paste, @grabbed is reserved before */
46996 +       grabbed = reserve_replace();
46997 +       result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
46998 +                                               extent */);
46999 +       /* restore reserved */
47000 +       free_replace_reserved(grabbed);
47001 +       kfree(h);
47002 +       return result;
47003 +}
47004 +
47005 +/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is
47006 +   one). Return 1 if it succeeded, 0 - otherwise */
47007 +static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext,
47008 +                      reiser4_extent *replace)
47009 +{
47010 +       assert("vs-1415", extent_by_coord(coord) == ext);
47011 +
47012 +       if (coord->unit_pos == 0
47013 +           || state_of_extent(ext - 1) != ALLOCATED_EXTENT)
47014 +               /* @ext either does not exist or is not allocated extent */
47015 +               return 0;
47016 +       if (extent_get_start(ext - 1) + extent_get_width(ext - 1) !=
47017 +           extent_get_start(replace))
47018 +               return 0;
47019 +
47020 +       /* we can glue, widen previous unit */
47021 +       extent_set_width(ext - 1,
47022 +                        extent_get_width(ext - 1) + extent_get_width(replace));
47023 +
47024 +       if (extent_get_width(ext) != extent_get_width(replace)) {
47025 +               /* make current extent narrower */
47026 +               if (state_of_extent(ext) == ALLOCATED_EXTENT)
47027 +                       extent_set_start(ext,
47028 +                                        extent_get_start(ext) +
47029 +                                        extent_get_width(replace));
47030 +               extent_set_width(ext,
47031 +                                extent_get_width(ext) -
47032 +                                extent_get_width(replace));
47033 +       } else {
47034 +               /* current extent completely glued with its left neighbor, remove it */
47035 +               coord_t from, to;
47036 +
47037 +               coord_dup(&from, coord);
47038 +               from.unit_pos = nr_units_extent(coord) - 1;
47039 +               coord_dup(&to, &from);
47040 +
47041 +               /* currently cut from extent can cut either from the beginning or from the end. Move place which got
47042 +                  freed after unit removal to end of item */
47043 +               memmove(ext, ext + 1,
47044 +                       (from.unit_pos -
47045 +                        coord->unit_pos) * sizeof(reiser4_extent));
47046 +               /* wipe part of item which is going to be cut, so that node_check will not be confused */
47047 +               cut_node_content(&from, &to, NULL, NULL, NULL);
47048 +       }
47049 +       znode_make_dirty(coord->node);
47050 +       /* move coord back */
47051 +       coord->unit_pos--;
47052 +       return 1;
47053 +}
47054 +
47055 +/**
47056 + * conv_extent - replace extent with 2 ones
47057 + * @coord: coordinate of extent to be replaced
47058 + * @replace: extent to overwrite the one @coord is set to
47059 + *
47060 + * Overwrites extent @coord is set to and paste one extent unit after
47061 + * overwritten one if @replace is shorter than initial extent
47062 + */
47063 +static int conv_extent(coord_t *coord, reiser4_extent *replace)
47064 +{
47065 +       int result;
47066 +       struct replace_handle *h;
47067 +       reiser4_extent *ext;
47068 +       reiser4_block_nr start, width, new_width;
47069 +       reiser4_block_nr grabbed;
47070 +       extent_state state;
47071 +
47072 +       ext = extent_by_coord(coord);
47073 +       state = state_of_extent(ext);
47074 +       start = extent_get_start(ext);
47075 +       width = extent_get_width(ext);
47076 +       new_width = extent_get_width(replace);
47077 +
47078 +       assert("vs-1458", (state == UNALLOCATED_EXTENT ||
47079 +                          state == ALLOCATED_EXTENT));
47080 +       assert("vs-1459", width >= new_width);
47081 +
47082 +       if (try_to_merge_with_left(coord, ext, replace)) {
47083 +               /* merged @replace with left neighbor. Current unit is either
47084 +                  removed or narrowed */
47085 +               return 0;
47086 +       }
47087 +
47088 +       if (width == new_width) {
47089 +               /* replace current extent with @replace */
47090 +               *ext = *replace;
47091 +               znode_make_dirty(coord->node);
47092 +               return 0;
47093 +       }
47094 +
47095 +       h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
47096 +       if (h == NULL)
47097 +               return RETERR(-ENOMEM);
47098 +       h->coord = coord;
47099 +       h->lh = znode_lh(coord->node);
47100 +       h->pkey = &h->key;
47101 +       unit_key_by_coord(coord, h->pkey);
47102 +       set_key_offset(h->pkey,
47103 +                      (get_key_offset(h->pkey) + new_width * current_blocksize));
47104 +       h->overwrite = *replace;
47105 +
47106 +       /* replace @ext with @replace and padding extent */
47107 +       reiser4_set_extent(&h->new_extents[0],
47108 +                          (state == ALLOCATED_EXTENT) ?
47109 +                          (start + new_width) :
47110 +                          UNALLOCATED_EXTENT_START,
47111 +                          width - new_width);
47112 +       h->nr_new_extents = 1;
47113 +       h->flags = COPI_DONT_SHIFT_LEFT;
47114 +       h->paste_key = h->key;
47115 +
47116 +       /* reserve space for extent unit paste, @grabbed is reserved before */
47117 +       grabbed = reserve_replace();
47118 +       result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
47119 +                                               extent */);
47120 +
47121 +       /* restore reserved */
47122 +       free_replace_reserved(grabbed);
47123 +       kfree(h);
47124 +       return result;
47125 +}
47126 +
47127 +/**
47128 + * assign_real_blocknrs
47129 + * @flush_pos:
47130 + * @oid: objectid of file jnodes to assign block number to belongs to
47131 + * @index: first jnode on the range
47132 + * @count: number of jnodes to assign block numbers to
47133 + * @first: start of allocated block range
47134 + *
47135 + * Assigns block numbers to each of @count jnodes. Index of first jnode is
47136 + * @index. Jnodes get lookuped with jlookup.
47137 + */
47138 +static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid,
47139 +                                unsigned long index, reiser4_block_nr count,
47140 +                                reiser4_block_nr first)
47141 +{
47142 +       unsigned long i;
47143 +       reiser4_tree *tree;
47144 +       txn_atom *atom;
47145 +       int nr;
47146 +
47147 +       atom = atom_locked_by_fq(flush_pos->fq);
47148 +       assert("vs-1468", atom);
47149 +       BUG_ON(atom == NULL);
47150 +
47151 +       nr = 0;
47152 +       tree = current_tree;
47153 +       for (i = 0; i < count; ++i, ++index) {
47154 +               jnode *node;
47155 +
47156 +               node = jlookup(tree, oid, index);
47157 +               assert("", node != NULL);
47158 +               BUG_ON(node == NULL);
47159 +
47160 +               spin_lock_jnode(node);
47161 +               assert("", !jnode_is_flushprepped(node));
47162 +               assert("vs-1475", node->atom == atom);
47163 +               assert("vs-1476", atomic_read(&node->x_count) > 0);
47164 +
47165 +               JF_CLR(node, JNODE_FLUSH_RESERVED);
47166 +               jnode_set_block(node, &first);
47167 +               unformatted_make_reloc(node, flush_pos->fq);
47168 +               ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
47169 +                                    FQ_LIST, 0));
47170 +               spin_unlock_jnode(node);
47171 +               first++;
47172 +
47173 +               atomic_dec(&node->x_count);
47174 +               nr ++;
47175 +       }
47176 +
47177 +       spin_unlock_atom(atom);
47178 +       return;
47179 +}
47180 +
47181 +/**
47182 + * make_node_ovrwr - assign node to overwrite set
47183 + * @jnodes: overwrite set list head
47184 + * @node: jnode to belong to overwrite set
47185 + *
47186 + * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes
47187 + * which is an accumulator for nodes before they get to overwrite set list of
47188 + * atom.
47189 + */
47190 +static void make_node_ovrwr(struct list_head *jnodes, jnode *node)
47191 +{
47192 +       spin_lock_jnode(node);
47193 +
47194 +       assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
47195 +       assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
47196 +
47197 +       JF_SET(node, JNODE_OVRWR);
47198 +       list_move_tail(&node->capture_link, jnodes);
47199 +       ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0));
47200 +
47201 +       spin_unlock_jnode(node);
47202 +}
47203 +
47204 +/**
47205 + * mark_jnodes_overwrite - put bunch of jnodes to overwrite set
47206 + * @flush_pos: flush position
47207 + * @oid: objectid of file jnodes belong to
47208 + * @index: starting index
47209 + * @width: extent width
47210 + *
47211 + * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's
47212 + * overwrite set. Starting from the one with index @index. If end of slum is
47213 + * detected (node is not found or flushprepped) - stop iterating and set flush
47214 + * position's state to POS_INVALID.
47215 + */
47216 +static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid,
47217 +                                 unsigned long index, reiser4_block_nr width)
47218 +{
47219 +       unsigned long i;
47220 +       reiser4_tree *tree;
47221 +       jnode *node;
47222 +       txn_atom *atom;
47223 +       LIST_HEAD(jnodes);
47224 +
47225 +       tree = current_tree;
47226 +
47227 +       atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
47228 +       assert("vs-1478", atom);
47229 +
47230 +       for (i = flush_pos->pos_in_unit; i < width; i++, index++) {
47231 +               node = jlookup(tree, oid, index);
47232 +               if (!node) {
47233 +                       flush_pos->state = POS_INVALID;
47234 +                       break;
47235 +               }
47236 +               if (jnode_check_flushprepped(node)) {
47237 +                       flush_pos->state = POS_INVALID;
47238 +                       atomic_dec(&node->x_count);
47239 +                       break;
47240 +               }
47241 +               if (node->atom != atom) {
47242 +                       flush_pos->state = POS_INVALID;
47243 +                       atomic_dec(&node->x_count);
47244 +                       break;
47245 +               }
47246 +               make_node_ovrwr(&jnodes, node);
47247 +               atomic_dec(&node->x_count);
47248 +       }
47249 +
47250 +       list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev);
47251 +       spin_unlock_atom(atom);
47252 +}
47253 +
47254 +/**
47255 + * allocated_extent_slum_size
47256 + * @flush_pos:
47257 + * @oid:
47258 + * @index:
47259 + * @count:
47260 + *
47261 + *
47262 + */
47263 +static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid,
47264 +                                     unsigned long index, unsigned long count)
47265 +{
47266 +       unsigned long i;
47267 +       reiser4_tree *tree;
47268 +       txn_atom *atom;
47269 +       int nr;
47270 +
47271 +       atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
47272 +       assert("vs-1468", atom);
47273 +
47274 +       nr = 0;
47275 +       tree = current_tree;
47276 +       for (i = 0; i < count; ++i, ++index) {
47277 +               jnode *node;
47278 +
47279 +               node = jlookup(tree, oid, index);
47280 +               if (!node)
47281 +                       break;
47282 +
47283 +               if (jnode_check_flushprepped(node)) {
47284 +                       atomic_dec(&node->x_count);
47285 +                       break;
47286 +               }
47287 +
47288 +               if (node->atom != atom) {
47289 +                       /*
47290 +                        * this is possible on overwrite: extent_write may
47291 +                        * capture several unformatted nodes without capturing
47292 +                        * any formatted nodes.
47293 +                        */
47294 +                       atomic_dec(&node->x_count);
47295 +                       break;
47296 +               }
47297 +
47298 +               assert("vs-1476", atomic_read(&node->x_count) > 1);
47299 +               atomic_dec(&node->x_count);
47300 +               nr ++;
47301 +       }
47302 +
47303 +       spin_unlock_atom(atom);
47304 +       return nr;
47305 +}
47306 +
47307 +/**
47308 + * alloc_extent
47309 + * @flush_pos:
47310 + *
47311 + *
47312 + * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord
47313 + * is set to. It is to prepare for flushing sequence of not flushprepped nodes
47314 + * (slum). It supposes that slum starts at flush_pos->pos_in_unit position
47315 + * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is
47316 + * set to 1 and to overwrite set otherwise
47317 + */
47318 +int reiser4_alloc_extent(flush_pos_t *flush_pos)
47319 +{
47320 +       coord_t *coord;
47321 +       reiser4_extent *ext;
47322 +       reiser4_extent replace_ext;
47323 +       oid_t oid;
47324 +       reiser4_block_nr protected;
47325 +       reiser4_block_nr start;
47326 +       __u64 index;
47327 +       __u64 width;
47328 +       extent_state state;
47329 +       int result;
47330 +       reiser4_block_nr first_allocated;
47331 +       __u64 allocated;
47332 +       reiser4_key key;
47333 +       block_stage_t block_stage;
47334 +
47335 +       assert("vs-1468", flush_pos->state == POS_ON_EPOINT);
47336 +       assert("vs-1469", coord_is_existing_unit(&flush_pos->coord)
47337 +              && item_is_extent(&flush_pos->coord));
47338 +
47339 +       coord = &flush_pos->coord;
47340 +
47341 +       ext = extent_by_coord(coord);
47342 +       state = state_of_extent(ext);
47343 +       if (state == HOLE_EXTENT) {
47344 +               flush_pos->state = POS_INVALID;
47345 +               return 0;
47346 +       }
47347 +
47348 +       item_key_by_coord(coord, &key);
47349 +       oid = get_key_objectid(&key);
47350 +       index = extent_unit_index(coord) + flush_pos->pos_in_unit;
47351 +       start = extent_get_start(ext);
47352 +       width = extent_get_width(ext);
47353 +
47354 +       assert("vs-1457", width > flush_pos->pos_in_unit);
47355 +
47356 +       if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) {
47357 +               /* relocate */
47358 +               if (flush_pos->pos_in_unit) {
47359 +                       /* split extent unit into two */
47360 +                       result =
47361 +                           split_allocated_extent(coord,
47362 +                                                  flush_pos->pos_in_unit);
47363 +                       flush_pos->pos_in_unit = 0;
47364 +                       return result;
47365 +               }
47366 +
47367 +               /* limit number of nodes to allocate */
47368 +               if (flush_pos->nr_to_write < width)
47369 +                       width = flush_pos->nr_to_write;
47370 +
47371 +               if (state == ALLOCATED_EXTENT) {
47372 +                       /*
47373 +                        * all protected nodes are not flushprepped, therefore
47374 +                        * they are counted as flush_reserved
47375 +                        */
47376 +                       block_stage = BLOCK_FLUSH_RESERVED;
47377 +                       protected = allocated_extent_slum_size(flush_pos, oid,
47378 +                                                              index, width);
47379 +                       if (protected == 0) {
47380 +                               flush_pos->state = POS_INVALID;
47381 +                               flush_pos->pos_in_unit = 0;
47382 +                               return 0;
47383 +                       }
47384 +               } else {
47385 +                       block_stage = BLOCK_UNALLOCATED;
47386 +                       protected = width;
47387 +               }
47388 +
47389 +               /*
47390 +                * look at previous unit if possible. If it is allocated, make
47391 +                * preceder more precise
47392 +                */
47393 +               if (coord->unit_pos &&
47394 +                   (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
47395 +                       reiser4_pos_hint(flush_pos)->blk =
47396 +                               extent_get_start(ext - 1) +
47397 +                               extent_get_width(ext - 1);
47398 +
47399 +               /* allocate new block numbers for protected nodes */
47400 +               extent_allocate_blocks(reiser4_pos_hint(flush_pos),
47401 +                                      protected,
47402 +                                      &first_allocated, &allocated,
47403 +                                      block_stage);
47404 +
47405 +               if (state == ALLOCATED_EXTENT)
47406 +                       /*
47407 +                        * on relocating - free nodes which are going to be
47408 +                        * relocated
47409 +                        */
47410 +                       reiser4_dealloc_blocks(&start, &allocated,
47411 +                                              BLOCK_ALLOCATED, BA_DEFER);
47412 +
47413 +               /* assign new block numbers to protected nodes */
47414 +               assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated);
47415 +
47416 +               /* prepare extent which will replace current one */
47417 +               reiser4_set_extent(&replace_ext, first_allocated, allocated);
47418 +
47419 +               /* adjust extent item */
47420 +               result = conv_extent(coord, &replace_ext);
47421 +               if (result != 0 && result != -ENOMEM) {
47422 +                       warning("vs-1461",
47423 +                               "Failed to allocate extent. Should not happen\n");
47424 +                       return result;
47425 +               }
47426 +
47427 +               /*
47428 +                * break flush: we prepared for flushing as many blocks as we
47429 +                * were asked for
47430 +                */
47431 +               if (flush_pos->nr_to_write == allocated)
47432 +                       flush_pos->state = POS_INVALID;
47433 +       } else {
47434 +               /* overwrite */
47435 +               mark_jnodes_overwrite(flush_pos, oid, index, width);
47436 +       }
47437 +       flush_pos->pos_in_unit = 0;
47438 +       return 0;
47439 +}
47440 +
47441 +/* if @key is glueable to the item @coord is set to */
47442 +static int must_insert(const coord_t *coord, const reiser4_key *key)
47443 +{
47444 +       reiser4_key last;
47445 +
47446 +       if (item_id_by_coord(coord) == EXTENT_POINTER_ID
47447 +           && keyeq(append_key_extent(coord, &last), key))
47448 +               return 0;
47449 +       return 1;
47450 +}
47451 +
47452 +/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item,
47453 +   or modify last unit of last item to have greater width */
47454 +static int put_unit_to_end(znode *node, const reiser4_key *key,
47455 +                          reiser4_extent *copy_ext)
47456 +{
47457 +       int result;
47458 +       coord_t coord;
47459 +       cop_insert_flag flags;
47460 +       reiser4_extent *last_ext;
47461 +       reiser4_item_data data;
47462 +
47463 +       /* set coord after last unit in an item */
47464 +       coord_init_last_unit(&coord, node);
47465 +       coord.between = AFTER_UNIT;
47466 +
47467 +       flags =
47468 +           COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE;
47469 +       if (must_insert(&coord, key)) {
47470 +               result =
47471 +                   insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1),
47472 +                                   key, NULL /*lh */ , flags);
47473 +
47474 +       } else {
47475 +               /* try to glue with last unit */
47476 +               last_ext = extent_by_coord(&coord);
47477 +               if (state_of_extent(last_ext) &&
47478 +                   extent_get_start(last_ext) + extent_get_width(last_ext) ==
47479 +                   extent_get_start(copy_ext)) {
47480 +                       /* widen last unit of node */
47481 +                       extent_set_width(last_ext,
47482 +                                        extent_get_width(last_ext) +
47483 +                                        extent_get_width(copy_ext));
47484 +                       znode_make_dirty(node);
47485 +                       return 0;
47486 +               }
47487 +
47488 +               /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */
47489 +               result =
47490 +                   insert_into_item(&coord, NULL /*lh */ , key,
47491 +                                    init_new_extent(&data, copy_ext, 1),
47492 +                                    flags);
47493 +       }
47494 +
47495 +       assert("vs-438", result == 0 || result == -E_NODE_FULL);
47496 +       return result;
47497 +}
47498 +
47499 +/* @coord is set to extent unit */
47500 +squeeze_result squalloc_extent(znode *left, const coord_t *coord,
47501 +                              flush_pos_t *flush_pos,
47502 +                              reiser4_key *stop_key)
47503 +{
47504 +       reiser4_extent *ext;
47505 +       __u64 index;
47506 +       __u64 width;
47507 +       reiser4_block_nr start;
47508 +       extent_state state;
47509 +       oid_t oid;
47510 +       reiser4_block_nr first_allocated;
47511 +       __u64 allocated;
47512 +       __u64 protected;
47513 +       reiser4_extent copy_extent;
47514 +       reiser4_key key;
47515 +       int result;
47516 +       block_stage_t block_stage;
47517 +
47518 +       assert("vs-1457", flush_pos->pos_in_unit == 0);
47519 +       assert("vs-1467", coord_is_leftmost_unit(coord));
47520 +       assert("vs-1467", item_is_extent(coord));
47521 +
47522 +       ext = extent_by_coord(coord);
47523 +       index = extent_unit_index(coord);
47524 +       start = extent_get_start(ext);
47525 +       width = extent_get_width(ext);
47526 +       state = state_of_extent(ext);
47527 +       unit_key_by_coord(coord, &key);
47528 +       oid = get_key_objectid(&key);
47529 +
47530 +       if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) ||
47531 +           (state == UNALLOCATED_EXTENT)) {
47532 +               /* relocate */
47533 +               if (state == ALLOCATED_EXTENT) {
47534 +                       /* all protected nodes are not flushprepped, therefore
47535 +                        * they are counted as flush_reserved */
47536 +                       block_stage = BLOCK_FLUSH_RESERVED;
47537 +                       protected = allocated_extent_slum_size(flush_pos, oid,
47538 +                                                              index, width);
47539 +                       if (protected == 0) {
47540 +                               flush_pos->state = POS_INVALID;
47541 +                               flush_pos->pos_in_unit = 0;
47542 +                               return 0;
47543 +                       }
47544 +               } else {
47545 +                       block_stage = BLOCK_UNALLOCATED;
47546 +                       protected = width;
47547 +               }
47548 +
47549 +               /*
47550 +                * look at previous unit if possible. If it is allocated, make
47551 +                * preceder more precise
47552 +                */
47553 +               if (coord->unit_pos &&
47554 +                   (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
47555 +                       reiser4_pos_hint(flush_pos)->blk =
47556 +                               extent_get_start(ext - 1) +
47557 +                               extent_get_width(ext - 1);
47558 +
47559 +               /* allocate new block numbers for protected nodes */
47560 +               extent_allocate_blocks(reiser4_pos_hint(flush_pos),
47561 +                                      protected,
47562 +                                      &first_allocated, &allocated,
47563 +                                      block_stage);
47564 +
47565 +               /* prepare extent which will be copied to left */
47566 +               reiser4_set_extent(&copy_extent, first_allocated, allocated);
47567 +
47568 +               result = put_unit_to_end(left, &key, &copy_extent);
47569 +               if (result == -E_NODE_FULL) {
47570 +                       int target_block_stage;
47571 +
47572 +                       /* free blocks which were just allocated */
47573 +                       target_block_stage =
47574 +                           (state ==
47575 +                            ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED :
47576 +                           BLOCK_UNALLOCATED;
47577 +                       reiser4_dealloc_blocks(&first_allocated, &allocated,
47578 +                                              target_block_stage,
47579 +                                              BA_PERMANENT);
47580 +
47581 +                       /* rewind the preceder. */
47582 +                       flush_pos->preceder.blk = first_allocated;
47583 +                       check_preceder(flush_pos->preceder.blk);
47584 +
47585 +                       return SQUEEZE_TARGET_FULL;
47586 +               }
47587 +
47588 +               if (state == ALLOCATED_EXTENT) {
47589 +                       /* free nodes which were relocated */
47590 +                       reiser4_dealloc_blocks(&start, &allocated,
47591 +                                              BLOCK_ALLOCATED, BA_DEFER);
47592 +               }
47593 +
47594 +               /* assign new block numbers to protected nodes */
47595 +               assign_real_blocknrs(flush_pos, oid, index, allocated,
47596 +                                    first_allocated);
47597 +
47598 +               set_key_offset(&key,
47599 +                              get_key_offset(&key) +
47600 +                              (allocated << current_blocksize_bits));
47601 +       } else {
47602 +               /*
47603 +                * overwrite: try to copy unit as it is to left neighbor and
47604 +                * make all first not flushprepped nodes overwrite nodes
47605 +                */
47606 +               reiser4_set_extent(&copy_extent, start, width);
47607 +               result = put_unit_to_end(left, &key, &copy_extent);
47608 +               if (result == -E_NODE_FULL)
47609 +                       return SQUEEZE_TARGET_FULL;
47610 +
47611 +               if (state != HOLE_EXTENT)
47612 +                       mark_jnodes_overwrite(flush_pos, oid, index, width);
47613 +               set_key_offset(&key,
47614 +                              get_key_offset(&key) +
47615 +                              (width << current_blocksize_bits));
47616 +       }
47617 +       *stop_key = key;
47618 +       return SQUEEZE_CONTINUE;
47619 +}
47620 +
47621 +int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key)
47622 +{
47623 +       return key_by_inode_and_offset_common(inode, off, key);
47624 +}
47625 +
47626 +/*
47627 + * Local variables:
47628 + * c-indentation-style: "K&R"
47629 + * mode-name: "LC"
47630 + * c-basic-offset: 8
47631 + * tab-width: 8
47632 + * fill-column: 79
47633 + * scroll-step: 1
47634 + * End:
47635 + */
47636 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/extent.h linux-2.6.35/fs/reiser4/plugin/item/extent.h
47637 --- linux-2.6.35.orig/fs/reiser4/plugin/item/extent.h   1970-01-01 01:00:00.000000000 +0100
47638 +++ linux-2.6.35/fs/reiser4/plugin/item/extent.h        2010-08-04 15:44:57.000000000 +0200
47639 @@ -0,0 +1,231 @@
47640 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47641 +
47642 +#ifndef __REISER4_EXTENT_H__
47643 +#define __REISER4_EXTENT_H__
47644 +
47645 +/* on disk extent */
47646 +typedef struct {
47647 +       reiser4_dblock_nr start;
47648 +       reiser4_dblock_nr width;
47649 +} reiser4_extent;
47650 +
47651 +struct extent_stat {
47652 +       int unallocated_units;
47653 +       int unallocated_blocks;
47654 +       int allocated_units;
47655 +       int allocated_blocks;
47656 +       int hole_units;
47657 +       int hole_blocks;
47658 +};
47659 +
47660 +/* extents in an extent item can be either holes, or unallocated or allocated
47661 +   extents */
47662 +typedef enum {
47663 +       HOLE_EXTENT,
47664 +       UNALLOCATED_EXTENT,
47665 +       ALLOCATED_EXTENT
47666 +} extent_state;
47667 +
47668 +#define HOLE_EXTENT_START 0
47669 +#define UNALLOCATED_EXTENT_START 1
47670 +#define UNALLOCATED_EXTENT_START2 2
47671 +
47672 +struct extent_coord_extension {
47673 +       reiser4_block_nr pos_in_unit;
47674 +       reiser4_block_nr width; /* width of current unit */
47675 +       pos_in_node_t nr_units; /* number of units */
47676 +       int ext_offset;         /* offset from the beginning of zdata() */
47677 +       unsigned long expected_page;
47678 +#if REISER4_DEBUG
47679 +       reiser4_extent extent;
47680 +#endif
47681 +};
47682 +
47683 +/* macros to set/get fields of on-disk extent */
47684 +static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext)
47685 +{
47686 +       return le64_to_cpu(ext->start);
47687 +}
47688 +
47689 +static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext)
47690 +{
47691 +       return le64_to_cpu(ext->width);
47692 +}
47693 +
47694 +extern __u64 reiser4_current_block_count(void);
47695 +
47696 +static inline void
47697 +extent_set_start(reiser4_extent * ext, reiser4_block_nr start)
47698 +{
47699 +       cassert(sizeof(ext->start) == 8);
47700 +       assert("nikita-2510",
47701 +              ergo(start > 1, start < reiser4_current_block_count()));
47702 +       put_unaligned(cpu_to_le64(start), &ext->start);
47703 +}
47704 +
47705 +static inline void
47706 +extent_set_width(reiser4_extent * ext, reiser4_block_nr width)
47707 +{
47708 +       cassert(sizeof(ext->width) == 8);
47709 +       assert("", width > 0);
47710 +       put_unaligned(cpu_to_le64(width), &ext->width);
47711 +       assert("nikita-2511",
47712 +              ergo(extent_get_start(ext) > 1,
47713 +                   extent_get_start(ext) + width <=
47714 +                   reiser4_current_block_count()));
47715 +}
47716 +
47717 +#define extent_item(coord)                                     \
47718 +({                                                             \
47719 +       assert("nikita-3143", item_is_extent(coord));           \
47720 +       ((reiser4_extent *)item_body_by_coord (coord));         \
47721 +})
47722 +
47723 +#define extent_by_coord(coord)                                 \
47724 +({                                                             \
47725 +       assert("nikita-3144", item_is_extent(coord));           \
47726 +       (extent_item (coord) + (coord)->unit_pos);              \
47727 +})
47728 +
47729 +#define width_by_coord(coord)                                  \
47730 +({                                                             \
47731 +       assert("nikita-3145", item_is_extent(coord));           \
47732 +       extent_get_width (extent_by_coord(coord));              \
47733 +})
47734 +
47735 +struct carry_cut_data;
47736 +struct carry_kill_data;
47737 +
47738 +/* plugin->u.item.b.* */
47739 +reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *);
47740 +int can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
47741 +                          const reiser4_item_data *);
47742 +int mergeable_extent(const coord_t * p1, const coord_t * p2);
47743 +pos_in_node_t nr_units_extent(const coord_t *);
47744 +lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *);
47745 +void init_coord_extent(coord_t *);
47746 +int init_extent(coord_t *, reiser4_item_data *);
47747 +int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *);
47748 +int can_shift_extent(unsigned free_space,
47749 +                    coord_t * source, znode * target, shift_direction,
47750 +                    unsigned *size, unsigned want);
47751 +void copy_units_extent(coord_t * target, coord_t * source, unsigned from,
47752 +                      unsigned count, shift_direction where_is_free_space,
47753 +                      unsigned free_space);
47754 +int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count,
47755 +                    struct carry_kill_data *);
47756 +int create_hook_extent(const coord_t * coord, void *arg);
47757 +int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47758 +                    struct carry_cut_data *, reiser4_key * smallest_removed,
47759 +                    reiser4_key * new_first);
47760 +int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47761 +                     struct carry_kill_data *, reiser4_key * smallest_removed,
47762 +                     reiser4_key * new_first);
47763 +reiser4_key *unit_key_extent(const coord_t *, reiser4_key *);
47764 +reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *);
47765 +void print_extent(const char *, coord_t *);
47766 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child);
47767 +int utmost_child_real_block_extent(const coord_t * coord, sideof side,
47768 +                                  reiser4_block_nr * block);
47769 +void item_stat_extent(const coord_t * coord, void *vp);
47770 +int reiser4_check_extent(const coord_t * coord, const char **error);
47771 +
47772 +/* plugin->u.item.s.file.* */
47773 +ssize_t reiser4_write_extent(struct file *, struct inode * inode,
47774 +                            const char __user *, size_t, loff_t *);
47775 +int reiser4_read_extent(struct file *, flow_t *, hint_t *);
47776 +int reiser4_readpage_extent(void *, struct page *);
47777 +int reiser4_do_readpage_extent(reiser4_extent*, reiser4_block_nr, struct page*);
47778 +reiser4_key *append_key_extent(const coord_t *, reiser4_key *);
47779 +void init_coord_extension_extent(uf_coord_t *, loff_t offset);
47780 +int get_block_address_extent(const coord_t *, sector_t block,
47781 +                            sector_t * result);
47782 +
47783 +/* these are used in flush.c
47784 +   FIXME-VS: should they be somewhere in item_plugin? */
47785 +int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos);
47786 +int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos,
47787 +                            reiser4_key * stop_key);
47788 +
47789 +int extent_is_unallocated(const coord_t * item);       /* True if this extent is unallocated (i.e., not a hole, not allocated). */
47790 +__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */
47791 +__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */
47792 +
47793 +/* plugin->u.item.f. */
47794 +int reiser4_scan_extent(flush_scan * scan);
47795 +extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *);
47796 +
47797 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
47798 +                                  int nr_extents);
47799 +reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr);
47800 +extent_state state_of_extent(reiser4_extent * ext);
47801 +void reiser4_set_extent(reiser4_extent *, reiser4_block_nr start,
47802 +                       reiser4_block_nr width);
47803 +int reiser4_update_extent(struct inode *, jnode *, loff_t pos,
47804 +                         int *plugged_hole);
47805 +
47806 +#include "../../coord.h"
47807 +#include "../../lock.h"
47808 +#include "../../tap.h"
47809 +
47810 +struct replace_handle {
47811 +       /* these are to be set before calling reiser4_replace_extent */
47812 +       coord_t *coord;
47813 +       lock_handle *lh;
47814 +       reiser4_key key;
47815 +       reiser4_key *pkey;
47816 +       reiser4_extent overwrite;
47817 +       reiser4_extent new_extents[2];
47818 +       int nr_new_extents;
47819 +       unsigned flags;
47820 +
47821 +       /* these are used by reiser4_replace_extent */
47822 +       reiser4_item_data item;
47823 +       coord_t coord_after;
47824 +       lock_handle lh_after;
47825 +       tap_t watch;
47826 +       reiser4_key paste_key;
47827 +#if REISER4_DEBUG
47828 +       reiser4_extent orig_ext;
47829 +       reiser4_key tmp;
47830 +#endif
47831 +};
47832 +
47833 +/* this structure is kmalloced before calling make_extent to avoid excessive
47834 +   stack consumption on plug_hole->reiser4_replace_extent */
47835 +struct make_extent_handle {
47836 +       uf_coord_t *uf_coord;
47837 +       reiser4_block_nr blocknr;
47838 +       int created;
47839 +       struct inode *inode;
47840 +       union {
47841 +               struct {
47842 +               } append;
47843 +               struct replace_handle replace;
47844 +       } u;
47845 +};
47846 +
47847 +int reiser4_replace_extent(struct replace_handle *,
47848 +                          int return_inserted_position);
47849 +lock_handle *znode_lh(znode *);
47850 +
47851 +/* the reiser4 repacker support */
47852 +struct repacker_cursor;
47853 +extern int process_extent_backward_for_repacking(tap_t *,
47854 +                                                struct repacker_cursor *);
47855 +extern int mark_extent_for_repacking(tap_t *, int);
47856 +
47857 +#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord))
47858 +#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent))
47859 +
47860 +/* __REISER4_EXTENT_H__ */
47861 +#endif
47862 +/*
47863 +   Local variables:
47864 +   c-indentation-style: "K&R"
47865 +   mode-name: "LC"
47866 +   c-basic-offset: 8
47867 +   tab-width: 8
47868 +   fill-column: 120
47869 +   End:
47870 +*/
47871 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/extent_item_ops.c linux-2.6.35/fs/reiser4/plugin/item/extent_item_ops.c
47872 --- linux-2.6.35.orig/fs/reiser4/plugin/item/extent_item_ops.c  1970-01-01 01:00:00.000000000 +0100
47873 +++ linux-2.6.35/fs/reiser4/plugin/item/extent_item_ops.c       2010-08-04 15:44:57.000000000 +0200
47874 @@ -0,0 +1,889 @@
47875 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47876 +
47877 +#include "item.h"
47878 +#include "../../inode.h"
47879 +#include "../../tree_walk.h"   /* check_sibling_list() */
47880 +#include "../../page_cache.h"
47881 +#include "../../carry.h"
47882 +
47883 +#include <linux/quotaops.h>
47884 +
47885 +/* item_plugin->b.max_key_inside */
47886 +reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key)
47887 +{
47888 +       item_key_by_coord(coord, key);
47889 +       set_key_offset(key, get_key_offset(reiser4_max_key()));
47890 +       return key;
47891 +}
47892 +
47893 +/* item_plugin->b.can_contain_key
47894 +   this checks whether @key of @data is matching to position set by @coord */
47895 +int
47896 +can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
47897 +                      const reiser4_item_data * data)
47898 +{
47899 +       reiser4_key item_key;
47900 +
47901 +       if (item_plugin_by_coord(coord) != data->iplug)
47902 +               return 0;
47903 +
47904 +       item_key_by_coord(coord, &item_key);
47905 +       if (get_key_locality(key) != get_key_locality(&item_key) ||
47906 +           get_key_objectid(key) != get_key_objectid(&item_key) ||
47907 +           get_key_ordering(key) != get_key_ordering(&item_key))
47908 +               return 0;
47909 +
47910 +       return 1;
47911 +}
47912 +
47913 +/* item_plugin->b.mergeable
47914 +   first item is of extent type */
47915 +/* Audited by: green(2002.06.13) */
47916 +int mergeable_extent(const coord_t * p1, const coord_t * p2)
47917 +{
47918 +       reiser4_key key1, key2;
47919 +
47920 +       assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID);
47921 +       /* FIXME-VS: Which is it? Assert or return 0 */
47922 +       if (item_id_by_coord(p2) != EXTENT_POINTER_ID) {
47923 +               return 0;
47924 +       }
47925 +
47926 +       item_key_by_coord(p1, &key1);
47927 +       item_key_by_coord(p2, &key2);
47928 +       if (get_key_locality(&key1) != get_key_locality(&key2) ||
47929 +           get_key_objectid(&key1) != get_key_objectid(&key2) ||
47930 +           get_key_ordering(&key1) != get_key_ordering(&key2) ||
47931 +           get_key_type(&key1) != get_key_type(&key2))
47932 +               return 0;
47933 +       if (get_key_offset(&key1) +
47934 +           reiser4_extent_size(p1, nr_units_extent(p1)) !=
47935 +           get_key_offset(&key2))
47936 +               return 0;
47937 +       return 1;
47938 +}
47939 +
47940 +/* item_plugin->b.nr_units */
47941 +pos_in_node_t nr_units_extent(const coord_t * coord)
47942 +{
47943 +       /* length of extent item has to be multiple of extent size */
47944 +       assert("vs-1424",
47945 +              (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0);
47946 +       return item_length_by_coord(coord) / sizeof(reiser4_extent);
47947 +}
47948 +
47949 +/* item_plugin->b.lookup */
47950 +lookup_result
47951 +lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG,
47952 +             coord_t * coord)
47953 +{                              /* znode and item_pos are
47954 +                                  set to an extent item to
47955 +                                  look through */
47956 +       reiser4_key item_key;
47957 +       reiser4_block_nr lookuped, offset;
47958 +       unsigned i, nr_units;
47959 +       reiser4_extent *ext;
47960 +       unsigned blocksize;
47961 +       unsigned char blocksize_bits;
47962 +
47963 +       item_key_by_coord(coord, &item_key);
47964 +       offset = get_key_offset(&item_key);
47965 +
47966 +       /* key we are looking for must be greater than key of item @coord */
47967 +       assert("vs-414", keygt(key, &item_key));
47968 +
47969 +       assert("umka-99945",
47970 +              !keygt(key, max_key_inside_extent(coord, &item_key)));
47971 +
47972 +       ext = extent_item(coord);
47973 +       assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset));
47974 +
47975 +       blocksize = current_blocksize;
47976 +       blocksize_bits = current_blocksize_bits;
47977 +
47978 +       /* offset we are looking for */
47979 +       lookuped = get_key_offset(key);
47980 +
47981 +       nr_units = nr_units_extent(coord);
47982 +       /* go through all extents until the one which address given offset */
47983 +       for (i = 0; i < nr_units; i++, ext++) {
47984 +               offset += (extent_get_width(ext) << blocksize_bits);
47985 +               if (offset > lookuped) {
47986 +                       /* desired byte is somewhere in this extent */
47987 +                       coord->unit_pos = i;
47988 +                       coord->between = AT_UNIT;
47989 +                       return CBK_COORD_FOUND;
47990 +               }
47991 +       }
47992 +
47993 +       /* set coord after last unit */
47994 +       coord->unit_pos = nr_units - 1;
47995 +       coord->between = AFTER_UNIT;
47996 +       return CBK_COORD_FOUND;
47997 +}
47998 +
47999 +/* item_plugin->b.paste
48000 +   item @coord is set to has been appended with @data->length of free
48001 +   space. data->data contains data to be pasted into the item in position
48002 +   @coord->in_item.unit_pos. It must fit into that free space.
48003 +   @coord must be set between units.
48004 +*/
48005 +int
48006 +paste_extent(coord_t * coord, reiser4_item_data * data,
48007 +            carry_plugin_info * info UNUSED_ARG)
48008 +{
48009 +       unsigned old_nr_units;
48010 +       reiser4_extent *ext;
48011 +       int item_length;
48012 +
48013 +       ext = extent_item(coord);
48014 +       item_length = item_length_by_coord(coord);
48015 +       old_nr_units = (item_length - data->length) / sizeof(reiser4_extent);
48016 +
48017 +       /* this is also used to copy extent into newly created item, so
48018 +          old_nr_units could be 0 */
48019 +       assert("vs-260", item_length >= data->length);
48020 +
48021 +       /* make sure that coord is set properly */
48022 +       assert("vs-35",
48023 +              ((!coord_is_existing_unit(coord))
48024 +               || (!old_nr_units && !coord->unit_pos)));
48025 +
48026 +       /* first unit to be moved */
48027 +       switch (coord->between) {
48028 +       case AFTER_UNIT:
48029 +               coord->unit_pos++;
48030 +       case BEFORE_UNIT:
48031 +               coord->between = AT_UNIT;
48032 +               break;
48033 +       case AT_UNIT:
48034 +               assert("vs-331", !old_nr_units && !coord->unit_pos);
48035 +               break;
48036 +       default:
48037 +               impossible("vs-330", "coord is set improperly");
48038 +       }
48039 +
48040 +       /* prepare space for new units */
48041 +       memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent),
48042 +               ext + coord->unit_pos,
48043 +               (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent));
48044 +
48045 +       /* copy new data from kernel space */
48046 +       assert("vs-556", data->user == 0);
48047 +       memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length);
48048 +
48049 +       /* after paste @coord is set to first of pasted units */
48050 +       assert("vs-332", coord_is_existing_unit(coord));
48051 +       assert("vs-333",
48052 +              !memcmp(data->data, extent_by_coord(coord),
48053 +                      (unsigned)data->length));
48054 +       return 0;
48055 +}
48056 +
48057 +/* item_plugin->b.can_shift */
48058 +int
48059 +can_shift_extent(unsigned free_space, coord_t * source,
48060 +                znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG,
48061 +                unsigned *size, unsigned want)
48062 +{
48063 +       *size = item_length_by_coord(source);
48064 +       if (*size > free_space)
48065 +               /* never split a unit of extent item */
48066 +               *size = free_space - free_space % sizeof(reiser4_extent);
48067 +
48068 +       /* we can shift *size bytes, calculate how many do we want to shift */
48069 +       if (*size > want * sizeof(reiser4_extent))
48070 +               *size = want * sizeof(reiser4_extent);
48071 +
48072 +       if (*size % sizeof(reiser4_extent) != 0)
48073 +               impossible("vs-119", "Wrong extent size: %i %zd", *size,
48074 +                          sizeof(reiser4_extent));
48075 +       return *size / sizeof(reiser4_extent);
48076 +
48077 +}
48078 +
48079 +/* item_plugin->b.copy_units */
48080 +void
48081 +copy_units_extent(coord_t * target, coord_t * source,
48082 +                 unsigned from, unsigned count,
48083 +                 shift_direction where_is_free_space, unsigned free_space)
48084 +{
48085 +       char *from_ext, *to_ext;
48086 +
48087 +       assert("vs-217", free_space == count * sizeof(reiser4_extent));
48088 +
48089 +       from_ext = item_body_by_coord(source);
48090 +       to_ext = item_body_by_coord(target);
48091 +
48092 +       if (where_is_free_space == SHIFT_LEFT) {
48093 +               assert("vs-215", from == 0);
48094 +
48095 +               /* At this moment, item length was already updated in the item
48096 +                  header by shifting code, hence nr_units_extent() will
48097 +                  return "new" number of units---one we obtain after copying
48098 +                  units.
48099 +                */
48100 +               to_ext +=
48101 +                   (nr_units_extent(target) - count) * sizeof(reiser4_extent);
48102 +       } else {
48103 +               reiser4_key key;
48104 +               coord_t coord;
48105 +
48106 +               assert("vs-216",
48107 +                      from + count == coord_last_unit_pos(source) + 1);
48108 +
48109 +               from_ext += item_length_by_coord(source) - free_space;
48110 +
48111 +               /* new units are inserted before first unit in an item,
48112 +                  therefore, we have to update item key */
48113 +               coord = *source;
48114 +               coord.unit_pos = from;
48115 +               unit_key_extent(&coord, &key);
48116 +
48117 +               node_plugin_by_node(target->node)->update_item_key(target, &key,
48118 +                                                                  NULL /*info */);
48119 +       }
48120 +
48121 +       memcpy(to_ext, from_ext, free_space);
48122 +}
48123 +
48124 +/* item_plugin->b.create_hook
48125 +   @arg is znode of leaf node for which we need to update right delimiting key */
48126 +int create_hook_extent(const coord_t * coord, void *arg)
48127 +{
48128 +       coord_t *child_coord;
48129 +       znode *node;
48130 +       reiser4_key key;
48131 +       reiser4_tree *tree;
48132 +
48133 +       if (!arg)
48134 +               return 0;
48135 +
48136 +       child_coord = arg;
48137 +       tree = znode_get_tree(coord->node);
48138 +
48139 +       assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL);
48140 +
48141 +       write_lock_tree(tree);
48142 +       write_lock_dk(tree);
48143 +       /* find a node on the left level for which right delimiting key has to
48144 +          be updated */
48145 +       if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) {
48146 +               assert("vs-411", znode_is_left_connected(child_coord->node));
48147 +               node = child_coord->node->left;
48148 +       } else {
48149 +               assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT);
48150 +               node = child_coord->node;
48151 +               assert("nikita-3314", node != NULL);
48152 +       }
48153 +
48154 +       if (node != NULL) {
48155 +               znode_set_rd_key(node, item_key_by_coord(coord, &key));
48156 +
48157 +               assert("nikita-3282", check_sibling_list(node));
48158 +               /* break sibling links */
48159 +               if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) {
48160 +                       ON_DEBUG(node->right->left_version =
48161 +                                atomic_inc_return(&delim_key_version);
48162 +                                node->right_version =
48163 +                                atomic_inc_return(&delim_key_version););
48164 +
48165 +                       node->right->left = NULL;
48166 +                       node->right = NULL;
48167 +               }
48168 +       }
48169 +       write_unlock_dk(tree);
48170 +       write_unlock_tree(tree);
48171 +       return 0;
48172 +}
48173 +
48174 +#define ITEM_TAIL_KILLED 0
48175 +#define ITEM_HEAD_KILLED 1
48176 +#define ITEM_KILLED 2
48177 +
48178 +/* item_plugin->b.kill_hook
48179 +   this is called when @count units starting from @from-th one are going to be removed
48180 +   */
48181 +int
48182 +kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
48183 +                struct carry_kill_data *kdata)
48184 +{
48185 +       reiser4_extent *ext;
48186 +       reiser4_block_nr start, length;
48187 +       const reiser4_key *pfrom_key, *pto_key;
48188 +       struct inode *inode;
48189 +       reiser4_tree *tree;
48190 +       pgoff_t from_off, to_off, offset, skip;
48191 +       int retval;
48192 +
48193 +       /* these are located in memory kmalloc-ed by kill_node_content */
48194 +       reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key;
48195 +       coord_t *dup, *next;
48196 +
48197 +       assert("zam-811", znode_is_write_locked(coord->node));
48198 +       assert("nikita-3315", kdata != NULL);
48199 +       assert("vs-34", kdata->buf != NULL);
48200 +
48201 +       /* map structures to kdata->buf */
48202 +       min_item_key = (reiser4_key *) (kdata->buf);
48203 +       max_item_key = min_item_key + 1;
48204 +       from_key = max_item_key + 1;
48205 +       to_key = from_key + 1;
48206 +       key = to_key + 1;
48207 +       dup = (coord_t *) (key + 1);
48208 +       next = dup + 1;
48209 +
48210 +       item_key_by_coord(coord, min_item_key);
48211 +       max_item_key_by_coord(coord, max_item_key);
48212 +
48213 +       if (kdata->params.from_key) {
48214 +               pfrom_key = kdata->params.from_key;
48215 +               pto_key = kdata->params.to_key;
48216 +       } else {
48217 +               assert("vs-1549", from == coord->unit_pos);
48218 +               unit_key_by_coord(coord, from_key);
48219 +               pfrom_key = from_key;
48220 +
48221 +               coord_dup(dup, coord);
48222 +               dup->unit_pos = from + count - 1;
48223 +               max_unit_key_by_coord(dup, to_key);
48224 +               pto_key = to_key;
48225 +       }
48226 +
48227 +       if (!keylt(pto_key, max_item_key)) {
48228 +               if (!keygt(pfrom_key, min_item_key)) {
48229 +                       znode *left, *right;
48230 +
48231 +                       /* item is to be removed completely */
48232 +                       assert("nikita-3316", kdata->left != NULL
48233 +                              && kdata->right != NULL);
48234 +
48235 +                       left = kdata->left->node;
48236 +                       right = kdata->right->node;
48237 +
48238 +                       tree = current_tree;
48239 +                       /* we have to do two things:
48240 +                        *
48241 +                        *     1. link left and right formatted neighbors of
48242 +                        *        extent being removed, and
48243 +                        *
48244 +                        *     2. update their delimiting keys.
48245 +                        *
48246 +                        * atomicity of these operations is protected by
48247 +                        * taking dk-lock and tree-lock.
48248 +                        */
48249 +                       /* if neighbors of item being removed are znodes -
48250 +                        * link them */
48251 +                       write_lock_tree(tree);
48252 +                       write_lock_dk(tree);
48253 +                       link_left_and_right(left, right);
48254 +                       if (left) {
48255 +                               /* update right delimiting key of left
48256 +                                * neighbor of extent item */
48257 +                               /*coord_t next;
48258 +                                  reiser4_key key; */
48259 +
48260 +                               coord_dup(next, coord);
48261 +
48262 +                               if (coord_next_item(next))
48263 +                                       *key = *znode_get_rd_key(coord->node);
48264 +                               else
48265 +                                       item_key_by_coord(next, key);
48266 +                               znode_set_rd_key(left, key);
48267 +                       }
48268 +                       write_unlock_dk(tree);
48269 +                       write_unlock_tree(tree);
48270 +
48271 +                       from_off =
48272 +                           get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT;
48273 +                       to_off =
48274 +                           (get_key_offset(max_item_key) +
48275 +                            1) >> PAGE_CACHE_SHIFT;
48276 +                       retval = ITEM_KILLED;
48277 +               } else {
48278 +                       /* tail of item is to be removed */
48279 +                       from_off =
48280 +                           (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE -
48281 +                            1) >> PAGE_CACHE_SHIFT;
48282 +                       to_off =
48283 +                           (get_key_offset(max_item_key) +
48284 +                            1) >> PAGE_CACHE_SHIFT;
48285 +                       retval = ITEM_TAIL_KILLED;
48286 +               }
48287 +       } else {
48288 +               /* head of item is to be removed */
48289 +               assert("vs-1571", keyeq(pfrom_key, min_item_key));
48290 +               assert("vs-1572",
48291 +                      (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) ==
48292 +                      0);
48293 +               assert("vs-1573",
48294 +                      ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
48295 +                                                        1)) == 0);
48296 +
48297 +               if (kdata->left->node) {
48298 +                       /* update right delimiting key of left neighbor of extent item */
48299 +                       /*reiser4_key key; */
48300 +
48301 +                       *key = *pto_key;
48302 +                       set_key_offset(key, get_key_offset(pto_key) + 1);
48303 +
48304 +                       write_lock_dk(current_tree);
48305 +                       znode_set_rd_key(kdata->left->node, key);
48306 +                       write_unlock_dk(current_tree);
48307 +               }
48308 +
48309 +               from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT;
48310 +               to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT;
48311 +               retval = ITEM_HEAD_KILLED;
48312 +       }
48313 +
48314 +       inode = kdata->inode;
48315 +       assert("vs-1545", inode != NULL);
48316 +       if (inode != NULL)
48317 +               /* take care of pages and jnodes corresponding to part of item being killed */
48318 +               reiser4_invalidate_pages(inode->i_mapping, from_off,
48319 +                                        to_off - from_off,
48320 +                                        kdata->params.truncate);
48321 +
48322 +       ext = extent_item(coord) + from;
48323 +       offset =
48324 +           (get_key_offset(min_item_key) +
48325 +            reiser4_extent_size(coord, from)) >> PAGE_CACHE_SHIFT;
48326 +
48327 +       assert("vs-1551", from_off >= offset);
48328 +       assert("vs-1552", from_off - offset <= extent_get_width(ext));
48329 +       skip = from_off - offset;
48330 +       offset = from_off;
48331 +
48332 +       while (offset < to_off) {
48333 +               length = extent_get_width(ext) - skip;
48334 +               if (state_of_extent(ext) == HOLE_EXTENT) {
48335 +                       skip = 0;
48336 +                       offset += length;
48337 +                       ext++;
48338 +                       continue;
48339 +               }
48340 +
48341 +               if (offset + length > to_off) {
48342 +                       length = to_off - offset;
48343 +               }
48344 +
48345 +               dquot_free_block_nodirty(inode, length);
48346 +
48347 +               if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
48348 +                       /* some jnodes corresponding to this unallocated extent */
48349 +                       fake_allocated2free(length, 0 /* unformatted */ );
48350 +
48351 +                       skip = 0;
48352 +                       offset += length;
48353 +                       ext++;
48354 +                       continue;
48355 +               }
48356 +
48357 +               assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT);
48358 +
48359 +               if (length != 0) {
48360 +                       start = extent_get_start(ext) + skip;
48361 +
48362 +                       /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed
48363 +                          immediately */
48364 +                       reiser4_dealloc_blocks(&start, &length,
48365 +                                              0 /* not used */ ,
48366 +                                              BA_DEFER
48367 +                                              /* unformatted with defer */ );
48368 +               }
48369 +               skip = 0;
48370 +               offset += length;
48371 +               ext++;
48372 +       }
48373 +       return retval;
48374 +}
48375 +
48376 +/* item_plugin->b.kill_units */
48377 +int
48378 +kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
48379 +                 struct carry_kill_data *kdata, reiser4_key * smallest_removed,
48380 +                 reiser4_key * new_first)
48381 +{
48382 +       reiser4_extent *ext;
48383 +       reiser4_key item_key;
48384 +       pos_in_node_t count;
48385 +       reiser4_key from_key, to_key;
48386 +       const reiser4_key *pfrom_key, *pto_key;
48387 +       loff_t off;
48388 +       int result;
48389 +
48390 +       assert("vs-1541",
48391 +              ((kdata->params.from_key == NULL && kdata->params.to_key == NULL)
48392 +               || (kdata->params.from_key != NULL
48393 +                   && kdata->params.to_key != NULL)));
48394 +
48395 +       if (kdata->params.from_key) {
48396 +               pfrom_key = kdata->params.from_key;
48397 +               pto_key = kdata->params.to_key;
48398 +       } else {
48399 +               coord_t dup;
48400 +
48401 +               /* calculate key range of kill */
48402 +               assert("vs-1549", from == coord->unit_pos);
48403 +               unit_key_by_coord(coord, &from_key);
48404 +               pfrom_key = &from_key;
48405 +
48406 +               coord_dup(&dup, coord);
48407 +               dup.unit_pos = to;
48408 +               max_unit_key_by_coord(&dup, &to_key);
48409 +               pto_key = &to_key;
48410 +       }
48411 +
48412 +       item_key_by_coord(coord, &item_key);
48413 +
48414 +#if REISER4_DEBUG
48415 +       {
48416 +               reiser4_key max_item_key;
48417 +
48418 +               max_item_key_by_coord(coord, &max_item_key);
48419 +
48420 +               if (new_first) {
48421 +                       /* head of item is to be cut */
48422 +                       assert("vs-1542", keyeq(pfrom_key, &item_key));
48423 +                       assert("vs-1538", keylt(pto_key, &max_item_key));
48424 +               } else {
48425 +                       /* tail of item is to be cut */
48426 +                       assert("vs-1540", keygt(pfrom_key, &item_key));
48427 +                       assert("vs-1543", !keylt(pto_key, &max_item_key));
48428 +               }
48429 +       }
48430 +#endif
48431 +
48432 +       if (smallest_removed)
48433 +               *smallest_removed = *pfrom_key;
48434 +
48435 +       if (new_first) {
48436 +               /* item head is cut. Item key will change. This new key is calculated here */
48437 +               assert("vs-1556",
48438 +                      (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
48439 +                      (PAGE_CACHE_SIZE - 1));
48440 +               *new_first = *pto_key;
48441 +               set_key_offset(new_first, get_key_offset(new_first) + 1);
48442 +       }
48443 +
48444 +       count = to - from + 1;
48445 +       result = kill_hook_extent(coord, from, count, kdata);
48446 +       if (result == ITEM_TAIL_KILLED) {
48447 +               assert("vs-1553",
48448 +                      get_key_offset(pfrom_key) >=
48449 +                      get_key_offset(&item_key) +
48450 +                      reiser4_extent_size(coord, from));
48451 +               off =
48452 +                   get_key_offset(pfrom_key) -
48453 +                       (get_key_offset(&item_key) +
48454 +                        reiser4_extent_size(coord, from));
48455 +               if (off) {
48456 +                       /* unit @from is to be cut partially. Its width decreases */
48457 +                       ext = extent_item(coord) + from;
48458 +                       extent_set_width(ext,
48459 +                                        (off + PAGE_CACHE_SIZE -
48460 +                                         1) >> PAGE_CACHE_SHIFT);
48461 +                       count--;
48462 +               }
48463 +       } else {
48464 +               __u64 max_to_offset;
48465 +               __u64 rest;
48466 +
48467 +               assert("vs-1575", result == ITEM_HEAD_KILLED);
48468 +               assert("", from == 0);
48469 +               assert("",
48470 +                      ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
48471 +                                                        1)) == 0);
48472 +               assert("",
48473 +                      get_key_offset(pto_key) + 1 >
48474 +                      get_key_offset(&item_key) +
48475 +                      reiser4_extent_size(coord, to));
48476 +               max_to_offset =
48477 +                   get_key_offset(&item_key) +
48478 +                       reiser4_extent_size(coord, to + 1) - 1;
48479 +               assert("", get_key_offset(pto_key) <= max_to_offset);
48480 +
48481 +               rest =
48482 +                   (max_to_offset -
48483 +                    get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT;
48484 +               if (rest) {
48485 +                       /* unit @to is to be cut partially */
48486 +                       ext = extent_item(coord) + to;
48487 +
48488 +                       assert("", extent_get_width(ext) > rest);
48489 +
48490 +                       if (state_of_extent(ext) == ALLOCATED_EXTENT)
48491 +                               extent_set_start(ext,
48492 +                                                extent_get_start(ext) +
48493 +                                                (extent_get_width(ext) -
48494 +                                                 rest));
48495 +
48496 +                       extent_set_width(ext, rest);
48497 +                       count--;
48498 +               }
48499 +       }
48500 +       return count * sizeof(reiser4_extent);
48501 +}
48502 +
48503 +/* item_plugin->b.cut_units
48504 +   this is too similar to kill_units_extent */
48505 +int
48506 +cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
48507 +                struct carry_cut_data *cdata, reiser4_key * smallest_removed,
48508 +                reiser4_key * new_first)
48509 +{
48510 +       reiser4_extent *ext;
48511 +       reiser4_key item_key;
48512 +       pos_in_node_t count;
48513 +       reiser4_key from_key, to_key;
48514 +       const reiser4_key *pfrom_key, *pto_key;
48515 +       loff_t off;
48516 +
48517 +       assert("vs-1541",
48518 +              ((cdata->params.from_key == NULL && cdata->params.to_key == NULL)
48519 +               || (cdata->params.from_key != NULL
48520 +                   && cdata->params.to_key != NULL)));
48521 +
48522 +       if (cdata->params.from_key) {
48523 +               pfrom_key = cdata->params.from_key;
48524 +               pto_key = cdata->params.to_key;
48525 +       } else {
48526 +               coord_t dup;
48527 +
48528 +               /* calculate key range of kill */
48529 +               coord_dup(&dup, coord);
48530 +               dup.unit_pos = from;
48531 +               unit_key_by_coord(&dup, &from_key);
48532 +
48533 +               dup.unit_pos = to;
48534 +               max_unit_key_by_coord(&dup, &to_key);
48535 +
48536 +               pfrom_key = &from_key;
48537 +               pto_key = &to_key;
48538 +       }
48539 +
48540 +       assert("vs-1555",
48541 +              (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0);
48542 +       assert("vs-1556",
48543 +              (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
48544 +              (PAGE_CACHE_SIZE - 1));
48545 +
48546 +       item_key_by_coord(coord, &item_key);
48547 +
48548 +#if REISER4_DEBUG
48549 +       {
48550 +               reiser4_key max_item_key;
48551 +
48552 +               assert("vs-1584",
48553 +                      get_key_locality(pfrom_key) ==
48554 +                      get_key_locality(&item_key));
48555 +               assert("vs-1585",
48556 +                      get_key_type(pfrom_key) == get_key_type(&item_key));
48557 +               assert("vs-1586",
48558 +                      get_key_objectid(pfrom_key) ==
48559 +                      get_key_objectid(&item_key));
48560 +               assert("vs-1587",
48561 +                      get_key_ordering(pfrom_key) ==
48562 +                      get_key_ordering(&item_key));
48563 +
48564 +               max_item_key_by_coord(coord, &max_item_key);
48565 +
48566 +               if (new_first != NULL) {
48567 +                       /* head of item is to be cut */
48568 +                       assert("vs-1542", keyeq(pfrom_key, &item_key));
48569 +                       assert("vs-1538", keylt(pto_key, &max_item_key));
48570 +               } else {
48571 +                       /* tail of item is to be cut */
48572 +                       assert("vs-1540", keygt(pfrom_key, &item_key));
48573 +                       assert("vs-1543", keyeq(pto_key, &max_item_key));
48574 +               }
48575 +       }
48576 +#endif
48577 +
48578 +       if (smallest_removed)
48579 +               *smallest_removed = *pfrom_key;
48580 +
48581 +       if (new_first) {
48582 +               /* item head is cut. Item key will change. This new key is calculated here */
48583 +               *new_first = *pto_key;
48584 +               set_key_offset(new_first, get_key_offset(new_first) + 1);
48585 +       }
48586 +
48587 +       count = to - from + 1;
48588 +
48589 +       assert("vs-1553",
48590 +              get_key_offset(pfrom_key) >=
48591 +              get_key_offset(&item_key) + reiser4_extent_size(coord, from));
48592 +       off =
48593 +           get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
48594 +                                        reiser4_extent_size(coord, from));
48595 +       if (off) {
48596 +               /* tail of unit @from is to be cut partially. Its width decreases */
48597 +               assert("vs-1582", new_first == NULL);
48598 +               ext = extent_item(coord) + from;
48599 +               extent_set_width(ext, off >> PAGE_CACHE_SHIFT);
48600 +               count--;
48601 +       }
48602 +
48603 +       assert("vs-1554",
48604 +              get_key_offset(pto_key) <=
48605 +              get_key_offset(&item_key) +
48606 +              reiser4_extent_size(coord, to + 1) - 1);
48607 +       off =
48608 +               (get_key_offset(&item_key) +
48609 +                reiser4_extent_size(coord, to + 1) - 1) -
48610 +               get_key_offset(pto_key);
48611 +       if (off) {
48612 +               /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased
48613 +                  and width decreased. */
48614 +               assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0);
48615 +               ext = extent_item(coord) + to;
48616 +               if (state_of_extent(ext) == ALLOCATED_EXTENT)
48617 +                       extent_set_start(ext,
48618 +                                        extent_get_start(ext) +
48619 +                                        (extent_get_width(ext) -
48620 +                                         (off >> PAGE_CACHE_SHIFT)));
48621 +
48622 +               extent_set_width(ext, (off >> PAGE_CACHE_SHIFT));
48623 +               count--;
48624 +       }
48625 +       return count * sizeof(reiser4_extent);
48626 +}
48627 +
48628 +/* item_plugin->b.unit_key */
48629 +reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key)
48630 +{
48631 +       assert("vs-300", coord_is_existing_unit(coord));
48632 +
48633 +       item_key_by_coord(coord, key);
48634 +       set_key_offset(key,
48635 +                      (get_key_offset(key) +
48636 +                       reiser4_extent_size(coord, coord->unit_pos)));
48637 +
48638 +       return key;
48639 +}
48640 +
48641 +/* item_plugin->b.max_unit_key */
48642 +reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key)
48643 +{
48644 +       assert("vs-300", coord_is_existing_unit(coord));
48645 +
48646 +       item_key_by_coord(coord, key);
48647 +       set_key_offset(key,
48648 +                      (get_key_offset(key) +
48649 +                       reiser4_extent_size(coord, coord->unit_pos + 1) - 1));
48650 +       return key;
48651 +}
48652 +
48653 +/* item_plugin->b.estimate
48654 +   item_plugin->b.item_data_by_flow */
48655 +
48656 +#if REISER4_DEBUG
48657 +
48658 +/* item_plugin->b.check
48659 +   used for debugging, every item should have here the most complete
48660 +   possible check of the consistency of the item that the inventor can
48661 +   construct
48662 +*/
48663 +int reiser4_check_extent(const coord_t * coord /* coord of item to check */,
48664 +                        const char **error /* where to store error message */)
48665 +{
48666 +       reiser4_extent *ext, *first;
48667 +       unsigned i, j;
48668 +       reiser4_block_nr start, width, blk_cnt;
48669 +       unsigned num_units;
48670 +       reiser4_tree *tree;
48671 +       oid_t oid;
48672 +       reiser4_key key;
48673 +       coord_t scan;
48674 +
48675 +       assert("vs-933", REISER4_DEBUG);
48676 +
48677 +       if (znode_get_level(coord->node) != TWIG_LEVEL) {
48678 +               *error = "Extent on the wrong level";
48679 +               return -1;
48680 +       }
48681 +       if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) {
48682 +               *error = "Wrong item size";
48683 +               return -1;
48684 +       }
48685 +       ext = first = extent_item(coord);
48686 +       blk_cnt = reiser4_block_count(reiser4_get_current_sb());
48687 +       num_units = coord_num_units(coord);
48688 +       tree = znode_get_tree(coord->node);
48689 +       item_key_by_coord(coord, &key);
48690 +       oid = get_key_objectid(&key);
48691 +       coord_dup(&scan, coord);
48692 +
48693 +       for (i = 0; i < num_units; ++i, ++ext) {
48694 +               __u64 index;
48695 +
48696 +               scan.unit_pos = i;
48697 +               index = extent_unit_index(&scan);
48698 +
48699 +#if 0
48700 +               /* check that all jnodes are present for the unallocated
48701 +                * extent */
48702 +               if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
48703 +                       for (j = 0; j < extent_get_width(ext); j++) {
48704 +                               jnode *node;
48705 +
48706 +                               node = jlookup(tree, oid, index + j);
48707 +                               if (node == NULL) {
48708 +                                       print_coord("scan", &scan, 0);
48709 +                                       *error = "Jnode missing";
48710 +                                       return -1;
48711 +                               }
48712 +                               jput(node);
48713 +                       }
48714 +               }
48715 +#endif
48716 +
48717 +               start = extent_get_start(ext);
48718 +               if (start < 2)
48719 +                       continue;
48720 +               /* extent is allocated one */
48721 +               width = extent_get_width(ext);
48722 +               if (start >= blk_cnt) {
48723 +                       *error = "Start too large";
48724 +                       return -1;
48725 +               }
48726 +               if (start + width > blk_cnt) {
48727 +                       *error = "End too large";
48728 +                       return -1;
48729 +               }
48730 +               /* make sure that this extent does not overlap with other
48731 +                  allocated extents extents */
48732 +               for (j = 0; j < i; j++) {
48733 +                       if (state_of_extent(first + j) != ALLOCATED_EXTENT)
48734 +                               continue;
48735 +                       if (!
48736 +                           ((extent_get_start(ext) >=
48737 +                             extent_get_start(first + j) +
48738 +                             extent_get_width(first + j))
48739 +                            || (extent_get_start(ext) +
48740 +                                extent_get_width(ext) <=
48741 +                                extent_get_start(first + j)))) {
48742 +                               *error = "Extent overlaps with others";
48743 +                               return -1;
48744 +                       }
48745 +               }
48746 +
48747 +       }
48748 +
48749 +       return 0;
48750 +}
48751 +
48752 +#endif                         /* REISER4_DEBUG */
48753 +
48754 +/*
48755 +   Local variables:
48756 +   c-indentation-style: "K&R"
48757 +   mode-name: "LC"
48758 +   c-basic-offset: 8
48759 +   tab-width: 8
48760 +   fill-column: 120
48761 +   scroll-step: 1
48762 +   End:
48763 +*/
48764 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/internal.c linux-2.6.35/fs/reiser4/plugin/item/internal.c
48765 --- linux-2.6.35.orig/fs/reiser4/plugin/item/internal.c 1970-01-01 01:00:00.000000000 +0100
48766 +++ linux-2.6.35/fs/reiser4/plugin/item/internal.c      2010-08-04 15:44:57.000000000 +0200
48767 @@ -0,0 +1,404 @@
48768 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48769 +
48770 +/* Implementation of internal-item plugin methods. */
48771 +
48772 +#include "../../forward.h"
48773 +#include "../../debug.h"
48774 +#include "../../dformat.h"
48775 +#include "../../key.h"
48776 +#include "../../coord.h"
48777 +#include "internal.h"
48778 +#include "item.h"
48779 +#include "../node/node.h"
48780 +#include "../plugin.h"
48781 +#include "../../jnode.h"
48782 +#include "../../znode.h"
48783 +#include "../../tree_walk.h"
48784 +#include "../../tree_mod.h"
48785 +#include "../../tree.h"
48786 +#include "../../super.h"
48787 +#include "../../block_alloc.h"
48788 +
48789 +/* see internal.h for explanation */
48790 +
48791 +/* plugin->u.item.b.mergeable */
48792 +int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ ,
48793 +                      const coord_t * p2 UNUSED_ARG /* second item */ )
48794 +{
48795 +       /* internal items are not mergeable */
48796 +       return 0;
48797 +}
48798 +
48799 +/* ->lookup() method for internal items */
48800 +lookup_result lookup_internal(const reiser4_key * key /* key to look up */ ,
48801 +                             lookup_bias bias UNUSED_ARG /* lookup bias */ ,
48802 +                             coord_t * coord /* coord of item */ )
48803 +{
48804 +       reiser4_key ukey;
48805 +
48806 +       switch (keycmp(unit_key_by_coord(coord, &ukey), key)) {
48807 +       default:
48808 +               impossible("", "keycmp()?!");
48809 +       case LESS_THAN:
48810 +               /* FIXME-VS: AFTER_ITEM used to be here. But with new coord
48811 +                  item plugin can not be taken using coord set this way */
48812 +               assert("vs-681", coord->unit_pos == 0);
48813 +               coord->between = AFTER_UNIT;
48814 +       case EQUAL_TO:
48815 +               return CBK_COORD_FOUND;
48816 +       case GREATER_THAN:
48817 +               return CBK_COORD_NOTFOUND;
48818 +       }
48819 +}
48820 +
48821 +/* return body of internal item at @coord */
48822 +static internal_item_layout *internal_at(const coord_t * coord /* coord of
48823 +                                                                * item */ )
48824 +{
48825 +       assert("nikita-607", coord != NULL);
48826 +       assert("nikita-1650",
48827 +              item_plugin_by_coord(coord) ==
48828 +              item_plugin_by_id(NODE_POINTER_ID));
48829 +       return (internal_item_layout *) item_body_by_coord(coord);
48830 +}
48831 +
48832 +void reiser4_update_internal(const coord_t * coord,
48833 +                            const reiser4_block_nr * blocknr)
48834 +{
48835 +       internal_item_layout *item = internal_at(coord);
48836 +       assert("nikita-2959", reiser4_blocknr_is_sane(blocknr));
48837 +
48838 +       put_unaligned(cpu_to_le64(*blocknr), &item->pointer);
48839 +}
48840 +
48841 +/* return child block number stored in the internal item at @coord */
48842 +static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ )
48843 +{
48844 +       assert("nikita-608", coord != NULL);
48845 +       return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer));
48846 +}
48847 +
48848 +/* get znode pointed to by internal @item */
48849 +static znode *znode_at(const coord_t * item /* coord of item */ ,
48850 +                      znode * parent /* parent node */ )
48851 +{
48852 +       return child_znode(item, parent, 1, 0);
48853 +}
48854 +
48855 +/* store pointer from internal item into "block". Implementation of
48856 +    ->down_link() method */
48857 +void down_link_internal(const coord_t * coord /* coord of item */ ,
48858 +                       const reiser4_key * key UNUSED_ARG      /* key to get
48859 +                                                                * pointer for */ ,
48860 +                       reiser4_block_nr * block /* resulting block number */ )
48861 +{
48862 +       ON_DEBUG(reiser4_key item_key);
48863 +
48864 +       assert("nikita-609", coord != NULL);
48865 +       assert("nikita-611", block != NULL);
48866 +       assert("nikita-612", (key == NULL) ||
48867 +              /* twig horrors */
48868 +              (znode_get_level(coord->node) == TWIG_LEVEL)
48869 +              || keyle(item_key_by_coord(coord, &item_key), key));
48870 +
48871 +       *block = pointer_at(coord);
48872 +       assert("nikita-2960", reiser4_blocknr_is_sane(block));
48873 +}
48874 +
48875 +/* Get the child's block number, or 0 if the block is unallocated. */
48876 +int
48877 +utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG,
48878 +                                reiser4_block_nr * block)
48879 +{
48880 +       assert("jmacd-2059", coord != NULL);
48881 +
48882 +       *block = pointer_at(coord);
48883 +       assert("nikita-2961", reiser4_blocknr_is_sane(block));
48884 +
48885 +       if (reiser4_blocknr_is_fake(block)) {
48886 +               *block = 0;
48887 +       }
48888 +
48889 +       return 0;
48890 +}
48891 +
48892 +/* Return the child. */
48893 +int
48894 +utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG,
48895 +                     jnode ** childp)
48896 +{
48897 +       reiser4_block_nr block = pointer_at(coord);
48898 +       znode *child;
48899 +
48900 +       assert("jmacd-2059", childp != NULL);
48901 +       assert("nikita-2962", reiser4_blocknr_is_sane(&block));
48902 +
48903 +       child = zlook(znode_get_tree(coord->node), &block);
48904 +
48905 +       if (IS_ERR(child)) {
48906 +               return PTR_ERR(child);
48907 +       }
48908 +
48909 +       *childp = ZJNODE(child);
48910 +
48911 +       return 0;
48912 +}
48913 +
48914 +#if REISER4_DEBUG
48915 +
48916 +static void check_link(znode * left, znode * right)
48917 +{
48918 +       znode *scan;
48919 +
48920 +       for (scan = left; scan != right; scan = scan->right) {
48921 +               if (ZF_ISSET(scan, JNODE_RIP))
48922 +                       break;
48923 +               if (znode_is_right_connected(scan) && scan->right != NULL) {
48924 +                       if (ZF_ISSET(scan->right, JNODE_RIP))
48925 +                               break;
48926 +                       assert("nikita-3285",
48927 +                              znode_is_left_connected(scan->right));
48928 +                       assert("nikita-3265",
48929 +                              ergo(scan != left,
48930 +                                   ZF_ISSET(scan, JNODE_HEARD_BANSHEE)));
48931 +                       assert("nikita-3284", scan->right->left == scan);
48932 +               } else
48933 +                       break;
48934 +       }
48935 +}
48936 +
48937 +int check__internal(const coord_t * coord, const char **error)
48938 +{
48939 +       reiser4_block_nr blk;
48940 +       znode *child;
48941 +       coord_t cpy;
48942 +
48943 +       blk = pointer_at(coord);
48944 +       if (!reiser4_blocknr_is_sane(&blk)) {
48945 +               *error = "Invalid pointer";
48946 +               return -1;
48947 +       }
48948 +       coord_dup(&cpy, coord);
48949 +       child = znode_at(&cpy, cpy.node);
48950 +       if (child != NULL) {
48951 +               znode *left_child;
48952 +               znode *right_child;
48953 +
48954 +               left_child = right_child = NULL;
48955 +
48956 +               assert("nikita-3256", znode_invariant(child));
48957 +               if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) {
48958 +                       left_child = znode_at(&cpy, cpy.node);
48959 +                       if (left_child != NULL) {
48960 +                               read_lock_tree(znode_get_tree(child));
48961 +                               check_link(left_child, child);
48962 +                               read_unlock_tree(znode_get_tree(child));
48963 +                               zput(left_child);
48964 +                       }
48965 +               }
48966 +               coord_dup(&cpy, coord);
48967 +               if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) {
48968 +                       right_child = znode_at(&cpy, cpy.node);
48969 +                       if (right_child != NULL) {
48970 +                               read_lock_tree(znode_get_tree(child));
48971 +                               check_link(child, right_child);
48972 +                               read_unlock_tree(znode_get_tree(child));
48973 +                               zput(right_child);
48974 +                       }
48975 +               }
48976 +               zput(child);
48977 +       }
48978 +       return 0;
48979 +}
48980 +
48981 +#endif  /*  REISER4_DEBUG  */
48982 +
48983 +/* return true only if this item really points to "block" */
48984 +/* Audited by: green(2002.06.14) */
48985 +int has_pointer_to_internal(const coord_t * coord /* coord of item */ ,
48986 +                           const reiser4_block_nr * block      /* block number to
48987 +                                                                * check */ )
48988 +{
48989 +       assert("nikita-613", coord != NULL);
48990 +       assert("nikita-614", block != NULL);
48991 +
48992 +       return pointer_at(coord) == *block;
48993 +}
48994 +
48995 +/* hook called by ->create_item() method of node plugin after new internal
48996 +   item was just created.
48997 +
48998 +   This is point where pointer to new node is inserted into tree. Initialize
48999 +   parent pointer in child znode, insert child into sibling list and slum.
49000 +
49001 +*/
49002 +int create_hook_internal(const coord_t * item /* coord of item */ ,
49003 +                        void *arg /* child's left neighbor, if any */ )
49004 +{
49005 +       znode *child;
49006 +       __u64 child_ptr;
49007 +
49008 +       assert("nikita-1252", item != NULL);
49009 +       assert("nikita-1253", item->node != NULL);
49010 +       assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL);
49011 +       assert("nikita-1450", item->unit_pos == 0);
49012 +
49013 +       /*
49014 +        * preparing to item insertion build_child_ptr_data sets pointer to
49015 +        * data to be inserted to jnode's blocknr which is in cpu byte
49016 +        * order. Node's create_item simply copied those data. As result we
49017 +        * have child pointer in cpu's byte order. Convert content of internal
49018 +        * item to little endian byte order.
49019 +        */
49020 +       child_ptr = get_unaligned((__u64 *)item_body_by_coord(item));
49021 +       reiser4_update_internal(item, &child_ptr);
49022 +
49023 +       child = znode_at(item, item->node);
49024 +       if (child != NULL && !IS_ERR(child)) {
49025 +               znode *left;
49026 +               int result = 0;
49027 +               reiser4_tree *tree;
49028 +
49029 +               left = arg;
49030 +               tree = znode_get_tree(item->node);
49031 +               write_lock_tree(tree);
49032 +               write_lock_dk(tree);
49033 +               assert("nikita-1400", (child->in_parent.node == NULL)
49034 +                      || (znode_above_root(child->in_parent.node)));
49035 +               ++item->node->c_count;
49036 +               coord_to_parent_coord(item, &child->in_parent);
49037 +               sibling_list_insert_nolock(child, left);
49038 +
49039 +               assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN));
49040 +               ZF_CLR(child, JNODE_ORPHAN);
49041 +
49042 +               if ((left != NULL) && !keyeq(znode_get_rd_key(left),
49043 +                                            znode_get_rd_key(child))) {
49044 +                       znode_set_rd_key(child, znode_get_rd_key(left));
49045 +               }
49046 +               write_unlock_dk(tree);
49047 +               write_unlock_tree(tree);
49048 +               zput(child);
49049 +               return result;
49050 +       } else {
49051 +               if (child == NULL)
49052 +                       child = ERR_PTR(-EIO);
49053 +               return PTR_ERR(child);
49054 +       }
49055 +}
49056 +
49057 +/* hook called by ->cut_and_kill() method of node plugin just before internal
49058 +   item is removed.
49059 +
49060 +   This is point where empty node is removed from the tree. Clear parent
49061 +   pointer in child, and mark node for pending deletion.
49062 +
49063 +   Node will be actually deleted later and in several installations:
49064 +
49065 +    . when last lock on this node will be released, node will be removed from
49066 +    the sibling list and its lock will be invalidated
49067 +
49068 +    . when last reference to this node will be dropped, bitmap will be updated
49069 +    and node will be actually removed from the memory.
49070 +
49071 +*/
49072 +int kill_hook_internal(const coord_t * item /* coord of item */ ,
49073 +                      pos_in_node_t from UNUSED_ARG /* start unit */ ,
49074 +                      pos_in_node_t count UNUSED_ARG /* stop unit */ ,
49075 +                      struct carry_kill_data *p UNUSED_ARG)
49076 +{
49077 +       znode *child;
49078 +       int result = 0;
49079 +
49080 +       assert("nikita-1222", item != NULL);
49081 +       assert("nikita-1224", from == 0);
49082 +       assert("nikita-1225", count == 1);
49083 +
49084 +       child = znode_at(item, item->node);
49085 +       if (child == NULL)
49086 +               return 0;
49087 +       if (IS_ERR(child))
49088 +               return PTR_ERR(child);
49089 +       result = zload(child);
49090 +       if (result) {
49091 +               zput(child);
49092 +               return result;
49093 +       }
49094 +       if (node_is_empty(child)) {
49095 +               reiser4_tree *tree;
49096 +
49097 +               assert("nikita-1397", znode_is_write_locked(child));
49098 +               assert("nikita-1398", child->c_count == 0);
49099 +               assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE));
49100 +
49101 +               tree = znode_get_tree(item->node);
49102 +               write_lock_tree(tree);
49103 +               init_parent_coord(&child->in_parent, NULL);
49104 +               --item->node->c_count;
49105 +               write_unlock_tree(tree);
49106 +       } else {
49107 +               warning("nikita-1223",
49108 +                       "Cowardly refuse to remove link to non-empty node");
49109 +               result = RETERR(-EIO);
49110 +       }
49111 +       zrelse(child);
49112 +       zput(child);
49113 +       return result;
49114 +}
49115 +
49116 +/* hook called by ->shift() node plugin method when iternal item was just
49117 +   moved from one node to another.
49118 +
49119 +   Update parent pointer in child and c_counts in old and new parent
49120 +
49121 +*/
49122 +int shift_hook_internal(const coord_t * item /* coord of item */ ,
49123 +                       unsigned from UNUSED_ARG /* start unit */ ,
49124 +                       unsigned count UNUSED_ARG /* stop unit */ ,
49125 +                       znode * old_node /* old parent */ )
49126 +{
49127 +       znode *child;
49128 +       znode *new_node;
49129 +       reiser4_tree *tree;
49130 +
49131 +       assert("nikita-1276", item != NULL);
49132 +       assert("nikita-1277", from == 0);
49133 +       assert("nikita-1278", count == 1);
49134 +       assert("nikita-1451", item->unit_pos == 0);
49135 +
49136 +       new_node = item->node;
49137 +       assert("nikita-2132", new_node != old_node);
49138 +       tree = znode_get_tree(item->node);
49139 +       child = child_znode(item, old_node, 1, 0);
49140 +       if (child == NULL)
49141 +               return 0;
49142 +       if (!IS_ERR(child)) {
49143 +               write_lock_tree(tree);
49144 +               ++new_node->c_count;
49145 +               assert("nikita-1395", znode_parent(child) == old_node);
49146 +               assert("nikita-1396", old_node->c_count > 0);
49147 +               coord_to_parent_coord(item, &child->in_parent);
49148 +               assert("nikita-1781", znode_parent(child) == new_node);
49149 +               assert("nikita-1782",
49150 +                      check_tree_pointer(item, child) == NS_FOUND);
49151 +               --old_node->c_count;
49152 +               write_unlock_tree(tree);
49153 +               zput(child);
49154 +               return 0;
49155 +       } else
49156 +               return PTR_ERR(child);
49157 +}
49158 +
49159 +/* plugin->u.item.b.max_key_inside - not defined */
49160 +
49161 +/* plugin->u.item.b.nr_units - item.c:single_unit */
49162 +
49163 +/* Make Linus happy.
49164 +   Local variables:
49165 +   c-indentation-style: "K&R"
49166 +   mode-name: "LC"
49167 +   c-basic-offset: 8
49168 +   tab-width: 8
49169 +   fill-column: 120
49170 +   End:
49171 +*/
49172 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/internal.h linux-2.6.35/fs/reiser4/plugin/item/internal.h
49173 --- linux-2.6.35.orig/fs/reiser4/plugin/item/internal.h 1970-01-01 01:00:00.000000000 +0100
49174 +++ linux-2.6.35/fs/reiser4/plugin/item/internal.h      2010-08-04 15:44:57.000000000 +0200
49175 @@ -0,0 +1,57 @@
49176 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49177 +/* Internal item contains down-link to the child of the internal/twig
49178 +   node in a tree. It is internal items that are actually used during
49179 +   tree traversal. */
49180 +
49181 +#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ )
49182 +#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__
49183 +
49184 +#include "../../forward.h"
49185 +#include "../../dformat.h"
49186 +
49187 +/* on-disk layout of internal item */
49188 +typedef struct internal_item_layout {
49189 +       /*  0 */ reiser4_dblock_nr pointer;
49190 +       /*  4 */
49191 +} internal_item_layout;
49192 +
49193 +struct cut_list;
49194 +
49195 +int mergeable_internal(const coord_t * p1, const coord_t * p2);
49196 +lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias,
49197 +                             coord_t * coord);
49198 +/* store pointer from internal item into "block". Implementation of
49199 +    ->down_link() method */
49200 +extern void down_link_internal(const coord_t * coord, const reiser4_key * key,
49201 +                              reiser4_block_nr * block);
49202 +extern int has_pointer_to_internal(const coord_t * coord,
49203 +                                  const reiser4_block_nr * block);
49204 +extern int create_hook_internal(const coord_t * item, void *arg);
49205 +extern int kill_hook_internal(const coord_t * item, pos_in_node_t from,
49206 +                             pos_in_node_t count, struct carry_kill_data *);
49207 +extern int shift_hook_internal(const coord_t * item, unsigned from,
49208 +                              unsigned count, znode * old_node);
49209 +extern void reiser4_print_internal(const char *prefix, coord_t * coord);
49210 +
49211 +extern int utmost_child_internal(const coord_t * coord, sideof side,
49212 +                                jnode ** child);
49213 +int utmost_child_real_block_internal(const coord_t * coord, sideof side,
49214 +                                    reiser4_block_nr * block);
49215 +
49216 +extern void reiser4_update_internal(const coord_t * coord,
49217 +                                   const reiser4_block_nr * blocknr);
49218 +/* FIXME: reiserfs has check_internal */
49219 +extern int check__internal(const coord_t * coord, const char **error);
49220 +
49221 +/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */
49222 +#endif
49223 +
49224 +/* Make Linus happy.
49225 +   Local variables:
49226 +   c-indentation-style: "K&R"
49227 +   mode-name: "LC"
49228 +   c-basic-offset: 8
49229 +   tab-width: 8
49230 +   fill-column: 120
49231 +   End:
49232 +*/
49233 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/item.c linux-2.6.35/fs/reiser4/plugin/item/item.c
49234 --- linux-2.6.35.orig/fs/reiser4/plugin/item/item.c     1970-01-01 01:00:00.000000000 +0100
49235 +++ linux-2.6.35/fs/reiser4/plugin/item/item.c  2010-08-04 15:44:57.000000000 +0200
49236 @@ -0,0 +1,719 @@
49237 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49238 +
49239 +/* definition of item plugins. */
49240 +
49241 +#include "../../forward.h"
49242 +#include "../../debug.h"
49243 +#include "../../key.h"
49244 +#include "../../coord.h"
49245 +#include "../plugin_header.h"
49246 +#include "sde.h"
49247 +#include "internal.h"
49248 +#include "item.h"
49249 +#include "static_stat.h"
49250 +#include "../plugin.h"
49251 +#include "../../znode.h"
49252 +#include "../../tree.h"
49253 +#include "../../context.h"
49254 +#include "ctail.h"
49255 +
49256 +/* return pointer to item body */
49257 +void item_body_by_coord_hard(coord_t * coord /* coord to query */ )
49258 +{
49259 +       assert("nikita-324", coord != NULL);
49260 +       assert("nikita-325", coord->node != NULL);
49261 +       assert("nikita-326", znode_is_loaded(coord->node));
49262 +       assert("nikita-3200", coord->offset == INVALID_OFFSET);
49263 +
49264 +       coord->offset =
49265 +           node_plugin_by_node(coord->node)->item_by_coord(coord) -
49266 +           zdata(coord->node);
49267 +       ON_DEBUG(coord->body_v = coord->node->times_locked);
49268 +}
49269 +
49270 +void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ )
49271 +{
49272 +       return zdata(coord->node) + coord->offset;
49273 +}
49274 +
49275 +#if REISER4_DEBUG
49276 +
49277 +int item_body_is_valid(const coord_t * coord)
49278 +{
49279 +       return
49280 +           coord->offset ==
49281 +           node_plugin_by_node(coord->node)->item_by_coord(coord) -
49282 +           zdata(coord->node);
49283 +}
49284 +
49285 +#endif
49286 +
49287 +/* return length of item at @coord */
49288 +pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ )
49289 +{
49290 +       int len;
49291 +
49292 +       assert("nikita-327", coord != NULL);
49293 +       assert("nikita-328", coord->node != NULL);
49294 +       assert("nikita-329", znode_is_loaded(coord->node));
49295 +
49296 +       len = node_plugin_by_node(coord->node)->length_by_coord(coord);
49297 +       return len;
49298 +}
49299 +
49300 +void obtain_item_plugin(const coord_t * coord)
49301 +{
49302 +       assert("nikita-330", coord != NULL);
49303 +       assert("nikita-331", coord->node != NULL);
49304 +       assert("nikita-332", znode_is_loaded(coord->node));
49305 +
49306 +       coord_set_iplug((coord_t *) coord,
49307 +                       node_plugin_by_node(coord->node)->
49308 +                       plugin_by_coord(coord));
49309 +       assert("nikita-2479",
49310 +              coord_iplug(coord) ==
49311 +              node_plugin_by_node(coord->node)->plugin_by_coord(coord));
49312 +}
49313 +
49314 +/* return id of item */
49315 +/* Audited by: green(2002.06.15) */
49316 +item_id item_id_by_coord(const coord_t * coord /* coord to query */ )
49317 +{
49318 +       assert("vs-539", coord != NULL);
49319 +       assert("vs-538", coord->node != NULL);
49320 +       assert("vs-537", znode_is_loaded(coord->node));
49321 +       assert("vs-536", item_plugin_by_coord(coord) != NULL);
49322 +       assert("vs-540",
49323 +              item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID);
49324 +
49325 +       return item_id_by_plugin(item_plugin_by_coord(coord));
49326 +}
49327 +
49328 +/* return key of item at @coord */
49329 +/* Audited by: green(2002.06.15) */
49330 +reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ ,
49331 +                              reiser4_key * key /* result */ )
49332 +{
49333 +       assert("nikita-338", coord != NULL);
49334 +       assert("nikita-339", coord->node != NULL);
49335 +       assert("nikita-340", znode_is_loaded(coord->node));
49336 +
49337 +       return node_plugin_by_node(coord->node)->key_at(coord, key);
49338 +}
49339 +
49340 +/* this returns max key in the item */
49341 +reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ ,
49342 +                                  reiser4_key * key /* result */ )
49343 +{
49344 +       coord_t last;
49345 +
49346 +       assert("nikita-338", coord != NULL);
49347 +       assert("nikita-339", coord->node != NULL);
49348 +       assert("nikita-340", znode_is_loaded(coord->node));
49349 +
49350 +       /* make coord pointing to last item's unit */
49351 +       coord_dup(&last, coord);
49352 +       last.unit_pos = coord_num_units(&last) - 1;
49353 +       assert("vs-1560", coord_is_existing_unit(&last));
49354 +
49355 +       max_unit_key_by_coord(&last, key);
49356 +       return key;
49357 +}
49358 +
49359 +/* return key of unit at @coord */
49360 +reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ ,
49361 +                              reiser4_key * key /* result */ )
49362 +{
49363 +       assert("nikita-772", coord != NULL);
49364 +       assert("nikita-774", coord->node != NULL);
49365 +       assert("nikita-775", znode_is_loaded(coord->node));
49366 +
49367 +       if (item_plugin_by_coord(coord)->b.unit_key != NULL)
49368 +               return item_plugin_by_coord(coord)->b.unit_key(coord, key);
49369 +       else
49370 +               return item_key_by_coord(coord, key);
49371 +}
49372 +
49373 +/* return the biggest key contained the unit @coord */
49374 +reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ ,
49375 +                                  reiser4_key * key /* result */ )
49376 +{
49377 +       assert("nikita-772", coord != NULL);
49378 +       assert("nikita-774", coord->node != NULL);
49379 +       assert("nikita-775", znode_is_loaded(coord->node));
49380 +
49381 +       if (item_plugin_by_coord(coord)->b.max_unit_key != NULL)
49382 +               return item_plugin_by_coord(coord)->b.max_unit_key(coord, key);
49383 +       else
49384 +               return unit_key_by_coord(coord, key);
49385 +}
49386 +
49387 +/* ->max_key_inside() method for items consisting of exactly one key (like
49388 +    stat-data) */
49389 +static reiser4_key *max_key_inside_single_key(const coord_t *
49390 +                                             coord /* coord of item */ ,
49391 +                                             reiser4_key *
49392 +                                             result /* resulting key */ )
49393 +{
49394 +       assert("nikita-604", coord != NULL);
49395 +
49396 +       /* coord -> key is starting key of this item and it has to be already
49397 +          filled in */
49398 +       return unit_key_by_coord(coord, result);
49399 +}
49400 +
49401 +/* ->nr_units() method for items consisting of exactly one unit always */
49402 +pos_in_node_t
49403 +nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ )
49404 +{
49405 +       return 1;
49406 +}
49407 +
49408 +static int
49409 +paste_no_paste(coord_t * coord UNUSED_ARG,
49410 +              reiser4_item_data * data UNUSED_ARG,
49411 +              carry_plugin_info * info UNUSED_ARG)
49412 +{
49413 +       return 0;
49414 +}
49415 +
49416 +/* default ->fast_paste() method */
49417 +static int
49418 +agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ )
49419 +{
49420 +       return 1;
49421 +}
49422 +
49423 +int item_can_contain_key(const coord_t * item /* coord of item */ ,
49424 +                        const reiser4_key * key /* key to check */ ,
49425 +                        const reiser4_item_data * data /* parameters of item
49426 +                                                        * being created */ )
49427 +{
49428 +       item_plugin *iplug;
49429 +       reiser4_key min_key_in_item;
49430 +       reiser4_key max_key_in_item;
49431 +
49432 +       assert("nikita-1658", item != NULL);
49433 +       assert("nikita-1659", key != NULL);
49434 +
49435 +       iplug = item_plugin_by_coord(item);
49436 +       if (iplug->b.can_contain_key != NULL)
49437 +               return iplug->b.can_contain_key(item, key, data);
49438 +       else {
49439 +               assert("nikita-1681", iplug->b.max_key_inside != NULL);
49440 +               item_key_by_coord(item, &min_key_in_item);
49441 +               iplug->b.max_key_inside(item, &max_key_in_item);
49442 +
49443 +               /* can contain key if
49444 +                  min_key_in_item <= key &&
49445 +                  key <= max_key_in_item
49446 +                */
49447 +               return keyle(&min_key_in_item, key)
49448 +                   && keyle(key, &max_key_in_item);
49449 +       }
49450 +}
49451 +
49452 +/* mergeable method for non mergeable items */
49453 +static int
49454 +not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG)
49455 +{
49456 +       return 0;
49457 +}
49458 +
49459 +/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */
49460 +int are_items_mergeable(const coord_t * i1 /* coord of first item */ ,
49461 +                       const coord_t * i2 /* coord of second item */ )
49462 +{
49463 +       item_plugin *iplug;
49464 +       reiser4_key k1;
49465 +       reiser4_key k2;
49466 +
49467 +       assert("nikita-1336", i1 != NULL);
49468 +       assert("nikita-1337", i2 != NULL);
49469 +
49470 +       iplug = item_plugin_by_coord(i1);
49471 +       assert("nikita-1338", iplug != NULL);
49472 +
49473 +       /* NOTE-NIKITA are_items_mergeable() is also called by assertions in
49474 +          shifting code when nodes are in "suspended" state. */
49475 +       assert("nikita-1663",
49476 +              keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2)));
49477 +
49478 +       if (iplug->b.mergeable != NULL) {
49479 +               return iplug->b.mergeable(i1, i2);
49480 +       } else if (iplug->b.max_key_inside != NULL) {
49481 +               iplug->b.max_key_inside(i1, &k1);
49482 +               item_key_by_coord(i2, &k2);
49483 +
49484 +               /* mergeable if ->max_key_inside() >= key of i2; */
49485 +               return keyge(iplug->b.max_key_inside(i1, &k1),
49486 +                            item_key_by_coord(i2, &k2));
49487 +       } else {
49488 +               item_key_by_coord(i1, &k1);
49489 +               item_key_by_coord(i2, &k2);
49490 +
49491 +               return
49492 +                   (get_key_locality(&k1) == get_key_locality(&k2)) &&
49493 +                   (get_key_objectid(&k1) == get_key_objectid(&k2))
49494 +                   && (iplug == item_plugin_by_coord(i2));
49495 +       }
49496 +}
49497 +
49498 +int item_is_extent(const coord_t * item)
49499 +{
49500 +       assert("vs-482", coord_is_existing_item(item));
49501 +       return item_id_by_coord(item) == EXTENT_POINTER_ID;
49502 +}
49503 +
49504 +int item_is_tail(const coord_t * item)
49505 +{
49506 +       assert("vs-482", coord_is_existing_item(item));
49507 +       return item_id_by_coord(item) == FORMATTING_ID;
49508 +}
49509 +
49510 +#if REISER4_DEBUG
49511 +
49512 +int item_is_statdata(const coord_t * item)
49513 +{
49514 +       assert("vs-516", coord_is_existing_item(item));
49515 +       return plugin_of_group(item_plugin_by_coord(item), STAT_DATA_ITEM_TYPE);
49516 +}
49517 +
49518 +int item_is_ctail(const coord_t * item)
49519 +{
49520 +       assert("edward-xx", coord_is_existing_item(item));
49521 +       return item_id_by_coord(item) == CTAIL_ID;
49522 +}
49523 +
49524 +#endif  /*  REISER4_DEBUG  */
49525 +
49526 +static int change_item(struct inode *inode,
49527 +                      reiser4_plugin * plugin,
49528 +                      pset_member memb)
49529 +{
49530 +       /* cannot change constituent item (sd, or dir_item) */
49531 +       return RETERR(-EINVAL);
49532 +}
49533 +
49534 +static reiser4_plugin_ops item_plugin_ops = {
49535 +       .init = NULL,
49536 +       .load = NULL,
49537 +       .save_len = NULL,
49538 +       .save = NULL,
49539 +       .change = change_item
49540 +};
49541 +
49542 +item_plugin item_plugins[LAST_ITEM_ID] = {
49543 +       [STATIC_STAT_DATA_ID] = {
49544 +               .h = {
49545 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
49546 +                       .id = STATIC_STAT_DATA_ID,
49547 +                       .groups = (1 << STAT_DATA_ITEM_TYPE),
49548 +                       .pops = &item_plugin_ops,
49549 +                       .label = "sd",
49550 +                       .desc = "stat-data",
49551 +                       .linkage = {NULL, NULL}
49552 +               },
49553 +               .b = {
49554 +                       .max_key_inside = max_key_inside_single_key,
49555 +                       .can_contain_key = NULL,
49556 +                       .mergeable = not_mergeable,
49557 +                       .nr_units = nr_units_single_unit,
49558 +                       .lookup = NULL,
49559 +                       .init = NULL,
49560 +                       .paste = paste_no_paste,
49561 +                       .fast_paste = NULL,
49562 +                       .can_shift = NULL,
49563 +                       .copy_units = NULL,
49564 +                       .create_hook = NULL,
49565 +                       .kill_hook = NULL,
49566 +                       .shift_hook = NULL,
49567 +                       .cut_units = NULL,
49568 +                       .kill_units = NULL,
49569 +                       .unit_key = NULL,
49570 +                       .max_unit_key = NULL,
49571 +                       .estimate = NULL,
49572 +                       .item_data_by_flow = NULL,
49573 +#if REISER4_DEBUG
49574 +                       .check = NULL
49575 +#endif
49576 +               },
49577 +               .f = {
49578 +                       .utmost_child = NULL,
49579 +                       .utmost_child_real_block = NULL,
49580 +                       .update = NULL,
49581 +                       .scan = NULL,
49582 +                       .convert = NULL
49583 +               },
49584 +               .s = {
49585 +                       .sd = {
49586 +                               .init_inode = init_inode_static_sd,
49587 +                               .save_len = save_len_static_sd,
49588 +                               .save = save_static_sd
49589 +                       }
49590 +               }
49591 +       },
49592 +       [SIMPLE_DIR_ENTRY_ID] = {
49593 +               .h = {
49594 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
49595 +                       .id = SIMPLE_DIR_ENTRY_ID,
49596 +                       .groups = (1 << DIR_ENTRY_ITEM_TYPE),
49597 +                       .pops = &item_plugin_ops,
49598 +                       .label = "de",
49599 +                       .desc = "directory entry",
49600 +                       .linkage = {NULL, NULL}
49601 +               },
49602 +               .b = {
49603 +                       .max_key_inside = max_key_inside_single_key,
49604 +                       .can_contain_key = NULL,
49605 +                       .mergeable = NULL,
49606 +                       .nr_units = nr_units_single_unit,
49607 +                       .lookup = NULL,
49608 +                       .init = NULL,
49609 +                       .paste = NULL,
49610 +                       .fast_paste = NULL,
49611 +                       .can_shift = NULL,
49612 +                       .copy_units = NULL,
49613 +                       .create_hook = NULL,
49614 +                       .kill_hook = NULL,
49615 +                       .shift_hook = NULL,
49616 +                       .cut_units = NULL,
49617 +                       .kill_units = NULL,
49618 +                       .unit_key = NULL,
49619 +                       .max_unit_key = NULL,
49620 +                       .estimate = NULL,
49621 +                       .item_data_by_flow = NULL,
49622 +#if REISER4_DEBUG
49623 +                       .check = NULL
49624 +#endif
49625 +               },
49626 +               .f = {
49627 +                       .utmost_child = NULL,
49628 +                       .utmost_child_real_block = NULL,
49629 +                       .update = NULL,
49630 +                       .scan = NULL,
49631 +                       .convert = NULL
49632 +               },
49633 +               .s = {
49634 +                       .dir = {
49635 +                               .extract_key = extract_key_de,
49636 +                               .update_key = update_key_de,
49637 +                               .extract_name = extract_name_de,
49638 +                               .extract_file_type = extract_file_type_de,
49639 +                               .add_entry = add_entry_de,
49640 +                               .rem_entry = rem_entry_de,
49641 +                               .max_name_len = max_name_len_de
49642 +                       }
49643 +               }
49644 +       },
49645 +       [COMPOUND_DIR_ID] = {
49646 +               .h = {
49647 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
49648 +                       .id = COMPOUND_DIR_ID,
49649 +                       .groups = (1 << DIR_ENTRY_ITEM_TYPE),
49650 +                       .pops = &item_plugin_ops,
49651 +                       .label = "cde",
49652 +                       .desc = "compressed directory entry",
49653 +                       .linkage = {NULL, NULL}
49654 +               },
49655 +               .b = {
49656 +                       .max_key_inside = max_key_inside_cde,
49657 +                       .can_contain_key = can_contain_key_cde,
49658 +                       .mergeable = mergeable_cde,
49659 +                       .nr_units = nr_units_cde,
49660 +                       .lookup = lookup_cde,
49661 +                       .init = init_cde,
49662 +                       .paste = paste_cde,
49663 +                       .fast_paste = agree_to_fast_op,
49664 +                       .can_shift = can_shift_cde,
49665 +                       .copy_units = copy_units_cde,
49666 +                       .create_hook = NULL,
49667 +                       .kill_hook = NULL,
49668 +                       .shift_hook = NULL,
49669 +                       .cut_units = cut_units_cde,
49670 +                       .kill_units = kill_units_cde,
49671 +                       .unit_key = unit_key_cde,
49672 +                       .max_unit_key = unit_key_cde,
49673 +                       .estimate = estimate_cde,
49674 +                       .item_data_by_flow = NULL,
49675 +#if REISER4_DEBUG
49676 +                       .check = reiser4_check_cde
49677 +#endif
49678 +               },
49679 +               .f = {
49680 +                       .utmost_child = NULL,
49681 +                       .utmost_child_real_block = NULL,
49682 +                       .update = NULL,
49683 +                       .scan = NULL,
49684 +                       .convert = NULL
49685 +               },
49686 +               .s = {
49687 +                       .dir = {
49688 +                               .extract_key = extract_key_cde,
49689 +                               .update_key = update_key_cde,
49690 +                               .extract_name = extract_name_cde,
49691 +                               .extract_file_type = extract_file_type_de,
49692 +                               .add_entry = add_entry_cde,
49693 +                               .rem_entry = rem_entry_cde,
49694 +                               .max_name_len = max_name_len_cde
49695 +                       }
49696 +               }
49697 +       },
49698 +       [NODE_POINTER_ID] = {
49699 +               .h = {
49700 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
49701 +                       .id = NODE_POINTER_ID,
49702 +                       .groups = (1 << INTERNAL_ITEM_TYPE),
49703 +                       .pops = NULL,
49704 +                       .label = "internal",
49705 +                       .desc = "internal item",
49706 +                       .linkage = {NULL, NULL}
49707 +               },
49708 +               .b = {
49709 +                       .max_key_inside = NULL,
49710 +                       .can_contain_key = NULL,
49711 +                       .mergeable = mergeable_internal,
49712 +                       .nr_units = nr_units_single_unit,
49713 +                       .lookup = lookup_internal,
49714 +                       .init = NULL,
49715 +                       .paste = NULL,
49716 +                       .fast_paste = NULL,
49717 +                       .can_shift = NULL,
49718 +                       .copy_units = NULL,
49719 +                       .create_hook = create_hook_internal,
49720 +                       .kill_hook = kill_hook_internal,
49721 +                       .shift_hook = shift_hook_internal,
49722 +                       .cut_units = NULL,
49723 +                       .kill_units = NULL,
49724 +                       .unit_key = NULL,
49725 +                       .max_unit_key = NULL,
49726 +                       .estimate = NULL,
49727 +                       .item_data_by_flow = NULL,
49728 +#if REISER4_DEBUG
49729 +                       .check = check__internal
49730 +#endif
49731 +               },
49732 +               .f = {
49733 +                       .utmost_child = utmost_child_internal,
49734 +                       .utmost_child_real_block =
49735 +                       utmost_child_real_block_internal,
49736 +                       .update = reiser4_update_internal,
49737 +                       .scan = NULL,
49738 +                       .convert = NULL
49739 +               },
49740 +               .s = {
49741 +                       .internal = {
49742 +                               .down_link = down_link_internal,
49743 +                               .has_pointer_to = has_pointer_to_internal
49744 +                       }
49745 +               }
49746 +       },
49747 +       [EXTENT_POINTER_ID] = {
49748 +               .h = {
49749 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
49750 +                       .id = EXTENT_POINTER_ID,
49751 +                       .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
49752 +                       .pops = NULL,
49753 +                       .label = "extent",
49754 +                       .desc = "extent item",
49755 +                       .linkage = {NULL, NULL}
49756 +               },
49757 +               .b = {
49758 +                       .max_key_inside = max_key_inside_extent,
49759 +                       .can_contain_key = can_contain_key_extent,
49760 +                       .mergeable = mergeable_extent,
49761 +                       .nr_units = nr_units_extent,
49762 +                       .lookup = lookup_extent,
49763 +                       .init = NULL,
49764 +                       .paste = paste_extent,
49765 +                       .fast_paste = agree_to_fast_op,
49766 +                       .can_shift = can_shift_extent,
49767 +                       .create_hook = create_hook_extent,
49768 +                       .copy_units = copy_units_extent,
49769 +                       .kill_hook = kill_hook_extent,
49770 +                       .shift_hook = NULL,
49771 +                       .cut_units = cut_units_extent,
49772 +                       .kill_units = kill_units_extent,
49773 +                       .unit_key = unit_key_extent,
49774 +                       .max_unit_key = max_unit_key_extent,
49775 +                       .estimate = NULL,
49776 +                       .item_data_by_flow = NULL,
49777 +#if REISER4_DEBUG
49778 +                       .check = reiser4_check_extent
49779 +#endif
49780 +               },
49781 +               .f = {
49782 +                       .utmost_child = utmost_child_extent,
49783 +                       .utmost_child_real_block =
49784 +                       utmost_child_real_block_extent,
49785 +                       .update = NULL,
49786 +                       .scan = reiser4_scan_extent,
49787 +                       .convert = NULL,
49788 +                       .key_by_offset = key_by_offset_extent
49789 +               },
49790 +               .s = {
49791 +                       .file = {
49792 +                               .write = reiser4_write_extent,
49793 +                               .read = reiser4_read_extent,
49794 +                               .readpage = reiser4_readpage_extent,
49795 +                               .get_block = get_block_address_extent,
49796 +                               .append_key = append_key_extent,
49797 +                               .init_coord_extension =
49798 +                               init_coord_extension_extent
49799 +                       }
49800 +               }
49801 +       },
49802 +       [FORMATTING_ID] = {
49803 +               .h = {
49804 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
49805 +                       .id = FORMATTING_ID,
49806 +                       .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
49807 +                       .pops = NULL,
49808 +                       .label = "body",
49809 +                       .desc = "body (or tail?) item",
49810 +                       .linkage = {NULL, NULL}
49811 +               },
49812 +               .b = {
49813 +                       .max_key_inside = max_key_inside_tail,
49814 +                       .can_contain_key = can_contain_key_tail,
49815 +                       .mergeable = mergeable_tail,
49816 +                       .nr_units = nr_units_tail,
49817 +                       .lookup = lookup_tail,
49818 +                       .init = NULL,
49819 +                       .paste = paste_tail,
49820 +                       .fast_paste = agree_to_fast_op,
49821 +                       .can_shift = can_shift_tail,
49822 +                       .create_hook = NULL,
49823 +                       .copy_units = copy_units_tail,
49824 +                       .kill_hook = kill_hook_tail,
49825 +                       .shift_hook = NULL,
49826 +                       .cut_units = cut_units_tail,
49827 +                       .kill_units = kill_units_tail,
49828 +                       .unit_key = unit_key_tail,
49829 +                       .max_unit_key = unit_key_tail,
49830 +                       .estimate = NULL,
49831 +                       .item_data_by_flow = NULL,
49832 +#if REISER4_DEBUG
49833 +                       .check = NULL
49834 +#endif
49835 +               },
49836 +               .f = {
49837 +                       .utmost_child = NULL,
49838 +                       .utmost_child_real_block = NULL,
49839 +                       .update = NULL,
49840 +                       .scan = NULL,
49841 +                       .convert = NULL
49842 +               },
49843 +               .s = {
49844 +                       .file = {
49845 +                               .write = reiser4_write_tail,
49846 +                               .read = reiser4_read_tail,
49847 +                               .readpage = readpage_tail,
49848 +                               .get_block = get_block_address_tail,
49849 +                               .append_key = append_key_tail,
49850 +                               .init_coord_extension =
49851 +                               init_coord_extension_tail
49852 +                       }
49853 +               }
49854 +       },
49855 +       [CTAIL_ID] = {
49856 +               .h = {
49857 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
49858 +                       .id = CTAIL_ID,
49859 +                       .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
49860 +                       .pops = NULL,
49861 +                       .label = "ctail",
49862 +                       .desc = "cryptcompress tail item",
49863 +                       .linkage = {NULL, NULL}
49864 +               },
49865 +               .b = {
49866 +                       .max_key_inside = max_key_inside_tail,
49867 +                       .can_contain_key = can_contain_key_ctail,
49868 +                       .mergeable = mergeable_ctail,
49869 +                       .nr_units = nr_units_ctail,
49870 +                       .lookup = NULL,
49871 +                       .init = init_ctail,
49872 +                       .paste = paste_ctail,
49873 +                       .fast_paste = agree_to_fast_op,
49874 +                       .can_shift = can_shift_ctail,
49875 +                       .create_hook = create_hook_ctail,
49876 +                       .copy_units = copy_units_ctail,
49877 +                       .kill_hook = kill_hook_ctail,
49878 +                       .shift_hook = shift_hook_ctail,
49879 +                       .cut_units = cut_units_ctail,
49880 +                       .kill_units = kill_units_ctail,
49881 +                       .unit_key = unit_key_tail,
49882 +                       .max_unit_key = unit_key_tail,
49883 +                       .estimate = estimate_ctail,
49884 +                       .item_data_by_flow = NULL,
49885 +#if REISER4_DEBUG
49886 +                       .check = check_ctail
49887 +#endif
49888 +               },
49889 +               .f = {
49890 +                       .utmost_child = utmost_child_ctail,
49891 +                       /* FIXME-EDWARD: write this */
49892 +                       .utmost_child_real_block = NULL,
49893 +                       .update = NULL,
49894 +                       .scan = scan_ctail,
49895 +                       .convert = convert_ctail
49896 +               },
49897 +               .s = {
49898 +                       .file = {
49899 +                               .write = NULL,
49900 +                               .read = read_ctail,
49901 +                               .readpage = readpage_ctail,
49902 +                               .get_block = get_block_address_tail,
49903 +                               .append_key = append_key_ctail,
49904 +                               .init_coord_extension =
49905 +                               init_coord_extension_tail
49906 +                       }
49907 +               }
49908 +       },
49909 +       [BLACK_BOX_ID] = {
49910 +               .h = {
49911 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
49912 +                       .id = BLACK_BOX_ID,
49913 +                       .groups = (1 << OTHER_ITEM_TYPE),
49914 +                       .pops = NULL,
49915 +                       .label = "blackbox",
49916 +                       .desc = "black box item",
49917 +                       .linkage = {NULL, NULL}
49918 +               },
49919 +               .b = {
49920 +                       .max_key_inside = NULL,
49921 +                       .can_contain_key = NULL,
49922 +                       .mergeable = not_mergeable,
49923 +                       .nr_units = nr_units_single_unit,
49924 +                       /* to need for ->lookup method */
49925 +                       .lookup = NULL,
49926 +                       .init = NULL,
49927 +                       .paste = NULL,
49928 +                       .fast_paste = NULL,
49929 +                       .can_shift = NULL,
49930 +                       .copy_units = NULL,
49931 +                       .create_hook = NULL,
49932 +                       .kill_hook = NULL,
49933 +                       .shift_hook = NULL,
49934 +                       .cut_units = NULL,
49935 +                       .kill_units = NULL,
49936 +                       .unit_key = NULL,
49937 +                       .max_unit_key = NULL,
49938 +                       .estimate = NULL,
49939 +                       .item_data_by_flow = NULL,
49940 +#if REISER4_DEBUG
49941 +                       .check = NULL
49942 +#endif
49943 +               }
49944 +       }
49945 +};
49946 +
49947 +/* Make Linus happy.
49948 +   Local variables:
49949 +   c-indentation-style: "K&R"
49950 +   mode-name: "LC"
49951 +   c-basic-offset: 8
49952 +   tab-width: 8
49953 +   fill-column: 120
49954 +   End:
49955 +*/
49956 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/item.h linux-2.6.35/fs/reiser4/plugin/item/item.h
49957 --- linux-2.6.35.orig/fs/reiser4/plugin/item/item.h     1970-01-01 01:00:00.000000000 +0100
49958 +++ linux-2.6.35/fs/reiser4/plugin/item/item.h  2010-08-04 15:44:57.000000000 +0200
49959 @@ -0,0 +1,398 @@
49960 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49961 +
49962 +/* first read balance.c comments before reading this */
49963 +
49964 +/* An item_plugin implements all of the operations required for
49965 +   balancing that are item specific. */
49966 +
49967 +/* an item plugin also implements other operations that are specific to that
49968 +   item.  These go into the item specific operations portion of the item
49969 +   handler, and all of the item specific portions of the item handler are put
49970 +   into a union. */
49971 +
49972 +#if !defined( __REISER4_ITEM_H__ )
49973 +#define __REISER4_ITEM_H__
49974 +
49975 +#include "../../forward.h"
49976 +#include "../plugin_header.h"
49977 +#include "../../dformat.h"
49978 +#include "../../seal.h"
49979 +#include "../../plugin/file/file.h"
49980 +
49981 +#include <linux/fs.h>          /* for struct file, struct inode  */
49982 +#include <linux/mm.h>          /* for struct page */
49983 +#include <linux/dcache.h>      /* for struct dentry */
49984 +
49985 +typedef enum {
49986 +       STAT_DATA_ITEM_TYPE,
49987 +       DIR_ENTRY_ITEM_TYPE,
49988 +       INTERNAL_ITEM_TYPE,
49989 +       UNIX_FILE_METADATA_ITEM_TYPE,
49990 +       OTHER_ITEM_TYPE
49991 +} item_type_id;
49992 +
49993 +/* this is the part of each item plugin that all items are expected to
49994 +   support or at least explicitly fail to support by setting the
49995 +   pointer to null. */
49996 +struct balance_ops {
49997 +       /* operations called by balancing
49998 +
49999 +          It is interesting to consider that some of these item
50000 +          operations could be given sources or targets that are not
50001 +          really items in nodes.  This could be ok/useful.
50002 +
50003 +        */
50004 +       /* maximal key that can _possibly_ be occupied by this item
50005 +
50006 +          When inserting, and node ->lookup() method (called by
50007 +          coord_by_key()) reaches an item after binary search,
50008 +          the  ->max_key_inside() item plugin method is used to determine
50009 +          whether new item should pasted into existing item
50010 +          (new_key<=max_key_inside()) or new item has to be created
50011 +          (new_key>max_key_inside()).
50012 +
50013 +          For items that occupy exactly one key (like stat-data)
50014 +          this method should return this key. For items that can
50015 +          grow indefinitely (extent, directory item) this should
50016 +          return reiser4_max_key().
50017 +
50018 +          For example extent with the key
50019 +
50020 +          (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
50021 +
50022 +          ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and
50023 +        */
50024 +       reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *);
50025 +
50026 +       /* true if item @coord can merge data at @key. */
50027 +       int (*can_contain_key) (const coord_t *, const reiser4_key *,
50028 +                               const reiser4_item_data *);
50029 +       /* mergeable() - check items for mergeability
50030 +
50031 +          Optional method. Returns true if two items can be merged.
50032 +
50033 +        */
50034 +       int (*mergeable) (const coord_t *, const coord_t *);
50035 +
50036 +       /* number of atomic things in an item.
50037 +          NOTE FOR CONTRIBUTORS: use a generic method
50038 +          nr_units_single_unit() for solid (atomic) items, as
50039 +          tree operations use it as a criterion of solidness
50040 +          (see is_solid_item macro) */
50041 +       pos_in_node_t(*nr_units) (const coord_t *);
50042 +
50043 +       /* search within item for a unit within the item, and return a
50044 +          pointer to it.  This can be used to calculate how many
50045 +          bytes to shrink an item if you use pointer arithmetic and
50046 +          compare to the start of the item body if the item's data
50047 +          are continuous in the node, if the item's data are not
50048 +          continuous in the node, all sorts of other things are maybe
50049 +          going to break as well. */
50050 +        lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *);
50051 +       /* method called by ode_plugin->create_item() to initialise new
50052 +          item */
50053 +       int (*init) (coord_t * target, coord_t * from,
50054 +                    reiser4_item_data * data);
50055 +       /* method called (e.g., by reiser4_resize_item()) to place new data
50056 +          into item when it grows */
50057 +       int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *);
50058 +       /* return true if paste into @coord is allowed to skip
50059 +          carry. That is, if such paste would require any changes
50060 +          at the parent level
50061 +        */
50062 +       int (*fast_paste) (const coord_t *);
50063 +       /* how many but not more than @want units of @source can be
50064 +          shifted into @target node. If pend == append - we try to
50065 +          append last item of @target by first units of @source. If
50066 +          pend == prepend - we try to "prepend" first item in @target
50067 +          by last units of @source. @target node has @free_space
50068 +          bytes of free space. Total size of those units are returned
50069 +          via @size.
50070 +
50071 +          @target is not NULL if shifting to the mergeable item and
50072 +          NULL is new item will be created during shifting.
50073 +        */
50074 +       int (*can_shift) (unsigned free_space, coord_t *,
50075 +                         znode *, shift_direction, unsigned *size,
50076 +                         unsigned want);
50077 +
50078 +       /* starting off @from-th unit of item @source append or
50079 +          prepend @count units to @target. @target has been already
50080 +          expanded by @free_space bytes. That must be exactly what is
50081 +          needed for those items in @target. If @where_is_free_space
50082 +          == SHIFT_LEFT - free space is at the end of @target item,
50083 +          othersize - it is in the beginning of it. */
50084 +       void (*copy_units) (coord_t *, coord_t *,
50085 +                           unsigned from, unsigned count,
50086 +                           shift_direction where_is_free_space,
50087 +                           unsigned free_space);
50088 +
50089 +       int (*create_hook) (const coord_t *, void *);
50090 +       /* do whatever is necessary to do when @count units starting
50091 +          from @from-th one are removed from the tree */
50092 +       /* FIXME-VS: this is used to be here for, in particular,
50093 +          extents and items of internal type to free blocks they point
50094 +          to at the same time with removing items from a
50095 +          tree. Problems start, however, when dealloc_block fails due
50096 +          to some reason. Item gets removed, but blocks it pointed to
50097 +          are not freed. It is not clear how to fix this for items of
50098 +          internal type because a need to remove internal item may
50099 +          appear in the middle of balancing, and there is no way to
50100 +          undo changes made. OTOH, if space allocator involves
50101 +          balancing to perform dealloc_block - this will probably
50102 +          break balancing due to deadlock issues
50103 +        */
50104 +       int (*kill_hook) (const coord_t *, pos_in_node_t from,
50105 +                         pos_in_node_t count, struct carry_kill_data *);
50106 +       int (*shift_hook) (const coord_t *, unsigned from, unsigned count,
50107 +                          znode * _node);
50108 +
50109 +       /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key
50110 +          including boundaries. When units are cut from item beginning - move space which gets freed to head of
50111 +          item. When units are cut from item end - move freed space to item end. When units are cut from the middle of
50112 +          item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in
50113 +          @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0
50114 +        */
50115 +       int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
50116 +                         struct carry_cut_data *,
50117 +                         reiser4_key * smallest_removed,
50118 +                         reiser4_key * new_first_key);
50119 +
50120 +       /* like cut_units, except that these units are removed from the
50121 +          tree, not only from a node */
50122 +       int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
50123 +                          struct carry_kill_data *,
50124 +                          reiser4_key * smallest_removed,
50125 +                          reiser4_key * new_first);
50126 +
50127 +       /* if @key_of_coord == 1 - returned key of coord, otherwise -
50128 +          key of unit is returned. If @coord is not set to certain
50129 +          unit - ERR_PTR(-ENOENT) is returned */
50130 +       reiser4_key *(*unit_key) (const coord_t *, reiser4_key *);
50131 +       reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *);
50132 +       /* estimate how much space is needed for paste @data into item at
50133 +          @coord. if @coord==0 - estimate insertion, otherwise - estimate
50134 +          pasting
50135 +        */
50136 +       int (*estimate) (const coord_t *, const reiser4_item_data *);
50137 +
50138 +       /* converts flow @f to item data. @coord == 0 on insert */
50139 +       int (*item_data_by_flow) (const coord_t *, const flow_t *,
50140 +                                 reiser4_item_data *);
50141 +
50142 +       /*void (*show) (struct seq_file *, coord_t *); */
50143 +
50144 +#if REISER4_DEBUG
50145 +       /* used for debugging, every item should have here the most
50146 +          complete possible check of the consistency of the item that
50147 +          the inventor can construct */
50148 +       int (*check) (const coord_t *, const char **error);
50149 +#endif
50150 +
50151 +};
50152 +
50153 +struct flush_ops {
50154 +       /* return the right or left child of @coord, only if it is in memory */
50155 +       int (*utmost_child) (const coord_t *, sideof side, jnode ** child);
50156 +
50157 +       /* return whether the right or left child of @coord has a non-fake
50158 +          block number. */
50159 +       int (*utmost_child_real_block) (const coord_t *, sideof side,
50160 +                                       reiser4_block_nr *);
50161 +       /* relocate child at @coord to the @block */
50162 +       void (*update) (const coord_t *, const reiser4_block_nr *);
50163 +       /* count unformatted nodes per item for leave relocation policy, etc.. */
50164 +       int (*scan) (flush_scan * scan);
50165 +       /* convert item by flush */
50166 +       int (*convert) (flush_pos_t * pos);
50167 +       /* backward mapping from jnode offset to a key.  */
50168 +       int (*key_by_offset) (struct inode *, loff_t, reiser4_key *);
50169 +};
50170 +
50171 +/* operations specific to the directory item */
50172 +struct dir_entry_iops {
50173 +       /* extract stat-data key from directory entry at @coord and place it
50174 +          into @key. */
50175 +       int (*extract_key) (const coord_t *, reiser4_key * key);
50176 +       /* update object key in item. */
50177 +       int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *);
50178 +       /* extract name from directory entry at @coord and return it */
50179 +       char *(*extract_name) (const coord_t *, char *buf);
50180 +       /* extract file type (DT_* stuff) from directory entry at @coord and
50181 +          return it */
50182 +       unsigned (*extract_file_type) (const coord_t *);
50183 +       int (*add_entry) (struct inode * dir,
50184 +                         coord_t *, lock_handle *,
50185 +                         const struct dentry * name,
50186 +                         reiser4_dir_entry_desc * entry);
50187 +       int (*rem_entry) (struct inode * dir, const struct qstr * name,
50188 +                         coord_t *, lock_handle *,
50189 +                         reiser4_dir_entry_desc * entry);
50190 +       int (*max_name_len) (const struct inode * dir);
50191 +};
50192 +
50193 +/* operations specific to items regular (unix) file metadata are built of */
50194 +struct file_iops{
50195 +       ssize_t (*write) (struct file *, struct inode *,
50196 +                         const char __user *, size_t, loff_t *pos);
50197 +       int (*read) (struct file *, flow_t *, hint_t *);
50198 +       int (*readpage) (void *, struct page *);
50199 +       int (*get_block) (const coord_t *, sector_t, sector_t *);
50200 +       /*
50201 +        * key of first byte which is not addressed by the item @coord is set
50202 +        * to.
50203 +        * For example, for extent item with the key
50204 +        *
50205 +        * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
50206 +        *
50207 +        * ->append_key is
50208 +        *
50209 +        * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size)
50210 +        */
50211 +       reiser4_key *(*append_key) (const coord_t *, reiser4_key *);
50212 +
50213 +       void (*init_coord_extension) (uf_coord_t *, loff_t);
50214 +};
50215 +
50216 +/* operations specific to items of stat data type */
50217 +struct sd_iops {
50218 +       int (*init_inode) (struct inode * inode, char *sd, int len);
50219 +       int (*save_len) (struct inode * inode);
50220 +       int (*save) (struct inode * inode, char **area);
50221 +};
50222 +
50223 +/* operations specific to internal item */
50224 +struct internal_iops{
50225 +       /* all tree traversal want to know from internal item is where
50226 +          to go next. */
50227 +       void (*down_link) (const coord_t * coord,
50228 +                          const reiser4_key * key, reiser4_block_nr * block);
50229 +       /* check that given internal item contains given pointer. */
50230 +       int (*has_pointer_to) (const coord_t * coord,
50231 +                              const reiser4_block_nr * block);
50232 +};
50233 +
50234 +struct item_plugin {
50235 +       /* generic fields */
50236 +       plugin_header h;
50237 +       /* methods common for all item types */
50238 +       struct balance_ops b; /* balance operations */
50239 +       struct flush_ops f;   /* flush operates with items via this methods */
50240 +
50241 +       /* methods specific to particular type of item */
50242 +       union {
50243 +               struct dir_entry_iops dir;
50244 +               struct      file_iops file;
50245 +               struct        sd_iops sd;
50246 +               struct  internal_iops internal;
50247 +       } s;
50248 +};
50249 +
50250 +#define is_solid_item(iplug) ((iplug)->b.nr_units == nr_units_single_unit)
50251 +
50252 +static inline item_id item_id_by_plugin(item_plugin * plugin)
50253 +{
50254 +       return plugin->h.id;
50255 +}
50256 +
50257 +static inline char get_iplugid(item_plugin * iplug)
50258 +{
50259 +       assert("nikita-2838", iplug != NULL);
50260 +       assert("nikita-2839", iplug->h.id < 0xff);
50261 +       return (char)item_id_by_plugin(iplug);
50262 +}
50263 +
50264 +extern unsigned long znode_times_locked(const znode * z);
50265 +
50266 +static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug)
50267 +{
50268 +       assert("nikita-2837", coord != NULL);
50269 +       assert("nikita-2838", iplug != NULL);
50270 +       coord->iplugid = get_iplugid(iplug);
50271 +       ON_DEBUG(coord->plug_v = znode_times_locked(coord->node));
50272 +}
50273 +
50274 +static inline item_plugin *coord_iplug(const coord_t * coord)
50275 +{
50276 +       assert("nikita-2833", coord != NULL);
50277 +       assert("nikita-2834", coord->iplugid != INVALID_PLUGID);
50278 +       assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node));
50279 +       return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE,
50280 +                                           coord->iplugid);
50281 +}
50282 +
50283 +extern int item_can_contain_key(const coord_t * item, const reiser4_key * key,
50284 +                               const reiser4_item_data *);
50285 +extern int are_items_mergeable(const coord_t * i1, const coord_t * i2);
50286 +extern int item_is_extent(const coord_t *);
50287 +extern int item_is_tail(const coord_t *);
50288 +extern int item_is_statdata(const coord_t * item);
50289 +extern int item_is_ctail(const coord_t *);
50290 +
50291 +extern pos_in_node_t item_length_by_coord(const coord_t * coord);
50292 +extern pos_in_node_t nr_units_single_unit(const coord_t * coord);
50293 +extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ );
50294 +extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key);
50295 +extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *);
50296 +extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key);
50297 +extern reiser4_key *max_unit_key_by_coord(const coord_t * coord,
50298 +                                         reiser4_key * key);
50299 +extern void obtain_item_plugin(const coord_t * coord);
50300 +
50301 +#if defined(REISER4_DEBUG)
50302 +extern int znode_is_loaded(const znode * node);
50303 +#endif
50304 +
50305 +/* return plugin of item at @coord */
50306 +static inline item_plugin *item_plugin_by_coord(const coord_t *
50307 +                                               coord /* coord to query */ )
50308 +{
50309 +       assert("nikita-330", coord != NULL);
50310 +       assert("nikita-331", coord->node != NULL);
50311 +       assert("nikita-332", znode_is_loaded(coord->node));
50312 +
50313 +       if (unlikely(!coord_is_iplug_set(coord)))
50314 +               obtain_item_plugin(coord);
50315 +       return coord_iplug(coord);
50316 +}
50317 +
50318 +/* this returns true if item is of internal type */
50319 +static inline int item_is_internal(const coord_t * item)
50320 +{
50321 +       assert("vs-483", coord_is_existing_item(item));
50322 +       return plugin_of_group(item_plugin_by_coord(item), INTERNAL_ITEM_TYPE);
50323 +}
50324 +
50325 +extern void item_body_by_coord_hard(coord_t * coord);
50326 +extern void *item_body_by_coord_easy(const coord_t * coord);
50327 +#if REISER4_DEBUG
50328 +extern int item_body_is_valid(const coord_t * coord);
50329 +#endif
50330 +
50331 +/* return pointer to item body */
50332 +static inline void *item_body_by_coord(const coord_t *
50333 +                                      coord /* coord to query */ )
50334 +{
50335 +       assert("nikita-324", coord != NULL);
50336 +       assert("nikita-325", coord->node != NULL);
50337 +       assert("nikita-326", znode_is_loaded(coord->node));
50338 +
50339 +       if (coord->offset == INVALID_OFFSET)
50340 +               item_body_by_coord_hard((coord_t *) coord);
50341 +       assert("nikita-3201", item_body_is_valid(coord));
50342 +       assert("nikita-3550", coord->body_v == znode_times_locked(coord->node));
50343 +       return item_body_by_coord_easy(coord);
50344 +}
50345 +
50346 +/* __REISER4_ITEM_H__ */
50347 +#endif
50348 +/* Make Linus happy.
50349 +   Local variables:
50350 +   c-indentation-style: "K&R"
50351 +   mode-name: "LC"
50352 +   c-basic-offset: 8
50353 +   tab-width: 8
50354 +   fill-column: 120
50355 +   scroll-step: 1
50356 +   End:
50357 +*/
50358 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/Makefile linux-2.6.35/fs/reiser4/plugin/item/Makefile
50359 --- linux-2.6.35.orig/fs/reiser4/plugin/item/Makefile   1970-01-01 01:00:00.000000000 +0100
50360 +++ linux-2.6.35/fs/reiser4/plugin/item/Makefile        2010-08-04 15:44:57.000000000 +0200
50361 @@ -0,0 +1,18 @@
50362 +obj-$(CONFIG_REISER4_FS) += item_plugins.o
50363 +
50364 +item_plugins-objs :=           \
50365 +       item.o                  \
50366 +       static_stat.o           \
50367 +       sde.o                   \
50368 +       cde.o                   \
50369 +       blackbox.o              \
50370 +       internal.o              \
50371 +       tail.o                  \
50372 +       ctail.o                 \
50373 +       extent.o                \
50374 +       extent_item_ops.o       \
50375 +       extent_file_ops.o       \
50376 +       extent_flush_ops.o
50377 +
50378 +
50379 +
50380 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/sde.c linux-2.6.35/fs/reiser4/plugin/item/sde.c
50381 --- linux-2.6.35.orig/fs/reiser4/plugin/item/sde.c      1970-01-01 01:00:00.000000000 +0100
50382 +++ linux-2.6.35/fs/reiser4/plugin/item/sde.c   2010-08-04 15:44:57.000000000 +0200
50383 @@ -0,0 +1,190 @@
50384 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50385 +
50386 +/* Directory entry implementation */
50387 +#include "../../forward.h"
50388 +#include "../../debug.h"
50389 +#include "../../dformat.h"
50390 +#include "../../kassign.h"
50391 +#include "../../coord.h"
50392 +#include "sde.h"
50393 +#include "item.h"
50394 +#include "../plugin.h"
50395 +#include "../../znode.h"
50396 +#include "../../carry.h"
50397 +#include "../../tree.h"
50398 +#include "../../inode.h"
50399 +
50400 +#include <linux/fs.h>          /* for struct inode */
50401 +#include <linux/dcache.h>      /* for struct dentry */
50402 +#include <linux/quotaops.h>
50403 +
50404 +/* ->extract_key() method of simple directory item plugin. */
50405 +int extract_key_de(const coord_t * coord /* coord of item */ ,
50406 +                  reiser4_key * key /* resulting key */ )
50407 +{
50408 +       directory_entry_format *dent;
50409 +
50410 +       assert("nikita-1458", coord != NULL);
50411 +       assert("nikita-1459", key != NULL);
50412 +
50413 +       dent = (directory_entry_format *) item_body_by_coord(coord);
50414 +       assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent);
50415 +       return extract_key_from_id(&dent->id, key);
50416 +}
50417 +
50418 +int
50419 +update_key_de(const coord_t * coord, const reiser4_key * key,
50420 +             lock_handle * lh UNUSED_ARG)
50421 +{
50422 +       directory_entry_format *dent;
50423 +       obj_key_id obj_id;
50424 +       int result;
50425 +
50426 +       assert("nikita-2342", coord != NULL);
50427 +       assert("nikita-2343", key != NULL);
50428 +
50429 +       dent = (directory_entry_format *) item_body_by_coord(coord);
50430 +       result = build_obj_key_id(key, &obj_id);
50431 +       if (result == 0) {
50432 +               dent->id = obj_id;
50433 +               znode_make_dirty(coord->node);
50434 +       }
50435 +       return 0;
50436 +}
50437 +
50438 +char *extract_dent_name(const coord_t * coord, directory_entry_format * dent,
50439 +                       char *buf)
50440 +{
50441 +       reiser4_key key;
50442 +
50443 +       unit_key_by_coord(coord, &key);
50444 +       if (get_key_type(&key) != KEY_FILE_NAME_MINOR)
50445 +               reiser4_print_address("oops", znode_get_block(coord->node));
50446 +       if (!is_longname_key(&key)) {
50447 +               if (is_dot_key(&key))
50448 +                       return (char *)".";
50449 +               else
50450 +                       return extract_name_from_key(&key, buf);
50451 +       } else
50452 +               return (char *)dent->name;
50453 +}
50454 +
50455 +/* ->extract_name() method of simple directory item plugin. */
50456 +char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf)
50457 +{
50458 +       directory_entry_format *dent;
50459 +
50460 +       assert("nikita-1460", coord != NULL);
50461 +
50462 +       dent = (directory_entry_format *) item_body_by_coord(coord);
50463 +       return extract_dent_name(coord, dent, buf);
50464 +}
50465 +
50466 +/* ->extract_file_type() method of simple directory item plugin. */
50467 +unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of
50468 +                                                                * item */ )
50469 +{
50470 +       assert("nikita-1764", coord != NULL);
50471 +       /* we don't store file type in the directory entry yet.
50472 +
50473 +          But see comments at kassign.h:obj_key_id
50474 +        */
50475 +       return DT_UNKNOWN;
50476 +}
50477 +
50478 +int add_entry_de(struct inode *dir /* directory of item */ ,
50479 +                coord_t * coord /* coord of item */ ,
50480 +                lock_handle * lh /* insertion lock handle */ ,
50481 +                const struct dentry *de /* name to add */ ,
50482 +                reiser4_dir_entry_desc * entry /* parameters of new directory
50483 +                                                * entry */ )
50484 +{
50485 +       reiser4_item_data data;
50486 +       directory_entry_format *dent;
50487 +       int result;
50488 +       const char *name;
50489 +       int len;
50490 +       int longname;
50491 +
50492 +       name = de->d_name.name;
50493 +       len = de->d_name.len;
50494 +       assert("nikita-1163", strlen(name) == len);
50495 +
50496 +       longname = is_longname(name, len);
50497 +
50498 +       data.length = sizeof *dent;
50499 +       if (longname)
50500 +               data.length += len + 1;
50501 +       data.data = NULL;
50502 +       data.user = 0;
50503 +       data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
50504 +
50505 +       /* NOTE-NIKITA quota plugin */
50506 +       if (dquot_alloc_space_nodirty(dir, data.length))
50507 +               return -EDQUOT;
50508 +
50509 +       result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
50510 +       if (result != 0)
50511 +               return result;
50512 +
50513 +       dent = (directory_entry_format *) item_body_by_coord(coord);
50514 +       build_inode_key_id(entry->obj, &dent->id);
50515 +       if (longname) {
50516 +               memcpy(dent->name, name, len);
50517 +               put_unaligned(0, &dent->name[len]);
50518 +       }
50519 +       return 0;
50520 +}
50521 +
50522 +int rem_entry_de(struct inode *dir /* directory of item */ ,
50523 +                const struct qstr *name UNUSED_ARG,
50524 +                coord_t * coord /* coord of item */ ,
50525 +                lock_handle * lh UNUSED_ARG    /* lock handle for
50526 +                                                * removal */ ,
50527 +                reiser4_dir_entry_desc * entry UNUSED_ARG      /* parameters of
50528 +                                                                * directory entry
50529 +                                                                * being removed */ )
50530 +{
50531 +       coord_t shadow;
50532 +       int result;
50533 +       int length;
50534 +
50535 +       length = item_length_by_coord(coord);
50536 +       if (inode_get_bytes(dir) < length) {
50537 +               warning("nikita-2627", "Dir is broke: %llu: %llu",
50538 +                       (unsigned long long)get_inode_oid(dir),
50539 +                       inode_get_bytes(dir));
50540 +
50541 +               return RETERR(-EIO);
50542 +       }
50543 +
50544 +       /* cut_node() is supposed to take pointers to _different_
50545 +          coords, because it will modify them without respect to
50546 +          possible aliasing. To work around this, create temporary copy
50547 +          of @coord.
50548 +        */
50549 +       coord_dup(&shadow, coord);
50550 +       result =
50551 +           kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
50552 +       if (result == 0) {
50553 +               /* NOTE-NIKITA quota plugin */
50554 +               dquot_free_space_nodirty(dir, length);
50555 +       }
50556 +       return result;
50557 +}
50558 +
50559 +int max_name_len_de(const struct inode *dir)
50560 +{
50561 +       return reiser4_tree_by_inode(dir)->nplug->max_item_size() -
50562 +               sizeof(directory_entry_format) - 2;
50563 +}
50564 +
50565 +/* Make Linus happy.
50566 +   Local variables:
50567 +   c-indentation-style: "K&R"
50568 +   mode-name: "LC"
50569 +   c-basic-offset: 8
50570 +   tab-width: 8
50571 +   fill-column: 120
50572 +   End:
50573 +*/
50574 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/sde.h linux-2.6.35/fs/reiser4/plugin/item/sde.h
50575 --- linux-2.6.35.orig/fs/reiser4/plugin/item/sde.h      1970-01-01 01:00:00.000000000 +0100
50576 +++ linux-2.6.35/fs/reiser4/plugin/item/sde.h   2010-08-04 15:44:57.000000000 +0200
50577 @@ -0,0 +1,66 @@
50578 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50579 +
50580 +/* Directory entry. */
50581 +
50582 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
50583 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
50584 +
50585 +#include "../../forward.h"
50586 +#include "../../dformat.h"
50587 +#include "../../kassign.h"
50588 +#include "../../key.h"
50589 +
50590 +#include <linux/fs.h>
50591 +#include <linux/dcache.h>      /* for struct dentry */
50592 +
50593 +typedef struct directory_entry_format {
50594 +       /* key of object stat-data. It's not necessary to store whole
50595 +          key here, because it's always key of stat-data, so minor
50596 +          packing locality and offset can be omitted here. But this
50597 +          relies on particular key allocation scheme for stat-data, so,
50598 +          for extensibility sake, whole key can be stored here.
50599 +
50600 +          We store key as array of bytes, because we don't want 8-byte
50601 +          alignment of dir entries.
50602 +        */
50603 +       obj_key_id id;
50604 +       /* file name. Null terminated string. */
50605 +       d8 name[0];
50606 +} directory_entry_format;
50607 +
50608 +void print_de(const char *prefix, coord_t * coord);
50609 +int extract_key_de(const coord_t * coord, reiser4_key * key);
50610 +int update_key_de(const coord_t * coord, const reiser4_key * key,
50611 +                 lock_handle * lh);
50612 +char *extract_name_de(const coord_t * coord, char *buf);
50613 +unsigned extract_file_type_de(const coord_t * coord);
50614 +int add_entry_de(struct inode *dir, coord_t * coord,
50615 +                lock_handle * lh, const struct dentry *name,
50616 +                reiser4_dir_entry_desc * entry);
50617 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
50618 +                lock_handle * lh, reiser4_dir_entry_desc * entry);
50619 +int max_name_len_de(const struct inode *dir);
50620 +
50621 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
50622 +
50623 +char *extract_dent_name(const coord_t * coord,
50624 +                       directory_entry_format * dent, char *buf);
50625 +
50626 +#if REISER4_LARGE_KEY
50627 +#define DE_NAME_BUF_LEN (24)
50628 +#else
50629 +#define DE_NAME_BUF_LEN (16)
50630 +#endif
50631 +
50632 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
50633 +#endif
50634 +
50635 +/* Make Linus happy.
50636 +   Local variables:
50637 +   c-indentation-style: "K&R"
50638 +   mode-name: "LC"
50639 +   c-basic-offset: 8
50640 +   tab-width: 8
50641 +   fill-column: 120
50642 +   End:
50643 +*/
50644 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/static_stat.c linux-2.6.35/fs/reiser4/plugin/item/static_stat.c
50645 --- linux-2.6.35.orig/fs/reiser4/plugin/item/static_stat.c      1970-01-01 01:00:00.000000000 +0100
50646 +++ linux-2.6.35/fs/reiser4/plugin/item/static_stat.c   2010-08-04 15:44:57.000000000 +0200
50647 @@ -0,0 +1,1114 @@
50648 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50649 +
50650 +/* stat data manipulation. */
50651 +
50652 +#include "../../forward.h"
50653 +#include "../../super.h"
50654 +#include "../../vfs_ops.h"
50655 +#include "../../inode.h"
50656 +#include "../../debug.h"
50657 +#include "../../dformat.h"
50658 +#include "../object.h"
50659 +#include "../plugin.h"
50660 +#include "../plugin_header.h"
50661 +#include "static_stat.h"
50662 +#include "item.h"
50663 +
50664 +#include <linux/types.h>
50665 +#include <linux/fs.h>
50666 +
50667 +/* see static_stat.h for explanation */
50668 +
50669 +/* helper function used while we are dumping/loading inode/plugin state
50670 +    to/from the stat-data. */
50671 +
50672 +static void move_on(int *length /* space remaining in stat-data */ ,
50673 +                   char **area /* current coord in stat data */ ,
50674 +                   int size_of /* how many bytes to move forward */ )
50675 +{
50676 +       assert("nikita-615", length != NULL);
50677 +       assert("nikita-616", area != NULL);
50678 +
50679 +       *length -= size_of;
50680 +       *area += size_of;
50681 +
50682 +       assert("nikita-617", *length >= 0);
50683 +}
50684 +
50685 +/* helper function used while loading inode/plugin state from stat-data.
50686 +    Complain if there is less space in stat-data than was expected.
50687 +    Can only happen on disk corruption. */
50688 +static int not_enough_space(struct inode *inode /* object being processed */ ,
50689 +                           const char *where /* error message */ )
50690 +{
50691 +       assert("nikita-618", inode != NULL);
50692 +
50693 +       warning("nikita-619", "Not enough space in %llu while loading %s",
50694 +               (unsigned long long)get_inode_oid(inode), where);
50695 +
50696 +       return RETERR(-EINVAL);
50697 +}
50698 +
50699 +/* helper function used while loading inode/plugin state from
50700 +    stat-data. Call it if invalid plugin id was found. */
50701 +static int unknown_plugin(reiser4_plugin_id id /* invalid id */ ,
50702 +                         struct inode *inode /* object being processed */ )
50703 +{
50704 +       warning("nikita-620", "Unknown plugin %i in %llu",
50705 +               id, (unsigned long long)get_inode_oid(inode));
50706 +
50707 +       return RETERR(-EINVAL);
50708 +}
50709 +
50710 +/* this is installed as ->init_inode() method of
50711 +    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
50712 +    Copies data from on-disk stat-data format into inode.
50713 +    Handles stat-data extensions. */
50714 +/* was sd_load */
50715 +int init_inode_static_sd(struct inode *inode /* object being processed */ ,
50716 +                        char *sd /* stat-data body */ ,
50717 +                        int len /* length of stat-data */ )
50718 +{
50719 +       int result;
50720 +       int bit;
50721 +       int chunk;
50722 +       __u16 mask;
50723 +       __u64 bigmask;
50724 +       reiser4_stat_data_base *sd_base;
50725 +       reiser4_inode *state;
50726 +
50727 +       assert("nikita-625", inode != NULL);
50728 +       assert("nikita-626", sd != NULL);
50729 +
50730 +       result = 0;
50731 +       sd_base = (reiser4_stat_data_base *) sd;
50732 +       state = reiser4_inode_data(inode);
50733 +       mask = le16_to_cpu(get_unaligned(&sd_base->extmask));
50734 +       bigmask = mask;
50735 +       reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
50736 +
50737 +       move_on(&len, &sd, sizeof *sd_base);
50738 +       for (bit = 0, chunk = 0;
50739 +            mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION;
50740 +            ++bit, mask >>= 1) {
50741 +               if (((bit + 1) % 16) != 0) {
50742 +                       /* handle extension */
50743 +                       sd_ext_plugin *sdplug;
50744 +
50745 +                       if (bit >= LAST_SD_EXTENSION) {
50746 +                               warning("vpf-1904",
50747 +                                       "No such extension %i in inode %llu",
50748 +                                       bit,
50749 +                                       (unsigned long long)
50750 +                                       get_inode_oid(inode));
50751 +
50752 +                               result = RETERR(-EINVAL);
50753 +                               break;
50754 +                       }
50755 +
50756 +                       sdplug = sd_ext_plugin_by_id(bit);
50757 +                       if (sdplug == NULL) {
50758 +                               warning("nikita-627",
50759 +                                       "No such extension %i in inode %llu",
50760 +                                       bit,
50761 +                                       (unsigned long long)
50762 +                                       get_inode_oid(inode));
50763 +
50764 +                               result = RETERR(-EINVAL);
50765 +                               break;
50766 +                       }
50767 +                       if (mask & 1) {
50768 +                               assert("nikita-628", sdplug->present);
50769 +                               /* alignment is not supported in node layout
50770 +                                  plugin yet.
50771 +                                  result = align( inode, &len, &sd,
50772 +                                  sdplug -> alignment );
50773 +                                  if( result != 0 )
50774 +                                  return result; */
50775 +                               result = sdplug->present(inode, &sd, &len);
50776 +                       } else if (sdplug->absent != NULL)
50777 +                               result = sdplug->absent(inode);
50778 +                       if (result)
50779 +                               break;
50780 +                       /* else, we are looking at the last bit in 16-bit
50781 +                          portion of bitmask */
50782 +               } else if (mask & 1) {
50783 +                       /* next portion of bitmask */
50784 +                       if (len < (int)sizeof(d16)) {
50785 +                               warning("nikita-629",
50786 +                                       "No space for bitmap in inode %llu",
50787 +                                       (unsigned long long)
50788 +                                       get_inode_oid(inode));
50789 +
50790 +                               result = RETERR(-EINVAL);
50791 +                               break;
50792 +                       }
50793 +                       mask = le16_to_cpu(get_unaligned((d16 *)sd));
50794 +                       bigmask <<= 16;
50795 +                       bigmask |= mask;
50796 +                       move_on(&len, &sd, sizeof(d16));
50797 +                       ++chunk;
50798 +                       if (chunk == 3) {
50799 +                               if (!(mask & 0x8000)) {
50800 +                                       /* clear last bit */
50801 +                                       mask &= ~0x8000;
50802 +                                       continue;
50803 +                               }
50804 +                               /* too much */
50805 +                               warning("nikita-630",
50806 +                                       "Too many extensions in %llu",
50807 +                                       (unsigned long long)
50808 +                                       get_inode_oid(inode));
50809 +
50810 +                               result = RETERR(-EINVAL);
50811 +                               break;
50812 +                       }
50813 +               } else
50814 +                       /* bitmask exhausted */
50815 +                       break;
50816 +       }
50817 +       state->extmask = bigmask;
50818 +       /* common initialisations */
50819 +       if (len - (bit / 16 * sizeof(d16)) > 0) {
50820 +               /* alignment in save_len_static_sd() is taken into account
50821 +                  -edward */
50822 +               warning("nikita-631", "unused space in inode %llu",
50823 +                       (unsigned long long)get_inode_oid(inode));
50824 +       }
50825 +
50826 +       return result;
50827 +}
50828 +
50829 +/* estimates size of stat-data required to store inode.
50830 +    Installed as ->save_len() method of
50831 +    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
50832 +/* was sd_len */
50833 +int save_len_static_sd(struct inode *inode /* object being processed */ )
50834 +{
50835 +       unsigned int result;
50836 +       __u64 mask;
50837 +       int bit;
50838 +
50839 +       assert("nikita-632", inode != NULL);
50840 +
50841 +       result = sizeof(reiser4_stat_data_base);
50842 +       mask = reiser4_inode_data(inode)->extmask;
50843 +       for (bit = 0; mask != 0; ++bit, mask >>= 1) {
50844 +               if (mask & 1) {
50845 +                       sd_ext_plugin *sdplug;
50846 +
50847 +                       sdplug = sd_ext_plugin_by_id(bit);
50848 +                       assert("nikita-633", sdplug != NULL);
50849 +                       /*
50850 +                         no aligment support
50851 +                         result +=
50852 +                         reiser4_round_up(result, sdplug -> alignment) -
50853 +                         result;
50854 +                       */
50855 +                       result += sdplug->save_len(inode);
50856 +               }
50857 +       }
50858 +       result += bit / 16 * sizeof(d16);
50859 +       return result;
50860 +}
50861 +
50862 +/* saves inode into stat-data.
50863 +    Installed as ->save() method of
50864 +    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
50865 +/* was sd_save */
50866 +int save_static_sd(struct inode *inode /* object being processed */ ,
50867 +                  char **area /* where to save stat-data */ )
50868 +{
50869 +       int result;
50870 +       __u64 emask;
50871 +       int bit;
50872 +       unsigned int len;
50873 +       reiser4_stat_data_base *sd_base;
50874 +
50875 +       assert("nikita-634", inode != NULL);
50876 +       assert("nikita-635", area != NULL);
50877 +
50878 +       result = 0;
50879 +       emask = reiser4_inode_data(inode)->extmask;
50880 +       sd_base = (reiser4_stat_data_base *) * area;
50881 +       put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask);
50882 +       /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/
50883 +
50884 +       *area += sizeof *sd_base;
50885 +       len = 0xffffffffu;
50886 +       for (bit = 0; emask != 0; ++bit, emask >>= 1) {
50887 +               if (emask & 1) {
50888 +                       if ((bit + 1) % 16 != 0) {
50889 +                               sd_ext_plugin *sdplug;
50890 +                               sdplug = sd_ext_plugin_by_id(bit);
50891 +                               assert("nikita-636", sdplug != NULL);
50892 +                               /* no alignment support yet
50893 +                                  align( inode, &len, area,
50894 +                                  sdplug -> alignment ); */
50895 +                               result = sdplug->save(inode, area);
50896 +                               if (result)
50897 +                                       break;
50898 +                       } else {
50899 +                               put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)),
50900 +                                             (d16 *)(*area));
50901 +                               /*cputod16((unsigned)(emask & 0xffff),
50902 +                                 (d16 *) * area);*/
50903 +                               *area += sizeof(d16);
50904 +                       }
50905 +               }
50906 +       }
50907 +       return result;
50908 +}
50909 +
50910 +/* stat-data extension handling functions. */
50911 +
50912 +static int present_lw_sd(struct inode *inode /* object being processed */ ,
50913 +                        char **area /* position in stat-data */ ,
50914 +                        int *len /* remaining length */ )
50915 +{
50916 +       if (*len >= (int)sizeof(reiser4_light_weight_stat)) {
50917 +               reiser4_light_weight_stat *sd_lw;
50918 +
50919 +               sd_lw = (reiser4_light_weight_stat *) * area;
50920 +
50921 +               inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode));
50922 +               inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink));
50923 +               inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size));
50924 +               if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) {
50925 +                       inode->i_mode &= ~S_IFIFO;
50926 +                       warning("", "partially converted file is encountered");
50927 +                       reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
50928 +               }
50929 +               move_on(len, area, sizeof *sd_lw);
50930 +               return 0;
50931 +       } else
50932 +               return not_enough_space(inode, "lw sd");
50933 +}
50934 +
50935 +static int save_len_lw_sd(struct inode *inode UNUSED_ARG       /* object being
50936 +                                                                * processed */ )
50937 +{
50938 +       return sizeof(reiser4_light_weight_stat);
50939 +}
50940 +
50941 +static int save_lw_sd(struct inode *inode /* object being processed */ ,
50942 +                     char **area /* position in stat-data */ )
50943 +{
50944 +       reiser4_light_weight_stat *sd;
50945 +       mode_t delta;
50946 +
50947 +       assert("nikita-2705", inode != NULL);
50948 +       assert("nikita-2706", area != NULL);
50949 +       assert("nikita-2707", *area != NULL);
50950 +
50951 +       sd = (reiser4_light_weight_stat *) * area;
50952 +
50953 +       delta = (reiser4_inode_get_flag(inode,
50954 +                                       REISER4_PART_MIXED) ? S_IFIFO : 0);
50955 +       put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode);
50956 +       put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink);
50957 +       put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size);
50958 +       *area += sizeof *sd;
50959 +       return 0;
50960 +}
50961 +
50962 +static int present_unix_sd(struct inode *inode /* object being processed */ ,
50963 +                          char **area /* position in stat-data */ ,
50964 +                          int *len /* remaining length */ )
50965 +{
50966 +       assert("nikita-637", inode != NULL);
50967 +       assert("nikita-638", area != NULL);
50968 +       assert("nikita-639", *area != NULL);
50969 +       assert("nikita-640", len != NULL);
50970 +       assert("nikita-641", *len > 0);
50971 +
50972 +       if (*len >= (int)sizeof(reiser4_unix_stat)) {
50973 +               reiser4_unix_stat *sd;
50974 +
50975 +               sd = (reiser4_unix_stat *) * area;
50976 +
50977 +               inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid));
50978 +               inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid));
50979 +               inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime));
50980 +               inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime));
50981 +               inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime));
50982 +               if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
50983 +                       inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev));
50984 +               else
50985 +                       inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes)));
50986 +               move_on(len, area, sizeof *sd);
50987 +               return 0;
50988 +       } else
50989 +               return not_enough_space(inode, "unix sd");
50990 +}
50991 +
50992 +static int absent_unix_sd(struct inode *inode /* object being processed */ )
50993 +{
50994 +       inode->i_uid = get_super_private(inode->i_sb)->default_uid;
50995 +       inode->i_gid = get_super_private(inode->i_sb)->default_gid;
50996 +       inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
50997 +       inode_set_bytes(inode, inode->i_size);
50998 +       /* mark inode as lightweight, so that caller (lookup_common) will
50999 +          complete initialisation by copying [ug]id from a parent. */
51000 +       reiser4_inode_set_flag(inode, REISER4_LIGHT_WEIGHT);
51001 +       return 0;
51002 +}
51003 +
51004 +/* Audited by: green(2002.06.14) */
51005 +static int save_len_unix_sd(struct inode *inode UNUSED_ARG     /* object being
51006 +                                                                * processed */ )
51007 +{
51008 +       return sizeof(reiser4_unix_stat);
51009 +}
51010 +
51011 +static int save_unix_sd(struct inode *inode /* object being processed */ ,
51012 +                       char **area /* position in stat-data */ )
51013 +{
51014 +       reiser4_unix_stat *sd;
51015 +
51016 +       assert("nikita-642", inode != NULL);
51017 +       assert("nikita-643", area != NULL);
51018 +       assert("nikita-644", *area != NULL);
51019 +
51020 +       sd = (reiser4_unix_stat *) * area;
51021 +       put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid);
51022 +       put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid);
51023 +       put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime);
51024 +       put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime);
51025 +       put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime);
51026 +       if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
51027 +               put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev);
51028 +       else
51029 +               put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes);
51030 +       *area += sizeof *sd;
51031 +       return 0;
51032 +}
51033 +
51034 +static int
51035 +present_large_times_sd(struct inode *inode /* object being processed */ ,
51036 +                      char **area /* position in stat-data */ ,
51037 +                      int *len /* remaining length */ )
51038 +{
51039 +       if (*len >= (int)sizeof(reiser4_large_times_stat)) {
51040 +               reiser4_large_times_stat *sd_lt;
51041 +
51042 +               sd_lt = (reiser4_large_times_stat *) * area;
51043 +
51044 +               inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime));
51045 +               inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime));
51046 +               inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime));
51047 +
51048 +               move_on(len, area, sizeof *sd_lt);
51049 +               return 0;
51050 +       } else
51051 +               return not_enough_space(inode, "large times sd");
51052 +}
51053 +
51054 +static int
51055 +save_len_large_times_sd(struct inode *inode UNUSED_ARG
51056 +                       /* object being processed */ )
51057 +{
51058 +       return sizeof(reiser4_large_times_stat);
51059 +}
51060 +
51061 +static int
51062 +save_large_times_sd(struct inode *inode /* object being processed */ ,
51063 +                   char **area /* position in stat-data */ )
51064 +{
51065 +       reiser4_large_times_stat *sd;
51066 +
51067 +       assert("nikita-2817", inode != NULL);
51068 +       assert("nikita-2818", area != NULL);
51069 +       assert("nikita-2819", *area != NULL);
51070 +
51071 +       sd = (reiser4_large_times_stat *) * area;
51072 +
51073 +       put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime);
51074 +       put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime);
51075 +       put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime);
51076 +
51077 +       *area += sizeof *sd;
51078 +       return 0;
51079 +}
51080 +
51081 +/* symlink stat data extension */
51082 +
51083 +/* allocate memory for symlink target and attach it to inode->i_private */
51084 +static int
51085 +symlink_target_to_inode(struct inode *inode, const char *target, int len)
51086 +{
51087 +       assert("vs-845", inode->i_private == NULL);
51088 +       assert("vs-846", !reiser4_inode_get_flag(inode,
51089 +                                                REISER4_GENERIC_PTR_USED));
51090 +       /* FIXME-VS: this is prone to deadlock. Not more than other similar
51091 +          places, though */
51092 +       inode->i_private = kmalloc((size_t) len + 1,
51093 +                                  reiser4_ctx_gfp_mask_get());
51094 +       if (!inode->i_private)
51095 +               return RETERR(-ENOMEM);
51096 +
51097 +       memcpy((char *)(inode->i_private), target, (size_t) len);
51098 +       ((char *)(inode->i_private))[len] = 0;
51099 +       reiser4_inode_set_flag(inode, REISER4_GENERIC_PTR_USED);
51100 +       return 0;
51101 +}
51102 +
51103 +/* this is called on read_inode. There is nothing to do actually, but some
51104 +   sanity checks */
51105 +static int present_symlink_sd(struct inode *inode, char **area, int *len)
51106 +{
51107 +       int result;
51108 +       int length;
51109 +       reiser4_symlink_stat *sd;
51110 +
51111 +       length = (int)inode->i_size;
51112 +       /*
51113 +        * *len is number of bytes in stat data item from *area to the end of
51114 +        * item. It must be not less than size of symlink + 1 for ending 0
51115 +        */
51116 +       if (length > *len)
51117 +               return not_enough_space(inode, "symlink");
51118 +
51119 +       if (*(*area + length) != 0) {
51120 +               warning("vs-840", "Symlink is not zero terminated");
51121 +               return RETERR(-EIO);
51122 +       }
51123 +
51124 +       sd = (reiser4_symlink_stat *) * area;
51125 +       result = symlink_target_to_inode(inode, sd->body, length);
51126 +
51127 +       move_on(len, area, length + 1);
51128 +       return result;
51129 +}
51130 +
51131 +static int save_len_symlink_sd(struct inode *inode)
51132 +{
51133 +       return inode->i_size + 1;
51134 +}
51135 +
51136 +/* this is called on create and update stat data. Do nothing on update but
51137 +   update @area */
51138 +static int save_symlink_sd(struct inode *inode, char **area)
51139 +{
51140 +       int result;
51141 +       int length;
51142 +       reiser4_symlink_stat *sd;
51143 +
51144 +       length = (int)inode->i_size;
51145 +       /* inode->i_size must be set already */
51146 +       assert("vs-841", length);
51147 +
51148 +       result = 0;
51149 +       sd = (reiser4_symlink_stat *) * area;
51150 +       if (!reiser4_inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) {
51151 +               const char *target;
51152 +
51153 +               target = (const char *)(inode->i_private);
51154 +               inode->i_private = NULL;
51155 +
51156 +               result = symlink_target_to_inode(inode, target, length);
51157 +
51158 +               /* copy symlink to stat data */
51159 +               memcpy(sd->body, target, (size_t) length);
51160 +               (*area)[length] = 0;
51161 +       } else {
51162 +               /* there is nothing to do in update but move area */
51163 +               assert("vs-844",
51164 +                      !memcmp(inode->i_private, sd->body,
51165 +                              (size_t) length + 1));
51166 +       }
51167 +
51168 +       *area += (length + 1);
51169 +       return result;
51170 +}
51171 +
51172 +static int present_flags_sd(struct inode *inode /* object being processed */ ,
51173 +                           char **area /* position in stat-data */ ,
51174 +                           int *len /* remaining length */ )
51175 +{
51176 +       assert("nikita-645", inode != NULL);
51177 +       assert("nikita-646", area != NULL);
51178 +       assert("nikita-647", *area != NULL);
51179 +       assert("nikita-648", len != NULL);
51180 +       assert("nikita-649", *len > 0);
51181 +
51182 +       if (*len >= (int)sizeof(reiser4_flags_stat)) {
51183 +               reiser4_flags_stat *sd;
51184 +
51185 +               sd = (reiser4_flags_stat *) * area;
51186 +               inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags));
51187 +               move_on(len, area, sizeof *sd);
51188 +               return 0;
51189 +       } else
51190 +               return not_enough_space(inode, "generation and attrs");
51191 +}
51192 +
51193 +/* Audited by: green(2002.06.14) */
51194 +static int save_len_flags_sd(struct inode *inode UNUSED_ARG    /* object being
51195 +                                                                * processed */ )
51196 +{
51197 +       return sizeof(reiser4_flags_stat);
51198 +}
51199 +
51200 +static int save_flags_sd(struct inode *inode /* object being processed */ ,
51201 +                        char **area /* position in stat-data */ )
51202 +{
51203 +       reiser4_flags_stat *sd;
51204 +
51205 +       assert("nikita-650", inode != NULL);
51206 +       assert("nikita-651", area != NULL);
51207 +       assert("nikita-652", *area != NULL);
51208 +
51209 +       sd = (reiser4_flags_stat *) * area;
51210 +       put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags);
51211 +       *area += sizeof *sd;
51212 +       return 0;
51213 +}
51214 +
51215 +static int absent_plugin_sd(struct inode *inode);
51216 +static int present_plugin_sd(struct inode *inode /* object being processed */ ,
51217 +                            char **area /* position in stat-data */ ,
51218 +                            int *len /* remaining length */,
51219 +                            int is_pset /* 1 if plugin set, 0 if heir set. */)
51220 +{
51221 +       reiser4_plugin_stat *sd;
51222 +       reiser4_plugin *plugin;
51223 +       reiser4_inode *info;
51224 +       int i;
51225 +       __u16 mask;
51226 +       int result;
51227 +       int num_of_plugins;
51228 +
51229 +       assert("nikita-653", inode != NULL);
51230 +       assert("nikita-654", area != NULL);
51231 +       assert("nikita-655", *area != NULL);
51232 +       assert("nikita-656", len != NULL);
51233 +       assert("nikita-657", *len > 0);
51234 +
51235 +       if (*len < (int)sizeof(reiser4_plugin_stat))
51236 +               return not_enough_space(inode, "plugin");
51237 +
51238 +       sd = (reiser4_plugin_stat *) * area;
51239 +       info = reiser4_inode_data(inode);
51240 +
51241 +       mask = 0;
51242 +       num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no));
51243 +       move_on(len, area, sizeof *sd);
51244 +       result = 0;
51245 +       for (i = 0; i < num_of_plugins; ++i) {
51246 +               reiser4_plugin_slot *slot;
51247 +               reiser4_plugin_type type;
51248 +               pset_member memb;
51249 +
51250 +               slot = (reiser4_plugin_slot *) * area;
51251 +               if (*len < (int)sizeof *slot)
51252 +                       return not_enough_space(inode, "additional plugin");
51253 +
51254 +               memb = le16_to_cpu(get_unaligned(&slot->pset_memb));
51255 +               type = aset_member_to_type_unsafe(memb);
51256 +
51257 +               if (type == REISER4_PLUGIN_TYPES) {
51258 +                       warning("nikita-3502",
51259 +                               "wrong %s member (%i) for %llu", is_pset ?
51260 +                               "pset" : "hset", memb,
51261 +                               (unsigned long long)get_inode_oid(inode));
51262 +                       return RETERR(-EINVAL);
51263 +               }
51264 +               plugin = plugin_by_disk_id(reiser4_tree_by_inode(inode),
51265 +                                          type, &slot->id);
51266 +               if (plugin == NULL)
51267 +                       return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode);
51268 +
51269 +               /* plugin is loaded into inode, mark this into inode's
51270 +                  bitmask of loaded non-standard plugins */
51271 +               if (!(mask & (1 << memb))) {
51272 +                       mask |= (1 << memb);
51273 +               } else {
51274 +                       warning("nikita-658", "duplicate plugin for %llu",
51275 +                               (unsigned long long)get_inode_oid(inode));
51276 +                       return RETERR(-EINVAL);
51277 +               }
51278 +               move_on(len, area, sizeof *slot);
51279 +               /* load plugin data, if any */
51280 +               if (plugin->h.pops != NULL && plugin->h.pops->load)
51281 +                       result = plugin->h.pops->load(inode, plugin, area, len);
51282 +               else
51283 +                       result = aset_set_unsafe(is_pset ? &info->pset :
51284 +                                                &info->hset, memb, plugin);
51285 +               if (result)
51286 +                       return result;
51287 +       }
51288 +       if (is_pset) {
51289 +               /* if object plugin wasn't loaded from stat-data, guess it by
51290 +                  mode bits */
51291 +               plugin = file_plugin_to_plugin(inode_file_plugin(inode));
51292 +               if (plugin == NULL)
51293 +                       result = absent_plugin_sd(inode);
51294 +               info->plugin_mask = mask;
51295 +       } else
51296 +               info->heir_mask = mask;
51297 +
51298 +       return result;
51299 +}
51300 +
51301 +static int present_pset_sd(struct inode *inode, char **area, int *len) {
51302 +       return present_plugin_sd(inode, area, len, 1 /* pset */);
51303 +}
51304 +
51305 +/* Determine object plugin for @inode based on i_mode.
51306 +
51307 +   Many objects in reiser4 file system are controlled by standard object
51308 +   plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on.
51309 +
51310 +   For such files we don't explicitly store plugin id in object stat
51311 +   data. Rather required plugin is guessed from mode bits, where file "type"
51312 +   is encoded (see stat(2)).
51313 +*/
51314 +static int
51315 +guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ )
51316 +{
51317 +       int fplug_id;
51318 +       int dplug_id;
51319 +       reiser4_inode *info;
51320 +
51321 +       assert("nikita-736", inode != NULL);
51322 +
51323 +       dplug_id = fplug_id = -1;
51324 +
51325 +       switch (inode->i_mode & S_IFMT) {
51326 +       case S_IFSOCK:
51327 +       case S_IFBLK:
51328 +       case S_IFCHR:
51329 +       case S_IFIFO:
51330 +               fplug_id = SPECIAL_FILE_PLUGIN_ID;
51331 +               break;
51332 +       case S_IFLNK:
51333 +               fplug_id = SYMLINK_FILE_PLUGIN_ID;
51334 +               break;
51335 +       case S_IFDIR:
51336 +               fplug_id = DIRECTORY_FILE_PLUGIN_ID;
51337 +               dplug_id = HASHED_DIR_PLUGIN_ID;
51338 +               break;
51339 +       default:
51340 +               warning("nikita-737", "wrong file mode: %o", inode->i_mode);
51341 +               return RETERR(-EIO);
51342 +       case S_IFREG:
51343 +               fplug_id = UNIX_FILE_PLUGIN_ID;
51344 +               break;
51345 +       }
51346 +       info = reiser4_inode_data(inode);
51347 +       set_plugin(&info->pset, PSET_FILE, (fplug_id >= 0) ?
51348 +                  plugin_by_id(REISER4_FILE_PLUGIN_TYPE, fplug_id) : NULL);
51349 +       set_plugin(&info->pset, PSET_DIR, (dplug_id >= 0) ?
51350 +                  plugin_by_id(REISER4_DIR_PLUGIN_TYPE, dplug_id) : NULL);
51351 +       return 0;
51352 +}
51353 +
51354 +/* Audited by: green(2002.06.14) */
51355 +static int absent_plugin_sd(struct inode *inode /* object being processed */ )
51356 +{
51357 +       int result;
51358 +
51359 +       assert("nikita-659", inode != NULL);
51360 +
51361 +       result = guess_plugin_by_mode(inode);
51362 +       /* if mode was wrong, guess_plugin_by_mode() returns "regular file",
51363 +          but setup_inode_ops() will call make_bad_inode().
51364 +          Another, more logical but bit more complex solution is to add
51365 +          "bad-file plugin". */
51366 +       /* FIXME-VS: activate was called here */
51367 +       return result;
51368 +}
51369 +
51370 +/* helper function for plugin_sd_save_len(): calculate how much space
51371 +    required to save state of given plugin */
51372 +/* Audited by: green(2002.06.14) */
51373 +static int len_for(reiser4_plugin * plugin /* plugin to save */ ,
51374 +                  struct inode *inode /* object being processed */ ,
51375 +                  pset_member memb,
51376 +                  int len, int is_pset)
51377 +{
51378 +       reiser4_inode *info;
51379 +       assert("nikita-661", inode != NULL);
51380 +
51381 +       if (plugin == NULL)
51382 +               return len;
51383 +
51384 +       info = reiser4_inode_data(inode);
51385 +       if (is_pset ?
51386 +           info->plugin_mask & (1 << memb) :
51387 +           info->heir_mask & (1 << memb)) {
51388 +               len += sizeof(reiser4_plugin_slot);
51389 +               if (plugin->h.pops && plugin->h.pops->save_len != NULL) {
51390 +                       /*
51391 +                        * non-standard plugin, call method
51392 +                        * commented as it is incompatible with alignment
51393 +                        * policy in save_plug() -edward
51394 +                        *
51395 +                        * len = reiser4_round_up(len,
51396 +                        * plugin->h.pops->alignment);
51397 +                        */
51398 +                       len += plugin->h.pops->save_len(inode, plugin);
51399 +               }
51400 +       }
51401 +       return len;
51402 +}
51403 +
51404 +/* calculate how much space is required to save state of all plugins,
51405 +    associated with inode */
51406 +static int save_len_plugin_sd(struct inode *inode /* object being processed */,
51407 +                             int is_pset)
51408 +{
51409 +       int len;
51410 +       int last;
51411 +       reiser4_inode *state;
51412 +       pset_member memb;
51413 +
51414 +       assert("nikita-663", inode != NULL);
51415 +
51416 +       state = reiser4_inode_data(inode);
51417 +
51418 +       /* common case: no non-standard plugins */
51419 +       if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
51420 +               return 0;
51421 +       len = sizeof(reiser4_plugin_stat);
51422 +       last = PSET_LAST;
51423 +
51424 +       for (memb = 0; memb < last; ++memb) {
51425 +             len = len_for(aset_get(is_pset ? state->pset : state->hset, memb),
51426 +                           inode, memb, len, is_pset);
51427 +       }
51428 +       assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat));
51429 +       return len;
51430 +}
51431 +
51432 +static int save_len_pset_sd(struct inode *inode) {
51433 +       return save_len_plugin_sd(inode, 1 /* pset */);
51434 +}
51435 +
51436 +/* helper function for plugin_sd_save(): save plugin, associated with
51437 +    inode. */
51438 +static int save_plug(reiser4_plugin * plugin /* plugin to save */ ,
51439 +                    struct inode *inode /* object being processed */ ,
51440 +                    int memb /* what element of pset is saved */ ,
51441 +                    char **area /* position in stat-data */ ,
51442 +                    int *count /* incremented if plugin were actually saved. */,
51443 +                    int is_pset /* 1 for plugin set, 0 for heir set */)
51444 +{
51445 +       reiser4_plugin_slot *slot;
51446 +       int fake_len;
51447 +       int result;
51448 +
51449 +       assert("nikita-665", inode != NULL);
51450 +       assert("nikita-666", area != NULL);
51451 +       assert("nikita-667", *area != NULL);
51452 +
51453 +       if (plugin == NULL)
51454 +               return 0;
51455 +
51456 +       if (is_pset ?
51457 +           !(reiser4_inode_data(inode)->plugin_mask & (1 << memb)) :
51458 +           !(reiser4_inode_data(inode)->heir_mask & (1 << memb)))
51459 +               return 0;
51460 +       slot = (reiser4_plugin_slot *) * area;
51461 +       put_unaligned(cpu_to_le16(memb), &slot->pset_memb);
51462 +       put_unaligned(cpu_to_le16(plugin->h.id), &slot->id);
51463 +       fake_len = (int)0xffff;
51464 +       move_on(&fake_len, area, sizeof *slot);
51465 +       ++*count;
51466 +       result = 0;
51467 +       if (plugin->h.pops != NULL) {
51468 +               if (plugin->h.pops->save != NULL)
51469 +                       result = plugin->h.pops->save(inode, plugin, area);
51470 +       }
51471 +       return result;
51472 +}
51473 +
51474 +/* save state of all non-standard plugins associated with inode */
51475 +static int save_plugin_sd(struct inode *inode /* object being processed */ ,
51476 +                         char **area /* position in stat-data */,
51477 +                         int is_pset /* 1 for pset, 0 for hset */)
51478 +{
51479 +       int fake_len;
51480 +       int result = 0;
51481 +       int num_of_plugins;
51482 +       reiser4_plugin_stat *sd;
51483 +       reiser4_inode *state;
51484 +       pset_member memb;
51485 +
51486 +       assert("nikita-669", inode != NULL);
51487 +       assert("nikita-670", area != NULL);
51488 +       assert("nikita-671", *area != NULL);
51489 +
51490 +       state = reiser4_inode_data(inode);
51491 +       if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
51492 +               return 0;
51493 +       sd = (reiser4_plugin_stat *) * area;
51494 +       fake_len = (int)0xffff;
51495 +       move_on(&fake_len, area, sizeof *sd);
51496 +
51497 +       num_of_plugins = 0;
51498 +       for (memb = 0; memb < PSET_LAST; ++memb) {
51499 +               result = save_plug(aset_get(is_pset ? state->pset : state->hset,
51500 +                                           memb),
51501 +                                  inode, memb, area, &num_of_plugins, is_pset);
51502 +               if (result != 0)
51503 +                       break;
51504 +       }
51505 +
51506 +       put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no);
51507 +       return result;
51508 +}
51509 +
51510 +static int save_pset_sd(struct inode *inode, char **area) {
51511 +       return save_plugin_sd(inode, area, 1 /* pset */);
51512 +}
51513 +
51514 +static int present_hset_sd(struct inode *inode, char **area, int *len) {
51515 +       return present_plugin_sd(inode, area, len, 0 /* hset */);
51516 +}
51517 +
51518 +static int save_len_hset_sd(struct inode *inode) {
51519 +       return save_len_plugin_sd(inode, 0 /* pset */);
51520 +}
51521 +
51522 +static int save_hset_sd(struct inode *inode, char **area) {
51523 +       return save_plugin_sd(inode, area, 0 /* hset */);
51524 +}
51525 +
51526 +/* helper function for crypto_sd_present(), crypto_sd_save.
51527 +   Extract crypto info from stat-data and attach it to inode */
51528 +static int extract_crypto_info (struct inode * inode,
51529 +                               reiser4_crypto_stat * sd)
51530 +{
51531 +       struct reiser4_crypto_info * info;
51532 +       assert("edward-11", !inode_crypto_info(inode));
51533 +       assert("edward-1413",
51534 +              !reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED));
51535 +       /* create and attach a crypto-stat without secret key loaded */
51536 +       info = reiser4_alloc_crypto_info(inode);
51537 +       if (IS_ERR(info))
51538 +               return PTR_ERR(info);
51539 +       info->keysize = le16_to_cpu(get_unaligned(&sd->keysize));
51540 +       memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize);
51541 +       reiser4_attach_crypto_info(inode, info);
51542 +       reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
51543 +       return 0;
51544 +}
51545 +
51546 +/* crypto stat-data extension */
51547 +
51548 +static int present_crypto_sd(struct inode *inode, char **area, int *len)
51549 +{
51550 +       int result;
51551 +       reiser4_crypto_stat *sd;
51552 +       digest_plugin *dplug = inode_digest_plugin(inode);
51553 +
51554 +       assert("edward-06", dplug != NULL);
51555 +       assert("edward-684", dplug->fipsize);
51556 +       assert("edward-07", area != NULL);
51557 +       assert("edward-08", *area != NULL);
51558 +       assert("edward-09", len != NULL);
51559 +       assert("edward-10", *len > 0);
51560 +
51561 +       if (*len < (int)sizeof(reiser4_crypto_stat)) {
51562 +               return not_enough_space(inode, "crypto-sd");
51563 +       }
51564 +       /* *len is number of bytes in stat data item from *area to the end of
51565 +          item. It must be not less than size of this extension */
51566 +       assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len);
51567 +
51568 +       sd = (reiser4_crypto_stat *) * area;
51569 +       result = extract_crypto_info(inode, sd);
51570 +       move_on(len, area, sizeof(*sd) + dplug->fipsize);
51571 +
51572 +       return result;
51573 +}
51574 +
51575 +static int save_len_crypto_sd(struct inode *inode)
51576 +{
51577 +       return sizeof(reiser4_crypto_stat) +
51578 +               inode_digest_plugin(inode)->fipsize;
51579 +}
51580 +
51581 +static int save_crypto_sd(struct inode *inode, char **area)
51582 +{
51583 +       int result = 0;
51584 +       reiser4_crypto_stat *sd;
51585 +       struct reiser4_crypto_info * info = inode_crypto_info(inode);
51586 +       digest_plugin *dplug = inode_digest_plugin(inode);
51587 +
51588 +       assert("edward-12", dplug != NULL);
51589 +       assert("edward-13", area != NULL);
51590 +       assert("edward-14", *area != NULL);
51591 +       assert("edward-15", info != NULL);
51592 +       assert("edward-1414", info->keyid != NULL);
51593 +       assert("edward-1415", info->keysize != 0);
51594 +       assert("edward-76", reiser4_inode_data(inode) != NULL);
51595 +
51596 +       if (!reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) {
51597 +               /* file is just created */
51598 +               sd = (reiser4_crypto_stat *) *area;
51599 +               /* copy everything but private key to the disk stat-data */
51600 +               put_unaligned(cpu_to_le16(info->keysize), &sd->keysize);
51601 +               memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize);
51602 +               reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
51603 +       }
51604 +       *area += (sizeof(*sd) + dplug->fipsize);
51605 +       return result;
51606 +}
51607 +
51608 +static int eio(struct inode *inode, char **area, int *len)
51609 +{
51610 +       return RETERR(-EIO);
51611 +}
51612 +
51613 +sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = {
51614 +       [LIGHT_WEIGHT_STAT] = {
51615 +               .h = {
51616 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51617 +                       .id = LIGHT_WEIGHT_STAT,
51618 +                       .pops = NULL,
51619 +                       .label = "light-weight sd",
51620 +                       .desc = "sd for light-weight files",
51621 +                       .linkage = {NULL,NULL}
51622 +               },
51623 +               .present = present_lw_sd,
51624 +               .absent = NULL,
51625 +               .save_len = save_len_lw_sd,
51626 +               .save = save_lw_sd,
51627 +               .alignment = 8
51628 +       },
51629 +       [UNIX_STAT] = {
51630 +               .h = {
51631 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51632 +                       .id = UNIX_STAT,
51633 +                       .pops = NULL,
51634 +                       .label = "unix-sd",
51635 +                       .desc = "unix stat-data fields",
51636 +                       .linkage = {NULL,NULL}
51637 +               },
51638 +               .present = present_unix_sd,
51639 +               .absent = absent_unix_sd,
51640 +               .save_len = save_len_unix_sd,
51641 +               .save = save_unix_sd,
51642 +               .alignment = 8
51643 +       },
51644 +       [LARGE_TIMES_STAT] = {
51645 +               .h = {
51646 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51647 +                       .id = LARGE_TIMES_STAT,
51648 +                       .pops = NULL,
51649 +                       .label = "64time-sd",
51650 +                       .desc = "nanosecond resolution for times",
51651 +                       .linkage = {NULL,NULL}
51652 +               },
51653 +               .present = present_large_times_sd,
51654 +               .absent = NULL,
51655 +               .save_len = save_len_large_times_sd,
51656 +               .save = save_large_times_sd,
51657 +               .alignment = 8
51658 +       },
51659 +       [SYMLINK_STAT] = {
51660 +               /* stat data of symlink has this extension */
51661 +               .h = {
51662 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51663 +                       .id = SYMLINK_STAT,
51664 +                       .pops = NULL,
51665 +                       .label = "symlink-sd",
51666 +                       .desc =
51667 +                       "stat data is appended with symlink name",
51668 +                       .linkage = {NULL,NULL}
51669 +               },
51670 +               .present = present_symlink_sd,
51671 +               .absent = NULL,
51672 +               .save_len = save_len_symlink_sd,
51673 +               .save = save_symlink_sd,
51674 +               .alignment = 8
51675 +       },
51676 +       [PLUGIN_STAT] = {
51677 +               .h = {
51678 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51679 +                       .id = PLUGIN_STAT,
51680 +                       .pops = NULL,
51681 +                       .label = "plugin-sd",
51682 +                       .desc = "plugin stat-data fields",
51683 +                       .linkage = {NULL,NULL}
51684 +               },
51685 +               .present = present_pset_sd,
51686 +               .absent = absent_plugin_sd,
51687 +               .save_len = save_len_pset_sd,
51688 +               .save = save_pset_sd,
51689 +               .alignment = 8
51690 +       },
51691 +       [HEIR_STAT] = {
51692 +               .h = {
51693 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51694 +                       .id = HEIR_STAT,
51695 +                       .pops = NULL,
51696 +                       .label = "heir-plugin-sd",
51697 +                       .desc = "heir plugin stat-data fields",
51698 +                       .linkage = {NULL,NULL}
51699 +               },
51700 +               .present = present_hset_sd,
51701 +               .absent = NULL,
51702 +               .save_len = save_len_hset_sd,
51703 +               .save = save_hset_sd,
51704 +               .alignment = 8
51705 +       },
51706 +       [FLAGS_STAT] = {
51707 +               .h = {
51708 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51709 +                       .id = FLAGS_STAT,
51710 +                       .pops = NULL,
51711 +                       .label = "flags-sd",
51712 +                       .desc = "inode bit flags",
51713 +                       .linkage = {NULL, NULL}
51714 +               },
51715 +               .present = present_flags_sd,
51716 +               .absent = NULL,
51717 +               .save_len = save_len_flags_sd,
51718 +               .save = save_flags_sd,
51719 +               .alignment = 8
51720 +       },
51721 +       [CAPABILITIES_STAT] = {
51722 +               .h = {
51723 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51724 +                       .id = CAPABILITIES_STAT,
51725 +                       .pops = NULL,
51726 +                       .label = "capabilities-sd",
51727 +                       .desc = "capabilities",
51728 +                       .linkage = {NULL, NULL}
51729 +               },
51730 +               .present = eio,
51731 +               .absent = NULL,
51732 +               .save_len = save_len_flags_sd,
51733 +               .save = save_flags_sd,
51734 +               .alignment = 8
51735 +       },
51736 +       [CRYPTO_STAT] = {
51737 +               .h = {
51738 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51739 +                       .id = CRYPTO_STAT,
51740 +                       .pops = NULL,
51741 +                       .label = "crypto-sd",
51742 +                       .desc = "secret key size and id",
51743 +                       .linkage = {NULL, NULL}
51744 +               },
51745 +               .present = present_crypto_sd,
51746 +               .absent = NULL,
51747 +               .save_len = save_len_crypto_sd,
51748 +               .save = save_crypto_sd,
51749 +               .alignment = 8
51750 +       }
51751 +};
51752 +
51753 +/* Make Linus happy.
51754 +   Local variables:
51755 +   c-indentation-style: "K&R"
51756 +   mode-name: "LC"
51757 +   c-basic-offset: 8
51758 +   tab-width: 8
51759 +   fill-column: 120
51760 +   End:
51761 +*/
51762 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/static_stat.h linux-2.6.35/fs/reiser4/plugin/item/static_stat.h
51763 --- linux-2.6.35.orig/fs/reiser4/plugin/item/static_stat.h      1970-01-01 01:00:00.000000000 +0100
51764 +++ linux-2.6.35/fs/reiser4/plugin/item/static_stat.h   2010-08-04 15:44:57.000000000 +0200
51765 @@ -0,0 +1,224 @@
51766 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51767 +
51768 +/* This describes the static_stat item, used to hold all information needed by the stat() syscall.
51769 +
51770 +In the case where each file has not less than the fields needed by the
51771 +stat() syscall, it is more compact to store those fields in this
51772 +struct.
51773 +
51774 +If this item does not exist, then all stats are dynamically resolved.
51775 +At the moment, we either resolve all stats dynamically or all of them
51776 +statically.  If you think this is not fully optimal, and the rest of
51777 +reiser4 is working, then fix it...:-)
51778 +
51779 +*/
51780 +
51781 +#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ )
51782 +#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__
51783 +
51784 +#include "../../forward.h"
51785 +#include "../../dformat.h"
51786 +
51787 +#include <linux/fs.h>          /* for struct inode */
51788 +
51789 +/* Stat data layout: goals and implementation.
51790 +
51791 +   We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to
51792 +   them, including not having semantic metadata attached to them.
51793 +
51794 +   There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you
51795 +   want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically
51796 +   sized structure because the statically sized structure knows without recording it what the names and lengths of the
51797 +   attributes are.
51798 +
51799 +   This leads to a natural compromise, which is to special case those files which have simply the standard unix file
51800 +   attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix
51801 +   file in their use of file attributes.
51802 +
51803 +   Yet this compromise deserves to be compromised a little.
51804 +
51805 +   We accommodate the case where you have no more than the standard unix file attributes by using an "extension
51806 +   bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum).
51807 +
51808 +   If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited
51809 +   from parent directory (as uid, gid) or initialised to some sane values.
51810 +
51811 +   To capitalize on existing code infrastructure, extensions are
51812 +   implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE.
51813 +   Each stat-data extension plugin implements four methods:
51814 +
51815 +    ->present() called by sd_load() when this extension is found in stat-data
51816 +    ->absent() called by sd_load() when this extension is not found in stat-data
51817 +    ->save_len() called by sd_len() to calculate total length of stat-data
51818 +    ->save() called by sd_save() to store extension data into stat-data
51819 +
51820 +    Implementation is in fs/reiser4/plugin/item/static_stat.c
51821 +*/
51822 +
51823 +/* stat-data extension. Please order this by presumed frequency of use */
51824 +typedef enum {
51825 +       /* support for light-weight files */
51826 +       LIGHT_WEIGHT_STAT,
51827 +       /* data required to implement unix stat(2) call. Layout is in
51828 +          reiser4_unix_stat. If this is not present, file is light-weight */
51829 +       UNIX_STAT,
51830 +       /* this contains additional set of 32bit [anc]time fields to implement
51831 +          nanosecond resolution. Layout is in reiser4_large_times_stat. Usage
51832 +          if this extension is governed by 32bittimes mount option. */
51833 +       LARGE_TIMES_STAT,
51834 +       /* stat data has link name included */
51835 +       SYMLINK_STAT,
51836 +       /* on-disk slots of non-standard plugins for main plugin table
51837 +          (@reiser4_inode->pset), that is, plugins that cannot be deduced
51838 +          from file mode bits), for example, aggregation, interpolation etc. */
51839 +       PLUGIN_STAT,
51840 +       /* this extension contains persistent inode flags. These flags are
51841 +          single bits: immutable, append, only, etc. Layout is in
51842 +          reiser4_flags_stat. */
51843 +       FLAGS_STAT,
51844 +       /* this extension contains capabilities sets, associated with this
51845 +          file. Layout is in reiser4_capabilities_stat */
51846 +       CAPABILITIES_STAT,
51847 +       /* this extension contains size and public id of the secret key.
51848 +          Layout is in reiser4_crypto_stat */
51849 +       CRYPTO_STAT,
51850 +       /* on-disk slots of non-default plugins for inheritance, which
51851 +          are extracted to special plugin table (@reiser4_inode->hset).
51852 +          By default, children of the object will inherit plugins from
51853 +          its main plugin table (pset). */
51854 +       HEIR_STAT,
51855 +       LAST_SD_EXTENSION,
51856 +       /*
51857 +        * init_inode_static_sd() iterates over extension mask until all
51858 +        * non-zero bits are processed. This means, that neither ->present(),
51859 +        * nor ->absent() methods will be called for stat-data extensions that
51860 +        * go after last present extension. But some basic extensions, we want
51861 +        * either ->absent() or ->present() method to be called, because these
51862 +        * extensions set up something in inode even when they are not
51863 +        * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all
51864 +        * extensions before and including LAST_IMPORTANT_SD_EXTENSION either
51865 +        * ->present(), or ->absent() method will be called, independently of
51866 +        * what other extensions are present.
51867 +        */
51868 +       LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT
51869 +} sd_ext_bits;
51870 +
51871 +/* minimal stat-data. This allows to support light-weight files. */
51872 +typedef struct reiser4_stat_data_base {
51873 +       /*  0 */ __le16 extmask;
51874 +       /*  2 */
51875 +} PACKED reiser4_stat_data_base;
51876 +
51877 +typedef struct reiser4_light_weight_stat {
51878 +       /*  0 */ __le16 mode;
51879 +       /*  2 */ __le32 nlink;
51880 +       /*  6 */ __le64 size;
51881 +       /* size in bytes */
51882 +       /* 14 */
51883 +} PACKED reiser4_light_weight_stat;
51884 +
51885 +typedef struct reiser4_unix_stat {
51886 +       /* owner id */
51887 +       /*  0 */ __le32 uid;
51888 +       /* group id */
51889 +       /*  4 */ __le32 gid;
51890 +       /* access time */
51891 +       /*  8 */ __le32 atime;
51892 +       /* modification time */
51893 +       /* 12 */ __le32 mtime;
51894 +       /* change time */
51895 +       /* 16 */ __le32 ctime;
51896 +       union {
51897 +               /* minor:major for device files */
51898 +               /* 20 */ __le64 rdev;
51899 +               /* bytes used by file */
51900 +               /* 20 */ __le64 bytes;
51901 +       } u;
51902 +       /* 28 */
51903 +} PACKED reiser4_unix_stat;
51904 +
51905 +/* symlink stored as part of inode */
51906 +typedef struct reiser4_symlink_stat {
51907 +       char body[0];
51908 +} PACKED reiser4_symlink_stat;
51909 +
51910 +typedef struct reiser4_plugin_slot {
51911 +       /*  0 */ __le16 pset_memb;
51912 +       /*  2 */ __le16 id;
51913 +       /*  4 *//* here plugin stores its persistent state */
51914 +} PACKED reiser4_plugin_slot;
51915 +
51916 +/* stat-data extension for files with non-standard plugin. */
51917 +typedef struct reiser4_plugin_stat {
51918 +       /* number of additional plugins, associated with this object */
51919 +       /*  0 */ __le16 plugins_no;
51920 +       /*  2 */ reiser4_plugin_slot slot[0];
51921 +       /*  2 */
51922 +} PACKED reiser4_plugin_stat;
51923 +
51924 +/* stat-data extension for inode flags. Currently it is just fixed-width 32
51925 + * bit mask. If need arise, this can be replaced with variable width
51926 + * bitmask. */
51927 +typedef struct reiser4_flags_stat {
51928 +       /*  0 */ __le32 flags;
51929 +       /*  4 */
51930 +} PACKED reiser4_flags_stat;
51931 +
51932 +typedef struct reiser4_capabilities_stat {
51933 +       /*  0 */ __le32 effective;
51934 +       /*  8 */ __le32 permitted;
51935 +       /* 16 */
51936 +} PACKED reiser4_capabilities_stat;
51937 +
51938 +typedef struct reiser4_cluster_stat {
51939 +/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */
51940 +       /* 0 */ d8 cluster_shift;
51941 +       /* 1 */
51942 +} PACKED reiser4_cluster_stat;
51943 +
51944 +typedef struct reiser4_crypto_stat {
51945 +       /* secret key size, bits */
51946 +       /*  0 */ d16 keysize;
51947 +       /* secret key id */
51948 +       /*  2 */ d8 keyid[0];
51949 +       /* 2 */
51950 +} PACKED reiser4_crypto_stat;
51951 +
51952 +typedef struct reiser4_large_times_stat {
51953 +       /* access time */
51954 +       /* 0 */ d32 atime;
51955 +       /* modification time */
51956 +       /* 4 */ d32 mtime;
51957 +       /* change time */
51958 +       /* 8 */ d32 ctime;
51959 +       /* 12 */
51960 +} PACKED reiser4_large_times_stat;
51961 +
51962 +/* this structure is filled by sd_item_stat */
51963 +typedef struct sd_stat {
51964 +       int dirs;
51965 +       int files;
51966 +       int others;
51967 +} sd_stat;
51968 +
51969 +/* plugin->item.common.* */
51970 +extern void print_sd(const char *prefix, coord_t * coord);
51971 +extern void item_stat_static_sd(const coord_t * coord, void *vp);
51972 +
51973 +/* plugin->item.s.sd.* */
51974 +extern int init_inode_static_sd(struct inode *inode, char *sd, int len);
51975 +extern int save_len_static_sd(struct inode *inode);
51976 +extern int save_static_sd(struct inode *inode, char **area);
51977 +
51978 +/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */
51979 +#endif
51980 +
51981 +/* Make Linus happy.
51982 +   Local variables:
51983 +   c-indentation-style: "K&R"
51984 +   mode-name: "LC"
51985 +   c-basic-offset: 8
51986 +   tab-width: 8
51987 +   fill-column: 120
51988 +   End:
51989 +*/
51990 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/tail.c linux-2.6.35/fs/reiser4/plugin/item/tail.c
51991 --- linux-2.6.35.orig/fs/reiser4/plugin/item/tail.c     1970-01-01 01:00:00.000000000 +0100
51992 +++ linux-2.6.35/fs/reiser4/plugin/item/tail.c  2010-08-04 15:44:57.000000000 +0200
51993 @@ -0,0 +1,807 @@
51994 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51995 +
51996 +#include "item.h"
51997 +#include "../../inode.h"
51998 +#include "../../page_cache.h"
51999 +#include "../../carry.h"
52000 +#include "../../vfs_ops.h"
52001 +
52002 +#include <linux/quotaops.h>
52003 +#include <asm/uaccess.h>
52004 +#include <linux/swap.h>
52005 +#include <linux/writeback.h>
52006 +
52007 +/* plugin->u.item.b.max_key_inside */
52008 +reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key)
52009 +{
52010 +       item_key_by_coord(coord, key);
52011 +       set_key_offset(key, get_key_offset(reiser4_max_key()));
52012 +       return key;
52013 +}
52014 +
52015 +/* plugin->u.item.b.can_contain_key */
52016 +int can_contain_key_tail(const coord_t *coord, const reiser4_key *key,
52017 +                        const reiser4_item_data *data)
52018 +{
52019 +       reiser4_key item_key;
52020 +
52021 +       if (item_plugin_by_coord(coord) != data->iplug)
52022 +               return 0;
52023 +
52024 +       item_key_by_coord(coord, &item_key);
52025 +       if (get_key_locality(key) != get_key_locality(&item_key) ||
52026 +           get_key_objectid(key) != get_key_objectid(&item_key))
52027 +               return 0;
52028 +
52029 +       return 1;
52030 +}
52031 +
52032 +/* plugin->u.item.b.mergeable
52033 +   first item is of tail type */
52034 +/* Audited by: green(2002.06.14) */
52035 +int mergeable_tail(const coord_t *p1, const coord_t *p2)
52036 +{
52037 +       reiser4_key key1, key2;
52038 +
52039 +       assert("vs-535", plugin_of_group(item_plugin_by_coord(p1),
52040 +                                        UNIX_FILE_METADATA_ITEM_TYPE));
52041 +       assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID);
52042 +
52043 +       if (item_id_by_coord(p2) != FORMATTING_ID) {
52044 +               /* second item is of another type */
52045 +               return 0;
52046 +       }
52047 +
52048 +       item_key_by_coord(p1, &key1);
52049 +       item_key_by_coord(p2, &key2);
52050 +       if (get_key_locality(&key1) != get_key_locality(&key2) ||
52051 +           get_key_objectid(&key1) != get_key_objectid(&key2)
52052 +           || get_key_type(&key1) != get_key_type(&key2)) {
52053 +               /* items of different objects */
52054 +               return 0;
52055 +       }
52056 +       if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) {
52057 +               /* not adjacent items */
52058 +               return 0;
52059 +       }
52060 +       return 1;
52061 +}
52062 +
52063 +/* plugin->u.item.b.print
52064 +   plugin->u.item.b.check */
52065 +
52066 +/* plugin->u.item.b.nr_units */
52067 +pos_in_node_t nr_units_tail(const coord_t * coord)
52068 +{
52069 +       return item_length_by_coord(coord);
52070 +}
52071 +
52072 +/* plugin->u.item.b.lookup */
52073 +lookup_result
52074 +lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord)
52075 +{
52076 +       reiser4_key item_key;
52077 +       __u64 lookuped, offset;
52078 +       unsigned nr_units;
52079 +
52080 +       item_key_by_coord(coord, &item_key);
52081 +       offset = get_key_offset(item_key_by_coord(coord, &item_key));
52082 +       nr_units = nr_units_tail(coord);
52083 +
52084 +       /* key we are looking for must be greater than key of item @coord */
52085 +       assert("vs-416", keygt(key, &item_key));
52086 +
52087 +       /* offset we are looking for */
52088 +       lookuped = get_key_offset(key);
52089 +
52090 +       if (lookuped >= offset && lookuped < offset + nr_units) {
52091 +               /* byte we are looking for is in this item */
52092 +               coord->unit_pos = lookuped - offset;
52093 +               coord->between = AT_UNIT;
52094 +               return CBK_COORD_FOUND;
52095 +       }
52096 +
52097 +       /* set coord after last unit */
52098 +       coord->unit_pos = nr_units - 1;
52099 +       coord->between = AFTER_UNIT;
52100 +       return bias ==
52101 +           FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
52102 +}
52103 +
52104 +/* plugin->u.item.b.paste */
52105 +int
52106 +paste_tail(coord_t *coord, reiser4_item_data *data,
52107 +          carry_plugin_info *info UNUSED_ARG)
52108 +{
52109 +       unsigned old_item_length;
52110 +       char *item;
52111 +
52112 +       /* length the item had before resizing has been performed */
52113 +       old_item_length = item_length_by_coord(coord) - data->length;
52114 +
52115 +       /* tail items never get pasted in the middle */
52116 +       assert("vs-363",
52117 +              (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) ||
52118 +              (coord->unit_pos == old_item_length - 1 &&
52119 +               coord->between == AFTER_UNIT) ||
52120 +              (coord->unit_pos == 0 && old_item_length == 0
52121 +               && coord->between == AT_UNIT));
52122 +
52123 +       item = item_body_by_coord(coord);
52124 +       if (coord->unit_pos == 0)
52125 +               /* make space for pasted data when pasting at the beginning of
52126 +                  the item */
52127 +               memmove(item + data->length, item, old_item_length);
52128 +
52129 +       if (coord->between == AFTER_UNIT)
52130 +               coord->unit_pos++;
52131 +
52132 +       if (data->data) {
52133 +               assert("vs-554", data->user == 0 || data->user == 1);
52134 +               if (data->user) {
52135 +                       assert("nikita-3035", reiser4_schedulable());
52136 +                       /* copy from user space */
52137 +                       if (__copy_from_user(item + coord->unit_pos,
52138 +                                            (const char __user *)data->data,
52139 +                                            (unsigned)data->length))
52140 +                               return RETERR(-EFAULT);
52141 +               } else
52142 +                       /* copy from kernel space */
52143 +                       memcpy(item + coord->unit_pos, data->data,
52144 +                              (unsigned)data->length);
52145 +       } else {
52146 +               memset(item + coord->unit_pos, 0, (unsigned)data->length);
52147 +       }
52148 +       return 0;
52149 +}
52150 +
52151 +/* plugin->u.item.b.fast_paste */
52152 +
52153 +/* plugin->u.item.b.can_shift
52154 +   number of units is returned via return value, number of bytes via @size. For
52155 +   tail items they coincide */
52156 +int
52157 +can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG,
52158 +              znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG,
52159 +              unsigned *size, unsigned want)
52160 +{
52161 +       /* make sure that that we do not want to shift more than we have */
52162 +       assert("vs-364", want > 0
52163 +              && want <= (unsigned)item_length_by_coord(source));
52164 +
52165 +       *size = min(want, free_space);
52166 +       return *size;
52167 +}
52168 +
52169 +/* plugin->u.item.b.copy_units */
52170 +void
52171 +copy_units_tail(coord_t * target, coord_t * source,
52172 +               unsigned from, unsigned count,
52173 +               shift_direction where_is_free_space,
52174 +               unsigned free_space UNUSED_ARG)
52175 +{
52176 +       /* make sure that item @target is expanded already */
52177 +       assert("vs-366", (unsigned)item_length_by_coord(target) >= count);
52178 +       assert("vs-370", free_space >= count);
52179 +
52180 +       if (where_is_free_space == SHIFT_LEFT) {
52181 +               /* append item @target with @count first bytes of @source */
52182 +               assert("vs-365", from == 0);
52183 +
52184 +               memcpy((char *)item_body_by_coord(target) +
52185 +                      item_length_by_coord(target) - count,
52186 +                      (char *)item_body_by_coord(source), count);
52187 +       } else {
52188 +               /* target item is moved to right already */
52189 +               reiser4_key key;
52190 +
52191 +               assert("vs-367",
52192 +                      (unsigned)item_length_by_coord(source) == from + count);
52193 +
52194 +               memcpy((char *)item_body_by_coord(target),
52195 +                      (char *)item_body_by_coord(source) + from, count);
52196 +
52197 +               /* new units are inserted before first unit in an item,
52198 +                  therefore, we have to update item key */
52199 +               item_key_by_coord(source, &key);
52200 +               set_key_offset(&key, get_key_offset(&key) + from);
52201 +
52202 +               node_plugin_by_node(target->node)->update_item_key(target, &key,
52203 +                                                                  NULL /*info */);
52204 +       }
52205 +}
52206 +
52207 +/* plugin->u.item.b.create_hook */
52208 +
52209 +/* item_plugin->b.kill_hook
52210 +   this is called when @count units starting from @from-th one are going to be removed
52211 +   */
52212 +int
52213 +kill_hook_tail(const coord_t * coord, pos_in_node_t from,
52214 +              pos_in_node_t count, struct carry_kill_data *kdata)
52215 +{
52216 +       reiser4_key key;
52217 +       loff_t start, end;
52218 +
52219 +       assert("vs-1577", kdata);
52220 +       assert("vs-1579", kdata->inode);
52221 +
52222 +       item_key_by_coord(coord, &key);
52223 +       start = get_key_offset(&key) + from;
52224 +       end = start + count;
52225 +       fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate);
52226 +       return 0;
52227 +}
52228 +
52229 +/* plugin->u.item.b.shift_hook */
52230 +
52231 +/* helper for kill_units_tail and cut_units_tail */
52232 +static int
52233 +do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
52234 +              reiser4_key * smallest_removed, reiser4_key * new_first)
52235 +{
52236 +       pos_in_node_t count;
52237 +
52238 +       /* this method is only called to remove part of item */
52239 +       assert("vs-374", (to - from + 1) < item_length_by_coord(coord));
52240 +       /* tails items are never cut from the middle of an item */
52241 +       assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord)));
52242 +       assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord)));
52243 +
52244 +       count = to - from + 1;
52245 +
52246 +       if (smallest_removed) {
52247 +               /* store smallest key removed */
52248 +               item_key_by_coord(coord, smallest_removed);
52249 +               set_key_offset(smallest_removed,
52250 +                              get_key_offset(smallest_removed) + from);
52251 +       }
52252 +       if (new_first) {
52253 +               /* head of item is cut */
52254 +               assert("vs-1529", from == 0);
52255 +
52256 +               item_key_by_coord(coord, new_first);
52257 +               set_key_offset(new_first,
52258 +                              get_key_offset(new_first) + from + count);
52259 +       }
52260 +
52261 +       if (REISER4_DEBUG)
52262 +               memset((char *)item_body_by_coord(coord) + from, 0, count);
52263 +       return count;
52264 +}
52265 +
52266 +/* plugin->u.item.b.cut_units */
52267 +int
52268 +cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
52269 +              struct carry_cut_data *cdata UNUSED_ARG,
52270 +              reiser4_key * smallest_removed, reiser4_key * new_first)
52271 +{
52272 +       return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
52273 +}
52274 +
52275 +/* plugin->u.item.b.kill_units */
52276 +int
52277 +kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
52278 +               struct carry_kill_data *kdata, reiser4_key * smallest_removed,
52279 +               reiser4_key * new_first)
52280 +{
52281 +       kill_hook_tail(coord, from, to - from + 1, kdata);
52282 +       return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
52283 +}
52284 +
52285 +/* plugin->u.item.b.unit_key */
52286 +reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key)
52287 +{
52288 +       assert("vs-375", coord_is_existing_unit(coord));
52289 +
52290 +       item_key_by_coord(coord, key);
52291 +       set_key_offset(key, (get_key_offset(key) + coord->unit_pos));
52292 +
52293 +       return key;
52294 +}
52295 +
52296 +/* plugin->u.item.b.estimate
52297 +   plugin->u.item.b.item_data_by_flow */
52298 +
52299 +/* tail redpage function. It is called from readpage_tail(). */
52300 +static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page)
52301 +{
52302 +       tap_t tap;
52303 +       int result;
52304 +       coord_t coord;
52305 +       lock_handle lh;
52306 +       int count, mapped;
52307 +       struct inode *inode;
52308 +       char *pagedata;
52309 +
52310 +       /* saving passed coord in order to do not move it by tap. */
52311 +       init_lh(&lh);
52312 +       copy_lh(&lh, uf_coord->lh);
52313 +       inode = page->mapping->host;
52314 +       coord_dup(&coord, &uf_coord->coord);
52315 +
52316 +       reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
52317 +
52318 +       if ((result = reiser4_tap_load(&tap)))
52319 +               goto out_tap_done;
52320 +
52321 +       /* lookup until page is filled up. */
52322 +       for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) {
52323 +               /* number of bytes to be copied to page */
52324 +               count = item_length_by_coord(&coord) - coord.unit_pos;
52325 +               if (count > PAGE_CACHE_SIZE - mapped)
52326 +                       count = PAGE_CACHE_SIZE - mapped;
52327 +
52328 +               /* attach @page to address space and get data address */
52329 +               pagedata = kmap_atomic(page, KM_USER0);
52330 +
52331 +               /* copy tail item to page */
52332 +               memcpy(pagedata + mapped,
52333 +                      ((char *)item_body_by_coord(&coord) + coord.unit_pos),
52334 +                      count);
52335 +               mapped += count;
52336 +
52337 +               flush_dcache_page(page);
52338 +
52339 +               /* dettach page from address space */
52340 +               kunmap_atomic(pagedata, KM_USER0);
52341 +
52342 +               /* Getting next tail item. */
52343 +               if (mapped < PAGE_CACHE_SIZE) {
52344 +                       /*
52345 +                        * unlock page in order to avoid keep it locked
52346 +                        * during tree lookup, which takes long term locks
52347 +                        */
52348 +                       unlock_page(page);
52349 +
52350 +                       /* getting right neighbour. */
52351 +                       result = go_dir_el(&tap, RIGHT_SIDE, 0);
52352 +
52353 +                       /* lock page back */
52354 +                       lock_page(page);
52355 +                       if (PageUptodate(page)) {
52356 +                               /*
52357 +                                * another thread read the page, we have
52358 +                                * nothing to do
52359 +                                */
52360 +                               result = 0;
52361 +                               goto out_unlock_page;
52362 +                       }
52363 +
52364 +                       if (result) {
52365 +                               if (result == -E_NO_NEIGHBOR) {
52366 +                                       /*
52367 +                                        * rigth neighbor is not a formatted
52368 +                                        * node
52369 +                                        */
52370 +                                       result = 0;
52371 +                                       goto done;
52372 +                               } else {
52373 +                                       goto out_tap_relse;
52374 +                               }
52375 +                       } else {
52376 +                               if (!inode_file_plugin(inode)->
52377 +                                   owns_item(inode, &coord)) {
52378 +                                       /* item of another file is found */
52379 +                                       result = 0;
52380 +                                       goto done;
52381 +                               }
52382 +                       }
52383 +               }
52384 +       }
52385 +
52386 + done:
52387 +       if (mapped != PAGE_CACHE_SIZE)
52388 +               zero_user_segment(page, mapped, PAGE_CACHE_SIZE);
52389 +       SetPageUptodate(page);
52390 + out_unlock_page:
52391 +       unlock_page(page);
52392 + out_tap_relse:
52393 +       reiser4_tap_relse(&tap);
52394 + out_tap_done:
52395 +       reiser4_tap_done(&tap);
52396 +       return result;
52397 +}
52398 +
52399 +/*
52400 +   plugin->s.file.readpage
52401 +   reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail
52402 +   or
52403 +   filemap_fault->reiser4_readpage->readpage_unix_file->->readpage_tail
52404 +
52405 +   At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail
52406 +   item. */
52407 +int readpage_tail(void *vp, struct page *page)
52408 +{
52409 +       uf_coord_t *uf_coord = vp;
52410 +       ON_DEBUG(coord_t * coord = &uf_coord->coord);
52411 +       ON_DEBUG(reiser4_key key);
52412 +
52413 +       assert("umka-2515", PageLocked(page));
52414 +       assert("umka-2516", !PageUptodate(page));
52415 +       assert("umka-2517", !jprivate(page) && !PagePrivate(page));
52416 +       assert("umka-2518", page->mapping && page->mapping->host);
52417 +
52418 +       assert("umka-2519", znode_is_loaded(coord->node));
52419 +       assert("umka-2520", item_is_tail(coord));
52420 +       assert("umka-2521", coord_is_existing_unit(coord));
52421 +       assert("umka-2522", znode_is_rlocked(coord->node));
52422 +       assert("umka-2523",
52423 +              page->mapping->host->i_ino ==
52424 +              get_key_objectid(item_key_by_coord(coord, &key)));
52425 +
52426 +       return do_readpage_tail(uf_coord, page);
52427 +}
52428 +
52429 +/**
52430 + * overwrite_tail
52431 + * @flow:
52432 + * @coord:
52433 + *
52434 + * Overwrites tail item or its part by user data. Returns number of bytes
52435 + * written or error code.
52436 + */
52437 +static int overwrite_tail(flow_t *flow, coord_t *coord)
52438 +{
52439 +       unsigned count;
52440 +
52441 +       assert("vs-570", flow->user == 1);
52442 +       assert("vs-946", flow->data);
52443 +       assert("vs-947", coord_is_existing_unit(coord));
52444 +       assert("vs-948", znode_is_write_locked(coord->node));
52445 +       assert("nikita-3036", reiser4_schedulable());
52446 +
52447 +       count = item_length_by_coord(coord) - coord->unit_pos;
52448 +       if (count > flow->length)
52449 +               count = flow->length;
52450 +
52451 +       if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos,
52452 +                            (const char __user *)flow->data, count))
52453 +               return RETERR(-EFAULT);
52454 +
52455 +       znode_make_dirty(coord->node);
52456 +       return count;
52457 +}
52458 +
52459 +/**
52460 + * insert_first_tail
52461 + * @inode:
52462 + * @flow:
52463 + * @coord:
52464 + * @lh:
52465 + *
52466 + * Returns number of bytes written or error code.
52467 + */
52468 +static ssize_t insert_first_tail(struct inode *inode, flow_t *flow,
52469 +                                coord_t *coord, lock_handle *lh)
52470 +{
52471 +       int result;
52472 +       loff_t to_write;
52473 +       struct unix_file_info *uf_info;
52474 +
52475 +       if (get_key_offset(&flow->key) != 0) {
52476 +               /*
52477 +                * file is empty and we have to write not to the beginning of
52478 +                * file. Create a hole at the beginning of file. On success
52479 +                * insert_flow returns 0 as number of written bytes which is
52480 +                * what we have to return on padding a file with holes
52481 +                */
52482 +               flow->data = NULL;
52483 +               flow->length = get_key_offset(&flow->key);
52484 +               set_key_offset(&flow->key, 0);
52485 +               /*
52486 +                * holes in files built of tails are stored just like if there
52487 +                * were real data which are all zeros. Therefore we have to
52488 +                * allocate quota here as well
52489 +                */
52490 +               if (dquot_alloc_space_nodirty(inode, flow->length))
52491 +                       return RETERR(-EDQUOT);
52492 +               result = reiser4_insert_flow(coord, lh, flow);
52493 +               if (flow->length)
52494 +                       dquot_free_space_nodirty(inode, flow->length);
52495 +
52496 +               uf_info = unix_file_inode_data(inode);
52497 +
52498 +               /*
52499 +                * first item insertion is only possible when writing to empty
52500 +                * file or performing tail conversion
52501 +                */
52502 +               assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
52503 +                           (reiser4_inode_get_flag(inode,
52504 +                                                   REISER4_PART_MIXED) &&
52505 +                            reiser4_inode_get_flag(inode,
52506 +                                                   REISER4_PART_IN_CONV))));
52507 +               /* if file was empty - update its state */
52508 +               if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
52509 +                       uf_info->container = UF_CONTAINER_TAILS;
52510 +               return result;
52511 +       }
52512 +
52513 +       /* check quota before appending data */
52514 +       if (dquot_alloc_space_nodirty(inode, flow->length))
52515 +               return RETERR(-EDQUOT);
52516 +
52517 +       to_write = flow->length;
52518 +       result = reiser4_insert_flow(coord, lh, flow);
52519 +       if (flow->length)
52520 +               dquot_free_space_nodirty(inode, flow->length);
52521 +       return (to_write - flow->length) ? (to_write - flow->length) : result;
52522 +}
52523 +
52524 +/**
52525 + * append_tail
52526 + * @inode:
52527 + * @flow:
52528 + * @coord:
52529 + * @lh:
52530 + *
52531 + * Returns number of bytes written or error code.
52532 + */
52533 +static ssize_t append_tail(struct inode *inode,
52534 +                          flow_t *flow, coord_t *coord, lock_handle *lh)
52535 +{
52536 +       int result;
52537 +       reiser4_key append_key;
52538 +       loff_t to_write;
52539 +
52540 +       if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) {
52541 +               flow->data = NULL;
52542 +               flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key);
52543 +               set_key_offset(&flow->key, get_key_offset(&append_key));
52544 +               /*
52545 +                * holes in files built of tails are stored just like if there
52546 +                * were real data which are all zeros. Therefore we have to
52547 +                * allocate quota here as well
52548 +                */
52549 +               if (dquot_alloc_space_nodirty(inode, flow->length))
52550 +                       return RETERR(-EDQUOT);
52551 +               result = reiser4_insert_flow(coord, lh, flow);
52552 +               if (flow->length)
52553 +                       dquot_free_space_nodirty(inode, flow->length);
52554 +               return result;
52555 +       }
52556 +
52557 +       /* check quota before appending data */
52558 +       if (dquot_alloc_space_nodirty(inode, flow->length))
52559 +               return RETERR(-EDQUOT);
52560 +
52561 +       to_write = flow->length;
52562 +       result = reiser4_insert_flow(coord, lh, flow);
52563 +       if (flow->length)
52564 +               dquot_free_space_nodirty(inode, flow->length);
52565 +       return (to_write - flow->length) ? (to_write - flow->length) : result;
52566 +}
52567 +
52568 +/**
52569 + * write_tail_reserve_space - reserve space for tail write operation
52570 + * @inode:
52571 + *
52572 + * Estimates and reserves space which may be required for writing one flow to a
52573 + * file
52574 + */
52575 +static int write_extent_reserve_space(struct inode *inode)
52576 +{
52577 +       __u64 count;
52578 +       reiser4_tree *tree;
52579 +
52580 +       /*
52581 +        * to write one flow to a file by tails we have to reserve disk space for:
52582 +
52583 +        * 1. find_file_item may have to insert empty node to the tree (empty
52584 +        * leaf node between two extent items). This requires 1 block and
52585 +        * number of blocks which are necessary to perform insertion of an
52586 +        * internal item into twig level.
52587 +        *
52588 +        * 2. flow insertion
52589 +        *
52590 +        * 3. stat data update
52591 +        */
52592 +       tree = reiser4_tree_by_inode(inode);
52593 +       count = estimate_one_insert_item(tree) +
52594 +               estimate_insert_flow(tree->height) +
52595 +               estimate_one_insert_item(tree);
52596 +       grab_space_enable();
52597 +       return reiser4_grab_space(count, 0 /* flags */);
52598 +}
52599 +
52600 +#define PAGE_PER_FLOW 4
52601 +
52602 +static loff_t faultin_user_pages(const char __user *buf, size_t count)
52603 +{
52604 +       loff_t faulted;
52605 +       int to_fault;
52606 +
52607 +       if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE)
52608 +               count = PAGE_PER_FLOW * PAGE_CACHE_SIZE;
52609 +       faulted = 0;
52610 +       while (count > 0) {
52611 +               to_fault = PAGE_CACHE_SIZE;
52612 +               if (count < to_fault)
52613 +                       to_fault = count;
52614 +               fault_in_pages_readable(buf + faulted, to_fault);
52615 +               count -= to_fault;
52616 +               faulted += to_fault;
52617 +       }
52618 +       return faulted;
52619 +}
52620 +
52621 +/**
52622 + * reiser4_write_tail - write method of tail item plugin
52623 + * @file: file to write to
52624 + * @buf: address of user-space buffer
52625 + * @count: number of bytes to write
52626 + * @pos: position in file to write to
52627 + *
52628 + * Returns number of written bytes or error code.
52629 + */
52630 +ssize_t reiser4_write_tail(struct file *file, struct inode * inode,
52631 +                          const char __user *buf, size_t count, loff_t *pos)
52632 +{
52633 +       struct hint hint;
52634 +       int result;
52635 +       flow_t flow;
52636 +       coord_t *coord;
52637 +       lock_handle *lh;
52638 +       znode *loaded;
52639 +
52640 +       assert("edward-1548", inode != NULL);
52641 +
52642 +       if (write_extent_reserve_space(inode))
52643 +               return RETERR(-ENOSPC);
52644 +
52645 +       result = load_file_hint(file, &hint);
52646 +       BUG_ON(result != 0);
52647 +
52648 +       flow.length = faultin_user_pages(buf, count);
52649 +       flow.user = 1;
52650 +       memcpy(&flow.data, &buf, sizeof(buf));
52651 +       flow.op = WRITE_OP;
52652 +       key_by_inode_and_offset_common(inode, *pos, &flow.key);
52653 +
52654 +       result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode);
52655 +       if (IS_CBKERR(result))
52656 +               return result;
52657 +
52658 +       coord = &hint.ext_coord.coord;
52659 +       lh = hint.ext_coord.lh;
52660 +
52661 +       result = zload(coord->node);
52662 +       BUG_ON(result != 0);
52663 +       loaded = coord->node;
52664 +
52665 +       if (coord->between == AFTER_UNIT) {
52666 +               /* append with data or hole */
52667 +               result = append_tail(inode, &flow, coord, lh);
52668 +       } else if (coord->between == AT_UNIT) {
52669 +               /* overwrite */
52670 +               result = overwrite_tail(&flow, coord);
52671 +       } else {
52672 +               /* no items of this file yet. insert data or hole */
52673 +               result = insert_first_tail(inode, &flow, coord, lh);
52674 +       }
52675 +       zrelse(loaded);
52676 +       if (result < 0) {
52677 +               done_lh(lh);
52678 +               return result;
52679 +       }
52680 +
52681 +       /* seal and unlock znode */
52682 +       hint.ext_coord.valid = 0;
52683 +       if (hint.ext_coord.valid)
52684 +               reiser4_set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK);
52685 +       else
52686 +               reiser4_unset_hint(&hint);
52687 +
52688 +       save_file_hint(file, &hint);
52689 +       return result;
52690 +}
52691 +
52692 +#if REISER4_DEBUG
52693 +
52694 +static int
52695 +coord_matches_key_tail(const coord_t * coord, const reiser4_key * key)
52696 +{
52697 +       reiser4_key item_key;
52698 +
52699 +       assert("vs-1356", coord_is_existing_unit(coord));
52700 +       assert("vs-1354", keylt(key, append_key_tail(coord, &item_key)));
52701 +       assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key)));
52702 +       return get_key_offset(key) ==
52703 +           get_key_offset(&item_key) + coord->unit_pos;
52704 +
52705 +}
52706 +
52707 +#endif
52708 +
52709 +/* plugin->u.item.s.file.read */
52710 +int reiser4_read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint)
52711 +{
52712 +       unsigned count;
52713 +       int item_length;
52714 +       coord_t *coord;
52715 +       uf_coord_t *uf_coord;
52716 +
52717 +       uf_coord = &hint->ext_coord;
52718 +       coord = &uf_coord->coord;
52719 +
52720 +       assert("vs-571", f->user == 1);
52721 +       assert("vs-571", f->data);
52722 +       assert("vs-967", coord && coord->node);
52723 +       assert("vs-1117", znode_is_rlocked(coord->node));
52724 +       assert("vs-1118", znode_is_loaded(coord->node));
52725 +
52726 +       assert("nikita-3037", reiser4_schedulable());
52727 +       assert("vs-1357", coord_matches_key_tail(coord, &f->key));
52728 +
52729 +       /* calculate number of bytes to read off the item */
52730 +       item_length = item_length_by_coord(coord);
52731 +       count = item_length_by_coord(coord) - coord->unit_pos;
52732 +       if (count > f->length)
52733 +               count = f->length;
52734 +
52735 +       /* user page has to be brought in so that major page fault does not
52736 +        * occur here when longtem lock is held */
52737 +       if (__copy_to_user((char __user *)f->data,
52738 +                          ((char *)item_body_by_coord(coord) + coord->unit_pos),
52739 +                          count))
52740 +               return RETERR(-EFAULT);
52741 +
52742 +       /* probably mark_page_accessed() should only be called if
52743 +        * coord->unit_pos is zero. */
52744 +       mark_page_accessed(znode_page(coord->node));
52745 +       move_flow_forward(f, count);
52746 +
52747 +       coord->unit_pos += count;
52748 +       if (item_length == coord->unit_pos) {
52749 +               coord->unit_pos--;
52750 +               coord->between = AFTER_UNIT;
52751 +       }
52752 +       reiser4_set_hint(hint, &f->key, ZNODE_READ_LOCK);
52753 +       return 0;
52754 +}
52755 +
52756 +/*
52757 +   plugin->u.item.s.file.append_key
52758 +   key of first byte which is the next to last byte by addressed by this item
52759 +*/
52760 +reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key)
52761 +{
52762 +       item_key_by_coord(coord, key);
52763 +       set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord));
52764 +       return key;
52765 +}
52766 +
52767 +/* plugin->u.item.s.file.init_coord_extension */
52768 +void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped)
52769 +{
52770 +       uf_coord->valid = 1;
52771 +}
52772 +
52773 +/*
52774 +  plugin->u.item.s.file.get_block
52775 +*/
52776 +int
52777 +get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block)
52778 +{
52779 +       assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL);
52780 +
52781 +       if (reiser4_blocknr_is_fake(znode_get_block(coord->node)))
52782 +               /* if node has'nt obtainet its block number yet, return 0.
52783 +                * Lets avoid upsetting users with some cosmic numbers beyond
52784 +                * the device capacity.*/
52785 +               *block = 0;
52786 +       else
52787 +               *block = *znode_get_block(coord->node);
52788 +       return 0;
52789 +}
52790 +
52791 +/*
52792 + * Local variables:
52793 + * c-indentation-style: "K&R"
52794 + * mode-name: "LC"
52795 + * c-basic-offset: 8
52796 + * tab-width: 8
52797 + * fill-column: 79
52798 + * scroll-step: 1
52799 + * End:
52800 + */
52801 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/item/tail.h linux-2.6.35/fs/reiser4/plugin/item/tail.h
52802 --- linux-2.6.35.orig/fs/reiser4/plugin/item/tail.h     1970-01-01 01:00:00.000000000 +0100
52803 +++ linux-2.6.35/fs/reiser4/plugin/item/tail.h  2010-08-04 15:44:57.000000000 +0200
52804 @@ -0,0 +1,56 @@
52805 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52806 +
52807 +#if !defined( __REISER4_TAIL_H__ )
52808 +#define __REISER4_TAIL_H__
52809 +
52810 +struct tail_coord_extension {
52811 +       int not_used;
52812 +};
52813 +
52814 +struct cut_list;
52815 +
52816 +/* plugin->u.item.b.* */
52817 +reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *);
52818 +int can_contain_key_tail(const coord_t * coord, const reiser4_key * key,
52819 +                        const reiser4_item_data *);
52820 +int mergeable_tail(const coord_t * p1, const coord_t * p2);
52821 +pos_in_node_t nr_units_tail(const coord_t *);
52822 +lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *);
52823 +int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *);
52824 +int can_shift_tail(unsigned free_space, coord_t * source,
52825 +                  znode * target, shift_direction, unsigned *size,
52826 +                  unsigned want);
52827 +void copy_units_tail(coord_t * target, coord_t * source, unsigned from,
52828 +                    unsigned count, shift_direction, unsigned free_space);
52829 +int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count,
52830 +                  struct carry_kill_data *);
52831 +int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
52832 +                  struct carry_cut_data *, reiser4_key * smallest_removed,
52833 +                  reiser4_key * new_first);
52834 +int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
52835 +                   struct carry_kill_data *, reiser4_key * smallest_removed,
52836 +                   reiser4_key * new_first);
52837 +reiser4_key *unit_key_tail(const coord_t *, reiser4_key *);
52838 +
52839 +/* plugin->u.item.s.* */
52840 +ssize_t reiser4_write_tail(struct file *file, struct inode * inode,
52841 +                          const char __user *buf, size_t count, loff_t *pos);
52842 +int reiser4_read_tail(struct file *, flow_t *, hint_t *);
52843 +int readpage_tail(void *vp, struct page *page);
52844 +reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
52845 +void init_coord_extension_tail(uf_coord_t *, loff_t offset);
52846 +int get_block_address_tail(const coord_t *, sector_t, sector_t *);
52847 +
52848 +/* __REISER4_TAIL_H__ */
52849 +#endif
52850 +
52851 +/* Make Linus happy.
52852 +   Local variables:
52853 +   c-indentation-style: "K&R"
52854 +   mode-name: "LC"
52855 +   c-basic-offset: 8
52856 +   tab-width: 8
52857 +   fill-column: 120
52858 +   scroll-step: 1
52859 +   End:
52860 +*/
52861 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/Makefile linux-2.6.35/fs/reiser4/plugin/Makefile
52862 --- linux-2.6.35.orig/fs/reiser4/plugin/Makefile        1970-01-01 01:00:00.000000000 +0100
52863 +++ linux-2.6.35/fs/reiser4/plugin/Makefile     2010-08-04 15:44:57.000000000 +0200
52864 @@ -0,0 +1,26 @@
52865 +obj-$(CONFIG_REISER4_FS) += plugins.o
52866 +
52867 +plugins-objs :=                        \
52868 +       plugin.o                \
52869 +       plugin_set.o            \
52870 +       object.o                \
52871 +       inode_ops.o             \
52872 +       inode_ops_rename.o      \
52873 +       file_ops.o              \
52874 +       file_ops_readdir.o      \
52875 +       file_plugin_common.o    \
52876 +       dir_plugin_common.o     \
52877 +       digest.o                \
52878 +       hash.o                  \
52879 +       fibration.o             \
52880 +       tail_policy.o           \
52881 +       regular.o
52882 +
52883 +obj-$(CONFIG_REISER4_FS) += item/
52884 +obj-$(CONFIG_REISER4_FS) += file/
52885 +obj-$(CONFIG_REISER4_FS) += dir/
52886 +obj-$(CONFIG_REISER4_FS) += node/
52887 +obj-$(CONFIG_REISER4_FS) += compress/
52888 +obj-$(CONFIG_REISER4_FS) += space/
52889 +obj-$(CONFIG_REISER4_FS) += disk_format/
52890 +obj-$(CONFIG_REISER4_FS) += security/
52891 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/node/Makefile linux-2.6.35/fs/reiser4/plugin/node/Makefile
52892 --- linux-2.6.35.orig/fs/reiser4/plugin/node/Makefile   1970-01-01 01:00:00.000000000 +0100
52893 +++ linux-2.6.35/fs/reiser4/plugin/node/Makefile        2010-08-04 15:44:57.000000000 +0200
52894 @@ -0,0 +1,5 @@
52895 +obj-$(CONFIG_REISER4_FS) += node_plugins.o
52896 +
52897 +node_plugins-objs :=   \
52898 +       node.o          \
52899 +       node40.o
52900 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/node/node40.c linux-2.6.35/fs/reiser4/plugin/node/node40.c
52901 --- linux-2.6.35.orig/fs/reiser4/plugin/node/node40.c   1970-01-01 01:00:00.000000000 +0100
52902 +++ linux-2.6.35/fs/reiser4/plugin/node/node40.c        2010-08-04 15:44:57.000000000 +0200
52903 @@ -0,0 +1,2924 @@
52904 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52905 +
52906 +#include "../../debug.h"
52907 +#include "../../key.h"
52908 +#include "../../coord.h"
52909 +#include "../plugin_header.h"
52910 +#include "../item/item.h"
52911 +#include "node.h"
52912 +#include "node40.h"
52913 +#include "../plugin.h"
52914 +#include "../../jnode.h"
52915 +#include "../../znode.h"
52916 +#include "../../pool.h"
52917 +#include "../../carry.h"
52918 +#include "../../tap.h"
52919 +#include "../../tree.h"
52920 +#include "../../super.h"
52921 +#include "../../reiser4.h"
52922 +
52923 +#include <asm/uaccess.h>
52924 +#include <linux/types.h>
52925 +#include <linux/prefetch.h>
52926 +
52927 +/* leaf 40 format:
52928 +
52929 +  [node header | item 0, item 1, .., item N-1 |  free space | item_head N-1, .. item_head 1, item head 0 ]
52930 +   plugin_id (16)                                                key
52931 +   free_space (16)                                               pluginid (16)
52932 +   free_space_start (16)                                         offset (16)
52933 +   level (8)
52934 +   num_items (16)
52935 +   magic (32)
52936 +   flush_time (32)
52937 +*/
52938 +/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs.  Change to "ReIs". */
52939 +/* magic number that is stored in ->magic field of node header */
52940 +static const __u32 REISER4_NODE_MAGIC = 0x52344653;    /* (*(__u32 *)"R4FS"); */
52941 +
52942 +static int prepare_for_update(znode * left, znode * right,
52943 +                             carry_plugin_info * info);
52944 +
52945 +/* header of node of reiser40 format is at the beginning of node */
52946 +static inline node40_header *node40_node_header(const znode * node     /* node to
52947 +                                                                        * query */ )
52948 +{
52949 +       assert("nikita-567", node != NULL);
52950 +       assert("nikita-568", znode_page(node) != NULL);
52951 +       assert("nikita-569", zdata(node) != NULL);
52952 +       return (node40_header *) zdata(node);
52953 +}
52954 +
52955 +/* functions to get/set fields of node40_header */
52956 +#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic))
52957 +#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space))
52958 +#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start))
52959 +#define nh40_get_level(nh) get_unaligned(&(nh)->level)
52960 +#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items))
52961 +#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id))
52962 +
52963 +#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic)
52964 +#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space)
52965 +#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start)
52966 +#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level)
52967 +#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items)
52968 +#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id)
52969 +
52970 +/* plugin field of node header should be read/set by
52971 +   plugin_by_disk_id/save_disk_plugin */
52972 +
52973 +/* array of item headers is at the end of node */
52974 +static inline item_header40 *node40_ih_at(const znode * node, unsigned pos)
52975 +{
52976 +       return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1;
52977 +}
52978 +
52979 +/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1
52980 + */
52981 +static inline item_header40 *node40_ih_at_coord(const coord_t * coord)
52982 +{
52983 +       return (item_header40 *) (zdata(coord->node) +
52984 +                                 znode_size(coord->node)) - (coord->item_pos) -
52985 +           1;
52986 +}
52987 +
52988 +/* functions to get/set fields of item_header40 */
52989 +#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset))
52990 +
52991 +#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset)
52992 +
52993 +/* plugin field of item header should be read/set by
52994 +   plugin_by_disk_id/save_disk_plugin */
52995 +
52996 +/* plugin methods */
52997 +
52998 +/* plugin->u.node.item_overhead
52999 +   look for description of this method in plugin/node/node.h */
53000 +size_t
53001 +item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG)
53002 +{
53003 +       return sizeof(item_header40);
53004 +}
53005 +
53006 +/* plugin->u.node.free_space
53007 +   look for description of this method in plugin/node/node.h */
53008 +size_t free_space_node40(znode * node)
53009 +{
53010 +       assert("nikita-577", node != NULL);
53011 +       assert("nikita-578", znode_is_loaded(node));
53012 +       assert("nikita-579", zdata(node) != NULL);
53013 +
53014 +       return nh40_get_free_space(node40_node_header(node));
53015 +}
53016 +
53017 +/* private inline version of node40_num_of_items() for use in this file. This
53018 +   is necessary, because address of node40_num_of_items() is taken and it is
53019 +   never inlined as a result. */
53020 +static inline short node40_num_of_items_internal(const znode * node)
53021 +{
53022 +       return nh40_get_num_items(node40_node_header(node));
53023 +}
53024 +
53025 +#if REISER4_DEBUG
53026 +static inline void check_num_items(const znode * node)
53027 +{
53028 +       assert("nikita-2749",
53029 +              node40_num_of_items_internal(node) == node->nr_items);
53030 +       assert("nikita-2746", znode_is_write_locked(node));
53031 +}
53032 +#else
53033 +#define check_num_items(node) noop
53034 +#endif
53035 +
53036 +/* plugin->u.node.num_of_items
53037 +   look for description of this method in plugin/node/node.h */
53038 +int num_of_items_node40(const znode * node)
53039 +{
53040 +       return node40_num_of_items_internal(node);
53041 +}
53042 +
53043 +static void
53044 +node40_set_num_items(znode * node, node40_header * nh, unsigned value)
53045 +{
53046 +       assert("nikita-2751", node != NULL);
53047 +       assert("nikita-2750", nh == node40_node_header(node));
53048 +
53049 +       check_num_items(node);
53050 +       nh40_set_num_items(nh, value);
53051 +       node->nr_items = value;
53052 +       check_num_items(node);
53053 +}
53054 +
53055 +/* plugin->u.node.item_by_coord
53056 +   look for description of this method in plugin/node/node.h */
53057 +char *item_by_coord_node40(const coord_t * coord)
53058 +{
53059 +       item_header40 *ih;
53060 +       char *p;
53061 +
53062 +       /* @coord is set to existing item */
53063 +       assert("nikita-596", coord != NULL);
53064 +       assert("vs-255", coord_is_existing_item(coord));
53065 +
53066 +       ih = node40_ih_at_coord(coord);
53067 +       p = zdata(coord->node) + ih40_get_offset(ih);
53068 +       return p;
53069 +}
53070 +
53071 +/* plugin->u.node.length_by_coord
53072 +   look for description of this method in plugin/node/node.h */
53073 +int length_by_coord_node40(const coord_t * coord)
53074 +{
53075 +       item_header40 *ih;
53076 +       int result;
53077 +
53078 +       /* @coord is set to existing item */
53079 +       assert("vs-256", coord != NULL);
53080 +       assert("vs-257", coord_is_existing_item(coord));
53081 +
53082 +       ih = node40_ih_at_coord(coord);
53083 +       if ((int)coord->item_pos ==
53084 +           node40_num_of_items_internal(coord->node) - 1)
53085 +               result =
53086 +                   nh40_get_free_space_start(node40_node_header(coord->node)) -
53087 +                   ih40_get_offset(ih);
53088 +       else
53089 +               result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
53090 +
53091 +       return result;
53092 +}
53093 +
53094 +static pos_in_node_t
53095 +node40_item_length(const znode * node, pos_in_node_t item_pos)
53096 +{
53097 +       item_header40 *ih;
53098 +       pos_in_node_t result;
53099 +
53100 +       /* @coord is set to existing item */
53101 +       assert("vs-256", node != NULL);
53102 +       assert("vs-257", node40_num_of_items_internal(node) > item_pos);
53103 +
53104 +       ih = node40_ih_at(node, item_pos);
53105 +       if (item_pos == node40_num_of_items_internal(node) - 1)
53106 +               result =
53107 +                   nh40_get_free_space_start(node40_node_header(node)) -
53108 +                   ih40_get_offset(ih);
53109 +       else
53110 +               result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
53111 +
53112 +       return result;
53113 +}
53114 +
53115 +/* plugin->u.node.plugin_by_coord
53116 +   look for description of this method in plugin/node/node.h */
53117 +item_plugin *plugin_by_coord_node40(const coord_t * coord)
53118 +{
53119 +       item_header40 *ih;
53120 +       item_plugin *result;
53121 +
53122 +       /* @coord is set to existing item */
53123 +       assert("vs-258", coord != NULL);
53124 +       assert("vs-259", coord_is_existing_item(coord));
53125 +
53126 +       ih = node40_ih_at_coord(coord);
53127 +       /* pass NULL in stead of current tree. This is time critical call. */
53128 +       result = item_plugin_by_disk_id(NULL, &ih->plugin_id);
53129 +       return result;
53130 +}
53131 +
53132 +/* plugin->u.node.key_at
53133 +   look for description of this method in plugin/node/node.h */
53134 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key)
53135 +{
53136 +       item_header40 *ih;
53137 +
53138 +       assert("nikita-1765", coord_is_existing_item(coord));
53139 +
53140 +       /* @coord is set to existing item */
53141 +       ih = node40_ih_at_coord(coord);
53142 +       memcpy(key, &ih->key, sizeof(reiser4_key));
53143 +       return key;
53144 +}
53145 +
53146 +/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */
53147 +
53148 +#define NODE_INCSTAT(n, counter)                                               \
53149 +       reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter)
53150 +
53151 +#define NODE_ADDSTAT(n, counter, val)                                          \
53152 +       reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val)
53153 +
53154 +/* plugin->u.node.lookup
53155 +   look for description of this method in plugin/node/node.h */
53156 +node_search_result lookup_node40(znode * node /* node to query */ ,
53157 +                                const reiser4_key * key /* key to look for */ ,
53158 +                                lookup_bias bias /* search bias */ ,
53159 +                                coord_t * coord /* resulting coord */ )
53160 +{
53161 +       int left;
53162 +       int right;
53163 +       int found;
53164 +       int items;
53165 +
53166 +       item_header40 *lefth;
53167 +       item_header40 *righth;
53168 +
53169 +       item_plugin *iplug;
53170 +       item_header40 *bstop;
53171 +       item_header40 *ih;
53172 +       cmp_t order;
53173 +
53174 +       assert("nikita-583", node != NULL);
53175 +       assert("nikita-584", key != NULL);
53176 +       assert("nikita-585", coord != NULL);
53177 +       assert("nikita-2693", znode_is_any_locked(node));
53178 +       cassert(REISER4_SEQ_SEARCH_BREAK > 2);
53179 +
53180 +       items = node_num_items(node);
53181 +
53182 +       if (unlikely(items == 0)) {
53183 +               coord_init_first_unit(coord, node);
53184 +               return NS_NOT_FOUND;
53185 +       }
53186 +
53187 +       /* binary search for item that can contain given key */
53188 +       left = 0;
53189 +       right = items - 1;
53190 +       coord->node = node;
53191 +       coord_clear_iplug(coord);
53192 +       found = 0;
53193 +
53194 +       lefth = node40_ih_at(node, left);
53195 +       righth = node40_ih_at(node, right);
53196 +
53197 +       /* It is known that for small arrays sequential search is on average
53198 +          more efficient than binary. This is because sequential search is
53199 +          coded as tight loop that can be better optimized by compilers and
53200 +          for small array size gain from this optimization makes sequential
53201 +          search the winner. Another, maybe more important, reason for this,
53202 +          is that sequential array is more CPU cache friendly, whereas binary
53203 +          search effectively destroys CPU caching.
53204 +
53205 +          Critical here is the notion of "smallness". Reasonable value of
53206 +          REISER4_SEQ_SEARCH_BREAK can be found by playing with code in
53207 +          fs/reiser4/ulevel/ulevel.c:test_search().
53208 +
53209 +          Don't try to further optimize sequential search by scanning from
53210 +          right to left in attempt to use more efficient loop termination
53211 +          condition (comparison with 0). This doesn't work.
53212 +
53213 +        */
53214 +
53215 +       while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
53216 +               int median;
53217 +               item_header40 *medianh;
53218 +
53219 +               median = (left + right) / 2;
53220 +               medianh = node40_ih_at(node, median);
53221 +
53222 +               assert("nikita-1084", median >= 0);
53223 +               assert("nikita-1085", median < items);
53224 +               switch (keycmp(key, &medianh->key)) {
53225 +               case LESS_THAN:
53226 +                       right = median;
53227 +                       righth = medianh;
53228 +                       break;
53229 +               default:
53230 +                       wrong_return_value("nikita-586", "keycmp");
53231 +               case GREATER_THAN:
53232 +                       left = median;
53233 +                       lefth = medianh;
53234 +                       break;
53235 +               case EQUAL_TO:
53236 +                       do {
53237 +                               --median;
53238 +                               /* headers are ordered from right to left */
53239 +                               ++medianh;
53240 +                       } while (median >= 0 && keyeq(key, &medianh->key));
53241 +                       right = left = median + 1;
53242 +                       ih = lefth = righth = medianh - 1;
53243 +                       found = 1;
53244 +                       break;
53245 +               }
53246 +       }
53247 +       /* sequential scan. Item headers, and, therefore, keys are stored at
53248 +          the rightmost part of a node from right to left. We are trying to
53249 +          access memory from left to right, and hence, scan in _descending_
53250 +          order of item numbers.
53251 +        */
53252 +       if (!found) {
53253 +               for (left = right, ih = righth; left >= 0; ++ih, --left) {
53254 +                       cmp_t comparison;
53255 +
53256 +                       prefetchkey(&(ih + 1)->key);
53257 +                       comparison = keycmp(&ih->key, key);
53258 +                       if (comparison == GREATER_THAN)
53259 +                               continue;
53260 +                       if (comparison == EQUAL_TO) {
53261 +                               found = 1;
53262 +                               do {
53263 +                                       --left;
53264 +                                       ++ih;
53265 +                               } while (left >= 0 && keyeq(&ih->key, key));
53266 +                               ++left;
53267 +                               --ih;
53268 +                       } else {
53269 +                               assert("nikita-1256", comparison == LESS_THAN);
53270 +                       }
53271 +                       break;
53272 +               }
53273 +               if (unlikely(left < 0))
53274 +                       left = 0;
53275 +       }
53276 +
53277 +       assert("nikita-3212", right >= left);
53278 +       assert("nikita-3214",
53279 +              equi(found, keyeq(&node40_ih_at(node, left)->key, key)));
53280 +
53281 +       coord_set_item_pos(coord, left);
53282 +       coord->unit_pos = 0;
53283 +       coord->between = AT_UNIT;
53284 +
53285 +       /* key < leftmost key in a mode or node is corrupted and keys
53286 +          are not sorted  */
53287 +       bstop = node40_ih_at(node, (unsigned)left);
53288 +       order = keycmp(&bstop->key, key);
53289 +       if (unlikely(order == GREATER_THAN)) {
53290 +               if (unlikely(left != 0)) {
53291 +                       /* screw up */
53292 +                       warning("nikita-587", "Key less than %i key in a node",
53293 +                               left);
53294 +                       reiser4_print_key("key", key);
53295 +                       reiser4_print_key("min", &bstop->key);
53296 +                       print_coord_content("coord", coord);
53297 +                       return RETERR(-EIO);
53298 +               } else {
53299 +                       coord->between = BEFORE_UNIT;
53300 +                       return NS_NOT_FOUND;
53301 +               }
53302 +       }
53303 +       /* left <= key, ok */
53304 +       iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id);
53305 +
53306 +       if (unlikely(iplug == NULL)) {
53307 +               warning("nikita-588", "Unknown plugin %i",
53308 +                       le16_to_cpu(get_unaligned(&bstop->plugin_id)));
53309 +               reiser4_print_key("key", key);
53310 +               print_coord_content("coord", coord);
53311 +               return RETERR(-EIO);
53312 +       }
53313 +
53314 +       coord_set_iplug(coord, iplug);
53315 +
53316 +       /* if exact key from item header was found by binary search, no
53317 +          further checks are necessary. */
53318 +       if (found) {
53319 +               assert("nikita-1259", order == EQUAL_TO);
53320 +               return NS_FOUND;
53321 +       }
53322 +       if (iplug->b.max_key_inside != NULL) {
53323 +               reiser4_key max_item_key;
53324 +
53325 +               /* key > max_item_key --- outside of an item */
53326 +               if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) {
53327 +                       coord->unit_pos = 0;
53328 +                       coord->between = AFTER_ITEM;
53329 +                       /* FIXME-VS: key we are looking for does not fit into
53330 +                          found item. Return NS_NOT_FOUND then. Without that
53331 +                          the following case does not work: there is extent of
53332 +                          file 10000, 10001. File 10000, 10002 has been just
53333 +                          created. When writing to position 0 in that file -
53334 +                          traverse_tree will stop here on twig level. When we
53335 +                          want it to go down to leaf level
53336 +                        */
53337 +                       return NS_NOT_FOUND;
53338 +               }
53339 +       }
53340 +
53341 +       if (iplug->b.lookup != NULL) {
53342 +               return iplug->b.lookup(key, bias, coord);
53343 +       } else {
53344 +               assert("nikita-1260", order == LESS_THAN);
53345 +               coord->between = AFTER_UNIT;
53346 +               return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND;
53347 +       }
53348 +}
53349 +
53350 +#undef NODE_ADDSTAT
53351 +#undef NODE_INCSTAT
53352 +
53353 +/* plugin->u.node.estimate
53354 +   look for description of this method in plugin/node/node.h */
53355 +size_t estimate_node40(znode * node)
53356 +{
53357 +       size_t result;
53358 +
53359 +       assert("nikita-597", node != NULL);
53360 +
53361 +       result = free_space_node40(node) - sizeof(item_header40);
53362 +
53363 +       return (result > 0) ? result : 0;
53364 +}
53365 +
53366 +/* plugin->u.node.check
53367 +   look for description of this method in plugin/node/node.h */
53368 +int check_node40(const znode * node /* node to check */ ,
53369 +                __u32 flags /* check flags */ ,
53370 +                const char **error /* where to store error message */ )
53371 +{
53372 +       int nr_items;
53373 +       int i;
53374 +       reiser4_key prev;
53375 +       unsigned old_offset;
53376 +       tree_level level;
53377 +       coord_t coord;
53378 +       int result;
53379 +
53380 +       assert("nikita-580", node != NULL);
53381 +       assert("nikita-581", error != NULL);
53382 +       assert("nikita-2948", znode_is_loaded(node));
53383 +
53384 +       if (ZF_ISSET(node, JNODE_HEARD_BANSHEE))
53385 +               return 0;
53386 +
53387 +       assert("nikita-582", zdata(node) != NULL);
53388 +
53389 +       nr_items = node40_num_of_items_internal(node);
53390 +       if (nr_items < 0) {
53391 +               *error = "Negative number of items";
53392 +               return -1;
53393 +       }
53394 +
53395 +       if (flags & REISER4_NODE_DKEYS)
53396 +               prev = *znode_get_ld_key((znode *) node);
53397 +       else
53398 +               prev = *reiser4_min_key();
53399 +
53400 +       old_offset = 0;
53401 +       coord_init_zero(&coord);
53402 +       coord.node = (znode *) node;
53403 +       coord.unit_pos = 0;
53404 +       coord.between = AT_UNIT;
53405 +       level = znode_get_level(node);
53406 +       for (i = 0; i < nr_items; i++) {
53407 +               item_header40 *ih;
53408 +               reiser4_key unit_key;
53409 +               unsigned j;
53410 +
53411 +               ih = node40_ih_at(node, (unsigned)i);
53412 +               coord_set_item_pos(&coord, i);
53413 +               if ((ih40_get_offset(ih) >=
53414 +                    znode_size(node) - nr_items * sizeof(item_header40)) ||
53415 +                   (ih40_get_offset(ih) < sizeof(node40_header))) {
53416 +                       *error = "Offset is out of bounds";
53417 +                       return -1;
53418 +               }
53419 +               if (ih40_get_offset(ih) <= old_offset) {
53420 +                       *error = "Offsets are in wrong order";
53421 +                       return -1;
53422 +               }
53423 +               if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) {
53424 +                       *error = "Wrong offset of first item";
53425 +                       return -1;
53426 +               }
53427 +               old_offset = ih40_get_offset(ih);
53428 +
53429 +               if (keygt(&prev, &ih->key)) {
53430 +                       *error = "Keys are in wrong order";
53431 +                       return -1;
53432 +               }
53433 +               if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) {
53434 +                       *error = "Wrong key of first unit";
53435 +                       return -1;
53436 +               }
53437 +               prev = ih->key;
53438 +               for (j = 0; j < coord_num_units(&coord); ++j) {
53439 +                       coord.unit_pos = j;
53440 +                       unit_key_by_coord(&coord, &unit_key);
53441 +                       if (keygt(&prev, &unit_key)) {
53442 +                               *error = "Unit keys are in wrong order";
53443 +                               return -1;
53444 +                       }
53445 +                       prev = unit_key;
53446 +               }
53447 +               coord.unit_pos = 0;
53448 +               if (level != TWIG_LEVEL && item_is_extent(&coord)) {
53449 +                       *error = "extent on the wrong level";
53450 +                       return -1;
53451 +               }
53452 +               if (level == LEAF_LEVEL && item_is_internal(&coord)) {
53453 +                       *error = "internal item on the wrong level";
53454 +                       return -1;
53455 +               }
53456 +               if (level != LEAF_LEVEL &&
53457 +                   !item_is_internal(&coord) && !item_is_extent(&coord)) {
53458 +                       *error = "wrong item on the internal level";
53459 +                       return -1;
53460 +               }
53461 +               if (level > TWIG_LEVEL && !item_is_internal(&coord)) {
53462 +                       *error = "non-internal item on the internal level";
53463 +                       return -1;
53464 +               }
53465 +#if REISER4_DEBUG
53466 +               if (item_plugin_by_coord(&coord)->b.check
53467 +                   && item_plugin_by_coord(&coord)->b.check(&coord, error))
53468 +                       return -1;
53469 +#endif
53470 +               if (i) {
53471 +                       coord_t prev_coord;
53472 +                       /* two neighboring items can not be mergeable */
53473 +                       coord_dup(&prev_coord, &coord);
53474 +                       coord_prev_item(&prev_coord);
53475 +                       if (are_items_mergeable(&prev_coord, &coord)) {
53476 +                               *error = "mergeable items in one node";
53477 +                               return -1;
53478 +                       }
53479 +
53480 +               }
53481 +       }
53482 +
53483 +       if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) {
53484 +               coord_t coord;
53485 +               item_plugin *iplug;
53486 +
53487 +               coord_init_last_unit(&coord, node);
53488 +               iplug = item_plugin_by_coord(&coord);
53489 +               if ((item_is_extent(&coord) || item_is_tail(&coord)) &&
53490 +                   iplug->s.file.append_key != NULL) {
53491 +                       reiser4_key mkey;
53492 +
53493 +                       iplug->s.file.append_key(&coord, &mkey);
53494 +                       set_key_offset(&mkey, get_key_offset(&mkey) - 1);
53495 +                       read_lock_dk(current_tree);
53496 +                       result = keygt(&mkey, znode_get_rd_key((znode *) node));
53497 +                       read_unlock_dk(current_tree);
53498 +                       if (result) {
53499 +                               *error = "key of rightmost item is too large";
53500 +                               return -1;
53501 +                       }
53502 +               }
53503 +       }
53504 +       if (flags & REISER4_NODE_DKEYS) {
53505 +               read_lock_tree(current_tree);
53506 +               read_lock_dk(current_tree);
53507 +
53508 +               flags |= REISER4_NODE_TREE_STABLE;
53509 +
53510 +               if (keygt(&prev, znode_get_rd_key((znode *) node))) {
53511 +                       if (flags & REISER4_NODE_TREE_STABLE) {
53512 +                               *error = "Last key is greater than rdkey";
53513 +                               read_unlock_dk(current_tree);
53514 +                               read_unlock_tree(current_tree);
53515 +                               return -1;
53516 +                       }
53517 +               }
53518 +               if (keygt
53519 +                   (znode_get_ld_key((znode *) node),
53520 +                    znode_get_rd_key((znode *) node))) {
53521 +                       *error = "ldkey is greater than rdkey";
53522 +                       read_unlock_dk(current_tree);
53523 +                       read_unlock_tree(current_tree);
53524 +                       return -1;
53525 +               }
53526 +               if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
53527 +                   (node->left != NULL) &&
53528 +                   !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) &&
53529 +                   ergo(flags & REISER4_NODE_TREE_STABLE,
53530 +                        !keyeq(znode_get_rd_key(node->left),
53531 +                               znode_get_ld_key((znode *) node)))
53532 +                   && ergo(!(flags & REISER4_NODE_TREE_STABLE),
53533 +                           keygt(znode_get_rd_key(node->left),
53534 +                                 znode_get_ld_key((znode *) node)))) {
53535 +                       *error = "left rdkey or ldkey is wrong";
53536 +                       read_unlock_dk(current_tree);
53537 +                       read_unlock_tree(current_tree);
53538 +                       return -1;
53539 +               }
53540 +               if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
53541 +                   (node->right != NULL) &&
53542 +                   !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) &&
53543 +                   ergo(flags & REISER4_NODE_TREE_STABLE,
53544 +                        !keyeq(znode_get_rd_key((znode *) node),
53545 +                               znode_get_ld_key(node->right)))
53546 +                   && ergo(!(flags & REISER4_NODE_TREE_STABLE),
53547 +                           keygt(znode_get_rd_key((znode *) node),
53548 +                                 znode_get_ld_key(node->right)))) {
53549 +                       *error = "rdkey or right ldkey is wrong";
53550 +                       read_unlock_dk(current_tree);
53551 +                       read_unlock_tree(current_tree);
53552 +                       return -1;
53553 +               }
53554 +
53555 +               read_unlock_dk(current_tree);
53556 +               read_unlock_tree(current_tree);
53557 +       }
53558 +
53559 +       return 0;
53560 +}
53561 +
53562 +/* plugin->u.node.parse
53563 +   look for description of this method in plugin/node/node.h */
53564 +int parse_node40(znode * node /* node to parse */ )
53565 +{
53566 +       node40_header *header;
53567 +       int result;
53568 +       d8 level;
53569 +
53570 +       header = node40_node_header((znode *) node);
53571 +       result = -EIO;
53572 +       level = nh40_get_level(header);
53573 +       if (unlikely(((__u8) znode_get_level(node)) != level))
53574 +               warning("nikita-494", "Wrong level found in node: %i != %i",
53575 +                       znode_get_level(node), level);
53576 +       else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC))
53577 +               warning("nikita-495",
53578 +                       "Wrong magic in tree node: want %x, got %x",
53579 +                       REISER4_NODE_MAGIC, nh40_get_magic(header));
53580 +       else {
53581 +               node->nr_items = node40_num_of_items_internal(node);
53582 +               result = 0;
53583 +       }
53584 +       return RETERR(result);
53585 +}
53586 +
53587 +/* plugin->u.node.init
53588 +   look for description of this method in plugin/node/node.h */
53589 +int init_node40(znode * node /* node to initialise */ )
53590 +{
53591 +       node40_header *header;
53592 +
53593 +       assert("nikita-570", node != NULL);
53594 +       assert("nikita-572", zdata(node) != NULL);
53595 +
53596 +       header = node40_node_header(node);
53597 +       memset(header, 0, sizeof(node40_header));
53598 +       nh40_set_free_space(header, znode_size(node) - sizeof(node40_header));
53599 +       nh40_set_free_space_start(header, sizeof(node40_header));
53600 +       /* sane hypothesis: 0 in CPU format is 0 in disk format */
53601 +       /* items: 0 */
53602 +       save_plugin_id(node_plugin_to_plugin(node->nplug),
53603 +                      &header->common_header.plugin_id);
53604 +       nh40_set_level(header, znode_get_level(node));
53605 +       nh40_set_magic(header, REISER4_NODE_MAGIC);
53606 +       node->nr_items = 0;
53607 +       nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb()));
53608 +
53609 +       /* flags: 0 */
53610 +       return 0;
53611 +}
53612 +
53613 +#ifdef GUESS_EXISTS
53614 +int guess_node40(const znode * node /* node to guess plugin of */ )
53615 +{
53616 +       node40_header *nethack;
53617 +
53618 +       assert("nikita-1058", node != NULL);
53619 +       nethack = node40_node_header(node);
53620 +       return
53621 +           (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) &&
53622 +           (plugin_by_disk_id(znode_get_tree(node),
53623 +                              REISER4_NODE_PLUGIN_TYPE,
53624 +                              &nethack->common_header.plugin_id)->h.id ==
53625 +            NODE40_ID);
53626 +}
53627 +#endif
53628 +
53629 +/* plugin->u.node.chage_item_size
53630 +   look for description of this method in plugin/node/node.h */
53631 +void change_item_size_node40(coord_t * coord, int by)
53632 +{
53633 +       node40_header *nh;
53634 +       item_header40 *ih;
53635 +       char *item_data;
53636 +       int item_length;
53637 +       unsigned i;
53638 +
53639 +       /* make sure that @item is coord of existing item */
53640 +       assert("vs-210", coord_is_existing_item(coord));
53641 +
53642 +       nh = node40_node_header(coord->node);
53643 +
53644 +       item_data = item_by_coord_node40(coord);
53645 +       item_length = length_by_coord_node40(coord);
53646 +
53647 +       /* move item bodies */
53648 +       ih = node40_ih_at_coord(coord);
53649 +       memmove(item_data + item_length + by, item_data + item_length,
53650 +               nh40_get_free_space_start(node40_node_header(coord->node)) -
53651 +               (ih40_get_offset(ih) + item_length));
53652 +
53653 +       /* update offsets of moved items */
53654 +       for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) {
53655 +               ih = node40_ih_at(coord->node, i);
53656 +               ih40_set_offset(ih, ih40_get_offset(ih) + by);
53657 +       }
53658 +
53659 +       /* update node header */
53660 +       nh40_set_free_space(nh, nh40_get_free_space(nh) - by);
53661 +       nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by);
53662 +}
53663 +
53664 +static int should_notify_parent(const znode * node)
53665 +{
53666 +       /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */
53667 +       return !disk_addr_eq(znode_get_block(node),
53668 +                            &znode_get_tree(node)->root_block);
53669 +}
53670 +
53671 +/* plugin->u.node.create_item
53672 +   look for description of this method in plugin/node/node.h */
53673 +int
53674 +create_item_node40(coord_t *target, const reiser4_key *key,
53675 +                  reiser4_item_data *data, carry_plugin_info *info)
53676 +{
53677 +       node40_header *nh;
53678 +       item_header40 *ih;
53679 +       unsigned offset;
53680 +       unsigned i;
53681 +
53682 +       nh = node40_node_header(target->node);
53683 +
53684 +       assert("vs-212", coord_is_between_items(target));
53685 +       /* node must have enough free space */
53686 +       assert("vs-254",
53687 +              free_space_node40(target->node) >=
53688 +              data->length + sizeof(item_header40));
53689 +       assert("vs-1410", data->length >= 0);
53690 +
53691 +       if (coord_set_to_right(target))
53692 +               /* there are not items to the right of @target, so, new item
53693 +                  will be inserted after last one */
53694 +               coord_set_item_pos(target, nh40_get_num_items(nh));
53695 +
53696 +       if (target->item_pos < nh40_get_num_items(nh)) {
53697 +               /* there are items to be moved to prepare space for new
53698 +                  item */
53699 +               ih = node40_ih_at_coord(target);
53700 +               /* new item will start at this offset */
53701 +               offset = ih40_get_offset(ih);
53702 +
53703 +               memmove(zdata(target->node) + offset + data->length,
53704 +                       zdata(target->node) + offset,
53705 +                       nh40_get_free_space_start(nh) - offset);
53706 +               /* update headers of moved items */
53707 +               for (i = target->item_pos; i < nh40_get_num_items(nh); i++) {
53708 +                       ih = node40_ih_at(target->node, i);
53709 +                       ih40_set_offset(ih, ih40_get_offset(ih) + data->length);
53710 +               }
53711 +
53712 +               /* @ih is set to item header of the last item, move item headers */
53713 +               memmove(ih - 1, ih,
53714 +                       sizeof(item_header40) * (nh40_get_num_items(nh) -
53715 +                                                target->item_pos));
53716 +       } else {
53717 +               /* new item will start at this offset */
53718 +               offset = nh40_get_free_space_start(nh);
53719 +       }
53720 +
53721 +       /* make item header for the new item */
53722 +       ih = node40_ih_at_coord(target);
53723 +       memcpy(&ih->key, key, sizeof(reiser4_key));
53724 +       ih40_set_offset(ih, offset);
53725 +       save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id);
53726 +
53727 +       /* update node header */
53728 +       nh40_set_free_space(nh,
53729 +                           nh40_get_free_space(nh) - data->length -
53730 +                           sizeof(item_header40));
53731 +       nh40_set_free_space_start(nh,
53732 +                                 nh40_get_free_space_start(nh) + data->length);
53733 +       node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1);
53734 +
53735 +       /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */
53736 +       target->unit_pos = 0;
53737 +       target->between = AT_UNIT;
53738 +       coord_clear_iplug(target);
53739 +
53740 +       /* initialize item */
53741 +       if (data->iplug->b.init != NULL) {
53742 +               data->iplug->b.init(target, NULL, data);
53743 +       }
53744 +       /* copy item body */
53745 +       if (data->iplug->b.paste != NULL) {
53746 +               data->iplug->b.paste(target, data, info);
53747 +       } else if (data->data != NULL) {
53748 +               if (data->user) {
53749 +                       /* AUDIT: Are we really should not check that pointer
53750 +                          from userspace was valid and data bytes were
53751 +                          available? How will we return -EFAULT of some kind
53752 +                          without this check? */
53753 +                       assert("nikita-3038", reiser4_schedulable());
53754 +                       /* copy data from user space */
53755 +                       __copy_from_user(zdata(target->node) + offset,
53756 +                                        (const char __user *)data->data,
53757 +                                        (unsigned)data->length);
53758 +               } else
53759 +                       /* copy from kernel space */
53760 +                       memcpy(zdata(target->node) + offset, data->data,
53761 +                              (unsigned)data->length);
53762 +       }
53763 +
53764 +       if (target->item_pos == 0) {
53765 +               /* left delimiting key has to be updated */
53766 +               prepare_for_update(NULL, target->node, info);
53767 +       }
53768 +
53769 +       if (item_plugin_by_coord(target)->b.create_hook != NULL) {
53770 +               item_plugin_by_coord(target)->b.create_hook(target, data->arg);
53771 +       }
53772 +
53773 +       return 0;
53774 +}
53775 +
53776 +/* plugin->u.node.update_item_key
53777 +   look for description of this method in plugin/node/node.h */
53778 +void
53779 +update_item_key_node40(coord_t * target, const reiser4_key * key,
53780 +                      carry_plugin_info * info)
53781 +{
53782 +       item_header40 *ih;
53783 +
53784 +       ih = node40_ih_at_coord(target);
53785 +       memcpy(&ih->key, key, sizeof(reiser4_key));
53786 +
53787 +       if (target->item_pos == 0) {
53788 +               prepare_for_update(NULL, target->node, info);
53789 +       }
53790 +}
53791 +
53792 +/* this bits encode cut mode */
53793 +#define CMODE_TAIL 1
53794 +#define CMODE_WHOLE 2
53795 +#define CMODE_HEAD 4
53796 +
53797 +struct cut40_info {
53798 +       int mode;
53799 +       pos_in_node_t tail_removed;     /* position of item which gets tail removed */
53800 +       pos_in_node_t first_removed;    /* position of first the leftmost item among items removed completely */
53801 +       pos_in_node_t removed_count;    /* number of items removed completely */
53802 +       pos_in_node_t head_removed;     /* position of item which gets head removed */
53803 +
53804 +       pos_in_node_t freed_space_start;
53805 +       pos_in_node_t freed_space_end;
53806 +       pos_in_node_t first_moved;
53807 +       pos_in_node_t head_removed_location;
53808 +};
53809 +
53810 +static void init_cinfo(struct cut40_info *cinfo)
53811 +{
53812 +       cinfo->mode = 0;
53813 +       cinfo->tail_removed = MAX_POS_IN_NODE;
53814 +       cinfo->first_removed = MAX_POS_IN_NODE;
53815 +       cinfo->removed_count = MAX_POS_IN_NODE;
53816 +       cinfo->head_removed = MAX_POS_IN_NODE;
53817 +       cinfo->freed_space_start = MAX_POS_IN_NODE;
53818 +       cinfo->freed_space_end = MAX_POS_IN_NODE;
53819 +       cinfo->first_moved = MAX_POS_IN_NODE;
53820 +       cinfo->head_removed_location = MAX_POS_IN_NODE;
53821 +}
53822 +
53823 +/* complete cut_node40/kill_node40 content by removing the gap created by */
53824 +static void compact(znode * node, struct cut40_info *cinfo)
53825 +{
53826 +       node40_header *nh;
53827 +       item_header40 *ih;
53828 +       pos_in_node_t freed;
53829 +       pos_in_node_t pos, nr_items;
53830 +
53831 +       assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE &&
53832 +                          cinfo->freed_space_end != MAX_POS_IN_NODE &&
53833 +                          cinfo->first_moved != MAX_POS_IN_NODE));
53834 +       assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start);
53835 +
53836 +       nh = node40_node_header(node);
53837 +       nr_items = nh40_get_num_items(nh);
53838 +
53839 +       /* remove gap made up by removal */
53840 +       memmove(zdata(node) + cinfo->freed_space_start,
53841 +               zdata(node) + cinfo->freed_space_end,
53842 +               nh40_get_free_space_start(nh) - cinfo->freed_space_end);
53843 +
53844 +       /* update item headers of moved items - change their locations */
53845 +       pos = cinfo->first_moved;
53846 +       ih = node40_ih_at(node, pos);
53847 +       if (cinfo->head_removed_location != MAX_POS_IN_NODE) {
53848 +               assert("vs-1580", pos == cinfo->head_removed);
53849 +               ih40_set_offset(ih, cinfo->head_removed_location);
53850 +               pos++;
53851 +               ih--;
53852 +       }
53853 +
53854 +       freed = cinfo->freed_space_end - cinfo->freed_space_start;
53855 +       for (; pos < nr_items; pos++, ih--) {
53856 +               assert("vs-1581", ih == node40_ih_at(node, pos));
53857 +               ih40_set_offset(ih, ih40_get_offset(ih) - freed);
53858 +       }
53859 +
53860 +       /* free space start moved to right */
53861 +       nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
53862 +
53863 +       if (cinfo->removed_count != MAX_POS_IN_NODE) {
53864 +               /* number of items changed. Remove item headers of those items */
53865 +               ih = node40_ih_at(node, nr_items - 1);
53866 +               memmove(ih + cinfo->removed_count, ih,
53867 +                       sizeof(item_header40) * (nr_items -
53868 +                                                cinfo->removed_count -
53869 +                                                cinfo->first_removed));
53870 +               freed += sizeof(item_header40) * cinfo->removed_count;
53871 +               node40_set_num_items(node, nh, nr_items - cinfo->removed_count);
53872 +       }
53873 +
53874 +       /* total amount of free space increased */
53875 +       nh40_set_free_space(nh, nh40_get_free_space(nh) + freed);
53876 +}
53877 +
53878 +int shrink_item_node40(coord_t * coord, int delta)
53879 +{
53880 +       node40_header *nh;
53881 +       item_header40 *ih;
53882 +       pos_in_node_t pos;
53883 +       pos_in_node_t nr_items;
53884 +       char *end;
53885 +       znode *node;
53886 +       int off;
53887 +
53888 +       assert("nikita-3487", coord != NULL);
53889 +       assert("nikita-3488", delta >= 0);
53890 +
53891 +       node = coord->node;
53892 +       nh = node40_node_header(node);
53893 +       nr_items = nh40_get_num_items(nh);
53894 +
53895 +       ih = node40_ih_at_coord(coord);
53896 +       assert("nikita-3489", delta <= length_by_coord_node40(coord));
53897 +       off = ih40_get_offset(ih) + length_by_coord_node40(coord);
53898 +       end = zdata(node) + off;
53899 +
53900 +       /* remove gap made up by removal */
53901 +       memmove(end - delta, end, nh40_get_free_space_start(nh) - off);
53902 +
53903 +       /* update item headers of moved items - change their locations */
53904 +       pos = coord->item_pos + 1;
53905 +       ih = node40_ih_at(node, pos);
53906 +       for (; pos < nr_items; pos++, ih--) {
53907 +               assert("nikita-3490", ih == node40_ih_at(node, pos));
53908 +               ih40_set_offset(ih, ih40_get_offset(ih) - delta);
53909 +       }
53910 +
53911 +       /* free space start moved to left */
53912 +       nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta);
53913 +       /* total amount of free space increased */
53914 +       nh40_set_free_space(nh, nh40_get_free_space(nh) + delta);
53915 +       /*
53916 +        * This method does _not_ changes number of items. Hence, it cannot
53917 +        * make node empty. Also it doesn't remove items at all, which means
53918 +        * that no keys have to be updated either.
53919 +        */
53920 +       return 0;
53921 +}
53922 +
53923 +/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types
53924 +   of cut. First is when a unit is removed from the middle of an item.  In this case this function returns 1. All the
53925 +   rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item
53926 +   getting head cut. Function returns 0 in this case */
53927 +static int
53928 +parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params)
53929 +{
53930 +       reiser4_key left_key, right_key;
53931 +       reiser4_key min_from_key, max_to_key;
53932 +       const reiser4_key *from_key, *to_key;
53933 +
53934 +       init_cinfo(cinfo);
53935 +
53936 +       /* calculate minimal key stored in first item of items to be cut (params->from) */
53937 +       item_key_by_coord(params->from, &min_from_key);
53938 +       /* and max key stored in last item of items to be cut (params->to) */
53939 +       max_item_key_by_coord(params->to, &max_to_key);
53940 +
53941 +       /* if cut key range is not defined in input parameters - define it using cut coord range */
53942 +       if (params->from_key == NULL) {
53943 +               assert("vs-1513", params->to_key == NULL);
53944 +               unit_key_by_coord(params->from, &left_key);
53945 +               from_key = &left_key;
53946 +               max_unit_key_by_coord(params->to, &right_key);
53947 +               to_key = &right_key;
53948 +       } else {
53949 +               from_key = params->from_key;
53950 +               to_key = params->to_key;
53951 +       }
53952 +
53953 +       if (params->from->item_pos == params->to->item_pos) {
53954 +               if (keylt(&min_from_key, from_key)
53955 +                   && keylt(to_key, &max_to_key))
53956 +                       return 1;
53957 +
53958 +               if (keygt(from_key, &min_from_key)) {
53959 +                       /* tail of item is to be cut cut */
53960 +                       cinfo->tail_removed = params->from->item_pos;
53961 +                       cinfo->mode |= CMODE_TAIL;
53962 +               } else if (keylt(to_key, &max_to_key)) {
53963 +                       /* head of item is to be cut */
53964 +                       cinfo->head_removed = params->from->item_pos;
53965 +                       cinfo->mode |= CMODE_HEAD;
53966 +               } else {
53967 +                       /* item is removed completely */
53968 +                       cinfo->first_removed = params->from->item_pos;
53969 +                       cinfo->removed_count = 1;
53970 +                       cinfo->mode |= CMODE_WHOLE;
53971 +               }
53972 +       } else {
53973 +               cinfo->first_removed = params->from->item_pos + 1;
53974 +               cinfo->removed_count =
53975 +                   params->to->item_pos - params->from->item_pos - 1;
53976 +
53977 +               if (keygt(from_key, &min_from_key)) {
53978 +                       /* first item is not cut completely */
53979 +                       cinfo->tail_removed = params->from->item_pos;
53980 +                       cinfo->mode |= CMODE_TAIL;
53981 +               } else {
53982 +                       cinfo->first_removed--;
53983 +                       cinfo->removed_count++;
53984 +               }
53985 +               if (keylt(to_key, &max_to_key)) {
53986 +                       /* last item is not cut completely */
53987 +                       cinfo->head_removed = params->to->item_pos;
53988 +                       cinfo->mode |= CMODE_HEAD;
53989 +               } else {
53990 +                       cinfo->removed_count++;
53991 +               }
53992 +               if (cinfo->removed_count)
53993 +                       cinfo->mode |= CMODE_WHOLE;
53994 +       }
53995 +
53996 +       return 0;
53997 +}
53998 +
53999 +static void
54000 +call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count,
54001 +               carry_kill_data * kdata)
54002 +{
54003 +       coord_t coord;
54004 +       item_plugin *iplug;
54005 +       pos_in_node_t pos;
54006 +
54007 +       coord.node = node;
54008 +       coord.unit_pos = 0;
54009 +       coord.between = AT_UNIT;
54010 +       for (pos = 0; pos < count; pos++) {
54011 +               coord_set_item_pos(&coord, from + pos);
54012 +               coord.unit_pos = 0;
54013 +               coord.between = AT_UNIT;
54014 +               iplug = item_plugin_by_coord(&coord);
54015 +               if (iplug->b.kill_hook) {
54016 +                       iplug->b.kill_hook(&coord, 0, coord_num_units(&coord),
54017 +                                          kdata);
54018 +               }
54019 +       }
54020 +}
54021 +
54022 +/* this is used to kill item partially */
54023 +static pos_in_node_t
54024 +kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
54025 +          reiser4_key * smallest_removed, reiser4_key * new_first_key)
54026 +{
54027 +       struct carry_kill_data *kdata;
54028 +       item_plugin *iplug;
54029 +
54030 +       kdata = data;
54031 +       iplug = item_plugin_by_coord(coord);
54032 +
54033 +       assert("vs-1524", iplug->b.kill_units);
54034 +       return iplug->b.kill_units(coord, from, to, kdata, smallest_removed,
54035 +                                  new_first_key);
54036 +}
54037 +
54038 +/* call item plugin to cut tail of file */
54039 +static pos_in_node_t
54040 +kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
54041 +{
54042 +       struct carry_kill_data *kdata;
54043 +       pos_in_node_t to;
54044 +
54045 +       kdata = data;
54046 +       to = coord_last_unit_pos(coord);
54047 +       return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed,
54048 +                         NULL);
54049 +}
54050 +
54051 +/* call item plugin to cut head of item */
54052 +static pos_in_node_t
54053 +kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
54054 +         reiser4_key * new_first_key)
54055 +{
54056 +       return kill_units(coord, 0, coord->unit_pos, data, smallest_removed,
54057 +                         new_first_key);
54058 +}
54059 +
54060 +/* this is used to cut item partially */
54061 +static pos_in_node_t
54062 +cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
54063 +         reiser4_key * smallest_removed, reiser4_key * new_first_key)
54064 +{
54065 +       carry_cut_data *cdata;
54066 +       item_plugin *iplug;
54067 +
54068 +       cdata = data;
54069 +       iplug = item_plugin_by_coord(coord);
54070 +       assert("vs-302", iplug->b.cut_units);
54071 +       return iplug->b.cut_units(coord, from, to, cdata, smallest_removed,
54072 +                                 new_first_key);
54073 +}
54074 +
54075 +/* call item plugin to cut tail of file */
54076 +static pos_in_node_t
54077 +cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
54078 +{
54079 +       carry_cut_data *cdata;
54080 +       pos_in_node_t to;
54081 +
54082 +       cdata = data;
54083 +       to = coord_last_unit_pos(cdata->params.from);
54084 +       return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL);
54085 +}
54086 +
54087 +/* call item plugin to cut head of item */
54088 +static pos_in_node_t
54089 +cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
54090 +        reiser4_key * new_first_key)
54091 +{
54092 +       return cut_units(coord, 0, coord->unit_pos, data, smallest_removed,
54093 +                        new_first_key);
54094 +}
54095 +
54096 +/* this returns 1 of key of first item changed, 0 - if it did not */
54097 +static int
54098 +prepare_for_compact(struct cut40_info *cinfo,
54099 +                   const struct cut_kill_params *params, int is_cut,
54100 +                   void *data, carry_plugin_info * info)
54101 +{
54102 +       znode *node;
54103 +       item_header40 *ih;
54104 +       pos_in_node_t freed;
54105 +       pos_in_node_t item_pos;
54106 +       coord_t coord;
54107 +       reiser4_key new_first_key;
54108 +       pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t,
54109 +                                     void *, reiser4_key *, reiser4_key *);
54110 +       pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *);
54111 +       pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *,
54112 +                                    reiser4_key *);
54113 +       int retval;
54114 +
54115 +       retval = 0;
54116 +
54117 +       node = params->from->node;
54118 +
54119 +       assert("vs-184", node == params->to->node);
54120 +       assert("vs-312", !node_is_empty(node));
54121 +       assert("vs-297",
54122 +              coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT);
54123 +
54124 +       if (is_cut) {
54125 +               kill_units_f = cut_units;
54126 +               kill_tail_f = cut_tail;
54127 +               kill_head_f = cut_head;
54128 +       } else {
54129 +               kill_units_f = kill_units;
54130 +               kill_tail_f = kill_tail;
54131 +               kill_head_f = kill_head;
54132 +       }
54133 +
54134 +       if (parse_cut(cinfo, params) == 1) {
54135 +               /* cut from the middle of item */
54136 +               freed =
54137 +                   kill_units_f(params->from, params->from->unit_pos,
54138 +                                params->to->unit_pos, data,
54139 +                                params->smallest_removed, NULL);
54140 +
54141 +               item_pos = params->from->item_pos;
54142 +               ih = node40_ih_at(node, item_pos);
54143 +               cinfo->freed_space_start =
54144 +                   ih40_get_offset(ih) + node40_item_length(node,
54145 +                                                            item_pos) - freed;
54146 +               cinfo->freed_space_end = cinfo->freed_space_start + freed;
54147 +               cinfo->first_moved = item_pos + 1;
54148 +       } else {
54149 +               assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE ||
54150 +                                  cinfo->first_removed != MAX_POS_IN_NODE ||
54151 +                                  cinfo->head_removed != MAX_POS_IN_NODE));
54152 +
54153 +               switch (cinfo->mode) {
54154 +               case CMODE_TAIL:
54155 +                       /* one item gets cut partially from its end */
54156 +                       assert("vs-1562",
54157 +                              cinfo->tail_removed == params->from->item_pos);
54158 +
54159 +                       freed =
54160 +                           kill_tail_f(params->from, data,
54161 +                                       params->smallest_removed);
54162 +
54163 +                       item_pos = cinfo->tail_removed;
54164 +                       ih = node40_ih_at(node, item_pos);
54165 +                       cinfo->freed_space_start =
54166 +                           ih40_get_offset(ih) + node40_item_length(node,
54167 +                                                                    item_pos) -
54168 +                           freed;
54169 +                       cinfo->freed_space_end =
54170 +                           cinfo->freed_space_start + freed;
54171 +                       cinfo->first_moved = cinfo->tail_removed + 1;
54172 +                       break;
54173 +
54174 +               case CMODE_WHOLE:
54175 +                       /* one or more items get removed completely */
54176 +                       assert("vs-1563",
54177 +                              cinfo->first_removed == params->from->item_pos);
54178 +                       assert("vs-1564", cinfo->removed_count > 0
54179 +                              && cinfo->removed_count != MAX_POS_IN_NODE);
54180 +
54181 +                       /* call kill hook for all items removed completely */
54182 +                       if (is_cut == 0)
54183 +                               call_kill_hooks(node, cinfo->first_removed,
54184 +                                               cinfo->removed_count, data);
54185 +
54186 +                       item_pos = cinfo->first_removed;
54187 +                       ih = node40_ih_at(node, item_pos);
54188 +
54189 +                       if (params->smallest_removed)
54190 +                               memcpy(params->smallest_removed, &ih->key,
54191 +                                      sizeof(reiser4_key));
54192 +
54193 +                       cinfo->freed_space_start = ih40_get_offset(ih);
54194 +
54195 +                       item_pos += (cinfo->removed_count - 1);
54196 +                       ih -= (cinfo->removed_count - 1);
54197 +                       cinfo->freed_space_end =
54198 +                           ih40_get_offset(ih) + node40_item_length(node,
54199 +                                                                    item_pos);
54200 +                       cinfo->first_moved = item_pos + 1;
54201 +                       if (cinfo->first_removed == 0)
54202 +                               /* key of first item of the node changes */
54203 +                               retval = 1;
54204 +                       break;
54205 +
54206 +               case CMODE_HEAD:
54207 +                       /* one item gets cut partially from its head */
54208 +                       assert("vs-1565",
54209 +                              cinfo->head_removed == params->from->item_pos);
54210 +
54211 +                       freed =
54212 +                           kill_head_f(params->to, data,
54213 +                                       params->smallest_removed,
54214 +                                       &new_first_key);
54215 +
54216 +                       item_pos = cinfo->head_removed;
54217 +                       ih = node40_ih_at(node, item_pos);
54218 +                       cinfo->freed_space_start = ih40_get_offset(ih);
54219 +                       cinfo->freed_space_end = ih40_get_offset(ih) + freed;
54220 +                       cinfo->first_moved = cinfo->head_removed + 1;
54221 +
54222 +                       /* item head is removed, therefore, item key changed */
54223 +                       coord.node = node;
54224 +                       coord_set_item_pos(&coord, item_pos);
54225 +                       coord.unit_pos = 0;
54226 +                       coord.between = AT_UNIT;
54227 +                       update_item_key_node40(&coord, &new_first_key, NULL);
54228 +                       if (item_pos == 0)
54229 +                               /* key of first item of the node changes */
54230 +                               retval = 1;
54231 +                       break;
54232 +
54233 +               case CMODE_TAIL | CMODE_WHOLE:
54234 +                       /* one item gets cut from its end and one or more items get removed completely */
54235 +                       assert("vs-1566",
54236 +                              cinfo->tail_removed == params->from->item_pos);
54237 +                       assert("vs-1567",
54238 +                              cinfo->first_removed == cinfo->tail_removed + 1);
54239 +                       assert("vs-1564", cinfo->removed_count > 0
54240 +                              && cinfo->removed_count != MAX_POS_IN_NODE);
54241 +
54242 +                       freed =
54243 +                           kill_tail_f(params->from, data,
54244 +                                       params->smallest_removed);
54245 +
54246 +                       item_pos = cinfo->tail_removed;
54247 +                       ih = node40_ih_at(node, item_pos);
54248 +                       cinfo->freed_space_start =
54249 +                           ih40_get_offset(ih) + node40_item_length(node,
54250 +                                                                    item_pos) -
54251 +                           freed;
54252 +
54253 +                       /* call kill hook for all items removed completely */
54254 +                       if (is_cut == 0)
54255 +                               call_kill_hooks(node, cinfo->first_removed,
54256 +                                               cinfo->removed_count, data);
54257 +
54258 +                       item_pos += cinfo->removed_count;
54259 +                       ih -= cinfo->removed_count;
54260 +                       cinfo->freed_space_end =
54261 +                           ih40_get_offset(ih) + node40_item_length(node,
54262 +                                                                    item_pos);
54263 +                       cinfo->first_moved = item_pos + 1;
54264 +                       break;
54265 +
54266 +               case CMODE_WHOLE | CMODE_HEAD:
54267 +                       /* one or more items get removed completely and one item gets cut partially from its head */
54268 +                       assert("vs-1568",
54269 +                              cinfo->first_removed == params->from->item_pos);
54270 +                       assert("vs-1564", cinfo->removed_count > 0
54271 +                              && cinfo->removed_count != MAX_POS_IN_NODE);
54272 +                       assert("vs-1569",
54273 +                              cinfo->head_removed ==
54274 +                              cinfo->first_removed + cinfo->removed_count);
54275 +
54276 +                       /* call kill hook for all items removed completely */
54277 +                       if (is_cut == 0)
54278 +                               call_kill_hooks(node, cinfo->first_removed,
54279 +                                               cinfo->removed_count, data);
54280 +
54281 +                       item_pos = cinfo->first_removed;
54282 +                       ih = node40_ih_at(node, item_pos);
54283 +
54284 +                       if (params->smallest_removed)
54285 +                               memcpy(params->smallest_removed, &ih->key,
54286 +                                      sizeof(reiser4_key));
54287 +
54288 +                       freed =
54289 +                           kill_head_f(params->to, data, NULL, &new_first_key);
54290 +
54291 +                       cinfo->freed_space_start = ih40_get_offset(ih);
54292 +
54293 +                       ih = node40_ih_at(node, cinfo->head_removed);
54294 +                       /* this is the most complex case. Item which got head removed and items which are to be moved
54295 +                          intact change their location differently. */
54296 +                       cinfo->freed_space_end = ih40_get_offset(ih) + freed;
54297 +                       cinfo->first_moved = cinfo->head_removed;
54298 +                       cinfo->head_removed_location = cinfo->freed_space_start;
54299 +
54300 +                       /* item head is removed, therefore, item key changed */
54301 +                       coord.node = node;
54302 +                       coord_set_item_pos(&coord, cinfo->head_removed);
54303 +                       coord.unit_pos = 0;
54304 +                       coord.between = AT_UNIT;
54305 +                       update_item_key_node40(&coord, &new_first_key, NULL);
54306 +
54307 +                       assert("vs-1579", cinfo->first_removed == 0);
54308 +                       /* key of first item of the node changes */
54309 +                       retval = 1;
54310 +                       break;
54311 +
54312 +               case CMODE_TAIL | CMODE_HEAD:
54313 +                       /* one item get cut from its end and its neighbor gets cut from its tail */
54314 +                       impossible("vs-1576", "this can not happen currently");
54315 +                       break;
54316 +
54317 +               case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD:
54318 +                       impossible("vs-1577", "this can not happen currently");
54319 +                       break;
54320 +               default:
54321 +                       impossible("vs-1578", "unexpected cut mode");
54322 +                       break;
54323 +               }
54324 +       }
54325 +       return retval;
54326 +}
54327 +
54328 +/* plugin->u.node.kill
54329 +   return value is number of items removed completely */
54330 +int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info)
54331 +{
54332 +       znode *node;
54333 +       struct cut40_info cinfo;
54334 +       int first_key_changed;
54335 +
54336 +       node = kdata->params.from->node;
54337 +
54338 +       first_key_changed =
54339 +           prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata,
54340 +                               info);
54341 +       compact(node, &cinfo);
54342 +
54343 +       if (info) {
54344 +               /* it is not called by node40_shift, so we have to take care
54345 +                  of changes on upper levels */
54346 +               if (node_is_empty(node)
54347 +                   && !(kdata->flags & DELETE_RETAIN_EMPTY))
54348 +                       /* all contents of node is deleted */
54349 +                       prepare_removal_node40(node, info);
54350 +               else if (first_key_changed) {
54351 +                       prepare_for_update(NULL, node, info);
54352 +               }
54353 +       }
54354 +
54355 +       coord_clear_iplug(kdata->params.from);
54356 +       coord_clear_iplug(kdata->params.to);
54357 +
54358 +       znode_make_dirty(node);
54359 +       return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
54360 +}
54361 +
54362 +/* plugin->u.node.cut
54363 +   return value is number of items removed completely */
54364 +int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info)
54365 +{
54366 +       znode *node;
54367 +       struct cut40_info cinfo;
54368 +       int first_key_changed;
54369 +
54370 +       node = cdata->params.from->node;
54371 +
54372 +       first_key_changed =
54373 +           prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata,
54374 +                               info);
54375 +       compact(node, &cinfo);
54376 +
54377 +       if (info) {
54378 +               /* it is not called by node40_shift, so we have to take care
54379 +                  of changes on upper levels */
54380 +               if (node_is_empty(node))
54381 +                       /* all contents of node is deleted */
54382 +                       prepare_removal_node40(node, info);
54383 +               else if (first_key_changed) {
54384 +                       prepare_for_update(NULL, node, info);
54385 +               }
54386 +       }
54387 +
54388 +       coord_clear_iplug(cdata->params.from);
54389 +       coord_clear_iplug(cdata->params.to);
54390 +
54391 +       znode_make_dirty(node);
54392 +       return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
54393 +}
54394 +
54395 +/* this structure is used by shift method of node40 plugin */
54396 +struct shift_params {
54397 +       shift_direction pend;   /* when @pend == append - we are shifting to
54398 +                                  left, when @pend == prepend - to right */
54399 +       coord_t wish_stop;      /* when shifting to left this is last unit we
54400 +                                  want shifted, when shifting to right - this
54401 +                                  is set to unit we want to start shifting
54402 +                                  from */
54403 +       znode *target;
54404 +       int everything;         /* it is set to 1 if everything we have to shift is
54405 +                                  shifted, 0 - otherwise */
54406 +
54407 +       /* FIXME-VS: get rid of read_stop */
54408 +
54409 +       /* these are set by estimate_shift */
54410 +       coord_t real_stop;      /* this will be set to last unit which will be
54411 +                                  really shifted */
54412 +
54413 +       /* coordinate in source node before operation of unit which becomes
54414 +          first after shift to left of last after shift to right */
54415 +       union {
54416 +               coord_t future_first;
54417 +               coord_t future_last;
54418 +       } u;
54419 +
54420 +       unsigned merging_units; /* number of units of first item which have to
54421 +                                  be merged with last item of target node */
54422 +       unsigned merging_bytes; /* number of bytes in those units */
54423 +
54424 +       unsigned entire;        /* items shifted in their entirety */
54425 +       unsigned entire_bytes;  /* number of bytes in those items */
54426 +
54427 +       unsigned part_units;    /* number of units of partially copied item */
54428 +       unsigned part_bytes;    /* number of bytes in those units */
54429 +
54430 +       unsigned shift_bytes;   /* total number of bytes in items shifted (item
54431 +                                  headers not included) */
54432 +
54433 +};
54434 +
54435 +static int item_creation_overhead(coord_t *item)
54436 +{
54437 +       return node_plugin_by_coord(item)->item_overhead(item->node, NULL);
54438 +}
54439 +
54440 +/* how many units are there in @source starting from source->unit_pos
54441 +   but not further than @stop_coord */
54442 +static int
54443 +wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend)
54444 +{
54445 +       if (pend == SHIFT_LEFT) {
54446 +               assert("vs-181", source->unit_pos == 0);
54447 +       } else {
54448 +               assert("vs-182",
54449 +                      source->unit_pos == coord_last_unit_pos(source));
54450 +       }
54451 +
54452 +       if (source->item_pos != stop_coord->item_pos) {
54453 +               /* @source and @stop_coord are different items */
54454 +               return coord_last_unit_pos(source) + 1;
54455 +       }
54456 +
54457 +       if (pend == SHIFT_LEFT) {
54458 +               return stop_coord->unit_pos + 1;
54459 +       } else {
54460 +               return source->unit_pos - stop_coord->unit_pos + 1;
54461 +       }
54462 +}
54463 +
54464 +/* this calculates what can be copied from @shift->wish_stop.node to
54465 +   @shift->target */
54466 +static void
54467 +estimate_shift(struct shift_params *shift, const reiser4_context * ctx)
54468 +{
54469 +       unsigned target_free_space, size;
54470 +       pos_in_node_t stop_item;        /* item which estimating should not consider */
54471 +       unsigned want;          /* number of units of item we want shifted */
54472 +       coord_t source;         /* item being estimated */
54473 +       item_plugin *iplug;
54474 +
54475 +       /* shifting to left/right starts from first/last units of
54476 +          @shift->wish_stop.node */
54477 +       if (shift->pend == SHIFT_LEFT) {
54478 +               coord_init_first_unit(&source, shift->wish_stop.node);
54479 +       } else {
54480 +               coord_init_last_unit(&source, shift->wish_stop.node);
54481 +       }
54482 +       shift->real_stop = source;
54483 +
54484 +       /* free space in target node and number of items in source */
54485 +       target_free_space = znode_free_space(shift->target);
54486 +
54487 +       shift->everything = 0;
54488 +       if (!node_is_empty(shift->target)) {
54489 +               /* target node is not empty, check for boundary items
54490 +                  mergeability */
54491 +               coord_t to;
54492 +
54493 +               /* item we try to merge @source with */
54494 +               if (shift->pend == SHIFT_LEFT) {
54495 +                       coord_init_last_unit(&to, shift->target);
54496 +               } else {
54497 +                       coord_init_first_unit(&to, shift->target);
54498 +               }
54499 +
54500 +               if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to,
54501 +                                                                     &source) :
54502 +                   are_items_mergeable(&source, &to)) {
54503 +                       /* how many units of @source do we want to merge to
54504 +                          item @to */
54505 +                       want =
54506 +                           wanted_units(&source, &shift->wish_stop,
54507 +                                        shift->pend);
54508 +
54509 +                       /* how many units of @source we can merge to item
54510 +                          @to */
54511 +                       iplug = item_plugin_by_coord(&source);
54512 +                       if (iplug->b.can_shift != NULL)
54513 +                               shift->merging_units =
54514 +                                   iplug->b.can_shift(target_free_space,
54515 +                                                      &source, shift->target,
54516 +                                                      shift->pend, &size,
54517 +                                                      want);
54518 +                       else {
54519 +                               shift->merging_units = 0;
54520 +                               size = 0;
54521 +                       }
54522 +                       shift->merging_bytes = size;
54523 +                       shift->shift_bytes += size;
54524 +                       /* update stop coord to be set to last unit of @source
54525 +                          we can merge to @target */
54526 +                       if (shift->merging_units)
54527 +                               /* at least one unit can be shifted */
54528 +                               shift->real_stop.unit_pos =
54529 +                                   (shift->merging_units - source.unit_pos -
54530 +                                    1) * shift->pend;
54531 +                       else {
54532 +                               /* nothing can be shifted */
54533 +                               if (shift->pend == SHIFT_LEFT)
54534 +                                       coord_init_before_first_item(&shift->
54535 +                                                                    real_stop,
54536 +                                                                    source.
54537 +                                                                    node);
54538 +                               else
54539 +                                       coord_init_after_last_item(&shift->
54540 +                                                                  real_stop,
54541 +                                                                  source.node);
54542 +                       }
54543 +                       assert("nikita-2081", shift->real_stop.unit_pos + 1);
54544 +
54545 +                       if (shift->merging_units != want) {
54546 +                               /* we could not copy as many as we want, so,
54547 +                                  there is no reason for estimating any
54548 +                                  longer */
54549 +                               return;
54550 +                       }
54551 +
54552 +                       target_free_space -= size;
54553 +                       coord_add_item_pos(&source, shift->pend);
54554 +               }
54555 +       }
54556 +
54557 +       /* number of item nothing of which we want to shift */
54558 +       stop_item = shift->wish_stop.item_pos + shift->pend;
54559 +
54560 +       /* calculate how many items can be copied into given free
54561 +          space as whole */
54562 +       for (; source.item_pos != stop_item;
54563 +            coord_add_item_pos(&source, shift->pend)) {
54564 +               if (shift->pend == SHIFT_RIGHT)
54565 +                       source.unit_pos = coord_last_unit_pos(&source);
54566 +
54567 +               /* how many units of @source do we want to copy */
54568 +               want = wanted_units(&source, &shift->wish_stop, shift->pend);
54569 +
54570 +               if (want == coord_last_unit_pos(&source) + 1) {
54571 +                       /* we want this item to be copied entirely */
54572 +                       size =
54573 +                           item_length_by_coord(&source) +
54574 +                           item_creation_overhead(&source);
54575 +                       if (size <= target_free_space) {
54576 +                               /* item fits into target node as whole */
54577 +                               target_free_space -= size;
54578 +                               shift->shift_bytes +=
54579 +                                   size - item_creation_overhead(&source);
54580 +                               shift->entire_bytes +=
54581 +                                   size - item_creation_overhead(&source);
54582 +                               shift->entire++;
54583 +
54584 +                               /* update shift->real_stop coord to be set to
54585 +                                  last unit of @source we can merge to
54586 +                                  @target */
54587 +                               shift->real_stop = source;
54588 +                               if (shift->pend == SHIFT_LEFT)
54589 +                                       shift->real_stop.unit_pos =
54590 +                                           coord_last_unit_pos(&shift->
54591 +                                                               real_stop);
54592 +                               else
54593 +                                       shift->real_stop.unit_pos = 0;
54594 +                               continue;
54595 +                       }
54596 +               }
54597 +
54598 +               /* we reach here only for an item which does not fit into
54599 +                  target node in its entirety. This item may be either
54600 +                  partially shifted, or not shifted at all. We will have to
54601 +                  create new item in target node, so decrease amout of free
54602 +                  space by an item creation overhead. We can reach here also
54603 +                  if stop coord is in this item */
54604 +               if (target_free_space >=
54605 +                   (unsigned)item_creation_overhead(&source)) {
54606 +                       target_free_space -= item_creation_overhead(&source);
54607 +                       iplug = item_plugin_by_coord(&source);
54608 +                       if (iplug->b.can_shift) {
54609 +                               shift->part_units = iplug->b.can_shift(target_free_space,
54610 +                                                                      &source,
54611 +                                                                      NULL, /* target */
54612 +                                                                      shift->pend,
54613 +                                                                      &size,
54614 +                                                                      want);
54615 +                       } else {
54616 +                               target_free_space = 0;
54617 +                               shift->part_units = 0;
54618 +                               size = 0;
54619 +                       }
54620 +               } else {
54621 +                       target_free_space = 0;
54622 +                       shift->part_units = 0;
54623 +                       size = 0;
54624 +               }
54625 +               shift->part_bytes = size;
54626 +               shift->shift_bytes += size;
54627 +
54628 +               /* set @shift->real_stop to last unit of @source we can merge
54629 +                  to @shift->target */
54630 +               if (shift->part_units) {
54631 +                       shift->real_stop = source;
54632 +                       shift->real_stop.unit_pos =
54633 +                           (shift->part_units - source.unit_pos -
54634 +                            1) * shift->pend;
54635 +                       assert("nikita-2082", shift->real_stop.unit_pos + 1);
54636 +               }
54637 +
54638 +               if (want != shift->part_units)
54639 +                       /* not everything wanted were shifted */
54640 +                       return;
54641 +               break;
54642 +       }
54643 +
54644 +       shift->everything = 1;
54645 +}
54646 +
54647 +static void
54648 +copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count,
54649 +          shift_direction dir, unsigned free_space)
54650 +{
54651 +       item_plugin *iplug;
54652 +
54653 +       assert("nikita-1463", target != NULL);
54654 +       assert("nikita-1464", source != NULL);
54655 +       assert("nikita-1465", from + count <= coord_num_units(source));
54656 +
54657 +       iplug = item_plugin_by_coord(source);
54658 +       assert("nikita-1468", iplug == item_plugin_by_coord(target));
54659 +       iplug->b.copy_units(target, source, from, count, dir, free_space);
54660 +
54661 +       if (dir == SHIFT_RIGHT) {
54662 +               /* FIXME-VS: this looks not necessary. update_item_key was
54663 +                  called already by copy_units method */
54664 +               reiser4_key split_key;
54665 +
54666 +               assert("nikita-1469", target->unit_pos == 0);
54667 +
54668 +               unit_key_by_coord(target, &split_key);
54669 +               node_plugin_by_coord(target)->update_item_key(target,
54670 +                                                             &split_key, NULL);
54671 +       }
54672 +}
54673 +
54674 +/* copy part of @shift->real_stop.node starting either from its beginning or
54675 +   from its end and ending at @shift->real_stop to either the end or the
54676 +   beginning of @shift->target */
54677 +static void copy(struct shift_params *shift)
54678 +{
54679 +       node40_header *nh;
54680 +       coord_t from;
54681 +       coord_t to;
54682 +       item_header40 *from_ih, *to_ih;
54683 +       int free_space_start;
54684 +       int new_items;
54685 +       unsigned old_items;
54686 +       int old_offset;
54687 +       unsigned i;
54688 +
54689 +       nh = node40_node_header(shift->target);
54690 +       free_space_start = nh40_get_free_space_start(nh);
54691 +       old_items = nh40_get_num_items(nh);
54692 +       new_items = shift->entire + (shift->part_units ? 1 : 0);
54693 +       assert("vs-185",
54694 +              shift->shift_bytes ==
54695 +              shift->merging_bytes + shift->entire_bytes + shift->part_bytes);
54696 +
54697 +       from = shift->wish_stop;
54698 +
54699 +       coord_init_first_unit(&to, shift->target);
54700 +
54701 +       /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty,
54702 +          hence to.between is set to EMPTY_NODE above. Looks like we want it
54703 +          to be AT_UNIT.
54704 +
54705 +          Oh, wonders of ->betweeness...
54706 +
54707 +        */
54708 +       to.between = AT_UNIT;
54709 +
54710 +       if (shift->pend == SHIFT_LEFT) {
54711 +               /* copying to left */
54712 +
54713 +               coord_set_item_pos(&from, 0);
54714 +               from_ih = node40_ih_at(from.node, 0);
54715 +
54716 +               coord_set_item_pos(&to,
54717 +                                  node40_num_of_items_internal(to.node) - 1);
54718 +               if (shift->merging_units) {
54719 +                       /* expand last item, so that plugin methods will see
54720 +                          correct data */
54721 +                       free_space_start += shift->merging_bytes;
54722 +                       nh40_set_free_space_start(nh,
54723 +                                                 (unsigned)free_space_start);
54724 +                       nh40_set_free_space(nh,
54725 +                                           nh40_get_free_space(nh) -
54726 +                                           shift->merging_bytes);
54727 +
54728 +                       /* appending last item of @target */
54729 +                       copy_units(&to, &from, 0,       /* starting from 0-th unit */
54730 +                                  shift->merging_units, SHIFT_LEFT,
54731 +                                  shift->merging_bytes);
54732 +                       coord_inc_item_pos(&from);
54733 +                       from_ih--;
54734 +                       coord_inc_item_pos(&to);
54735 +               }
54736 +
54737 +               to_ih = node40_ih_at(shift->target, old_items);
54738 +               if (shift->entire) {
54739 +                       /* copy @entire items entirely */
54740 +
54741 +                       /* copy item headers */
54742 +                       memcpy(to_ih - shift->entire + 1,
54743 +                              from_ih - shift->entire + 1,
54744 +                              shift->entire * sizeof(item_header40));
54745 +                       /* update item header offset */
54746 +                       old_offset = ih40_get_offset(from_ih);
54747 +                       /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */
54748 +                       for (i = 0; i < shift->entire; i++, to_ih--, from_ih--)
54749 +                               ih40_set_offset(to_ih,
54750 +                                               ih40_get_offset(from_ih) -
54751 +                                               old_offset + free_space_start);
54752 +
54753 +                       /* copy item bodies */
54754 +                       memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset,  /*ih40_get_offset (from_ih), */
54755 +                              shift->entire_bytes);
54756 +
54757 +                       coord_add_item_pos(&from, (int)shift->entire);
54758 +                       coord_add_item_pos(&to, (int)shift->entire);
54759 +               }
54760 +
54761 +               nh40_set_free_space_start(nh,
54762 +                                         free_space_start +
54763 +                                         shift->shift_bytes -
54764 +                                         shift->merging_bytes);
54765 +               nh40_set_free_space(nh,
54766 +                                   nh40_get_free_space(nh) -
54767 +                                   (shift->shift_bytes - shift->merging_bytes +
54768 +                                    sizeof(item_header40) * new_items));
54769 +
54770 +               /* update node header */
54771 +               node40_set_num_items(shift->target, nh, old_items + new_items);
54772 +               assert("vs-170",
54773 +                      nh40_get_free_space(nh) < znode_size(shift->target));
54774 +
54775 +               if (shift->part_units) {
54776 +                       /* copy heading part (@part units) of @source item as
54777 +                          a new item into @target->node */
54778 +
54779 +                       /* copy item header of partially copied item */
54780 +                       coord_set_item_pos(&to,
54781 +                                          node40_num_of_items_internal(to.node)
54782 +                                          - 1);
54783 +                       memcpy(to_ih, from_ih, sizeof(item_header40));
54784 +                       ih40_set_offset(to_ih,
54785 +                                       nh40_get_free_space_start(nh) -
54786 +                                       shift->part_bytes);
54787 +                       if (item_plugin_by_coord(&to)->b.init)
54788 +                               item_plugin_by_coord(&to)->b.init(&to, &from,
54789 +                                                                 NULL);
54790 +                       copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT,
54791 +                                  shift->part_bytes);
54792 +               }
54793 +
54794 +       } else {
54795 +               /* copying to right */
54796 +
54797 +               coord_set_item_pos(&from,
54798 +                                  node40_num_of_items_internal(from.node) - 1);
54799 +               from_ih = node40_ih_at_coord(&from);
54800 +
54801 +               coord_set_item_pos(&to, 0);
54802 +
54803 +               /* prepare space for new items */
54804 +               memmove(zdata(to.node) + sizeof(node40_header) +
54805 +                       shift->shift_bytes,
54806 +                       zdata(to.node) + sizeof(node40_header),
54807 +                       free_space_start - sizeof(node40_header));
54808 +               /* update item headers of moved items */
54809 +               to_ih = node40_ih_at(to.node, 0);
54810 +               /* first item gets @merging_bytes longer. free space appears
54811 +                  at its beginning */
54812 +               if (!node_is_empty(to.node))
54813 +                       ih40_set_offset(to_ih,
54814 +                                       ih40_get_offset(to_ih) +
54815 +                                       shift->shift_bytes -
54816 +                                       shift->merging_bytes);
54817 +
54818 +               for (i = 1; i < old_items; i++)
54819 +                       ih40_set_offset(to_ih - i,
54820 +                                       ih40_get_offset(to_ih - i) +
54821 +                                       shift->shift_bytes);
54822 +
54823 +               /* move item headers to make space for new items */
54824 +               memmove(to_ih - old_items + 1 - new_items,
54825 +                       to_ih - old_items + 1,
54826 +                       sizeof(item_header40) * old_items);
54827 +               to_ih -= (new_items - 1);
54828 +
54829 +               nh40_set_free_space_start(nh,
54830 +                                         free_space_start +
54831 +                                         shift->shift_bytes);
54832 +               nh40_set_free_space(nh,
54833 +                                   nh40_get_free_space(nh) -
54834 +                                   (shift->shift_bytes +
54835 +                                    sizeof(item_header40) * new_items));
54836 +
54837 +               /* update node header */
54838 +               node40_set_num_items(shift->target, nh, old_items + new_items);
54839 +               assert("vs-170",
54840 +                      nh40_get_free_space(nh) < znode_size(shift->target));
54841 +
54842 +               if (shift->merging_units) {
54843 +                       coord_add_item_pos(&to, new_items);
54844 +                       to.unit_pos = 0;
54845 +                       to.between = AT_UNIT;
54846 +                       /* prepend first item of @to */
54847 +                       copy_units(&to, &from,
54848 +                                  coord_last_unit_pos(&from) -
54849 +                                  shift->merging_units + 1,
54850 +                                  shift->merging_units, SHIFT_RIGHT,
54851 +                                  shift->merging_bytes);
54852 +                       coord_dec_item_pos(&from);
54853 +                       from_ih++;
54854 +               }
54855 +
54856 +               if (shift->entire) {
54857 +                       /* copy @entire items entirely */
54858 +
54859 +                       /* copy item headers */
54860 +                       memcpy(to_ih, from_ih,
54861 +                              shift->entire * sizeof(item_header40));
54862 +
54863 +                       /* update item header offset */
54864 +                       old_offset =
54865 +                           ih40_get_offset(from_ih + shift->entire - 1);
54866 +                       /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */
54867 +                       for (i = 0; i < shift->entire; i++, to_ih++, from_ih++)
54868 +                               ih40_set_offset(to_ih,
54869 +                                               ih40_get_offset(from_ih) -
54870 +                                               old_offset +
54871 +                                               sizeof(node40_header) +
54872 +                                               shift->part_bytes);
54873 +                       /* copy item bodies */
54874 +                       coord_add_item_pos(&from, -(int)(shift->entire - 1));
54875 +                       memcpy(zdata(to.node) + sizeof(node40_header) +
54876 +                              shift->part_bytes, item_by_coord_node40(&from),
54877 +                              shift->entire_bytes);
54878 +                       coord_dec_item_pos(&from);
54879 +               }
54880 +
54881 +               if (shift->part_units) {
54882 +                       coord_set_item_pos(&to, 0);
54883 +                       to.unit_pos = 0;
54884 +                       to.between = AT_UNIT;
54885 +                       /* copy heading part (@part units) of @source item as
54886 +                          a new item into @target->node */
54887 +
54888 +                       /* copy item header of partially copied item */
54889 +                       memcpy(to_ih, from_ih, sizeof(item_header40));
54890 +                       ih40_set_offset(to_ih, sizeof(node40_header));
54891 +                       if (item_plugin_by_coord(&to)->b.init)
54892 +                               item_plugin_by_coord(&to)->b.init(&to, &from,
54893 +                                                                 NULL);
54894 +                       copy_units(&to, &from,
54895 +                                  coord_last_unit_pos(&from) -
54896 +                                  shift->part_units + 1, shift->part_units,
54897 +                                  SHIFT_RIGHT, shift->part_bytes);
54898 +               }
54899 +       }
54900 +}
54901 +
54902 +/* remove everything either before or after @fact_stop. Number of items
54903 +   removed completely is returned */
54904 +static int delete_copied(struct shift_params *shift)
54905 +{
54906 +       coord_t from;
54907 +       coord_t to;
54908 +       struct carry_cut_data cdata;
54909 +
54910 +       if (shift->pend == SHIFT_LEFT) {
54911 +               /* we were shifting to left, remove everything from the
54912 +                  beginning of @shift->wish_stop->node upto
54913 +                  @shift->wish_stop */
54914 +               coord_init_first_unit(&from, shift->real_stop.node);
54915 +               to = shift->real_stop;
54916 +
54917 +               /* store old coordinate of unit which will be first after
54918 +                  shift to left */
54919 +               shift->u.future_first = to;
54920 +               coord_next_unit(&shift->u.future_first);
54921 +       } else {
54922 +               /* we were shifting to right, remove everything from
54923 +                  @shift->stop_coord upto to end of
54924 +                  @shift->stop_coord->node */
54925 +               from = shift->real_stop;
54926 +               coord_init_last_unit(&to, from.node);
54927 +
54928 +               /* store old coordinate of unit which will be last after
54929 +                  shift to right */
54930 +               shift->u.future_last = from;
54931 +               coord_prev_unit(&shift->u.future_last);
54932 +       }
54933 +
54934 +       cdata.params.from = &from;
54935 +       cdata.params.to = &to;
54936 +       cdata.params.from_key = NULL;
54937 +       cdata.params.to_key = NULL;
54938 +       cdata.params.smallest_removed = NULL;
54939 +       return cut_node40(&cdata, NULL);
54940 +}
54941 +
54942 +/* something was moved between @left and @right. Add carry operation to @info
54943 +   list to have carry to update delimiting key between them */
54944 +static int
54945 +prepare_for_update(znode * left, znode * right, carry_plugin_info * info)
54946 +{
54947 +       carry_op *op;
54948 +       carry_node *cn;
54949 +
54950 +       if (info == NULL)
54951 +               /* nowhere to send operation to. */
54952 +               return 0;
54953 +
54954 +       if (!should_notify_parent(right))
54955 +               return 0;
54956 +
54957 +       op = node_post_carry(info, COP_UPDATE, right, 1);
54958 +       if (IS_ERR(op) || op == NULL)
54959 +               return op ? PTR_ERR(op) : -EIO;
54960 +
54961 +       if (left != NULL) {
54962 +               carry_node *reference;
54963 +
54964 +               if (info->doing)
54965 +                       reference = insert_carry_node(info->doing,
54966 +                                                     info->todo, left);
54967 +               else
54968 +                       reference = op->node;
54969 +               assert("nikita-2992", reference != NULL);
54970 +               cn = reiser4_add_carry(info->todo, POOLO_BEFORE, reference);
54971 +               if (IS_ERR(cn))
54972 +                       return PTR_ERR(cn);
54973 +               cn->parent = 1;
54974 +               cn->node = left;
54975 +               if (ZF_ISSET(left, JNODE_ORPHAN))
54976 +                       cn->left_before = 1;
54977 +               op->u.update.left = cn;
54978 +       } else
54979 +               op->u.update.left = NULL;
54980 +       return 0;
54981 +}
54982 +
54983 +/* plugin->u.node.prepare_removal
54984 +   to delete a pointer to @empty from the tree add corresponding carry
54985 +   operation (delete) to @info list */
54986 +int prepare_removal_node40(znode * empty, carry_plugin_info * info)
54987 +{
54988 +       carry_op *op;
54989 +       reiser4_tree *tree;
54990 +
54991 +       if (!should_notify_parent(empty))
54992 +               return 0;
54993 +       /* already on a road to Styx */
54994 +       if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE))
54995 +               return 0;
54996 +       op = node_post_carry(info, COP_DELETE, empty, 1);
54997 +       if (IS_ERR(op) || op == NULL)
54998 +               return RETERR(op ? PTR_ERR(op) : -EIO);
54999 +
55000 +       op->u.delete.child = NULL;
55001 +       op->u.delete.flags = 0;
55002 +
55003 +       /* fare thee well */
55004 +       tree = znode_get_tree(empty);
55005 +       read_lock_tree(tree);
55006 +       write_lock_dk(tree);
55007 +       znode_set_ld_key(empty, znode_get_rd_key(empty));
55008 +       if (znode_is_left_connected(empty) && empty->left)
55009 +               znode_set_rd_key(empty->left, znode_get_rd_key(empty));
55010 +       write_unlock_dk(tree);
55011 +       read_unlock_tree(tree);
55012 +
55013 +       ZF_SET(empty, JNODE_HEARD_BANSHEE);
55014 +       return 0;
55015 +}
55016 +
55017 +/* something were shifted from @insert_coord->node to @shift->target, update
55018 +   @insert_coord correspondingly */
55019 +static void
55020 +adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed,
55021 +            int including_insert_coord)
55022 +{
55023 +       /* item plugin was invalidated by shifting */
55024 +       coord_clear_iplug(insert_coord);
55025 +
55026 +       if (node_is_empty(shift->wish_stop.node)) {
55027 +               assert("vs-242", shift->everything);
55028 +               if (including_insert_coord) {
55029 +                       if (shift->pend == SHIFT_RIGHT) {
55030 +                               /* set @insert_coord before first unit of
55031 +                                  @shift->target node */
55032 +                               coord_init_before_first_item(insert_coord,
55033 +                                                            shift->target);
55034 +                       } else {
55035 +                               /* set @insert_coord after last in target node */
55036 +                               coord_init_after_last_item(insert_coord,
55037 +                                                          shift->target);
55038 +                       }
55039 +               } else {
55040 +                       /* set @insert_coord inside of empty node. There is
55041 +                          only one possible coord within an empty
55042 +                          node. init_first_unit will set that coord */
55043 +                       coord_init_first_unit(insert_coord,
55044 +                                             shift->wish_stop.node);
55045 +               }
55046 +               return;
55047 +       }
55048 +
55049 +       if (shift->pend == SHIFT_RIGHT) {
55050 +               /* there was shifting to right */
55051 +               if (shift->everything) {
55052 +                       /* everything wanted was shifted */
55053 +                       if (including_insert_coord) {
55054 +                               /* @insert_coord is set before first unit of
55055 +                                  @to node */
55056 +                               coord_init_before_first_item(insert_coord,
55057 +                                                            shift->target);
55058 +                               insert_coord->between = BEFORE_UNIT;
55059 +                       } else {
55060 +                               /* @insert_coord is set after last unit of
55061 +                                  @insert->node */
55062 +                               coord_init_last_unit(insert_coord,
55063 +                                                    shift->wish_stop.node);
55064 +                               insert_coord->between = AFTER_UNIT;
55065 +                       }
55066 +               }
55067 +               return;
55068 +       }
55069 +
55070 +       /* there was shifting to left */
55071 +       if (shift->everything) {
55072 +               /* everything wanted was shifted */
55073 +               if (including_insert_coord) {
55074 +                       /* @insert_coord is set after last unit in @to node */
55075 +                       coord_init_after_last_item(insert_coord, shift->target);
55076 +               } else {
55077 +                       /* @insert_coord is set before first unit in the same
55078 +                          node */
55079 +                       coord_init_before_first_item(insert_coord,
55080 +                                                    shift->wish_stop.node);
55081 +               }
55082 +               return;
55083 +       }
55084 +
55085 +       /* FIXME-VS: the code below is complicated because with between ==
55086 +          AFTER_ITEM unit_pos is set to 0 */
55087 +
55088 +       if (!removed) {
55089 +               /* no items were shifted entirely */
55090 +               assert("vs-195", shift->merging_units == 0
55091 +                      || shift->part_units == 0);
55092 +
55093 +               if (shift->real_stop.item_pos == insert_coord->item_pos) {
55094 +                       if (shift->merging_units) {
55095 +                               if (insert_coord->between == AFTER_UNIT) {
55096 +                                       assert("nikita-1441",
55097 +                                              insert_coord->unit_pos >=
55098 +                                              shift->merging_units);
55099 +                                       insert_coord->unit_pos -=
55100 +                                           shift->merging_units;
55101 +                               } else if (insert_coord->between == BEFORE_UNIT) {
55102 +                                       assert("nikita-2090",
55103 +                                              insert_coord->unit_pos >
55104 +                                              shift->merging_units);
55105 +                                       insert_coord->unit_pos -=
55106 +                                           shift->merging_units;
55107 +                               }
55108 +
55109 +                               assert("nikita-2083",
55110 +                                      insert_coord->unit_pos + 1);
55111 +                       } else {
55112 +                               if (insert_coord->between == AFTER_UNIT) {
55113 +                                       assert("nikita-1442",
55114 +                                              insert_coord->unit_pos >=
55115 +                                              shift->part_units);
55116 +                                       insert_coord->unit_pos -=
55117 +                                           shift->part_units;
55118 +                               } else if (insert_coord->between == BEFORE_UNIT) {
55119 +                                       assert("nikita-2089",
55120 +                                              insert_coord->unit_pos >
55121 +                                              shift->part_units);
55122 +                                       insert_coord->unit_pos -=
55123 +                                           shift->part_units;
55124 +                               }
55125 +
55126 +                               assert("nikita-2084",
55127 +                                      insert_coord->unit_pos + 1);
55128 +                       }
55129 +               }
55130 +               return;
55131 +       }
55132 +
55133 +       /* we shifted to left and there was no enough space for everything */
55134 +       switch (insert_coord->between) {
55135 +       case AFTER_UNIT:
55136 +       case BEFORE_UNIT:
55137 +               if (shift->real_stop.item_pos == insert_coord->item_pos)
55138 +                       insert_coord->unit_pos -= shift->part_units;
55139 +       case AFTER_ITEM:
55140 +               coord_add_item_pos(insert_coord, -removed);
55141 +               break;
55142 +       default:
55143 +               impossible("nikita-2087", "not ready");
55144 +       }
55145 +       assert("nikita-2085", insert_coord->unit_pos + 1);
55146 +}
55147 +
55148 +static int call_shift_hooks(struct shift_params *shift)
55149 +{
55150 +       unsigned i, shifted;
55151 +       coord_t coord;
55152 +       item_plugin *iplug;
55153 +
55154 +       assert("vs-275", !node_is_empty(shift->target));
55155 +
55156 +       /* number of items shift touches */
55157 +       shifted =
55158 +           shift->entire + (shift->merging_units ? 1 : 0) +
55159 +           (shift->part_units ? 1 : 0);
55160 +
55161 +       if (shift->pend == SHIFT_LEFT) {
55162 +               /* moved items are at the end */
55163 +               coord_init_last_unit(&coord, shift->target);
55164 +               coord.unit_pos = 0;
55165 +
55166 +               assert("vs-279", shift->pend == 1);
55167 +               for (i = 0; i < shifted; i++) {
55168 +                       unsigned from, count;
55169 +
55170 +                       iplug = item_plugin_by_coord(&coord);
55171 +                       if (i == 0 && shift->part_units) {
55172 +                               assert("vs-277",
55173 +                                      coord_num_units(&coord) ==
55174 +                                      shift->part_units);
55175 +                               count = shift->part_units;
55176 +                               from = 0;
55177 +                       } else if (i == shifted - 1 && shift->merging_units) {
55178 +                               count = shift->merging_units;
55179 +                               from = coord_num_units(&coord) - count;
55180 +                       } else {
55181 +                               count = coord_num_units(&coord);
55182 +                               from = 0;
55183 +                       }
55184 +
55185 +                       if (iplug->b.shift_hook) {
55186 +                               iplug->b.shift_hook(&coord, from, count,
55187 +                                                   shift->wish_stop.node);
55188 +                       }
55189 +                       coord_add_item_pos(&coord, -shift->pend);
55190 +               }
55191 +       } else {
55192 +               /* moved items are at the beginning */
55193 +               coord_init_first_unit(&coord, shift->target);
55194 +
55195 +               assert("vs-278", shift->pend == -1);
55196 +               for (i = 0; i < shifted; i++) {
55197 +                       unsigned from, count;
55198 +
55199 +                       iplug = item_plugin_by_coord(&coord);
55200 +                       if (i == 0 && shift->part_units) {
55201 +                               assert("vs-277",
55202 +                                      coord_num_units(&coord) ==
55203 +                                      shift->part_units);
55204 +                               count = coord_num_units(&coord);
55205 +                               from = 0;
55206 +                       } else if (i == shifted - 1 && shift->merging_units) {
55207 +                               count = shift->merging_units;
55208 +                               from = 0;
55209 +                       } else {
55210 +                               count = coord_num_units(&coord);
55211 +                               from = 0;
55212 +                       }
55213 +
55214 +                       if (iplug->b.shift_hook) {
55215 +                               iplug->b.shift_hook(&coord, from, count,
55216 +                                                   shift->wish_stop.node);
55217 +                       }
55218 +                       coord_add_item_pos(&coord, -shift->pend);
55219 +               }
55220 +       }
55221 +
55222 +       return 0;
55223 +}
55224 +
55225 +/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */
55226 +static int
55227 +unit_moved_left(const struct shift_params *shift, const coord_t * old)
55228 +{
55229 +       assert("vs-944", shift->real_stop.node == old->node);
55230 +
55231 +       if (shift->real_stop.item_pos < old->item_pos)
55232 +               return 0;
55233 +       if (shift->real_stop.item_pos == old->item_pos) {
55234 +               if (shift->real_stop.unit_pos < old->unit_pos)
55235 +                       return 0;
55236 +       }
55237 +       return 1;
55238 +}
55239 +
55240 +/* shift to right is completed. Return 1 if unit @old was moved to right
55241 +   neighbor */
55242 +static int
55243 +unit_moved_right(const struct shift_params *shift, const coord_t * old)
55244 +{
55245 +       assert("vs-944", shift->real_stop.node == old->node);
55246 +
55247 +       if (shift->real_stop.item_pos > old->item_pos)
55248 +               return 0;
55249 +       if (shift->real_stop.item_pos == old->item_pos) {
55250 +               if (shift->real_stop.unit_pos > old->unit_pos)
55251 +                       return 0;
55252 +       }
55253 +       return 1;
55254 +}
55255 +
55256 +/* coord @old was set in node from which shift was performed. What was shifted
55257 +   is stored in @shift. Update @old correspondingly to performed shift */
55258 +static coord_t *adjust_coord2(const struct shift_params *shift,
55259 +                             const coord_t * old, coord_t * new)
55260 +{
55261 +       coord_clear_iplug(new);
55262 +       new->between = old->between;
55263 +
55264 +       coord_clear_iplug(new);
55265 +       if (old->node == shift->target) {
55266 +               if (shift->pend == SHIFT_LEFT) {
55267 +                       /* coord which is set inside of left neighbor does not
55268 +                          change during shift to left */
55269 +                       coord_dup(new, old);
55270 +                       return new;
55271 +               }
55272 +               new->node = old->node;
55273 +               coord_set_item_pos(new,
55274 +                                  old->item_pos + shift->entire +
55275 +                                  (shift->part_units ? 1 : 0));
55276 +               new->unit_pos = old->unit_pos;
55277 +               if (old->item_pos == 0 && shift->merging_units)
55278 +                       new->unit_pos += shift->merging_units;
55279 +               return new;
55280 +       }
55281 +
55282 +       assert("vs-977", old->node == shift->wish_stop.node);
55283 +       if (shift->pend == SHIFT_LEFT) {
55284 +               if (unit_moved_left(shift, old)) {
55285 +                       /* unit @old moved to left neighbor. Calculate its
55286 +                          coordinate there */
55287 +                       new->node = shift->target;
55288 +                       coord_set_item_pos(new,
55289 +                                          node_num_items(shift->target) -
55290 +                                          shift->entire -
55291 +                                          (shift->part_units ? 1 : 0) +
55292 +                                          old->item_pos);
55293 +
55294 +                       new->unit_pos = old->unit_pos;
55295 +                       if (shift->merging_units) {
55296 +                               coord_dec_item_pos(new);
55297 +                               if (old->item_pos == 0) {
55298 +                                       /* unit_pos only changes if item got
55299 +                                          merged */
55300 +                                       new->unit_pos =
55301 +                                           coord_num_units(new) -
55302 +                                           (shift->merging_units -
55303 +                                            old->unit_pos);
55304 +                               }
55305 +                       }
55306 +               } else {
55307 +                       /* unit @old did not move to left neighbor.
55308 +
55309 +                          Use _nocheck, because @old is outside of its node.
55310 +                        */
55311 +                       coord_dup_nocheck(new, old);
55312 +                       coord_add_item_pos(new,
55313 +                                          -shift->u.future_first.item_pos);
55314 +                       if (new->item_pos == 0)
55315 +                               new->unit_pos -= shift->u.future_first.unit_pos;
55316 +               }
55317 +       } else {
55318 +               if (unit_moved_right(shift, old)) {
55319 +                       /* unit @old moved to right neighbor */
55320 +                       new->node = shift->target;
55321 +                       coord_set_item_pos(new,
55322 +                                          old->item_pos -
55323 +                                          shift->real_stop.item_pos);
55324 +                       if (new->item_pos == 0) {
55325 +                               /* unit @old might change unit pos */
55326 +                               coord_set_item_pos(new,
55327 +                                                  old->unit_pos -
55328 +                                                  shift->real_stop.unit_pos);
55329 +                       }
55330 +               } else {
55331 +                       /* unit @old did not move to right neighbor, therefore
55332 +                          it did not change */
55333 +                       coord_dup(new, old);
55334 +               }
55335 +       }
55336 +       coord_set_iplug(new, item_plugin_by_coord(new));
55337 +       return new;
55338 +}
55339 +
55340 +/* this is called when shift is completed (something of source node is copied
55341 +   to target and deleted in source) to update all taps set in current
55342 +   context */
55343 +static void update_taps(const struct shift_params *shift)
55344 +{
55345 +       tap_t *tap;
55346 +       coord_t new;
55347 +
55348 +       for_all_taps(tap) {
55349 +               /* update only taps set to nodes participating in shift */
55350 +               if (tap->coord->node == shift->wish_stop.node
55351 +                   || tap->coord->node == shift->target)
55352 +                       tap_to_coord(tap,
55353 +                                    adjust_coord2(shift, tap->coord, &new));
55354 +       }
55355 +}
55356 +
55357 +#if REISER4_DEBUG
55358 +
55359 +struct shift_check {
55360 +       reiser4_key key;
55361 +       __u16 plugin_id;
55362 +       union {
55363 +               __u64 bytes;
55364 +               __u64 entries;
55365 +               void *unused;
55366 +       } u;
55367 +};
55368 +
55369 +void *shift_check_prepare(const znode * left, const znode * right)
55370 +{
55371 +       pos_in_node_t i, nr_items;
55372 +       int mergeable;
55373 +       struct shift_check *data;
55374 +       item_header40 *ih;
55375 +
55376 +       if (node_is_empty(left) || node_is_empty(right))
55377 +               mergeable = 0;
55378 +       else {
55379 +               coord_t l, r;
55380 +
55381 +               coord_init_last_unit(&l, left);
55382 +               coord_init_first_unit(&r, right);
55383 +               mergeable = are_items_mergeable(&l, &r);
55384 +       }
55385 +       nr_items =
55386 +           node40_num_of_items_internal(left) +
55387 +           node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
55388 +       data =
55389 +               kmalloc(sizeof(struct shift_check) * nr_items,
55390 +                       reiser4_ctx_gfp_mask_get());
55391 +       if (data != NULL) {
55392 +               coord_t coord;
55393 +               pos_in_node_t item_pos;
55394 +
55395 +               coord_init_first_unit(&coord, left);
55396 +               i = 0;
55397 +
55398 +               for (item_pos = 0;
55399 +                    item_pos < node40_num_of_items_internal(left);
55400 +                    item_pos++) {
55401 +
55402 +                       coord_set_item_pos(&coord, item_pos);
55403 +                       ih = node40_ih_at_coord(&coord);
55404 +
55405 +                       data[i].key = ih->key;
55406 +                       data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
55407 +                       switch (data[i].plugin_id) {
55408 +                       case CTAIL_ID:
55409 +                       case FORMATTING_ID:
55410 +                               data[i].u.bytes = coord_num_units(&coord);
55411 +                               break;
55412 +                       case EXTENT_POINTER_ID:
55413 +                               data[i].u.bytes =
55414 +                                       reiser4_extent_size(&coord,
55415 +                                                      coord_num_units(&coord));
55416 +                               break;
55417 +                       case COMPOUND_DIR_ID:
55418 +                               data[i].u.entries = coord_num_units(&coord);
55419 +                               break;
55420 +                       default:
55421 +                               data[i].u.unused = NULL;
55422 +                               break;
55423 +                       }
55424 +                       i++;
55425 +               }
55426 +
55427 +               coord_init_first_unit(&coord, right);
55428 +
55429 +               if (mergeable) {
55430 +                       assert("vs-1609", i != 0);
55431 +
55432 +                       ih = node40_ih_at_coord(&coord);
55433 +
55434 +                       assert("vs-1589",
55435 +                              data[i - 1].plugin_id ==
55436 +                              le16_to_cpu(get_unaligned(&ih->plugin_id)));
55437 +                       switch (data[i - 1].plugin_id) {
55438 +                       case CTAIL_ID:
55439 +                       case FORMATTING_ID:
55440 +                               data[i - 1].u.bytes += coord_num_units(&coord);
55441 +                               break;
55442 +                       case EXTENT_POINTER_ID:
55443 +                               data[i - 1].u.bytes +=
55444 +                                   reiser4_extent_size(&coord,
55445 +                                               coord_num_units(&coord));
55446 +                               break;
55447 +                       case COMPOUND_DIR_ID:
55448 +                               data[i - 1].u.entries +=
55449 +                                   coord_num_units(&coord);
55450 +                               break;
55451 +                       default:
55452 +                               impossible("vs-1605", "wrong mergeable item");
55453 +                               break;
55454 +                       }
55455 +                       item_pos = 1;
55456 +               } else
55457 +                       item_pos = 0;
55458 +               for (; item_pos < node40_num_of_items_internal(right);
55459 +                    item_pos++) {
55460 +
55461 +                       assert("vs-1604", i < nr_items);
55462 +                       coord_set_item_pos(&coord, item_pos);
55463 +                       ih = node40_ih_at_coord(&coord);
55464 +
55465 +                       data[i].key = ih->key;
55466 +                       data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
55467 +                       switch (data[i].plugin_id) {
55468 +                       case CTAIL_ID:
55469 +                       case FORMATTING_ID:
55470 +                               data[i].u.bytes = coord_num_units(&coord);
55471 +                               break;
55472 +                       case EXTENT_POINTER_ID:
55473 +                               data[i].u.bytes =
55474 +                                   reiser4_extent_size(&coord,
55475 +                                               coord_num_units(&coord));
55476 +                               break;
55477 +                       case COMPOUND_DIR_ID:
55478 +                               data[i].u.entries = coord_num_units(&coord);
55479 +                               break;
55480 +                       default:
55481 +                               data[i].u.unused = NULL;
55482 +                               break;
55483 +                       }
55484 +                       i++;
55485 +               }
55486 +               assert("vs-1606", i == nr_items);
55487 +       }
55488 +       return data;
55489 +}
55490 +
55491 +void shift_check(void *vp, const znode * left, const znode * right)
55492 +{
55493 +       pos_in_node_t i, nr_items;
55494 +       coord_t coord;
55495 +       __u64 last_bytes;
55496 +       int mergeable;
55497 +       item_header40 *ih;
55498 +       pos_in_node_t item_pos;
55499 +       struct shift_check *data;
55500 +
55501 +       data = (struct shift_check *)vp;
55502 +
55503 +       if (data == NULL)
55504 +               return;
55505 +
55506 +       if (node_is_empty(left) || node_is_empty(right))
55507 +               mergeable = 0;
55508 +       else {
55509 +               coord_t l, r;
55510 +
55511 +               coord_init_last_unit(&l, left);
55512 +               coord_init_first_unit(&r, right);
55513 +               mergeable = are_items_mergeable(&l, &r);
55514 +       }
55515 +
55516 +       nr_items =
55517 +           node40_num_of_items_internal(left) +
55518 +           node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
55519 +
55520 +       i = 0;
55521 +       last_bytes = 0;
55522 +
55523 +       coord_init_first_unit(&coord, left);
55524 +
55525 +       for (item_pos = 0; item_pos < node40_num_of_items_internal(left);
55526 +            item_pos++) {
55527 +
55528 +               coord_set_item_pos(&coord, item_pos);
55529 +               ih = node40_ih_at_coord(&coord);
55530 +
55531 +               assert("vs-1611", i == item_pos);
55532 +               assert("vs-1590", keyeq(&ih->key, &data[i].key));
55533 +               assert("vs-1591",
55534 +                      le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
55535 +               if ((i < (node40_num_of_items_internal(left) - 1))
55536 +                   || !mergeable) {
55537 +                       switch (data[i].plugin_id) {
55538 +                       case CTAIL_ID:
55539 +                       case FORMATTING_ID:
55540 +                               assert("vs-1592",
55541 +                                      data[i].u.bytes ==
55542 +                                      coord_num_units(&coord));
55543 +                               break;
55544 +                       case EXTENT_POINTER_ID:
55545 +                               assert("vs-1593",
55546 +                                      data[i].u.bytes ==
55547 +                                      reiser4_extent_size(&coord,
55548 +                                                          coord_num_units
55549 +                                                          (&coord)));
55550 +                               break;
55551 +                       case COMPOUND_DIR_ID:
55552 +                               assert("vs-1594",
55553 +                                      data[i].u.entries ==
55554 +                                      coord_num_units(&coord));
55555 +                               break;
55556 +                       default:
55557 +                               break;
55558 +                       }
55559 +               }
55560 +               if (item_pos == (node40_num_of_items_internal(left) - 1)
55561 +                   && mergeable) {
55562 +                       switch (data[i].plugin_id) {
55563 +                       case CTAIL_ID:
55564 +                       case FORMATTING_ID:
55565 +                               last_bytes = coord_num_units(&coord);
55566 +                               break;
55567 +                       case EXTENT_POINTER_ID:
55568 +                               last_bytes =
55569 +                                   reiser4_extent_size(&coord,
55570 +                                               coord_num_units(&coord));
55571 +                               break;
55572 +                       case COMPOUND_DIR_ID:
55573 +                               last_bytes = coord_num_units(&coord);
55574 +                               break;
55575 +                       default:
55576 +                               impossible("vs-1595", "wrong mergeable item");
55577 +                               break;
55578 +                       }
55579 +               }
55580 +               i++;
55581 +       }
55582 +
55583 +       coord_init_first_unit(&coord, right);
55584 +       if (mergeable) {
55585 +               ih = node40_ih_at_coord(&coord);
55586 +
55587 +               assert("vs-1589",
55588 +                      data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id)));
55589 +               assert("vs-1608", last_bytes != 0);
55590 +               switch (data[i - 1].plugin_id) {
55591 +               case CTAIL_ID:
55592 +               case FORMATTING_ID:
55593 +                       assert("vs-1596",
55594 +                              data[i - 1].u.bytes ==
55595 +                              last_bytes + coord_num_units(&coord));
55596 +                       break;
55597 +
55598 +               case EXTENT_POINTER_ID:
55599 +                       assert("vs-1597",
55600 +                              data[i - 1].u.bytes ==
55601 +                              last_bytes + reiser4_extent_size(&coord,
55602 +                                                               coord_num_units
55603 +                                                               (&coord)));
55604 +                       break;
55605 +
55606 +               case COMPOUND_DIR_ID:
55607 +                       assert("vs-1598",
55608 +                              data[i - 1].u.bytes ==
55609 +                              last_bytes + coord_num_units(&coord));
55610 +                       break;
55611 +               default:
55612 +                       impossible("vs-1599", "wrong mergeable item");
55613 +                       break;
55614 +               }
55615 +               item_pos = 1;
55616 +       } else
55617 +               item_pos = 0;
55618 +
55619 +       for (; item_pos < node40_num_of_items_internal(right); item_pos++) {
55620 +
55621 +               coord_set_item_pos(&coord, item_pos);
55622 +               ih = node40_ih_at_coord(&coord);
55623 +
55624 +               assert("vs-1612", keyeq(&ih->key, &data[i].key));
55625 +               assert("vs-1613",
55626 +                      le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
55627 +               switch (data[i].plugin_id) {
55628 +               case CTAIL_ID:
55629 +               case FORMATTING_ID:
55630 +                       assert("vs-1600",
55631 +                              data[i].u.bytes == coord_num_units(&coord));
55632 +                       break;
55633 +               case EXTENT_POINTER_ID:
55634 +                       assert("vs-1601",
55635 +                              data[i].u.bytes ==
55636 +                              reiser4_extent_size(&coord,
55637 +                                                  coord_num_units
55638 +                                                  (&coord)));
55639 +                       break;
55640 +               case COMPOUND_DIR_ID:
55641 +                       assert("vs-1602",
55642 +                              data[i].u.entries == coord_num_units(&coord));
55643 +                       break;
55644 +               default:
55645 +                       break;
55646 +               }
55647 +               i++;
55648 +       }
55649 +
55650 +       assert("vs-1603", i == nr_items);
55651 +       kfree(data);
55652 +}
55653 +
55654 +#endif
55655 +
55656 +/* plugin->u.node.shift
55657 +   look for description of this method in plugin/node/node.h */
55658 +int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child,   /* if @from->node becomes empty - it will be
55659 +                                                                                          deleted from the tree if this is set to 1 */
55660 +                int including_stop_coord, carry_plugin_info * info)
55661 +{
55662 +       struct shift_params shift;
55663 +       int result;
55664 +       znode *left, *right;
55665 +       znode *source;
55666 +       int target_empty;
55667 +
55668 +       assert("nikita-2161", coord_check(from));
55669 +
55670 +       memset(&shift, 0, sizeof(shift));
55671 +       shift.pend = pend;
55672 +       shift.wish_stop = *from;
55673 +       shift.target = to;
55674 +
55675 +       assert("nikita-1473", znode_is_write_locked(from->node));
55676 +       assert("nikita-1474", znode_is_write_locked(to));
55677 +
55678 +       source = from->node;
55679 +
55680 +       /* set @shift.wish_stop to rightmost/leftmost unit among units we want
55681 +          shifted */
55682 +       if (pend == SHIFT_LEFT) {
55683 +               result = coord_set_to_left(&shift.wish_stop);
55684 +               left = to;
55685 +               right = from->node;
55686 +       } else {
55687 +               result = coord_set_to_right(&shift.wish_stop);
55688 +               left = from->node;
55689 +               right = to;
55690 +       }
55691 +
55692 +       if (result) {
55693 +               /* move insertion coord even if there is nothing to move */
55694 +               if (including_stop_coord) {
55695 +                       /* move insertion coord (@from) */
55696 +                       if (pend == SHIFT_LEFT) {
55697 +                               /* after last item in target node */
55698 +                               coord_init_after_last_item(from, to);
55699 +                       } else {
55700 +                               /* before first item in target node */
55701 +                               coord_init_before_first_item(from, to);
55702 +                       }
55703 +               }
55704 +
55705 +               if (delete_child && node_is_empty(shift.wish_stop.node))
55706 +                       result =
55707 +                           prepare_removal_node40(shift.wish_stop.node, info);
55708 +               else
55709 +                       result = 0;
55710 +               /* there is nothing to shift */
55711 +               assert("nikita-2078", coord_check(from));
55712 +               return result;
55713 +       }
55714 +
55715 +       target_empty = node_is_empty(to);
55716 +
55717 +       /* when first node plugin with item body compression is implemented,
55718 +          this must be changed to call node specific plugin */
55719 +
55720 +       /* shift->stop_coord is updated to last unit which really will be
55721 +          shifted */
55722 +       estimate_shift(&shift, get_current_context());
55723 +       if (!shift.shift_bytes) {
55724 +               /* we could not shift anything */
55725 +               assert("nikita-2079", coord_check(from));
55726 +               return 0;
55727 +       }
55728 +
55729 +       copy(&shift);
55730 +
55731 +       /* result value of this is important. It is used by adjust_coord below */
55732 +       result = delete_copied(&shift);
55733 +
55734 +       assert("vs-1610", result >= 0);
55735 +       assert("vs-1471",
55736 +              ((reiser4_context *) current->journal_info)->magic ==
55737 +              context_magic);
55738 +
55739 +       /* item which has been moved from one node to another might want to do
55740 +          something on that event. This can be done by item's shift_hook
55741 +          method, which will be now called for every moved items */
55742 +       call_shift_hooks(&shift);
55743 +
55744 +       assert("vs-1472",
55745 +              ((reiser4_context *) current->journal_info)->magic ==
55746 +              context_magic);
55747 +
55748 +       update_taps(&shift);
55749 +
55750 +       assert("vs-1473",
55751 +              ((reiser4_context *) current->journal_info)->magic ==
55752 +              context_magic);
55753 +
55754 +       /* adjust @from pointer in accordance with @including_stop_coord flag
55755 +          and amount of data which was really shifted */
55756 +       adjust_coord(from, &shift, result, including_stop_coord);
55757 +
55758 +       if (target_empty)
55759 +               /*
55760 +                * items were shifted into empty node. Update delimiting key.
55761 +                */
55762 +               result = prepare_for_update(NULL, left, info);
55763 +
55764 +       /* add update operation to @info, which is the list of operations to
55765 +          be performed on a higher level */
55766 +       result = prepare_for_update(left, right, info);
55767 +       if (!result && node_is_empty(source) && delete_child) {
55768 +               /* all contents of @from->node is moved to @to and @from->node
55769 +                  has to be removed from the tree, so, on higher level we
55770 +                  will be removing the pointer to node @from->node */
55771 +               result = prepare_removal_node40(source, info);
55772 +       }
55773 +       assert("nikita-2080", coord_check(from));
55774 +       return result ? result : (int)shift.shift_bytes;
55775 +}
55776 +
55777 +/* plugin->u.node.fast_insert()
55778 +   look for description of this method in plugin/node/node.h */
55779 +int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
55780 +{
55781 +       return 1;
55782 +}
55783 +
55784 +/* plugin->u.node.fast_paste()
55785 +   look for description of this method in plugin/node/node.h */
55786 +int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
55787 +{
55788 +       return 1;
55789 +}
55790 +
55791 +/* plugin->u.node.fast_cut()
55792 +   look for description of this method in plugin/node/node.h */
55793 +int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
55794 +{
55795 +       return 1;
55796 +}
55797 +
55798 +/* plugin->u.node.modify - not defined */
55799 +
55800 +/* plugin->u.node.max_item_size */
55801 +int max_item_size_node40(void)
55802 +{
55803 +       return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) -
55804 +           sizeof(item_header40);
55805 +}
55806 +
55807 +/* plugin->u.node.set_item_plugin */
55808 +int set_item_plugin_node40(coord_t *coord, item_id id)
55809 +{
55810 +       item_header40 *ih;
55811 +
55812 +       ih = node40_ih_at_coord(coord);
55813 +       put_unaligned(cpu_to_le16(id), &ih->plugin_id);
55814 +       coord->iplugid = id;
55815 +       return 0;
55816 +}
55817 +
55818 +/*
55819 +   Local variables:
55820 +   c-indentation-style: "K&R"
55821 +   mode-name: "LC"
55822 +   c-basic-offset: 8
55823 +   tab-width: 8
55824 +   fill-column: 120
55825 +   scroll-step: 1
55826 +   End:
55827 +*/
55828 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/node/node40.h linux-2.6.35/fs/reiser4/plugin/node/node40.h
55829 --- linux-2.6.35.orig/fs/reiser4/plugin/node/node40.h   1970-01-01 01:00:00.000000000 +0100
55830 +++ linux-2.6.35/fs/reiser4/plugin/node/node40.h        2010-08-04 15:44:57.000000000 +0200
55831 @@ -0,0 +1,125 @@
55832 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55833 +
55834 +#if !defined( __REISER4_NODE40_H__ )
55835 +#define __REISER4_NODE40_H__
55836 +
55837 +#include "../../forward.h"
55838 +#include "../../dformat.h"
55839 +#include "node.h"
55840 +
55841 +#include <linux/types.h>
55842 +
55843 +/* format of node header for 40 node layouts. Keep bloat out of this struct.  */
55844 +typedef struct node40_header {
55845 +       /* identifier of node plugin. Must be located at the very beginning
55846 +          of a node. */
55847 +       common_node_header common_header;       /* this is 16 bits */
55848 +       /* number of items. Should be first element in the node header,
55849 +          because we haven't yet finally decided whether it shouldn't go into
55850 +          common_header.
55851 +        */
55852 +/* NIKITA-FIXME-HANS: Create a macro such that if there is only one
55853 + * node format at compile time, and it is this one, accesses do not function dereference when
55854 + * accessing these fields (and otherwise they do).  Probably 80% of users will only have one node format at a time throughout the life of reiser4.  */
55855 +       d16 nr_items;
55856 +       /* free space in node measured in bytes */
55857 +       d16 free_space;
55858 +       /* offset to start of free space in node */
55859 +       d16 free_space_start;
55860 +       /* for reiser4_fsck.  When information about what is a free
55861 +          block is corrupted, and we try to recover everything even
55862 +          if marked as freed, then old versions of data may
55863 +          duplicate newer versions, and this field allows us to
55864 +          restore the newer version.  Also useful for when users
55865 +          who don't have the new trashcan installed on their linux distro
55866 +          delete the wrong files and send us desperate emails
55867 +          offering $25 for them back.  */
55868 +
55869 +       /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */
55870 +       d32 magic;
55871 +       /* flushstamp is made of mk_id and write_counter. mk_id is an
55872 +          id generated randomly at mkreiserfs time. So we can just
55873 +          skip all nodes with different mk_id. write_counter is d64
55874 +          incrementing counter of writes on disk. It is used for
55875 +          choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */
55876 +
55877 +       d32 mkfs_id;
55878 +       d64 flush_id;
55879 +       /* node flags to be used by fsck (reiser4ck or reiser4fsck?)
55880 +          and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */
55881 +       d16 flags;
55882 +
55883 +       /* 1 is leaf level, 2 is twig level, root is the numerically
55884 +          largest level */
55885 +       d8 level;
55886 +
55887 +       d8 pad;
55888 +} PACKED node40_header;
55889 +
55890 +/* item headers are not standard across all node layouts, pass
55891 +   pos_in_node to functions instead */
55892 +typedef struct item_header40 {
55893 +       /* key of item */
55894 +       /*  0 */ reiser4_key key;
55895 +       /* offset from start of a node measured in 8-byte chunks */
55896 +       /* 24 */ d16 offset;
55897 +       /* 26 */ d16 flags;
55898 +       /* 28 */ d16 plugin_id;
55899 +} PACKED item_header40;
55900 +
55901 +size_t item_overhead_node40(const znode * node, flow_t * aflow);
55902 +size_t free_space_node40(znode * node);
55903 +node_search_result lookup_node40(znode * node, const reiser4_key * key,
55904 +                                lookup_bias bias, coord_t * coord);
55905 +int num_of_items_node40(const znode * node);
55906 +char *item_by_coord_node40(const coord_t * coord);
55907 +int length_by_coord_node40(const coord_t * coord);
55908 +item_plugin *plugin_by_coord_node40(const coord_t * coord);
55909 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key);
55910 +size_t estimate_node40(znode * node);
55911 +int check_node40(const znode * node, __u32 flags, const char **error);
55912 +int parse_node40(znode * node);
55913 +int init_node40(znode * node);
55914 +#ifdef GUESS_EXISTS
55915 +int guess_node40(const znode * node);
55916 +#endif
55917 +void change_item_size_node40(coord_t * coord, int by);
55918 +int create_item_node40(coord_t * target, const reiser4_key * key,
55919 +                      reiser4_item_data * data, carry_plugin_info * info);
55920 +void update_item_key_node40(coord_t * target, const reiser4_key * key,
55921 +                           carry_plugin_info * info);
55922 +int kill_node40(struct carry_kill_data *, carry_plugin_info *);
55923 +int cut_node40(struct carry_cut_data *, carry_plugin_info *);
55924 +int shift_node40(coord_t * from, znode * to, shift_direction pend,
55925 +                /* if @from->node becomes
55926 +                   empty - it will be deleted from
55927 +                   the tree if this is set to 1
55928 +                 */
55929 +                int delete_child, int including_stop_coord,
55930 +                carry_plugin_info * info);
55931 +
55932 +int fast_insert_node40(const coord_t * coord);
55933 +int fast_paste_node40(const coord_t * coord);
55934 +int fast_cut_node40(const coord_t * coord);
55935 +int max_item_size_node40(void);
55936 +int prepare_removal_node40(znode * empty, carry_plugin_info * info);
55937 +int set_item_plugin_node40(coord_t * coord, item_id id);
55938 +int shrink_item_node40(coord_t * coord, int delta);
55939 +
55940 +#if REISER4_DEBUG
55941 +void *shift_check_prepare(const znode *left, const znode *right);
55942 +void shift_check(void *vp, const znode *left, const znode *right);
55943 +#endif
55944 +
55945 +/* __REISER4_NODE40_H__ */
55946 +#endif
55947 +/*
55948 +   Local variables:
55949 +   c-indentation-style: "K&R"
55950 +   mode-name: "LC"
55951 +   c-basic-offset: 8
55952 +   tab-width: 8
55953 +   fill-column: 120
55954 +   scroll-step: 1
55955 +   End:
55956 +*/
55957 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/node/node.c linux-2.6.35/fs/reiser4/plugin/node/node.c
55958 --- linux-2.6.35.orig/fs/reiser4/plugin/node/node.c     1970-01-01 01:00:00.000000000 +0100
55959 +++ linux-2.6.35/fs/reiser4/plugin/node/node.c  2010-08-04 15:44:57.000000000 +0200
55960 @@ -0,0 +1,131 @@
55961 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55962 +
55963 +/* Node plugin interface.
55964 +
55965 +   Description: The tree provides the abstraction of flows, which it
55966 +   internally fragments into items which it stores in nodes.
55967 +
55968 +   A key_atom is a piece of data bound to a single key.
55969 +
55970 +   For reasonable space efficiency to be achieved it is often
55971 +   necessary to store key_atoms in the nodes in the form of items, where
55972 +   an item is a sequence of key_atoms of the same or similar type. It is
55973 +   more space-efficient, because the item can implement (very)
55974 +   efficient compression of key_atom's bodies using internal knowledge
55975 +   about their semantics, and it can often avoid having a key for each
55976 +   key_atom. Each type of item has specific operations implemented by its
55977 +   item handler (see balance.c).
55978 +
55979 +   Rationale: the rest of the code (specifically balancing routines)
55980 +   accesses leaf level nodes through this interface. This way we can
55981 +   implement various block layouts and even combine various layouts
55982 +   within the same tree. Balancing/allocating algorithms should not
55983 +   care about peculiarities of splitting/merging specific item types,
55984 +   but rather should leave that to the item's item handler.
55985 +
55986 +   Items, including those that provide the abstraction of flows, have
55987 +   the property that if you move them in part or in whole to another
55988 +   node, the balancing code invokes their is_left_mergeable()
55989 +   item_operation to determine if they are mergeable with their new
55990 +   neighbor in the node you have moved them to.  For some items the
55991 +   is_left_mergeable() function always returns null.
55992 +
55993 +   When moving the bodies of items from one node to another:
55994 +
55995 +     if a partial item is shifted to another node the balancing code invokes
55996 +     an item handler method to handle the item splitting.
55997 +
55998 +     if the balancing code needs to merge with an item in the node it
55999 +     is shifting to, it will invoke an item handler method to handle
56000 +     the item merging.
56001 +
56002 +     if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy()
56003 +     adjusting the item headers after the move is done using the node handler.
56004 +*/
56005 +
56006 +#include "../../forward.h"
56007 +#include "../../debug.h"
56008 +#include "../../key.h"
56009 +#include "../../coord.h"
56010 +#include "../plugin_header.h"
56011 +#include "../item/item.h"
56012 +#include "node.h"
56013 +#include "../plugin.h"
56014 +#include "../../znode.h"
56015 +#include "../../tree.h"
56016 +#include "../../super.h"
56017 +#include "../../reiser4.h"
56018 +
56019 +/**
56020 + * leftmost_key_in_node - get the smallest key in node
56021 + * @node:
56022 + * @key: store result here
56023 + *
56024 + * Stores the leftmost key of @node in @key.
56025 + */
56026 +reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key)
56027 +{
56028 +       assert("nikita-1634", node != NULL);
56029 +       assert("nikita-1635", key != NULL);
56030 +
56031 +       if (!node_is_empty(node)) {
56032 +               coord_t first_item;
56033 +
56034 +               coord_init_first_unit(&first_item, (znode *) node);
56035 +               item_key_by_coord(&first_item, key);
56036 +       } else
56037 +               *key = *reiser4_max_key();
56038 +       return key;
56039 +}
56040 +
56041 +node_plugin node_plugins[LAST_NODE_ID] = {
56042 +       [NODE40_ID] = {
56043 +               .h = {
56044 +                       .type_id = REISER4_NODE_PLUGIN_TYPE,
56045 +                       .id = NODE40_ID,
56046 +                       .pops = NULL,
56047 +                       .label = "unified",
56048 +                       .desc = "unified node layout",
56049 +                       .linkage = {NULL, NULL}
56050 +               },
56051 +               .item_overhead = item_overhead_node40,
56052 +               .free_space = free_space_node40,
56053 +               .lookup = lookup_node40,
56054 +               .num_of_items = num_of_items_node40,
56055 +               .item_by_coord = item_by_coord_node40,
56056 +               .length_by_coord = length_by_coord_node40,
56057 +               .plugin_by_coord = plugin_by_coord_node40,
56058 +               .key_at = key_at_node40,
56059 +               .estimate = estimate_node40,
56060 +               .check = check_node40,
56061 +               .parse = parse_node40,
56062 +               .init = init_node40,
56063 +#ifdef GUESS_EXISTS
56064 +               .guess = guess_node40,
56065 +#endif
56066 +               .change_item_size = change_item_size_node40,
56067 +               .create_item = create_item_node40,
56068 +               .update_item_key = update_item_key_node40,
56069 +               .cut_and_kill = kill_node40,
56070 +               .cut = cut_node40,
56071 +               .shift = shift_node40,
56072 +               .shrink_item = shrink_item_node40,
56073 +               .fast_insert = fast_insert_node40,
56074 +               .fast_paste = fast_paste_node40,
56075 +               .fast_cut = fast_cut_node40,
56076 +               .max_item_size = max_item_size_node40,
56077 +               .prepare_removal = prepare_removal_node40,
56078 +               .set_item_plugin = set_item_plugin_node40
56079 +       }
56080 +};
56081 +
56082 +/*
56083 +   Local variables:
56084 +   c-indentation-style: "K&R"
56085 +   mode-name: "LC"
56086 +   c-basic-offset: 8
56087 +   tab-width: 8
56088 +   fill-column: 120
56089 +   scroll-step: 1
56090 +   End:
56091 +*/
56092 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/node/node.h linux-2.6.35/fs/reiser4/plugin/node/node.h
56093 --- linux-2.6.35.orig/fs/reiser4/plugin/node/node.h     1970-01-01 01:00:00.000000000 +0100
56094 +++ linux-2.6.35/fs/reiser4/plugin/node/node.h  2010-08-04 15:44:57.000000000 +0200
56095 @@ -0,0 +1,272 @@
56096 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
56097 +
56098 +/* We need a definition of the default node layout here. */
56099 +
56100 +/* Generally speaking, it is best to have free space in the middle of the
56101 +   node so that two sets of things can grow towards it, and to have the
56102 +   item bodies on the left so that the last one of them grows into free
56103 +   space.  We optimize for the case where we append new items to the end
56104 +   of the node, or grow the last item, because it hurts nothing to so
56105 +   optimize and it is a common special case to do massive insertions in
56106 +   increasing key order (and one of cases more likely to have a real user
56107 +   notice the delay time for).
56108 +
56109 +   formatted leaf default layout: (leaf1)
56110 +
56111 +   |node header:item bodies:free space:key + pluginid + item offset|
56112 +
56113 +   We grow towards the middle, optimizing layout for the case where we
56114 +   append new items to the end of the node.  The node header is fixed
56115 +   length.  Keys, and item offsets plus pluginids for the items
56116 +   corresponding to them are in increasing key order, and are fixed
56117 +   length.  Item offsets are relative to start of node (16 bits creating
56118 +   a node size limit of 64k, 12 bits might be a better choice....).  Item
56119 +   bodies are in decreasing key order.  Item bodies have a variable size.
56120 +   There is a one to one to one mapping of keys to item offsets to item
56121 +   bodies.  Item offsets consist of pointers to the zeroth byte of the
56122 +   item body.  Item length equals the start of the next item minus the
56123 +   start of this item, except the zeroth item whose length equals the end
56124 +   of the node minus the start of that item (plus a byte).  In other
56125 +   words, the item length is not recorded anywhere, and it does not need
56126 +   to be since it is computable.
56127 +
56128 +   Leaf variable length items and keys layout : (lvar)
56129 +
56130 +   |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies|
56131 +
56132 +   We grow towards the middle, optimizing layout for the case where we
56133 +   append new items to the end of the node.  The node header is fixed
56134 +   length.  Keys and item offsets for the items corresponding to them are
56135 +   in increasing key order, and keys are variable length.  Item offsets
56136 +   are relative to start of node (16 bits).  Item bodies are in
56137 +   decreasing key order.  Item bodies have a variable size.  There is a
56138 +   one to one to one mapping of keys to item offsets to item bodies.
56139 +   Item offsets consist of pointers to the zeroth byte of the item body.
56140 +   Item length equals the start of the next item's key minus the start of
56141 +   this item, except the zeroth item whose length equals the end of the
56142 +   node minus the start of that item (plus a byte).
56143 +
56144 +   leaf compressed keys layout: (lcomp)
56145 +
56146 +   |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies|
56147 +
56148 +   We grow towards the middle, optimizing layout for the case where we
56149 +   append new items to the end of the node.  The node header is fixed
56150 +   length.  Keys and item offsets for the items corresponding to them are
56151 +   in increasing key order, and keys are variable length.  The "key
56152 +   inherit" field indicates how much of the key prefix is identical to
56153 +   the previous key (stem compression as described in "Managing
56154 +   Gigabytes" is used).  key_inherit is a one byte integer.  The
56155 +   intra-node searches performed through this layout are linear searches,
56156 +   and this is theorized to not hurt performance much due to the high
56157 +   cost of processor stalls on modern CPUs, and the small number of keys
56158 +   in a single node.  Item offsets are relative to start of node (16
56159 +   bits).  Item bodies are in decreasing key order.  Item bodies have a
56160 +   variable size.  There is a one to one to one mapping of keys to item
56161 +   offsets to item bodies.  Item offsets consist of pointers to the
56162 +   zeroth byte of the item body.  Item length equals the start of the
56163 +   next item minus the start of this item, except the zeroth item whose
56164 +   length equals the end of the node minus the start of that item (plus a
56165 +   byte).  In other words, item length and key length is not recorded
56166 +   anywhere, and it does not need to be since it is computable.
56167 +
56168 +   internal node default layout: (idef1)
56169 +
56170 +   just like ldef1 except that item bodies are either blocknrs of
56171 +   children or extents, and moving them may require updating parent
56172 +   pointers in the nodes that they point to.
56173 +*/
56174 +
56175 +/* There is an inherent 3-way tradeoff between optimizing and
56176 +   exchanging disks between different architectures and code
56177 +   complexity.  This is optimal and simple and inexchangeable.
56178 +   Someone else can do the code for exchanging disks and make it
56179 +   complex. It would not be that hard.  Using other than the PAGE_SIZE
56180 +   might be suboptimal.
56181 +*/
56182 +
56183 +#if !defined( __REISER4_NODE_H__ )
56184 +#define __REISER4_NODE_H__
56185 +
56186 +#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE
56187 +
56188 +#include "../../dformat.h"
56189 +#include "../plugin_header.h"
56190 +
56191 +#include <linux/types.h>
56192 +
56193 +typedef enum {
56194 +       NS_FOUND = 0,
56195 +       NS_NOT_FOUND = -ENOENT
56196 +} node_search_result;
56197 +
56198 +/* Maximal possible space overhead for creation of new item in a node */
56199 +#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 )
56200 +
56201 +typedef enum {
56202 +       REISER4_NODE_DKEYS = (1 << 0),
56203 +       REISER4_NODE_TREE_STABLE = (1 << 1)
56204 +} reiser4_node_check_flag;
56205 +
56206 +/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */
56207 +struct cut_list {
56208 +       coord_t *from;
56209 +       coord_t *to;
56210 +       const reiser4_key *from_key;
56211 +       const reiser4_key *to_key;
56212 +       reiser4_key *smallest_removed;
56213 +       carry_plugin_info *info;
56214 +       __u32 flags;
56215 +       struct inode *inode;    /* this is to pass list of eflushed jnodes down to extent_kill_hook */
56216 +       lock_handle *left;
56217 +       lock_handle *right;
56218 +};
56219 +
56220 +struct carry_cut_data;
56221 +struct carry_kill_data;
56222 +
56223 +/* The responsibility of the node plugin is to store and give access
56224 +   to the sequence of items within the node.  */
56225 +typedef struct node_plugin {
56226 +       /* generic plugin fields */
56227 +       plugin_header h;
56228 +
56229 +       /* calculates the amount of space that will be required to store an
56230 +          item which is in addition to the space consumed by the item body.
56231 +          (the space consumed by the item body can be gotten by calling
56232 +          item->estimate) */
56233 +        size_t(*item_overhead) (const znode * node, flow_t * f);
56234 +
56235 +       /* returns free space by looking into node (i.e., without using
56236 +          znode->free_space). */
56237 +        size_t(*free_space) (znode * node);
56238 +       /* search within the node for the one item which might
56239 +          contain the key, invoking item->search_within to search within
56240 +          that item to see if it is in there */
56241 +        node_search_result(*lookup) (znode * node, const reiser4_key * key,
56242 +                                     lookup_bias bias, coord_t * coord);
56243 +       /* number of items in node */
56244 +       int (*num_of_items) (const znode * node);
56245 +
56246 +       /* store information about item in @coord in @data */
56247 +       /* break into several node ops, don't add any more uses of this before doing so */
56248 +       /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */
56249 +       char *(*item_by_coord) (const coord_t * coord);
56250 +       int (*length_by_coord) (const coord_t * coord);
56251 +       item_plugin *(*plugin_by_coord) (const coord_t * coord);
56252 +
56253 +       /* store item key in @key */
56254 +       reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key);
56255 +       /* conservatively estimate whether unit of what size can fit
56256 +          into node. This estimation should be performed without
56257 +          actually looking into the node's content (free space is saved in
56258 +          znode). */
56259 +        size_t(*estimate) (znode * node);
56260 +
56261 +       /* performs every consistency check the node plugin author could
56262 +          imagine. Optional. */
56263 +       int (*check) (const znode * node, __u32 flags, const char **error);
56264 +
56265 +       /* Called when node is read into memory and node plugin is
56266 +          already detected. This should read some data into znode (like free
56267 +          space counter) and, optionally, check data consistency.
56268 +        */
56269 +       int (*parse) (znode * node);
56270 +       /* This method is called on a new node to initialise plugin specific
56271 +          data (header, etc.) */
56272 +       int (*init) (znode * node);
56273 +       /* Check whether @node content conforms to this plugin format.
56274 +          Probably only useful after support for old V3.x formats is added.
56275 +          Uncomment after 4.0 only.
56276 +        */
56277 +       /*      int ( *guess )( const znode *node ); */
56278 +#if REISER4_DEBUG
56279 +       void (*print) (const char *prefix, const znode * node, __u32 flags);
56280 +#endif
56281 +       /* change size of @item by @by bytes. @item->node has enough free
56282 +          space. When @by > 0 - free space is appended to end of item. When
56283 +          @by < 0 - item is truncated - it is assumed that last @by bytes if
56284 +          the item are freed already */
56285 +       void (*change_item_size) (coord_t * item, int by);
56286 +
56287 +       /* create new item @length bytes long in coord @target */
56288 +       int (*create_item) (coord_t * target, const reiser4_key * key,
56289 +                           reiser4_item_data * data, carry_plugin_info * info);
56290 +
56291 +       /* update key of item. */
56292 +       void (*update_item_key) (coord_t * target, const reiser4_key * key,
56293 +                                carry_plugin_info * info);
56294 +
56295 +       int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *);
56296 +       int (*cut) (struct carry_cut_data *, carry_plugin_info *);
56297 +
56298 +       /*
56299 +        * shrink item pointed to by @coord by @delta bytes.
56300 +        */
56301 +       int (*shrink_item) (coord_t * coord, int delta);
56302 +
56303 +       /* copy as much as possible but not more than up to @stop from
56304 +          @stop->node to @target. If (pend == append) then data from beginning of
56305 +          @stop->node are copied to the end of @target. If (pend == prepend) then
56306 +          data from the end of @stop->node are copied to the beginning of
56307 +          @target. Copied data are removed from @stop->node. Information
56308 +          about what to do on upper level is stored in @todo */
56309 +       int (*shift) (coord_t * stop, znode * target, shift_direction pend,
56310 +                     int delete_node, int including_insert_coord,
56311 +                     carry_plugin_info * info);
56312 +       /* return true if this node allows skip carry() in some situations
56313 +          (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format
56314 +          emulation doesn't.
56315 +
56316 +          This will speedup insertions that doesn't require updates to the
56317 +          parent, by bypassing initialisation of carry() structures. It's
56318 +          believed that majority of insertions will fit there.
56319 +
56320 +        */
56321 +       int (*fast_insert) (const coord_t * coord);
56322 +       int (*fast_paste) (const coord_t * coord);
56323 +       int (*fast_cut) (const coord_t * coord);
56324 +       /* this limits max size of item which can be inserted into a node and
56325 +          number of bytes item in a node may be appended with */
56326 +       int (*max_item_size) (void);
56327 +       int (*prepare_removal) (znode * empty, carry_plugin_info * info);
56328 +       /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular
56329 +        * files */
56330 +       int (*set_item_plugin) (coord_t * coord, item_id);
56331 +} node_plugin;
56332 +
56333 +typedef enum {
56334 +       /* standard unified node layout used for both leaf and internal
56335 +          nodes */
56336 +       NODE40_ID,
56337 +       LAST_NODE_ID
56338 +} reiser4_node_id;
56339 +
56340 +extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key);
56341 +#if REISER4_DEBUG
56342 +extern void print_node_content(const char *prefix, const znode * node,
56343 +                              __u32 flags);
56344 +#endif
56345 +
56346 +extern void indent_znode(const znode * node);
56347 +
56348 +typedef struct common_node_header {
56349 +       /*
56350 +        * identifier of node plugin. Must be located at the very beginning of
56351 +        * a node.
56352 +        */
56353 +       __le16 plugin_id;
56354 +} common_node_header;
56355 +
56356 +/* __REISER4_NODE_H__ */
56357 +#endif
56358 +/*
56359 + * Local variables:
56360 + * c-indentation-style: "K&R"
56361 + * mode-name: "LC"
56362 + * c-basic-offset: 8
56363 + * tab-width: 8
56364 + * fill-column: 79
56365 + * scroll-step: 1
56366 + * End:
56367 + */
56368 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/object.c linux-2.6.35/fs/reiser4/plugin/object.c
56369 --- linux-2.6.35.orig/fs/reiser4/plugin/object.c        1970-01-01 01:00:00.000000000 +0100
56370 +++ linux-2.6.35/fs/reiser4/plugin/object.c     2010-08-04 15:44:57.000000000 +0200
56371 @@ -0,0 +1,531 @@
56372 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
56373 + * reiser4/README */
56374 +
56375 +/*
56376 + * Examples of object plugins: file, directory, symlink, special file.
56377 + *
56378 + * Plugins associated with inode:
56379 + *
56380 + * Plugin of inode is plugin referenced by plugin-id field of on-disk
56381 + * stat-data. How we store this plugin in in-core inode is not
56382 + * important. Currently pointers are used, another variant is to store offsets
56383 + * and do array lookup on each access.
56384 + *
56385 + * Now, each inode has one selected plugin: object plugin that
56386 + * determines what type of file this object is: directory, regular etc.
56387 + *
56388 + * This main plugin can use other plugins that are thus subordinated to
56389 + * it. Directory instance of object plugin uses hash; regular file
56390 + * instance uses tail policy plugin.
56391 + *
56392 + * Object plugin is either taken from id in stat-data or guessed from
56393 + * i_mode bits. Once it is established we ask it to install its
56394 + * subordinate plugins, by looking again in stat-data or inheriting them
56395 + * from parent.
56396 + *
56397 + * How new inode is initialized during ->read_inode():
56398 + * 1 read stat-data and initialize inode fields: i_size, i_mode,
56399 + *   i_generation, capabilities etc.
56400 + * 2 read plugin id from stat data or try to guess plugin id
56401 + *   from inode->i_mode bits if plugin id is missing.
56402 + * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields.
56403 + *
56404 + * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3?  What
56405 + * if stat data does contain i_size, etc., due to it being an unusual plugin?
56406 + *
56407 + * 4 Call ->activate() method of object's plugin. Plugin is either read from
56408 + *    from stat-data or guessed from mode bits
56409 + * 5 Call ->inherit() method of object plugin to inherit as yet un initialized
56410 + *    plugins from parent.
56411 + *
56412 + * Easy induction proves that on last step all plugins of inode would be
56413 + * initialized.
56414 + *
56415 + * When creating new object:
56416 + * 1 obtain object plugin id (see next period)
56417 + * NIKITA-FIXME-HANS: period?
56418 + * 2 ->install() this plugin
56419 + * 3 ->inherit() the rest from the parent
56420 + *
56421 + * We need some examples of creating an object with default and non-default
56422 + * plugin ids.  Nikita, please create them.
56423 + */
56424 +
56425 +#include "../inode.h"
56426 +
56427 +static int _bugop(void)
56428 +{
56429 +       BUG_ON(1);
56430 +       return 0;
56431 +}
56432 +
56433 +#define bugop ((void *)_bugop)
56434 +
56435 +static int _dummyop(void)
56436 +{
56437 +       return 0;
56438 +}
56439 +
56440 +#define dummyop ((void *)_dummyop)
56441 +
56442 +static int change_file(struct inode *inode,
56443 +                      reiser4_plugin * plugin,
56444 +                      pset_member memb)
56445 +{
56446 +       /* cannot change object plugin of already existing object */
56447 +       if (memb == PSET_FILE)
56448 +               return RETERR(-EINVAL);
56449 +
56450 +       /* Change PSET_CREATE */
56451 +       return aset_set_unsafe(&reiser4_inode_data(inode)->pset, memb, plugin);
56452 +}
56453 +
56454 +static reiser4_plugin_ops file_plugin_ops = {
56455 +       .change = change_file
56456 +};
56457 +
56458 +static struct inode_operations         null_i_ops = {.create = NULL};
56459 +static struct file_operations          null_f_ops = {.owner = NULL};
56460 +static struct address_space_operations null_a_ops = {.writepage = NULL};
56461 +
56462 +/* VFS methods for regular files */
56463 +static struct inode_operations regular_file_i_ops = {
56464 +       .permission = reiser4_permission_common,
56465 +       .setattr = reiser4_setattr,
56466 +       .getattr = reiser4_getattr_common
56467 +};
56468 +static struct file_operations regular_file_f_ops = {
56469 +       .llseek = generic_file_llseek,
56470 +       .read = reiser4_read_careful,
56471 +       .write = reiser4_write_careful,
56472 +       .aio_read = generic_file_aio_read,
56473 +       .unlocked_ioctl = reiser4_ioctl_careful,
56474 +       .mmap = reiser4_mmap_careful,
56475 +       .open = reiser4_open_careful,
56476 +       .release = reiser4_release_careful,
56477 +       .fsync = reiser4_sync_file_common,
56478 +       .splice_read = generic_file_splice_read,
56479 +       .splice_write = generic_file_splice_write
56480 +};
56481 +static struct address_space_operations regular_file_a_ops = {
56482 +       .writepage = reiser4_writepage,
56483 +       .readpage = reiser4_readpage,
56484 +       .sync_page = block_sync_page,
56485 +       .writepages = reiser4_writepages,
56486 +       .set_page_dirty = reiser4_set_page_dirty,
56487 +       .readpages = reiser4_readpages,
56488 +       .write_begin = reiser4_write_begin_careful,
56489 +       .write_end = reiser4_write_end_careful,
56490 +       .bmap = reiser4_bmap_careful,
56491 +       .invalidatepage = reiser4_invalidatepage,
56492 +       .releasepage = reiser4_releasepage
56493 +};
56494 +
56495 +/* VFS methods for symlink files */
56496 +static struct inode_operations symlink_file_i_ops = {
56497 +       .readlink = generic_readlink,
56498 +       .follow_link = reiser4_follow_link_common,
56499 +       .permission = reiser4_permission_common,
56500 +       .setattr = reiser4_setattr_common,
56501 +       .getattr = reiser4_getattr_common
56502 +};
56503 +
56504 +/* VFS methods for special files */
56505 +static struct inode_operations special_file_i_ops = {
56506 +       .permission = reiser4_permission_common,
56507 +       .setattr = reiser4_setattr_common,
56508 +       .getattr = reiser4_getattr_common
56509 +};
56510 +
56511 +/* VFS methods for directories */
56512 +static struct inode_operations directory_i_ops = {
56513 +       .create = reiser4_create_common,
56514 +       .lookup = reiser4_lookup_common,
56515 +       .link = reiser4_link_common,
56516 +       .unlink = reiser4_unlink_common,
56517 +       .symlink = reiser4_symlink_common,
56518 +       .mkdir = reiser4_mkdir_common,
56519 +       .rmdir = reiser4_unlink_common,
56520 +       .mknod = reiser4_mknod_common,
56521 +       .rename = reiser4_rename_common,
56522 +       .permission = reiser4_permission_common,
56523 +       .setattr = reiser4_setattr_common,
56524 +       .getattr = reiser4_getattr_common
56525 +};
56526 +static struct file_operations directory_f_ops = {
56527 +       .llseek = reiser4_llseek_dir_common,
56528 +       .read = generic_read_dir,
56529 +       .readdir = reiser4_readdir_common,
56530 +       .release = reiser4_release_dir_common,
56531 +       .fsync = reiser4_sync_common
56532 +};
56533 +static struct address_space_operations directory_a_ops = {
56534 +       .writepage = bugop,
56535 +       .sync_page = bugop,
56536 +       .writepages = dummyop,
56537 +       .set_page_dirty = bugop,
56538 +       .readpages = bugop,
56539 +       .write_begin = bugop,
56540 +       .write_end = bugop,
56541 +       .bmap = bugop,
56542 +       .invalidatepage = bugop,
56543 +       .releasepage = bugop
56544 +};
56545 +
56546 +/*
56547 + * Definitions of object plugins.
56548 + */
56549 +
56550 +file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = {
56551 +       [UNIX_FILE_PLUGIN_ID] = {
56552 +               .h = {
56553 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
56554 +                       .id = UNIX_FILE_PLUGIN_ID,
56555 +                       .groups = (1 << REISER4_REGULAR_FILE),
56556 +                       .pops = &file_plugin_ops,
56557 +                       .label = "reg",
56558 +                       .desc = "regular file",
56559 +                       .linkage = {NULL, NULL},
56560 +               },
56561 +               /*
56562 +                * invariant vfs ops
56563 +                */
56564 +               .inode_ops = &regular_file_i_ops,
56565 +               .file_ops = &regular_file_f_ops,
56566 +               .as_ops = &regular_file_a_ops,
56567 +               /*
56568 +                * private i_ops
56569 +                */
56570 +               .setattr = setattr_unix_file,
56571 +               .open = open_unix_file,
56572 +               .read = read_unix_file,
56573 +               .write = write_unix_file,
56574 +               .ioctl = ioctl_unix_file,
56575 +               .mmap = mmap_unix_file,
56576 +               .release = release_unix_file,
56577 +               /*
56578 +                * private f_ops
56579 +                */
56580 +               .readpage = readpage_unix_file,
56581 +               .readpages = readpages_unix_file,
56582 +               .writepages = writepages_unix_file,
56583 +               .write_begin = write_begin_unix_file,
56584 +               .write_end = write_end_unix_file,
56585 +               /*
56586 +                * private a_ops
56587 +                */
56588 +               .bmap = bmap_unix_file,
56589 +               /*
56590 +                * other private methods
56591 +                */
56592 +               .write_sd_by_inode = write_sd_by_inode_common,
56593 +               .flow_by_inode = flow_by_inode_unix_file,
56594 +               .key_by_inode = key_by_inode_and_offset_common,
56595 +               .set_plug_in_inode = set_plug_in_inode_common,
56596 +               .adjust_to_parent = adjust_to_parent_common,
56597 +               .create_object = reiser4_create_object_common,
56598 +               .delete_object = delete_object_unix_file,
56599 +               .add_link = reiser4_add_link_common,
56600 +               .rem_link = reiser4_rem_link_common,
56601 +               .owns_item = owns_item_unix_file,
56602 +               .can_add_link = can_add_link_common,
56603 +               .detach = dummyop,
56604 +               .bind = dummyop,
56605 +               .safelink = safelink_common,
56606 +               .estimate = {
56607 +                       .create = estimate_create_common,
56608 +                       .update = estimate_update_common,
56609 +                       .unlink = estimate_unlink_common
56610 +               },
56611 +               .init_inode_data = init_inode_data_unix_file,
56612 +               .cut_tree_worker = cut_tree_worker_common,
56613 +               .wire = {
56614 +                       .write = wire_write_common,
56615 +                       .read = wire_read_common,
56616 +                       .get = wire_get_common,
56617 +                       .size = wire_size_common,
56618 +                       .done = wire_done_common
56619 +               }
56620 +       },
56621 +       [DIRECTORY_FILE_PLUGIN_ID] = {
56622 +               .h = {
56623 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
56624 +                       .id = DIRECTORY_FILE_PLUGIN_ID,
56625 +                       .groups = (1 << REISER4_DIRECTORY_FILE),
56626 +                       .pops = &file_plugin_ops,
56627 +                       .label = "dir",
56628 +                       .desc = "directory",
56629 +                       .linkage = {NULL, NULL}
56630 +               },
56631 +               .inode_ops = &null_i_ops,
56632 +               .file_ops = &null_f_ops,
56633 +               .as_ops = &null_a_ops,
56634 +
56635 +               .write_sd_by_inode = write_sd_by_inode_common,
56636 +               .flow_by_inode = bugop,
56637 +               .key_by_inode = bugop,
56638 +               .set_plug_in_inode = set_plug_in_inode_common,
56639 +               .adjust_to_parent = adjust_to_parent_common_dir,
56640 +               .create_object = reiser4_create_object_common,
56641 +               .delete_object = reiser4_delete_dir_common,
56642 +               .add_link = reiser4_add_link_common,
56643 +               .rem_link = rem_link_common_dir,
56644 +               .owns_item = owns_item_common_dir,
56645 +               .can_add_link = can_add_link_common,
56646 +               .can_rem_link = can_rem_link_common_dir,
56647 +               .detach = reiser4_detach_common_dir,
56648 +               .bind = reiser4_bind_common_dir,
56649 +               .safelink = safelink_common,
56650 +               .estimate = {
56651 +                       .create = estimate_create_common_dir,
56652 +                       .update = estimate_update_common,
56653 +                       .unlink = estimate_unlink_common_dir
56654 +               },
56655 +               .wire = {
56656 +                       .write = wire_write_common,
56657 +                       .read = wire_read_common,
56658 +                       .get = wire_get_common,
56659 +                       .size = wire_size_common,
56660 +                       .done = wire_done_common
56661 +               },
56662 +               .init_inode_data = init_inode_ordering,
56663 +               .cut_tree_worker = cut_tree_worker_common,
56664 +       },
56665 +       [SYMLINK_FILE_PLUGIN_ID] = {
56666 +               .h = {
56667 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
56668 +                       .id = SYMLINK_FILE_PLUGIN_ID,
56669 +                       .groups = (1 << REISER4_SYMLINK_FILE),
56670 +                       .pops = &file_plugin_ops,
56671 +                       .label = "symlink",
56672 +                       .desc = "symbolic link",
56673 +                       .linkage = {NULL,NULL}
56674 +               },
56675 +               .inode_ops = &symlink_file_i_ops,
56676 +               /* inode->i_fop of symlink is initialized
56677 +                  by NULL in setup_inode_ops */
56678 +               .file_ops = &null_f_ops,
56679 +               .as_ops = &null_a_ops,
56680 +
56681 +               .write_sd_by_inode = write_sd_by_inode_common,
56682 +               .set_plug_in_inode = set_plug_in_inode_common,
56683 +               .adjust_to_parent = adjust_to_parent_common,
56684 +               .create_object = reiser4_create_symlink,
56685 +               .delete_object = reiser4_delete_object_common,
56686 +               .add_link = reiser4_add_link_common,
56687 +               .rem_link = reiser4_rem_link_common,
56688 +               .can_add_link = can_add_link_common,
56689 +               .detach = dummyop,
56690 +               .bind = dummyop,
56691 +               .safelink = safelink_common,
56692 +               .estimate = {
56693 +                       .create = estimate_create_common,
56694 +                       .update = estimate_update_common,
56695 +                       .unlink = estimate_unlink_common
56696 +               },
56697 +               .init_inode_data = init_inode_ordering,
56698 +               .cut_tree_worker = cut_tree_worker_common,
56699 +               .destroy_inode = destroy_inode_symlink,
56700 +               .wire = {
56701 +                       .write = wire_write_common,
56702 +                       .read = wire_read_common,
56703 +                       .get = wire_get_common,
56704 +                       .size = wire_size_common,
56705 +                       .done = wire_done_common
56706 +               }
56707 +       },
56708 +       [SPECIAL_FILE_PLUGIN_ID] = {
56709 +               .h = {
56710 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
56711 +                       .id = SPECIAL_FILE_PLUGIN_ID,
56712 +                       .groups = (1 << REISER4_SPECIAL_FILE),
56713 +                       .pops = &file_plugin_ops,
56714 +                       .label = "special",
56715 +                       .desc =
56716 +                       "special: fifo, device or socket",
56717 +                       .linkage = {NULL, NULL}
56718 +               },
56719 +               .inode_ops = &special_file_i_ops,
56720 +               /* file_ops of special files (sockets, block, char, fifo) are
56721 +                  initialized by init_special_inode. */
56722 +               .file_ops = &null_f_ops,
56723 +               .as_ops = &null_a_ops,
56724 +
56725 +               .write_sd_by_inode = write_sd_by_inode_common,
56726 +               .set_plug_in_inode = set_plug_in_inode_common,
56727 +               .adjust_to_parent = adjust_to_parent_common,
56728 +               .create_object = reiser4_create_object_common,
56729 +               .delete_object = reiser4_delete_object_common,
56730 +               .add_link = reiser4_add_link_common,
56731 +               .rem_link = reiser4_rem_link_common,
56732 +               .owns_item = owns_item_common,
56733 +               .can_add_link = can_add_link_common,
56734 +               .detach = dummyop,
56735 +               .bind = dummyop,
56736 +               .safelink = safelink_common,
56737 +               .estimate = {
56738 +                       .create = estimate_create_common,
56739 +                       .update = estimate_update_common,
56740 +                       .unlink = estimate_unlink_common
56741 +               },
56742 +               .init_inode_data = init_inode_ordering,
56743 +               .cut_tree_worker = cut_tree_worker_common,
56744 +               .wire = {
56745 +                       .write = wire_write_common,
56746 +                       .read = wire_read_common,
56747 +                       .get = wire_get_common,
56748 +                       .size = wire_size_common,
56749 +                       .done = wire_done_common
56750 +               }
56751 +       },
56752 +       [CRYPTCOMPRESS_FILE_PLUGIN_ID] = {
56753 +               .h = {
56754 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
56755 +                       .id = CRYPTCOMPRESS_FILE_PLUGIN_ID,
56756 +                       .groups = (1 << REISER4_REGULAR_FILE),
56757 +                       .pops = &file_plugin_ops,
56758 +                       .label = "cryptcompress",
56759 +                       .desc = "cryptcompress file",
56760 +                       .linkage = {NULL, NULL}
56761 +               },
56762 +               .inode_ops = &regular_file_i_ops,
56763 +               .file_ops = &regular_file_f_ops,
56764 +               .as_ops = &regular_file_a_ops,
56765 +
56766 +               .setattr = setattr_cryptcompress,
56767 +               .open = open_cryptcompress,
56768 +               .read = read_cryptcompress,
56769 +               .write = write_cryptcompress,
56770 +               .ioctl = ioctl_cryptcompress,
56771 +               .mmap = mmap_cryptcompress,
56772 +               .release = release_cryptcompress,
56773 +
56774 +               .readpage = readpage_cryptcompress,
56775 +               .readpages = readpages_cryptcompress,
56776 +               .writepages = writepages_cryptcompress,
56777 +               .write_begin = write_begin_cryptcompress,
56778 +               .write_end = write_end_cryptcompress,
56779 +
56780 +               .bmap = bmap_cryptcompress,
56781 +
56782 +               .write_sd_by_inode = write_sd_by_inode_common,
56783 +               .flow_by_inode = flow_by_inode_cryptcompress,
56784 +               .key_by_inode = key_by_inode_cryptcompress,
56785 +               .set_plug_in_inode = set_plug_in_inode_common,
56786 +               .adjust_to_parent = adjust_to_parent_cryptcompress,
56787 +               .create_object = create_object_cryptcompress,
56788 +               .delete_object = delete_object_cryptcompress,
56789 +               .add_link = reiser4_add_link_common,
56790 +               .rem_link = reiser4_rem_link_common,
56791 +               .owns_item = owns_item_common,
56792 +               .can_add_link = can_add_link_common,
56793 +               .detach = dummyop,
56794 +               .bind = dummyop,
56795 +               .safelink = safelink_common,
56796 +               .estimate = {
56797 +                       .create = estimate_create_common,
56798 +                       .update = estimate_update_common,
56799 +                       .unlink = estimate_unlink_common
56800 +               },
56801 +               .init_inode_data = init_inode_data_cryptcompress,
56802 +               .cut_tree_worker = cut_tree_worker_cryptcompress,
56803 +               .destroy_inode = destroy_inode_cryptcompress,
56804 +               .wire = {
56805 +                       .write = wire_write_common,
56806 +                       .read = wire_read_common,
56807 +                       .get = wire_get_common,
56808 +                       .size = wire_size_common,
56809 +                       .done = wire_done_common
56810 +               }
56811 +       }
56812 +};
56813 +
56814 +static int change_dir(struct inode *inode,
56815 +                     reiser4_plugin * plugin,
56816 +                     pset_member memb)
56817 +{
56818 +       /* cannot change dir plugin of already existing object */
56819 +       return RETERR(-EINVAL);
56820 +}
56821 +
56822 +static reiser4_plugin_ops dir_plugin_ops = {
56823 +       .change = change_dir
56824 +};
56825 +
56826 +/*
56827 + * definition of directory plugins
56828 + */
56829 +
56830 +dir_plugin dir_plugins[LAST_DIR_ID] = {
56831 +       /* standard hashed directory plugin */
56832 +       [HASHED_DIR_PLUGIN_ID] = {
56833 +               .h = {
56834 +                       .type_id = REISER4_DIR_PLUGIN_TYPE,
56835 +                       .id = HASHED_DIR_PLUGIN_ID,
56836 +                       .pops = &dir_plugin_ops,
56837 +                       .label = "dir",
56838 +                       .desc = "hashed directory",
56839 +                       .linkage = {NULL, NULL}
56840 +               },
56841 +               .inode_ops = &directory_i_ops,
56842 +               .file_ops = &directory_f_ops,
56843 +               .as_ops = &directory_a_ops,
56844 +
56845 +               .get_parent = get_parent_common,
56846 +               .is_name_acceptable = is_name_acceptable_common,
56847 +               .build_entry_key = build_entry_key_hashed,
56848 +               .build_readdir_key = build_readdir_key_common,
56849 +               .add_entry = reiser4_add_entry_common,
56850 +               .rem_entry = reiser4_rem_entry_common,
56851 +               .init = reiser4_dir_init_common,
56852 +               .done = reiser4_dir_done_common,
56853 +               .attach = reiser4_attach_common,
56854 +               .detach = reiser4_detach_common,
56855 +               .estimate = {
56856 +                       .add_entry = estimate_add_entry_common,
56857 +                       .rem_entry = estimate_rem_entry_common,
56858 +                       .unlink = dir_estimate_unlink_common
56859 +               }
56860 +       },
56861 +       /* hashed directory for which seekdir/telldir are guaranteed to
56862 +        * work. Brain-damage. */
56863 +       [SEEKABLE_HASHED_DIR_PLUGIN_ID] = {
56864 +               .h = {
56865 +                       .type_id = REISER4_DIR_PLUGIN_TYPE,
56866 +                       .id = SEEKABLE_HASHED_DIR_PLUGIN_ID,
56867 +                       .pops = &dir_plugin_ops,
56868 +                       .label = "dir32",
56869 +                       .desc = "directory hashed with 31 bit hash",
56870 +                       .linkage = {NULL, NULL}
56871 +               },
56872 +               .inode_ops = &directory_i_ops,
56873 +               .file_ops = &directory_f_ops,
56874 +               .as_ops = &directory_a_ops,
56875 +
56876 +               .get_parent = get_parent_common,
56877 +               .is_name_acceptable = is_name_acceptable_common,
56878 +               .build_entry_key = build_entry_key_seekable,
56879 +               .build_readdir_key = build_readdir_key_common,
56880 +               .add_entry = reiser4_add_entry_common,
56881 +               .rem_entry = reiser4_rem_entry_common,
56882 +               .init = reiser4_dir_init_common,
56883 +               .done = reiser4_dir_done_common,
56884 +               .attach = reiser4_attach_common,
56885 +               .detach = reiser4_detach_common,
56886 +               .estimate = {
56887 +                       .add_entry = estimate_add_entry_common,
56888 +                       .rem_entry = estimate_rem_entry_common,
56889 +                       .unlink = dir_estimate_unlink_common
56890 +               }
56891 +       }
56892 +};
56893 +
56894 +/* Make Linus happy.
56895 +   Local variables:
56896 +   c-indentation-style: "K&R"
56897 +   mode-name: "LC"
56898 +   c-basic-offset: 8
56899 +   tab-width: 8
56900 +   fill-column: 120
56901 +   End:
56902 +*/
56903 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/object.h linux-2.6.35/fs/reiser4/plugin/object.h
56904 --- linux-2.6.35.orig/fs/reiser4/plugin/object.h        1970-01-01 01:00:00.000000000 +0100
56905 +++ linux-2.6.35/fs/reiser4/plugin/object.h     2010-08-04 18:11:54.000000000 +0200
56906 @@ -0,0 +1,116 @@
56907 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by
56908 + * reiser4/README */
56909 +
56910 +/* Declaration of object plugin functions. */
56911 +
56912 +#if !defined(__FS_REISER4_PLUGIN_OBJECT_H__)
56913 +#define __FS_REISER4_PLUGIN_OBJECT_H__
56914 +
56915 +#include "../type_safe_hash.h"
56916 +
56917 +/* common implementations of inode operations */
56918 +int reiser4_create_common(struct inode *parent, struct dentry *dentry,
56919 +                         int mode, struct nameidata *);
56920 +struct dentry *reiser4_lookup_common(struct inode *parent,
56921 +                                     struct dentry *dentry,
56922 +                                     struct nameidata *nameidata);
56923 +int reiser4_link_common(struct dentry *existing, struct inode *parent,
56924 +                       struct dentry *newname);
56925 +int reiser4_unlink_common(struct inode *parent, struct dentry *victim);
56926 +int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode);
56927 +int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
56928 +                  const char *linkname);
56929 +int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
56930 +                int mode, dev_t rdev);
56931 +int reiser4_rename_common(struct inode *old_dir, struct dentry *old_name,
56932 +                         struct inode *new_dir, struct dentry *new_name);
56933 +void *reiser4_follow_link_common(struct dentry *, struct nameidata *data);
56934 +int reiser4_permission_common(struct inode *, int mask);
56935 +int reiser4_setattr_common(struct dentry *, struct iattr *);
56936 +int reiser4_getattr_common(struct vfsmount *mnt, struct dentry *,
56937 +                          struct kstat *);
56938 +
56939 +/* common implementations of file operations */
56940 +loff_t reiser4_llseek_dir_common(struct file *, loff_t off, int origin);
56941 +int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
56942 +int reiser4_release_dir_common(struct inode *, struct file *);
56943 +int reiser4_sync_common(struct file *, int datasync);
56944 +
56945 +/* file plugin operations: common implementations */
56946 +int write_sd_by_inode_common(struct inode *);
56947 +int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *);
56948 +int set_plug_in_inode_common(struct inode *object, struct inode *parent,
56949 +                            reiser4_object_create_data *);
56950 +int adjust_to_parent_common(struct inode *object, struct inode *parent,
56951 +                           struct inode *root);
56952 +int adjust_to_parent_common_dir(struct inode *object, struct inode *parent,
56953 +                               struct inode *root);
56954 +int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent,
56955 +                                  struct inode *root);
56956 +int reiser4_create_object_common(struct inode *object, struct inode *parent,
56957 +                                reiser4_object_create_data *);
56958 +int reiser4_delete_object_common(struct inode *);
56959 +int reiser4_delete_dir_common(struct inode *);
56960 +int reiser4_add_link_common(struct inode *object, struct inode *parent);
56961 +int reiser4_rem_link_common(struct inode *object, struct inode *parent);
56962 +int rem_link_common_dir(struct inode *object, struct inode *parent);
56963 +int owns_item_common(const struct inode *, const coord_t *);
56964 +int owns_item_common_dir(const struct inode *, const coord_t *);
56965 +int can_add_link_common(const struct inode *);
56966 +int can_rem_link_common_dir(const struct inode *);
56967 +int reiser4_detach_common_dir(struct inode *child, struct inode *parent);
56968 +int reiser4_bind_common_dir(struct inode *child, struct inode *parent);
56969 +int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value);
56970 +reiser4_block_nr estimate_create_common(const struct inode *);
56971 +reiser4_block_nr estimate_create_common_dir(const struct inode *);
56972 +reiser4_block_nr estimate_update_common(const struct inode *);
56973 +reiser4_block_nr estimate_unlink_common(const struct inode *,
56974 +                                       const struct inode *);
56975 +reiser4_block_nr estimate_unlink_common_dir(const struct inode *,
56976 +                                           const struct inode *);
56977 +char *wire_write_common(struct inode *, char *start);
56978 +char *wire_read_common(char *addr, reiser4_object_on_wire *);
56979 +struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *);
56980 +int wire_size_common(struct inode *);
56981 +void wire_done_common(reiser4_object_on_wire *);
56982 +
56983 +/* dir plugin operations: common implementations */
56984 +struct dentry *get_parent_common(struct inode *child);
56985 +int is_name_acceptable_common(const struct inode *, const char *name, int len);
56986 +void build_entry_key_common(const struct inode *,
56987 +                           const struct qstr *qname, reiser4_key *);
56988 +int build_readdir_key_common(struct file *dir, reiser4_key *);
56989 +int reiser4_add_entry_common(struct inode *object, struct dentry *where,
56990 +                    reiser4_object_create_data * , reiser4_dir_entry_desc *);
56991 +int reiser4_rem_entry_common(struct inode *object, struct dentry *where,
56992 +                    reiser4_dir_entry_desc *);
56993 +int reiser4_dir_init_common(struct inode *object, struct inode *parent,
56994 +                           reiser4_object_create_data *);
56995 +int reiser4_dir_done_common(struct inode *);
56996 +int reiser4_attach_common(struct inode *child, struct inode *parent);
56997 +int reiser4_detach_common(struct inode *object, struct inode *parent);
56998 +reiser4_block_nr estimate_add_entry_common(const struct inode *);
56999 +reiser4_block_nr estimate_rem_entry_common(const struct inode *);
57000 +reiser4_block_nr dir_estimate_unlink_common(const struct inode *,
57001 +                                           const struct inode *);
57002 +
57003 +/* these are essential parts of common implementations, they are to make
57004 +   customized implementations easier */
57005 +int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
57006 +
57007 +/* merely useful functions */
57008 +int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle * ,
57009 +             const reiser4_key * , int silent);
57010 +
57011 +/* __FS_REISER4_PLUGIN_OBJECT_H__ */
57012 +#endif
57013 +
57014 +/* Make Linus happy.
57015 +   Local variables:
57016 +   c-indentation-style: "K&R"
57017 +   mode-name: "LC"
57018 +   c-basic-offset: 8
57019 +   tab-width: 8
57020 +   fill-column: 120
57021 +   End:
57022 +*/
57023 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/plugin.c linux-2.6.35/fs/reiser4/plugin/plugin.c
57024 --- linux-2.6.35.orig/fs/reiser4/plugin/plugin.c        1970-01-01 01:00:00.000000000 +0100
57025 +++ linux-2.6.35/fs/reiser4/plugin/plugin.c     2010-08-04 15:44:57.000000000 +0200
57026 @@ -0,0 +1,560 @@
57027 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
57028 + * reiser4/README */
57029 +
57030 +/* Basic plugin infrastructure, lookup etc. */
57031 +
57032 +/* PLUGINS:
57033 +
57034 +   Plugins are internal Reiser4 "modules" or "objects" used to increase
57035 +   extensibility and allow external users to easily adapt reiser4 to
57036 +   their needs.
57037 +
57038 +   Plugins are classified into several disjoint "types". Plugins
57039 +   belonging to the particular plugin type are termed "instances" of
57040 +   this type. Existing types are listed by enum reiser4_plugin_type
57041 +   (see plugin/plugin_header.h)
57042 +
57043 +NIKITA-FIXME-HANS: update this list, and review this entire comment for currency
57044 +
57045 +   Object (file) plugin determines how given file-system object serves
57046 +   standard VFS requests for read, write, seek, mmap etc. Instances of
57047 +   file plugins are: regular file, directory, symlink. Another example
57048 +   of file plugin is audit plugin, that optionally records accesses to
57049 +   underlying object and forwards requests to it.
57050 +
57051 +   Hash plugins compute hashes used by reiser4 to store and locate
57052 +   files within directories. Instances of hash plugin type are: r5,
57053 +   tea, rupasov.
57054 +
57055 +   Tail plugins (or, more precisely, tail policy plugins) determine
57056 +   when last part of the file should be stored in a formatted item.
57057 +
57058 +   Scope and lookup:
57059 +
57060 +   label such that pair ( type_label, plugin_label ) is unique.  This
57061 +   pair is a globally persistent and user-visible plugin
57062 +   identifier. Internally kernel maintains plugins and plugin types in
57063 +   arrays using an index into those arrays as plugin and plugin type
57064 +   identifiers. File-system in turn, also maintains persistent
57065 +   "dictionary" which is mapping from plugin label to numerical
57066 +   identifier which is stored in file-system objects.  That is, we
57067 +   store the offset into the plugin array for that plugin type as the
57068 +   plugin id in the stat data of the filesystem object.
57069 +
57070 +   Internal kernel plugin type identifier (index in plugins[] array) is
57071 +   of type reiser4_plugin_type. Set of available plugin types is
57072 +   currently static, but dynamic loading doesn't seem to pose
57073 +   insurmountable problems.
57074 +
57075 +   Within each type plugins are addressed by the identifiers of type
57076 +   reiser4_plugin_id (indices in reiser4_plugin_type_data.builtin[]).
57077 +   Such identifiers are only required to be unique within one type,
57078 +   not globally.
57079 +
57080 +   Thus, plugin in memory is uniquely identified by the pair (type_id,
57081 +   id).
57082 +
57083 +   Usage:
57084 +
57085 +   There exists only one instance of each plugin instance, but this
57086 +   single instance can be associated with many entities (file-system
57087 +   objects, items, nodes, transactions, file-descriptors etc.). Entity
57088 +   to which plugin of given type is termed (due to the lack of
57089 +   imagination) "subject" of this plugin type and, by abuse of
57090 +   terminology, subject of particular instance of this type to which
57091 +   it's attached currently. For example, inode is subject of object
57092 +   plugin type. Inode representing directory is subject of directory
57093 +   plugin, hash plugin type and some particular instance of hash plugin
57094 +   type. Inode, representing regular file is subject of "regular file"
57095 +   plugin, tail-policy plugin type etc.
57096 +
57097 +   With each subject the plugin possibly stores some state. For example,
57098 +   the state of a directory plugin (instance of object plugin type) is pointer
57099 +   to hash plugin (if directories always use hashing that is).
57100 +
57101 +   Interface:
57102 +
57103 +   In addition to a scalar identifier, each plugin type and plugin
57104 +   proper has a "label": short string and a "description"---longer
57105 +   descriptive string. Labels and descriptions of plugin types are
57106 +   hard-coded into plugins[] array, declared and defined in
57107 +   plugin.c. Label and description of plugin are stored in .label and
57108 +   .desc fields of reiser4_plugin_header respectively. It's possible to
57109 +   locate plugin by the pair of labels.
57110 +
57111 +   Features (not implemented):
57112 +
57113 +    . user-level plugin manipulations:
57114 +      + reiser4("filename/..file_plugin<='audit'");
57115 +      + write(open("filename/..file_plugin"), "audit", 8);
57116 +
57117 +    . user level utilities lsplug and chplug to manipulate plugins.
57118 +      Utilities are not of primary priority. Possibly they will be not
57119 +      working on v4.0
57120 +
57121 +   NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount
57122 +   option, do you agree?  I don't think that specifying it at mount time,
57123 +   and then changing it with each mount, is a good model for usage.
57124 +
57125 +    . mount option "plug" to set-up plugins of root-directory.
57126 +      "plug=foo:bar" will set "bar" as default plugin of type "foo".
57127 +
57128 +   Limitations:
57129 +
57130 +    . each plugin type has to provide at least one builtin
57131 +      plugin. This is technical limitation and it can be lifted in the
57132 +      future.
57133 +
57134 +   TODO:
57135 +
57136 +   New plugin types/plugings:
57137 +   Things we should be able to separately choose to inherit:
57138 +
57139 +   security plugins
57140 +
57141 +   stat data
57142 +
57143 +   file bodies
57144 +
57145 +   file plugins
57146 +
57147 +   dir plugins
57148 +
57149 +    . perm:acl
57150 +
57151 +    . audi---audit plugin intercepting and possibly logging all
57152 +      accesses to object. Requires to put stub functions in file_operations
57153 +      in stead of generic_file_*.
57154 +
57155 +NIKITA-FIXME-HANS: why make overflows a plugin?
57156 +    . over---handle hash overflows
57157 +
57158 +    . sqnt---handle different access patterns and instruments read-ahead
57159 +
57160 +NIKITA-FIXME-HANS: describe the line below in more detail.
57161 +
57162 +    . hier---handle inheritance of plugins along file-system hierarchy
57163 +
57164 +   Different kinds of inheritance: on creation vs. on access.
57165 +   Compatible/incompatible plugins.
57166 +   Inheritance for multi-linked files.
57167 +   Layered plugins.
57168 +   Notion of plugin context is abandoned.
57169 +
57170 +Each file is associated
57171 +   with one plugin and dependant plugins (hash, etc.) are stored as
57172 +   main plugin state. Now, if we have plugins used for regular files
57173 +   but not for directories, how such plugins would be inherited?
57174 +    . always store them with directories also
57175 +
57176 +NIKTIA-FIXME-HANS: Do the line above.  It is not exclusive of doing
57177 +the line below which is also useful.
57178 +
57179 +    . use inheritance hierarchy, independent of file-system namespace
57180 +*/
57181 +
57182 +#include "../debug.h"
57183 +#include "../dformat.h"
57184 +#include "plugin_header.h"
57185 +#include "item/static_stat.h"
57186 +#include "node/node.h"
57187 +#include "security/perm.h"
57188 +#include "space/space_allocator.h"
57189 +#include "disk_format/disk_format.h"
57190 +#include "plugin.h"
57191 +#include "../reiser4.h"
57192 +#include "../jnode.h"
57193 +#include "../inode.h"
57194 +
57195 +#include <linux/fs.h>          /* for struct super_block  */
57196 +
57197 +/*
57198 + * init_plugins - initialize plugin sub-system.
57199 + * Just call this once on reiser4 startup.
57200 + *
57201 + * Initializes plugin sub-system. It is part of reiser4 module
57202 + * initialization. For each plugin of each type init method is called and each
57203 + * plugin is put into list of plugins.
57204 + */
57205 +int init_plugins(void)
57206 +{
57207 +       reiser4_plugin_type type_id;
57208 +
57209 +       for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
57210 +               struct reiser4_plugin_type_data *ptype;
57211 +               int i;
57212 +
57213 +               ptype = &plugins[type_id];
57214 +               assert("nikita-3508", ptype->label != NULL);
57215 +               assert("nikita-3509", ptype->type_id == type_id);
57216 +
57217 +               INIT_LIST_HEAD(&ptype->plugins_list);
57218 +/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term
57219 + * builtin. */
57220 +               for (i = 0; i < ptype->builtin_num; ++i) {
57221 +                       reiser4_plugin *plugin;
57222 +
57223 +                       plugin = plugin_at(ptype, i);
57224 +
57225 +                       if (plugin->h.label == NULL)
57226 +                               /* uninitialized slot encountered */
57227 +                               continue;
57228 +                       assert("nikita-3445", plugin->h.type_id == type_id);
57229 +                       plugin->h.id = i;
57230 +                       if (plugin->h.pops != NULL &&
57231 +                           plugin->h.pops->init != NULL) {
57232 +                               int result;
57233 +
57234 +                               result = plugin->h.pops->init(plugin);
57235 +                               if (result != 0)
57236 +                                       return result;
57237 +                       }
57238 +                       INIT_LIST_HEAD(&plugin->h.linkage);
57239 +                       list_add_tail(&plugin->h.linkage, &ptype->plugins_list);
57240 +               }
57241 +       }
57242 +       return 0;
57243 +}
57244 +
57245 +/* true if plugin type id is valid */
57246 +int is_plugin_type_valid(reiser4_plugin_type type)
57247 +{
57248 +       /* "type" is unsigned, so no comparison with 0 is
57249 +          necessary */
57250 +       return (type < REISER4_PLUGIN_TYPES);
57251 +}
57252 +
57253 +/* true if plugin id is valid */
57254 +int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id)
57255 +{
57256 +       assert("nikita-1653", is_plugin_type_valid(type));
57257 +       return id < plugins[type].builtin_num;
57258 +}
57259 +
57260 +/* return plugin by its @type and @id.
57261 +
57262 +   Both arguments are checked for validness: this is supposed to be called
57263 +   from user-level.
57264 +
57265 +NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in
57266 +user space, and passed to the filesystem by use of method files? Your
57267 +comment really confused me on the first reading....
57268 +
57269 +*/
57270 +reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type /* plugin type
57271 +                                                                * unchecked */,
57272 +                                   reiser4_plugin_id id        /* plugin id,
57273 +                                                                * unchecked */)
57274 +{
57275 +       if (is_plugin_type_valid(type)) {
57276 +               if (is_plugin_id_valid(type, id))
57277 +                       return plugin_at(&plugins[type], id);
57278 +               else
57279 +                       /* id out of bounds */
57280 +                       warning("nikita-2913",
57281 +                               "Invalid plugin id: [%i:%i]", type, id);
57282 +       } else
57283 +               /* type_id out of bounds */
57284 +               warning("nikita-2914", "Invalid type_id: %i", type);
57285 +       return NULL;
57286 +}
57287 +
57288 +/**
57289 + * save_plugin_id - store plugin id in disk format
57290 + * @plugin: plugin to convert
57291 + * @area: where to store result
57292 + *
57293 + * Puts id of @plugin in little endian format to address @area.
57294 + */
57295 +int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ ,
57296 +                  d16 * area/* where to store result */)
57297 +{
57298 +       assert("nikita-1261", plugin != NULL);
57299 +       assert("nikita-1262", area != NULL);
57300 +
57301 +       put_unaligned(cpu_to_le16(plugin->h.id), area);
57302 +       return 0;
57303 +}
57304 +
57305 +/* list of all plugins of given type */
57306 +struct list_head *get_plugin_list(reiser4_plugin_type type)
57307 +{
57308 +       assert("nikita-1056", is_plugin_type_valid(type));
57309 +       return &plugins[type].plugins_list;
57310 +}
57311 +
57312 +static void update_pset_mask(reiser4_inode * info, pset_member memb)
57313 +{
57314 +       struct dentry *rootdir;
57315 +       reiser4_inode *root;
57316 +
57317 +       assert("edward-1443", memb != PSET_FILE);
57318 +
57319 +       rootdir = inode_by_reiser4_inode(info)->i_sb->s_root;
57320 +       if (rootdir != NULL) {
57321 +               root = reiser4_inode_data(rootdir->d_inode);
57322 +               /*
57323 +                * if inode is different from the default one, or we are
57324 +                * changing plugin of root directory, update plugin_mask
57325 +                */
57326 +               if (aset_get(info->pset, memb) !=
57327 +                   aset_get(root->pset, memb) ||
57328 +                   info == root)
57329 +                       info->plugin_mask |= (1 << memb);
57330 +               else
57331 +                       info->plugin_mask &= ~(1 << memb);
57332 +       }
57333 +}
57334 +
57335 +/* Get specified plugin set member from parent,
57336 +   or from fs-defaults (if no parent is given) and
57337 +   install the result to pset of @self */
57338 +int grab_plugin_pset(struct inode *self,
57339 +                    struct inode *ancestor,
57340 +                    pset_member memb)
57341 +{
57342 +       reiser4_plugin *plug;
57343 +       reiser4_inode *info;
57344 +       int result = 0;
57345 +
57346 +       /* Do not grab if initialised already. */
57347 +       info = reiser4_inode_data(self);
57348 +       if (aset_get(info->pset, memb) != NULL)
57349 +               return 0;
57350 +       if (ancestor) {
57351 +               reiser4_inode *parent;
57352 +
57353 +               parent = reiser4_inode_data(ancestor);
57354 +               plug = aset_get(parent->hset, memb) ? :
57355 +                       aset_get(parent->pset, memb);
57356 +       } else
57357 +               plug = get_default_plugin(memb);
57358 +
57359 +       result = set_plugin(&info->pset, memb, plug);
57360 +       if (result == 0) {
57361 +               if (!ancestor || self->i_sb->s_root->d_inode != self)
57362 +                       update_pset_mask(info, memb);
57363 +       }
57364 +       return result;
57365 +}
57366 +
57367 +/* Take missing pset members from root inode */
57368 +int finish_pset(struct inode *inode)
57369 +{
57370 +       reiser4_plugin *plug;
57371 +       reiser4_inode *root;
57372 +       reiser4_inode *info;
57373 +       pset_member memb;
57374 +       int result = 0;
57375 +
57376 +       root = reiser4_inode_data(inode->i_sb->s_root->d_inode);
57377 +       info = reiser4_inode_data(inode);
57378 +
57379 +       assert("edward-1455", root != NULL);
57380 +       assert("edward-1456", info != NULL);
57381 +
57382 +       /* file and directory plugins are already initialized. */
57383 +       for (memb = PSET_DIR + 1; memb < PSET_LAST; ++memb) {
57384 +
57385 +               /* Do not grab if initialised already. */
57386 +               if (aset_get(info->pset, memb) != NULL)
57387 +                       continue;
57388 +
57389 +               plug = aset_get(root->pset, memb);
57390 +               result = set_plugin(&info->pset, memb, plug);
57391 +               if (result != 0)
57392 +                       break;
57393 +       }
57394 +       if (result != 0) {
57395 +               warning("nikita-3447",
57396 +                       "Cannot set up plugins for %lli",
57397 +                       (unsigned long long)
57398 +                       get_inode_oid(inode));
57399 +       }
57400 +       return result;
57401 +}
57402 +
57403 +int force_plugin_pset(struct inode *self, pset_member memb,
57404 +                     reiser4_plugin * plug)
57405 +{
57406 +       reiser4_inode *info;
57407 +       int result = 0;
57408 +
57409 +       if (!self->i_sb->s_root || self->i_sb->s_root->d_inode == self) {
57410 +               /* Changing pset in the root object. */
57411 +               return RETERR(-EINVAL);
57412 +       }
57413 +
57414 +       info = reiser4_inode_data(self);
57415 +       if (plug->h.pops != NULL && plug->h.pops->change != NULL)
57416 +               result = plug->h.pops->change(self, plug, memb);
57417 +       else
57418 +               result = aset_set_unsafe(&info->pset, memb, plug);
57419 +       if (result == 0) {
57420 +               __u16 oldmask = info->plugin_mask;
57421 +
57422 +               update_pset_mask(info, memb);
57423 +               if (oldmask != info->plugin_mask)
57424 +                       reiser4_inode_clr_flag(self, REISER4_SDLEN_KNOWN);
57425 +       }
57426 +       return result;
57427 +}
57428 +
57429 +struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = {
57430 +       /* C90 initializers */
57431 +       [REISER4_FILE_PLUGIN_TYPE] = {
57432 +               .type_id = REISER4_FILE_PLUGIN_TYPE,
57433 +               .label = "file",
57434 +               .desc = "Object plugins",
57435 +               .builtin_num = sizeof_array(file_plugins),
57436 +               .builtin = file_plugins,
57437 +               .plugins_list = {NULL, NULL},
57438 +               .size = sizeof(file_plugin)
57439 +       },
57440 +       [REISER4_DIR_PLUGIN_TYPE] = {
57441 +               .type_id = REISER4_DIR_PLUGIN_TYPE,
57442 +               .label = "dir",
57443 +               .desc = "Directory plugins",
57444 +               .builtin_num = sizeof_array(dir_plugins),
57445 +               .builtin = dir_plugins,
57446 +               .plugins_list = {NULL, NULL},
57447 +               .size = sizeof(dir_plugin)
57448 +       },
57449 +       [REISER4_HASH_PLUGIN_TYPE] = {
57450 +               .type_id = REISER4_HASH_PLUGIN_TYPE,
57451 +               .label = "hash",
57452 +               .desc = "Directory hashes",
57453 +               .builtin_num = sizeof_array(hash_plugins),
57454 +               .builtin = hash_plugins,
57455 +               .plugins_list = {NULL, NULL},
57456 +               .size = sizeof(hash_plugin)
57457 +       },
57458 +       [REISER4_FIBRATION_PLUGIN_TYPE] = {
57459 +               .type_id =
57460 +               REISER4_FIBRATION_PLUGIN_TYPE,
57461 +               .label = "fibration",
57462 +               .desc = "Directory fibrations",
57463 +               .builtin_num = sizeof_array(fibration_plugins),
57464 +               .builtin = fibration_plugins,
57465 +               .plugins_list = {NULL, NULL},
57466 +               .size = sizeof(fibration_plugin)
57467 +       },
57468 +       [REISER4_CIPHER_PLUGIN_TYPE] = {
57469 +               .type_id = REISER4_CIPHER_PLUGIN_TYPE,
57470 +               .label = "cipher",
57471 +               .desc = "Cipher plugins",
57472 +               .builtin_num = sizeof_array(cipher_plugins),
57473 +               .builtin = cipher_plugins,
57474 +               .plugins_list = {NULL, NULL},
57475 +               .size = sizeof(cipher_plugin)
57476 +       },
57477 +       [REISER4_DIGEST_PLUGIN_TYPE] = {
57478 +               .type_id = REISER4_DIGEST_PLUGIN_TYPE,
57479 +               .label = "digest",
57480 +               .desc = "Digest plugins",
57481 +               .builtin_num = sizeof_array(digest_plugins),
57482 +               .builtin = digest_plugins,
57483 +               .plugins_list = {NULL, NULL},
57484 +               .size = sizeof(digest_plugin)
57485 +       },
57486 +       [REISER4_COMPRESSION_PLUGIN_TYPE] = {
57487 +               .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
57488 +               .label = "compression",
57489 +               .desc = "Compression plugins",
57490 +               .builtin_num = sizeof_array(compression_plugins),
57491 +               .builtin = compression_plugins,
57492 +               .plugins_list = {NULL, NULL},
57493 +               .size = sizeof(compression_plugin)
57494 +       },
57495 +       [REISER4_FORMATTING_PLUGIN_TYPE] = {
57496 +               .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
57497 +               .label = "formatting",
57498 +               .desc = "Tail inlining policies",
57499 +               .builtin_num = sizeof_array(formatting_plugins),
57500 +               .builtin = formatting_plugins,
57501 +               .plugins_list = {NULL, NULL},
57502 +               .size = sizeof(formatting_plugin)
57503 +       },
57504 +       [REISER4_PERM_PLUGIN_TYPE] = {
57505 +               .type_id = REISER4_PERM_PLUGIN_TYPE,
57506 +               .label = "perm",
57507 +               .desc = "Permission checks",
57508 +               .builtin_num = sizeof_array(perm_plugins),
57509 +               .builtin = perm_plugins,
57510 +               .plugins_list = {NULL, NULL},
57511 +               .size = sizeof(perm_plugin)
57512 +       },
57513 +       [REISER4_ITEM_PLUGIN_TYPE] = {
57514 +               .type_id = REISER4_ITEM_PLUGIN_TYPE,
57515 +               .label = "item",
57516 +               .desc = "Item handlers",
57517 +               .builtin_num = sizeof_array(item_plugins),
57518 +               .builtin = item_plugins,
57519 +               .plugins_list = {NULL, NULL},
57520 +               .size = sizeof(item_plugin)
57521 +       },
57522 +       [REISER4_NODE_PLUGIN_TYPE] = {
57523 +               .type_id = REISER4_NODE_PLUGIN_TYPE,
57524 +               .label = "node",
57525 +               .desc = "node layout handlers",
57526 +               .builtin_num = sizeof_array(node_plugins),
57527 +               .builtin = node_plugins,
57528 +               .plugins_list = {NULL, NULL},
57529 +               .size = sizeof(node_plugin)
57530 +       },
57531 +       [REISER4_SD_EXT_PLUGIN_TYPE] = {
57532 +               .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
57533 +               .label = "sd_ext",
57534 +               .desc = "Parts of stat-data",
57535 +               .builtin_num = sizeof_array(sd_ext_plugins),
57536 +               .builtin = sd_ext_plugins,
57537 +               .plugins_list = {NULL, NULL},
57538 +               .size = sizeof(sd_ext_plugin)
57539 +       },
57540 +       [REISER4_FORMAT_PLUGIN_TYPE] = {
57541 +               .type_id = REISER4_FORMAT_PLUGIN_TYPE,
57542 +               .label = "disk_layout",
57543 +               .desc = "defines filesystem on disk layout",
57544 +               .builtin_num = sizeof_array(format_plugins),
57545 +               .builtin = format_plugins,
57546 +               .plugins_list = {NULL, NULL},
57547 +               .size = sizeof(disk_format_plugin)
57548 +       },
57549 +       [REISER4_JNODE_PLUGIN_TYPE] = {
57550 +               .type_id = REISER4_JNODE_PLUGIN_TYPE,
57551 +               .label = "jnode",
57552 +               .desc = "defines kind of jnode",
57553 +               .builtin_num = sizeof_array(jnode_plugins),
57554 +               .builtin = jnode_plugins,
57555 +               .plugins_list = {NULL, NULL},
57556 +               .size = sizeof(jnode_plugin)
57557 +       },
57558 +       [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = {
57559 +               .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
57560 +               .label = "compression_mode",
57561 +               .desc = "Defines compression mode",
57562 +               .builtin_num = sizeof_array(compression_mode_plugins),
57563 +               .builtin = compression_mode_plugins,
57564 +               .plugins_list = {NULL, NULL},
57565 +               .size = sizeof(compression_mode_plugin)
57566 +       },
57567 +       [REISER4_CLUSTER_PLUGIN_TYPE] = {
57568 +               .type_id = REISER4_CLUSTER_PLUGIN_TYPE,
57569 +               .label = "cluster",
57570 +               .desc = "Defines cluster size",
57571 +               .builtin_num = sizeof_array(cluster_plugins),
57572 +               .builtin = cluster_plugins,
57573 +               .plugins_list = {NULL, NULL},
57574 +               .size = sizeof(cluster_plugin)
57575 +       }
57576 +};
57577 +
57578 +/*
57579 + * Local variables:
57580 + * c-indentation-style: "K&R"
57581 + * mode-name: "LC"
57582 + * c-basic-offset: 8
57583 + * tab-width: 8
57584 + * fill-column: 120
57585 + * End:
57586 + */
57587 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/plugin.h linux-2.6.35/fs/reiser4/plugin/plugin.h
57588 --- linux-2.6.35.orig/fs/reiser4/plugin/plugin.h        1970-01-01 01:00:00.000000000 +0100
57589 +++ linux-2.6.35/fs/reiser4/plugin/plugin.h     2010-08-04 15:44:57.000000000 +0200
57590 @@ -0,0 +1,942 @@
57591 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
57592 + * reiser4/README */
57593 +
57594 +/* Basic plugin data-types.
57595 +   see fs/reiser4/plugin/plugin.c for details */
57596 +
57597 +#if !defined(__FS_REISER4_PLUGIN_TYPES_H__)
57598 +#define __FS_REISER4_PLUGIN_TYPES_H__
57599 +
57600 +#include "../forward.h"
57601 +#include "../debug.h"
57602 +#include "../dformat.h"
57603 +#include "../key.h"
57604 +#include "compress/compress.h"
57605 +#include "crypto/cipher.h"
57606 +#include "plugin_header.h"
57607 +#include "item/static_stat.h"
57608 +#include "item/internal.h"
57609 +#include "item/sde.h"
57610 +#include "item/cde.h"
57611 +#include "item/item.h"
57612 +#include "node/node.h"
57613 +#include "node/node40.h"
57614 +#include "security/perm.h"
57615 +#include "fibration.h"
57616 +
57617 +#include "space/bitmap.h"
57618 +#include "space/space_allocator.h"
57619 +
57620 +#include "disk_format/disk_format40.h"
57621 +#include "disk_format/disk_format.h"
57622 +
57623 +#include <linux/fs.h>          /* for struct super_block, address_space  */
57624 +#include <linux/mm.h>          /* for struct page */
57625 +#include <linux/buffer_head.h> /* for struct buffer_head */
57626 +#include <linux/dcache.h>      /* for struct dentry */
57627 +#include <linux/types.h>
57628 +#include <linux/crypto.h>
57629 +
57630 +typedef struct reiser4_object_on_wire reiser4_object_on_wire;
57631 +
57632 +/*
57633 + * File plugin.  Defines the set of methods that file plugins implement, some
57634 + * of which are optional.
57635 + *
57636 + * A file plugin offers to the caller an interface for IO ( writing to and/or
57637 + * reading from) to what the caller sees as one sequence of bytes.  An IO to it
57638 + * may affect more than one physical sequence of bytes, or no physical sequence
57639 + * of bytes, it may affect sequences of bytes offered by other file plugins to
57640 + * the semantic layer, and the file plugin may invoke other plugins and
57641 + * delegate work to them, but its interface is structured for offering the
57642 + * caller the ability to read and/or write what the caller sees as being a
57643 + * single sequence of bytes.
57644 + *
57645 + * The file plugin must present a sequence of bytes to the caller, but it does
57646 + * not necessarily have to store a sequence of bytes, it does not necessarily
57647 + * have to support efficient tree traversal to any offset in the sequence of
57648 + * bytes (tail and extent items, whose keys contain offsets, do however provide
57649 + * efficient non-sequential lookup of any offset in the sequence of bytes).
57650 + *
57651 + * Directory plugins provide methods for selecting file plugins by resolving a
57652 + * name for them.
57653 + *
57654 + * The functionality other filesystems call an attribute, and rigidly tie
57655 + * together, we decompose into orthogonal selectable features of files.  Using
57656 + * the terminology we will define next, an attribute is a perhaps constrained,
57657 + * perhaps static length, file whose parent has a uni-count-intra-link to it,
57658 + * which might be grandparent-major-packed, and whose parent has a deletion
57659 + * method that deletes it.
57660 + *
57661 + * File plugins can implement constraints.
57662 + *
57663 + * Files can be of variable length (e.g. regular unix files), or of static
57664 + * length (e.g. static sized attributes).
57665 + *
57666 + * An object may have many sequences of bytes, and many file plugins, but, it
57667 + * has exactly one objectid.  It is usually desirable that an object has a
57668 + * deletion method which deletes every item with that objectid.  Items cannot
57669 + * in general be found by just their objectids.  This means that an object must
57670 + * have either a method built into its deletion plugin method for knowing what
57671 + * items need to be deleted, or links stored with the object that provide the
57672 + * plugin with a method for finding those items.  Deleting a file within an
57673 + * object may or may not have the effect of deleting the entire object,
57674 + * depending on the file plugin's deletion method.
57675 + *
57676 + * LINK TAXONOMY:
57677 + *
57678 + * Many objects have a reference count, and when the reference count reaches 0
57679 + * the object's deletion method is invoked.  Some links embody a reference
57680 + * count increase ("countlinks"), and others do not ("nocountlinks").
57681 + *
57682 + * Some links are bi-directional links ("bilinks"), and some are
57683 + * uni-directional("unilinks").
57684 + *
57685 + * Some links are between parts of the same object ("intralinks"), and some are
57686 + * between different objects ("interlinks").
57687 + *
57688 + * PACKING TAXONOMY:
57689 + *
57690 + * Some items of an object are stored with a major packing locality based on
57691 + * their object's objectid (e.g. unix directory items in plan A), and these are
57692 + * called "self-major-packed".
57693 + *
57694 + * Some items of an object are stored with a major packing locality based on
57695 + * their semantic parent object's objectid (e.g. unix file bodies in plan A),
57696 + * and these are called "parent-major-packed".
57697 + *
57698 + * Some items of an object are stored with a major packing locality based on
57699 + * their semantic grandparent, and these are called "grandparent-major-packed".
57700 + * Now carefully notice that we run into trouble with key length if we have to
57701 + * store a 8 byte major+minor grandparent based packing locality, an 8 byte
57702 + * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in
57703 + * a 24 byte key.  One of these fields must be sacrificed if an item is to be
57704 + * grandparent-major-packed, and which to sacrifice is left to the item author
57705 + * choosing to make the item grandparent-major-packed.  You cannot make tail
57706 + * items and extent items grandparent-major-packed, though you could make them
57707 + * self-major-packed (usually they are parent-major-packed).
57708 + *
57709 + * In the case of ACLs (which are composed of fixed length ACEs which consist
57710 + * of {subject-type, subject, and permission bitmask} triples), it makes sense
57711 + * to not have an offset field in the ACE item key, and to allow duplicate keys
57712 + * for ACEs.  Thus, the set of ACES for a given file is found by looking for a
57713 + * key consisting of the objectid of the grandparent (thus grouping all ACLs in
57714 + * a directory together), the minor packing locality of ACE, the objectid of
57715 + * the file, and 0.
57716 + *
57717 + * IO involves moving data from one location to another, which means that two
57718 + * locations must be specified, source and destination.
57719 + *
57720 + * This source and destination can be in the filesystem, or they can be a
57721 + * pointer in the user process address space plus a byte count.
57722 + *
57723 + * If both source and destination are in the filesystem, then at least one of
57724 + * them must be representable as a pure stream of bytes (which we call a flow,
57725 + * and define as a struct containing a key, a data pointer, and a length).
57726 + * This may mean converting one of them into a flow.  We provide a generic
57727 + * cast_into_flow() method, which will work for any plugin supporting
57728 + * read_flow(), though it is inefficiently implemented in that it temporarily
57729 + * stores the flow in a buffer (Question: what to do with huge flows that
57730 + * cannot fit into memory?  Answer: we must not convert them all at once. )
57731 + *
57732 + * Performing a write requires resolving the write request into a flow defining
57733 + * the source, and a method that performs the write, and a key that defines
57734 + * where in the tree the write is to go.
57735 + *
57736 + * Performing a read requires resolving the read request into a flow defining
57737 + * the target, and a method that performs the read, and a key that defines
57738 + * where in the tree the read is to come from.
57739 + *
57740 + * There will exist file plugins which have no pluginid stored on the disk for
57741 + * them, and which are only invoked by other plugins.
57742 + */
57743 +
57744 +/* This should be incremented with each new contributed
57745 +   pair (plugin type, plugin id).
57746 +   NOTE: Make sure there is a release of reiser4progs
57747 +   with the corresponding version number */
57748 +#define PLUGIN_LIBRARY_VERSION 0
57749 +
57750 + /* enumeration of fields within plugin_set */
57751 +typedef enum {
57752 +       PSET_FILE,
57753 +       PSET_DIR,               /* PSET_FILE and PSET_DIR should be first
57754 +                                * elements: inode.c:read_inode() depends on
57755 +                                * this. */
57756 +       PSET_PERM,
57757 +       PSET_FORMATTING,
57758 +       PSET_HASH,
57759 +       PSET_FIBRATION,
57760 +       PSET_SD,
57761 +       PSET_DIR_ITEM,
57762 +       PSET_CIPHER,
57763 +       PSET_DIGEST,
57764 +       PSET_COMPRESSION,
57765 +       PSET_COMPRESSION_MODE,
57766 +       PSET_CLUSTER,
57767 +       PSET_CREATE,
57768 +       PSET_LAST
57769 +} pset_member;
57770 +
57771 +/* builtin file-plugins */
57772 +typedef enum {
57773 +       /* regular file */
57774 +       UNIX_FILE_PLUGIN_ID,
57775 +       /* directory */
57776 +       DIRECTORY_FILE_PLUGIN_ID,
57777 +       /* symlink */
57778 +       SYMLINK_FILE_PLUGIN_ID,
57779 +       /* for objects completely handled by the VFS: fifos, devices,
57780 +          sockets  */
57781 +       SPECIAL_FILE_PLUGIN_ID,
57782 +       /* regular cryptcompress file */
57783 +       CRYPTCOMPRESS_FILE_PLUGIN_ID,
57784 +       /* number of file plugins. Used as size of arrays to hold
57785 +          file plugins. */
57786 +       LAST_FILE_PLUGIN_ID
57787 +} reiser4_file_id;
57788 +
57789 +typedef struct file_plugin {
57790 +
57791 +       /* generic fields */
57792 +       plugin_header h;
57793 +
57794 +       /* VFS methods.
57795 +        * Must be invariant with respect to plugin conversion.
57796 +        * It can be achieved by using "common" methods, which
57797 +        * are the same for all plugins that take participation in
57798 +        * conversion, or by using "generic" or "careful" methods,
57799 +        * which provide automatic redirection to proper private
57800 +        * plugin methods ("careful" are the same as "generic",
57801 +        * but with protection of pset and other disk structures
57802 +        * from being rebuilt during conversion.
57803 +        */
57804 +       struct inode_operations * inode_ops;
57805 +       struct file_operations * file_ops;
57806 +       struct address_space_operations * as_ops;
57807 +       /**
57808 +        * Private methods. These are optional. If used they will allow you
57809 +        * to minimize the amount of code needed to implement a deviation
57810 +        * from some other method that also uses them.
57811 +        */
57812 +       /*
57813 +        * private inode_ops
57814 +        */
57815 +       int (*setattr)(struct dentry *, struct iattr *);
57816 +       /*
57817 +        * private file_ops
57818 +        */
57819 +       /* do whatever is necessary to do when object is opened */
57820 +       int (*open) (struct inode *inode, struct file *file);
57821 +       ssize_t (*read) (struct file *, char __user *buf, size_t read_amount,
57822 +                       loff_t *off);
57823 +       /* write as much as possible bytes from nominated @write_amount
57824 +        * before plugin scheduling is occurred. Save scheduling state
57825 +        * in @cont */
57826 +       ssize_t (*write) (struct file *, const char __user *buf,
57827 +                         size_t write_amount, loff_t * off,
57828 +                         struct dispatch_context * cont);
57829 +       int (*ioctl) (struct inode *inode, struct file *filp,
57830 +                     unsigned int cmd, unsigned long arg);
57831 +       int (*mmap) (struct file *, struct vm_area_struct *);
57832 +       int (*release) (struct inode *, struct file *);
57833 +       /*
57834 +        * private a_ops
57835 +        */
57836 +       int (*readpage) (struct file *file, struct page *page);
57837 +       int (*readpages)(struct file *file, struct address_space *mapping,
57838 +                         struct list_head *pages, unsigned nr_pages);
57839 +       int (*writepages)(struct address_space *mapping,
57840 +                         struct writeback_control *wbc);
57841 +       int (*write_begin)(struct file *file, struct page *page,
57842 +                         unsigned from, unsigned to);
57843 +       int (*write_end)(struct file *file, struct page *page,
57844 +                         unsigned from, unsigned to);
57845 +       sector_t (*bmap) (struct address_space * mapping, sector_t lblock);
57846 +       /* other private methods */
57847 +       /* save inode cached stat-data onto disk. It was called
57848 +          reiserfs_update_sd() in 3.x */
57849 +       int (*write_sd_by_inode) (struct inode *);
57850 +       /*
57851 +        * Construct flow into @flow according to user-supplied data.
57852 +        *
57853 +        * This is used by read/write methods to construct a flow to
57854 +        * write/read. ->flow_by_inode() is plugin method, rather than single
57855 +        * global implementation, because key in a flow used by plugin may
57856 +        * depend on data in a @buf.
57857 +        *
57858 +        * NIKITA-FIXME-HANS: please create statistics on what functions are
57859 +        * dereferenced how often for the mongo benchmark.  You can supervise
57860 +        * Elena doing this for you if that helps.  Email me the list of the
57861 +        * top 10, with their counts, and an estimate of the total number of
57862 +        * CPU cycles spent dereferencing as a percentage of CPU cycles spent
57863 +        * processing (non-idle processing).  If the total percent is, say,
57864 +        * less than 1%, it will make our coding discussions much easier, and
57865 +        * keep me from questioning whether functions like the below are too
57866 +        * frequently called to be dereferenced.  If the total percent is more
57867 +        * than 1%, perhaps private methods should be listed in a "required"
57868 +        * comment at the top of each plugin (with stern language about how if
57869 +        * the comment is missing it will not be accepted by the maintainer),
57870 +        * and implemented using macros not dereferenced functions.  How about
57871 +        * replacing this whole private methods part of the struct with a
57872 +        * thorough documentation of what the standard helper functions are for
57873 +        * use in constructing plugins?  I think users have been asking for
57874 +        * that, though not in so many words.
57875 +        */
57876 +       int (*flow_by_inode) (struct inode *, const char __user *buf,
57877 +                             int user, loff_t size,
57878 +                             loff_t off, rw_op op, flow_t *);
57879 +       /*
57880 +        * Return the key used to retrieve an offset of a file. It is used by
57881 +        * default implementation of ->flow_by_inode() method
57882 +        * (common_build_flow()) and, among other things, to get to the extent
57883 +        * from jnode of unformatted node.
57884 +        */
57885 +       int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *);
57886 +
57887 +       /* NIKITA-FIXME-HANS: this comment is not as clear to others as you
57888 +        * think.... */
57889 +       /*
57890 +        * set the plugin for a file.  Called during file creation in creat()
57891 +        * but not reiser4() unless an inode already exists for the file.
57892 +        */
57893 +       int (*set_plug_in_inode) (struct inode *inode, struct inode *parent,
57894 +                                 reiser4_object_create_data *);
57895 +
57896 +       /* NIKITA-FIXME-HANS: comment and name seem to say different things,
57897 +        * are you setting up the object itself also or just adjusting the
57898 +        * parent?.... */
57899 +       /* set up plugins for new @object created in @parent. @root is root
57900 +          directory. */
57901 +       int (*adjust_to_parent) (struct inode *object, struct inode *parent,
57902 +                                struct inode *root);
57903 +       /*
57904 +        * this does whatever is necessary to do when object is created. For
57905 +        * instance, for unix files stat data is inserted. It is supposed to be
57906 +        * called by create of struct inode_operations.
57907 +        */
57908 +       int (*create_object) (struct inode *object, struct inode *parent,
57909 +                             reiser4_object_create_data *);
57910 +       /*
57911 +        * this method should check REISER4_NO_SD and set REISER4_NO_SD on
57912 +        * success. Deletion of an object usually includes removal of items
57913 +        * building file body (for directories this is removal of "." and "..")
57914 +        * and removal of stat-data item.
57915 +        */
57916 +       int (*delete_object) (struct inode *);
57917 +
57918 +       /* add link from @parent to @object */
57919 +       int (*add_link) (struct inode *object, struct inode *parent);
57920 +
57921 +       /* remove link from @parent to @object */
57922 +       int (*rem_link) (struct inode *object, struct inode *parent);
57923 +
57924 +       /*
57925 +        * return true if item addressed by @coord belongs to @inode.  This is
57926 +        * used by read/write to properly slice flow into items in presence of
57927 +        * multiple key assignment policies, because items of a file are not
57928 +        * necessarily contiguous in a key space, for example, in a plan-b.
57929 +        */
57930 +       int (*owns_item) (const struct inode *, const coord_t *);
57931 +
57932 +       /* checks whether yet another hard links to this object can be
57933 +          added  */
57934 +       int (*can_add_link) (const struct inode *);
57935 +
57936 +       /* checks whether hard links to this object can be removed */
57937 +       int (*can_rem_link) (const struct inode *);
57938 +
57939 +       /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls
57940 +          detach of directory plugin to remove ".." */
57941 +       int (*detach) (struct inode *child, struct inode *parent);
57942 +
57943 +       /* called when @child was just looked up in the @parent. It is not
57944 +          empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of
57945 +          directory plugin */
57946 +       int (*bind) (struct inode *child, struct inode *parent);
57947 +
57948 +       /* process safe-link during mount */
57949 +       int (*safelink) (struct inode *object, reiser4_safe_link_t link,
57950 +                        __u64 value);
57951 +
57952 +       /* The couple of estimate methods for all file operations */
57953 +       struct {
57954 +               reiser4_block_nr(*create) (const struct inode *);
57955 +               reiser4_block_nr(*update) (const struct inode *);
57956 +               reiser4_block_nr(*unlink) (const struct inode *,
57957 +                                          const struct inode *);
57958 +       } estimate;
57959 +
57960 +       /*
57961 +        * reiser4 specific part of inode has a union of structures which are
57962 +        * specific to a plugin. This method is called when inode is read
57963 +        * (read_inode) and when file is created (common_create_child) so that
57964 +        * file plugin could initialize its inode data
57965 +        */
57966 +       void (*init_inode_data) (struct inode *, reiser4_object_create_data * ,
57967 +                                int);
57968 +
57969 +       /*
57970 +        * This method performs progressive deletion of items and whole nodes
57971 +        * from right to left.
57972 +        *
57973 +        * @tap: the point deletion process begins from,
57974 +        * @from_key: the beginning of the deleted key range,
57975 +        * @to_key: the end of the deleted key range,
57976 +        * @smallest_removed: the smallest removed key,
57977 +        *
57978 +        * @return: 0 if success, error code otherwise, -E_REPEAT means that
57979 +        * long cut_tree operation was interrupted for allowing atom commit .
57980 +        */
57981 +       int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key,
57982 +                               const reiser4_key * to_key,
57983 +                               reiser4_key * smallest_removed, struct inode *,
57984 +                               int, int *);
57985 +
57986 +       /* called from ->destroy_inode() */
57987 +       void (*destroy_inode) (struct inode *);
57988 +
57989 +       /*
57990 +        * methods to serialize object identify. This is used, for example, by
57991 +        * reiser4_{en,de}code_fh().
57992 +        */
57993 +       struct {
57994 +               /* store object's identity at @area */
57995 +               char *(*write) (struct inode *inode, char *area);
57996 +               /* parse object from wire to the @obj */
57997 +               char *(*read) (char *area, reiser4_object_on_wire * obj);
57998 +               /* given object identity in @obj, find or create its dentry */
57999 +               struct dentry *(*get) (struct super_block *s,
58000 +                                      reiser4_object_on_wire * obj);
58001 +               /* how many bytes ->wire.write() consumes */
58002 +               int (*size) (struct inode *inode);
58003 +               /* finish with object identify */
58004 +               void (*done) (reiser4_object_on_wire * obj);
58005 +       } wire;
58006 +} file_plugin;
58007 +
58008 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
58009 +
58010 +struct reiser4_object_on_wire {
58011 +       file_plugin *plugin;
58012 +       union {
58013 +               struct {
58014 +                       obj_key_id key_id;
58015 +               } std;
58016 +               void *generic;
58017 +       } u;
58018 +};
58019 +
58020 +/* builtin dir-plugins */
58021 +typedef enum {
58022 +       HASHED_DIR_PLUGIN_ID,
58023 +       SEEKABLE_HASHED_DIR_PLUGIN_ID,
58024 +       LAST_DIR_ID
58025 +} reiser4_dir_id;
58026 +
58027 +typedef struct dir_plugin {
58028 +       /* generic fields */
58029 +       plugin_header h;
58030 +
58031 +       struct inode_operations * inode_ops;
58032 +       struct file_operations * file_ops;
58033 +       struct address_space_operations * as_ops;
58034 +
58035 +       /*
58036 +        * private methods: These are optional.  If used they will allow you to
58037 +        * minimize the amount of code needed to implement a deviation from
58038 +        * some other method that uses them.  You could logically argue that
58039 +        * they should be a separate type of plugin.
58040 +        */
58041 +
58042 +       struct dentry *(*get_parent) (struct inode *childdir);
58043 +
58044 +       /*
58045 +        * check whether "name" is acceptable name to be inserted into this
58046 +        * object. Optionally implemented by directory-like objects.  Can check
58047 +        * for maximal length, reserved symbols etc
58048 +        */
58049 +       int (*is_name_acceptable) (const struct inode *inode, const char *name,
58050 +                                  int len);
58051 +
58052 +       void (*build_entry_key) (const struct inode *dir /* directory where
58053 +                                                         * entry is (or will
58054 +                                                         * be) in.*/ ,
58055 +                                const struct qstr *name /* name of file
58056 +                                                         * referenced by this
58057 +                                                         * entry */ ,
58058 +                                reiser4_key * result   /* resulting key of
58059 +                                                        * directory entry */ );
58060 +       int (*build_readdir_key) (struct file *dir, reiser4_key * result);
58061 +       int (*add_entry) (struct inode *object, struct dentry *where,
58062 +                         reiser4_object_create_data * data,
58063 +                         reiser4_dir_entry_desc * entry);
58064 +       int (*rem_entry) (struct inode *object, struct dentry *where,
58065 +                         reiser4_dir_entry_desc * entry);
58066 +
58067 +       /*
58068 +        * initialize directory structure for newly created object. For normal
58069 +        * unix directories, insert dot and dotdot.
58070 +        */
58071 +       int (*init) (struct inode *object, struct inode *parent,
58072 +                    reiser4_object_create_data * data);
58073 +
58074 +       /* destroy directory */
58075 +       int (*done) (struct inode *child);
58076 +
58077 +       /* called when @subdir was just looked up in the @dir */
58078 +       int (*attach) (struct inode *subdir, struct inode *dir);
58079 +       int (*detach) (struct inode *subdir, struct inode *dir);
58080 +
58081 +       struct {
58082 +               reiser4_block_nr(*add_entry) (const struct inode *);
58083 +               reiser4_block_nr(*rem_entry) (const struct inode *);
58084 +               reiser4_block_nr(*unlink) (const struct inode *,
58085 +                                          const struct inode *);
58086 +       } estimate;
58087 +} dir_plugin;
58088 +
58089 +extern dir_plugin dir_plugins[LAST_DIR_ID];
58090 +
58091 +typedef struct formatting_plugin {
58092 +       /* generic fields */
58093 +       plugin_header h;
58094 +       /* returns non-zero iff file's tail has to be stored
58095 +          in a direct item. */
58096 +       int (*have_tail) (const struct inode *inode, loff_t size);
58097 +} formatting_plugin;
58098 +
58099 +typedef struct hash_plugin {
58100 +       /* generic fields */
58101 +       plugin_header h;
58102 +       /* computes hash of the given name */
58103 +        __u64(*hash) (const unsigned char *name, int len);
58104 +} hash_plugin;
58105 +
58106 +typedef struct cipher_plugin {
58107 +       /* generic fields */
58108 +       plugin_header h;
58109 +       struct crypto_blkcipher * (*alloc) (void);
58110 +       void (*free) (struct crypto_blkcipher *tfm);
58111 +       /* Offset translator. For each offset this returns (k * offset), where
58112 +          k (k >= 1) is an expansion factor of the cipher algorithm.
58113 +          For all symmetric algorithms k == 1. For asymmetric algorithms (which
58114 +          inflate data) offset translation guarantees that all disk cluster's
58115 +          units will have keys smaller then next cluster's one.
58116 +        */
58117 +        loff_t(*scale) (struct inode *inode, size_t blocksize, loff_t src);
58118 +       /* Cipher algorithms can accept data only by chunks of cipher block
58119 +          size. This method is to align any flow up to cipher block size when
58120 +          we pass it to cipher algorithm. To align means to append padding of
58121 +          special format specific to the cipher algorithm */
58122 +       int (*align_stream) (__u8 *tail, int clust_size, int blocksize);
58123 +       /* low-level key manager (check, install, etc..) */
58124 +       int (*setkey) (struct crypto_tfm *tfm, const __u8 *key,
58125 +                      unsigned int keylen);
58126 +       /* main text processing procedures */
58127 +       void (*encrypt) (__u32 *expkey, __u8 *dst, const __u8 *src);
58128 +       void (*decrypt) (__u32 *expkey, __u8 *dst, const __u8 *src);
58129 +} cipher_plugin;
58130 +
58131 +typedef struct digest_plugin {
58132 +       /* generic fields */
58133 +       plugin_header h;
58134 +       /* fingerprint size in bytes */
58135 +       int fipsize;
58136 +       struct crypto_hash * (*alloc) (void);
58137 +       void (*free) (struct crypto_hash *tfm);
58138 +} digest_plugin;
58139 +
58140 +typedef struct compression_plugin {
58141 +       /* generic fields */
58142 +       plugin_header h;
58143 +       int (*init) (void);
58144 +       /* the maximum number of bytes the size of the "compressed" data can
58145 +        * exceed the uncompressed data. */
58146 +       int (*overrun) (unsigned src_len);
58147 +        coa_t(*alloc) (tfm_action act);
58148 +       void (*free) (coa_t coa, tfm_action act);
58149 +       /* minimal size of the flow we still try to compress */
58150 +       int (*min_size_deflate) (void);
58151 +        __u32(*checksum) (char *data, __u32 length);
58152 +       /* main transform procedures */
58153 +       void (*compress) (coa_t coa, __u8 *src_first, size_t src_len,
58154 +                         __u8 *dst_first, size_t *dst_len);
58155 +       void (*decompress) (coa_t coa, __u8 *src_first, size_t src_len,
58156 +                           __u8 *dst_first, size_t *dst_len);
58157 +} compression_plugin;
58158 +
58159 +typedef struct compression_mode_plugin {
58160 +       /* generic fields */
58161 +       plugin_header h;
58162 +       /* this is called when estimating compressibility
58163 +          of a logical cluster by its content */
58164 +       int (*should_deflate) (struct inode *inode, cloff_t index);
58165 +       /* this is called when results of compression should be saved */
58166 +       int (*accept_hook) (struct inode *inode, cloff_t index);
58167 +       /* this is called when results of compression should be discarded */
58168 +       int (*discard_hook) (struct inode *inode, cloff_t index);
58169 +} compression_mode_plugin;
58170 +
58171 +typedef struct cluster_plugin {
58172 +       /* generic fields */
58173 +       plugin_header h;
58174 +       int shift;
58175 +} cluster_plugin;
58176 +
58177 +typedef struct sd_ext_plugin {
58178 +       /* generic fields */
58179 +       plugin_header h;
58180 +       int (*present) (struct inode *inode, char **area, int *len);
58181 +       int (*absent) (struct inode *inode);
58182 +       int (*save_len) (struct inode *inode);
58183 +       int (*save) (struct inode *inode, char **area);
58184 +       /* alignment requirement for this stat-data part */
58185 +       int alignment;
58186 +} sd_ext_plugin;
58187 +
58188 +/* this plugin contains methods to allocate objectid for newly created files,
58189 +   to deallocate objectid when file gets removed, to report number of used and
58190 +   free objectids */
58191 +typedef struct oid_allocator_plugin {
58192 +       /* generic fields */
58193 +       plugin_header h;
58194 +       int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files,
58195 +                                  __u64 oids);
58196 +       /* used to report statfs->f_files */
58197 +        __u64(*oids_used) (reiser4_oid_allocator * map);
58198 +       /* get next oid to use */
58199 +        __u64(*next_oid) (reiser4_oid_allocator * map);
58200 +       /* used to report statfs->f_ffree */
58201 +        __u64(*oids_free) (reiser4_oid_allocator * map);
58202 +       /* allocate new objectid */
58203 +       int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *);
58204 +       /* release objectid */
58205 +       int (*release_oid) (reiser4_oid_allocator * map, oid_t);
58206 +       /* how many pages to reserve in transaction for allocation of new
58207 +          objectid */
58208 +       int (*oid_reserve_allocate) (reiser4_oid_allocator * map);
58209 +       /* how many pages to reserve in transaction for freeing of an
58210 +          objectid */
58211 +       int (*oid_reserve_release) (reiser4_oid_allocator * map);
58212 +       void (*print_info) (const char *, reiser4_oid_allocator *);
58213 +} oid_allocator_plugin;
58214 +
58215 +/* disk layout plugin: this specifies super block, journal, bitmap (if there
58216 +   are any) locations, etc */
58217 +typedef struct disk_format_plugin {
58218 +       /* generic fields */
58219 +       plugin_header h;
58220 +       /* replay journal, initialize super_info_data, etc */
58221 +       int (*init_format) (struct super_block *, void *data);
58222 +
58223 +       /* key of root directory stat data */
58224 +       const reiser4_key * (*root_dir_key) (const struct super_block *);
58225 +
58226 +       int (*release) (struct super_block *);
58227 +       jnode * (*log_super) (struct super_block *);
58228 +       int (*check_open) (const struct inode *object);
58229 +       int (*version_update) (struct super_block *);
58230 +} disk_format_plugin;
58231 +
58232 +struct jnode_plugin {
58233 +       /* generic fields */
58234 +       plugin_header h;
58235 +       int (*init) (jnode * node);
58236 +       int (*parse) (jnode * node);
58237 +       struct address_space *(*mapping) (const jnode * node);
58238 +       unsigned long (*index) (const jnode * node);
58239 +       jnode * (*clone) (jnode * node);
58240 +};
58241 +
58242 +/* plugin instance.                                                         */
58243 +/*                                                                          */
58244 +/* This is "wrapper" union for all types of plugins. Most of the code uses  */
58245 +/* plugins of particular type (file_plugin, dir_plugin, etc.)  rather than  */
58246 +/* operates with pointers to reiser4_plugin. This union is only used in     */
58247 +/* some generic code in plugin/plugin.c that operates on all                */
58248 +/* plugins. Technically speaking purpose of this union is to add type       */
58249 +/* safety to said generic code: each plugin type (file_plugin, for          */
58250 +/* example), contains plugin_header as its first memeber. This first member */
58251 +/* is located at the same place in memory as .h member of                   */
58252 +/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and      */
58253 +/* looks in the .h which is header of plugin type located in union. This    */
58254 +/* allows to avoid type-casts.                                              */
58255 +union reiser4_plugin {
58256 +       /* generic fields */
58257 +       plugin_header h;
58258 +       /* file plugin */
58259 +       file_plugin file;
58260 +       /* directory plugin */
58261 +       dir_plugin dir;
58262 +       /* hash plugin, used by directory plugin */
58263 +       hash_plugin hash;
58264 +       /* fibration plugin used by directory plugin */
58265 +       fibration_plugin fibration;
58266 +       /* cipher transform plugin, used by file plugin */
58267 +       cipher_plugin cipher;
58268 +       /* digest transform plugin, used by file plugin */
58269 +       digest_plugin digest;
58270 +       /* compression transform plugin, used by file plugin */
58271 +       compression_plugin compression;
58272 +       /* tail plugin, used by file plugin */
58273 +       formatting_plugin formatting;
58274 +       /* permission plugin */
58275 +       perm_plugin perm;
58276 +       /* node plugin */
58277 +       node_plugin node;
58278 +       /* item plugin */
58279 +       item_plugin item;
58280 +       /* stat-data extension plugin */
58281 +       sd_ext_plugin sd_ext;
58282 +       /* disk layout plugin */
58283 +       disk_format_plugin format;
58284 +       /* object id allocator plugin */
58285 +       oid_allocator_plugin oid_allocator;
58286 +       /* plugin for different jnode types */
58287 +       jnode_plugin jnode;
58288 +       /* compression mode plugin, used by object plugin */
58289 +       compression_mode_plugin compression_mode;
58290 +       /* cluster plugin, used by object plugin */
58291 +       cluster_plugin clust;
58292 +       /* place-holder for new plugin types that can be registered
58293 +          dynamically, and used by other dynamically loaded plugins.  */
58294 +       void *generic;
58295 +};
58296 +
58297 +struct reiser4_plugin_ops {
58298 +       /* called when plugin is initialized */
58299 +       int (*init) (reiser4_plugin * plugin);
58300 +       /* called when plugin is unloaded */
58301 +       int (*done) (reiser4_plugin * plugin);
58302 +       /* load given plugin from disk */
58303 +       int (*load) (struct inode *inode,
58304 +                    reiser4_plugin * plugin, char **area, int *len);
58305 +       /* how many space is required to store this plugin's state
58306 +          in stat-data */
58307 +       int (*save_len) (struct inode *inode, reiser4_plugin * plugin);
58308 +       /* save persistent plugin-data to disk */
58309 +       int (*save) (struct inode *inode, reiser4_plugin * plugin,
58310 +                    char **area);
58311 +       /* alignment requirement for on-disk state of this plugin
58312 +          in number of bytes */
58313 +       int alignment;
58314 +       /* install itself into given inode. This can return error
58315 +          (e.g., you cannot change hash of non-empty directory). */
58316 +       int (*change) (struct inode *inode, reiser4_plugin * plugin,
58317 +                      pset_member memb);
58318 +       /* install itself into given inode. This can return error
58319 +          (e.g., you cannot change hash of non-empty directory). */
58320 +       int (*inherit) (struct inode *inode, struct inode *parent,
58321 +                       reiser4_plugin * plugin);
58322 +};
58323 +
58324 +/* functions implemented in fs/reiser4/plugin/plugin.c */
58325 +
58326 +/* stores plugin reference in reiser4-specific part of inode */
58327 +extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id);
58328 +extern int init_plugins(void);
58329 +
58330 +/* builtin plugins */
58331 +
58332 +/* builtin hash-plugins */
58333 +
58334 +typedef enum {
58335 +       RUPASOV_HASH_ID,
58336 +       R5_HASH_ID,
58337 +       TEA_HASH_ID,
58338 +       FNV1_HASH_ID,
58339 +       DEGENERATE_HASH_ID,
58340 +       LAST_HASH_ID
58341 +} reiser4_hash_id;
58342 +
58343 +/* builtin cipher plugins */
58344 +
58345 +typedef enum {
58346 +       NONE_CIPHER_ID,
58347 +       LAST_CIPHER_ID
58348 +} reiser4_cipher_id;
58349 +
58350 +/* builtin digest plugins */
58351 +
58352 +typedef enum {
58353 +       SHA256_32_DIGEST_ID,
58354 +       LAST_DIGEST_ID
58355 +} reiser4_digest_id;
58356 +
58357 +/* builtin compression mode plugins */
58358 +typedef enum {
58359 +       NONE_COMPRESSION_MODE_ID,
58360 +       LATTD_COMPRESSION_MODE_ID,
58361 +       ULTIM_COMPRESSION_MODE_ID,
58362 +       FORCE_COMPRESSION_MODE_ID,
58363 +       CONVX_COMPRESSION_MODE_ID,
58364 +       LAST_COMPRESSION_MODE_ID
58365 +} reiser4_compression_mode_id;
58366 +
58367 +/* builtin cluster plugins */
58368 +typedef enum {
58369 +       CLUSTER_64K_ID,
58370 +       CLUSTER_32K_ID,
58371 +       CLUSTER_16K_ID,
58372 +       CLUSTER_8K_ID,
58373 +       CLUSTER_4K_ID,
58374 +       LAST_CLUSTER_ID
58375 +} reiser4_cluster_id;
58376 +
58377 +/* builtin tail-plugins */
58378 +
58379 +typedef enum {
58380 +       NEVER_TAILS_FORMATTING_ID,
58381 +       ALWAYS_TAILS_FORMATTING_ID,
58382 +       SMALL_FILE_FORMATTING_ID,
58383 +       LAST_TAIL_FORMATTING_ID
58384 +} reiser4_formatting_id;
58385 +
58386 +/* data type used to pack parameters that we pass to vfs object creation
58387 +   function create_object() */
58388 +struct reiser4_object_create_data {
58389 +       /* plugin to control created object */
58390 +       reiser4_file_id id;
58391 +       /* mode of regular file, directory or special file */
58392 +/* what happens if some other sort of perm plugin is in use? */
58393 +       int mode;
58394 +       /* rdev of special file */
58395 +       dev_t rdev;
58396 +       /* symlink target */
58397 +       const char *name;
58398 +       /* add here something for non-standard objects you invent, like
58399 +          query for interpolation file etc. */
58400 +
58401 +       struct reiser4_crypto_info *crypto;
58402 +
58403 +       struct inode *parent;
58404 +       struct dentry *dentry;
58405 +};
58406 +
58407 +/* description of directory entry being created/destroyed/sought for
58408 +
58409 +   It is passed down to the directory plugin and farther to the
58410 +   directory item plugin methods. Creation of new directory is done in
58411 +   several stages: first we search for an entry with the same name, then
58412 +   create new one. reiser4_dir_entry_desc is used to store some information
58413 +   collected at some stage of this process and required later: key of
58414 +   item that we want to insert/delete and pointer to an object that will
58415 +   be bound by the new directory entry. Probably some more fields will
58416 +   be added there.
58417 +
58418 +*/
58419 +struct reiser4_dir_entry_desc {
58420 +       /* key of directory entry */
58421 +       reiser4_key key;
58422 +       /* object bound by this entry. */
58423 +       struct inode *obj;
58424 +};
58425 +
58426 +#define MAX_PLUGIN_TYPE_LABEL_LEN  32
58427 +#define MAX_PLUGIN_PLUG_LABEL_LEN  32
58428 +
58429 +#define PLUGIN_BY_ID(TYPE, ID, FIELD)                                  \
58430 +static inline TYPE *TYPE ## _by_id(reiser4_plugin_id id)               \
58431 +{                                                                      \
58432 +       reiser4_plugin *plugin = plugin_by_id(ID, id);                  \
58433 +       return plugin ? &plugin->FIELD : NULL;                          \
58434 +}                                                                      \
58435 +static inline TYPE *TYPE ## _by_disk_id(reiser4_tree * tree, d16 *id)  \
58436 +{                                                                      \
58437 +       reiser4_plugin *plugin = plugin_by_disk_id(tree, ID, id);       \
58438 +       return plugin ? &plugin->FIELD : NULL;                          \
58439 +}                                                                      \
58440 +static inline TYPE *TYPE ## _by_unsafe_id(reiser4_plugin_id id)                \
58441 +{                                                                      \
58442 +       reiser4_plugin *plugin = plugin_by_unsafe_id(ID, id);           \
58443 +       return plugin ? &plugin->FIELD : NULL;                          \
58444 +}                                                                      \
58445 +static inline reiser4_plugin* TYPE ## _to_plugin(TYPE* plugin)         \
58446 +{                                                                      \
58447 +       return (reiser4_plugin *) plugin;                               \
58448 +}                                                                      \
58449 +static inline reiser4_plugin_id TYPE ## _id(TYPE* plugin)              \
58450 +{                                                                      \
58451 +       return TYPE ## _to_plugin(plugin)->h.id;                        \
58452 +}                                                                      \
58453 +typedef struct { int foo; } TYPE ## _plugin_dummy
58454 +
58455 +PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item);
58456 +PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file);
58457 +PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir);
58458 +PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node);
58459 +PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext);
58460 +PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm);
58461 +PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash);
58462 +PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration);
58463 +PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher);
58464 +PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest);
58465 +PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression);
58466 +PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting);
58467 +PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format);
58468 +PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode);
58469 +PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
58470 +            compression_mode);
58471 +PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust);
58472 +
58473 +extern int save_plugin_id(reiser4_plugin * plugin, d16 * area);
58474 +
58475 +extern struct list_head *get_plugin_list(reiser4_plugin_type type_id);
58476 +
58477 +#define for_all_plugins(ptype, plugin)                                                 \
58478 +for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage);     \
58479 +     get_plugin_list(ptype) != &plugin->h.linkage;                                     \
58480 +     plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage))
58481 +
58482 +
58483 +extern int grab_plugin_pset(struct inode *self, struct inode *ancestor,
58484 +                           pset_member memb);
58485 +extern int force_plugin_pset(struct inode *self, pset_member memb,
58486 +                            reiser4_plugin *plug);
58487 +extern int finish_pset(struct inode *inode);
58488 +
58489 +/* defined in fs/reiser4/plugin/object.c */
58490 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
58491 +/* defined in fs/reiser4/plugin/object.c */
58492 +extern dir_plugin dir_plugins[LAST_DIR_ID];
58493 +/* defined in fs/reiser4/plugin/item/static_stat.c */
58494 +extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION];
58495 +/* defined in fs/reiser4/plugin/hash.c */
58496 +extern hash_plugin hash_plugins[LAST_HASH_ID];
58497 +/* defined in fs/reiser4/plugin/fibration.c */
58498 +extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID];
58499 +/* defined in fs/reiser4/plugin/crypt.c */
58500 +extern cipher_plugin cipher_plugins[LAST_CIPHER_ID];
58501 +/* defined in fs/reiser4/plugin/digest.c */
58502 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
58503 +/* defined in fs/reiser4/plugin/compress/compress.c */
58504 +extern compression_plugin compression_plugins[LAST_COMPRESSION_ID];
58505 +/* defined in fs/reiser4/plugin/compress/compression_mode.c */
58506 +extern compression_mode_plugin
58507 +compression_mode_plugins[LAST_COMPRESSION_MODE_ID];
58508 +/* defined in fs/reiser4/plugin/cluster.c */
58509 +extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID];
58510 +/* defined in fs/reiser4/plugin/tail.c */
58511 +extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID];
58512 +/* defined in fs/reiser4/plugin/security/security.c */
58513 +extern perm_plugin perm_plugins[LAST_PERM_ID];
58514 +/* defined in fs/reiser4/plugin/item/item.c */
58515 +extern item_plugin item_plugins[LAST_ITEM_ID];
58516 +/* defined in fs/reiser4/plugin/node/node.c */
58517 +extern node_plugin node_plugins[LAST_NODE_ID];
58518 +/* defined in fs/reiser4/plugin/disk_format/disk_format.c */
58519 +extern disk_format_plugin format_plugins[LAST_FORMAT_ID];
58520 +
58521 +/* __FS_REISER4_PLUGIN_TYPES_H__ */
58522 +#endif
58523 +
58524 +/* Make Linus happy.
58525 +   Local variables:
58526 +   c-indentation-style: "K&R"
58527 +   mode-name: "LC"
58528 +   c-basic-offset: 8
58529 +   tab-width: 8
58530 +   fill-column: 120
58531 +   End:
58532 +*/
58533 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/plugin_header.h linux-2.6.35/fs/reiser4/plugin/plugin_header.h
58534 --- linux-2.6.35.orig/fs/reiser4/plugin/plugin_header.h 1970-01-01 01:00:00.000000000 +0100
58535 +++ linux-2.6.35/fs/reiser4/plugin/plugin_header.h      2010-08-04 15:44:57.000000000 +0200
58536 @@ -0,0 +1,149 @@
58537 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58538 +
58539 +/* plugin header. Data structures required by all plugin types. */
58540 +
58541 +#if !defined(__PLUGIN_HEADER_H__)
58542 +#define __PLUGIN_HEADER_H__
58543 +
58544 +/* plugin data-types and constants */
58545 +
58546 +#include "../debug.h"
58547 +#include "../dformat.h"
58548 +
58549 +/* The list of Reiser4 interfaces */
58550 +typedef enum {
58551 +       REISER4_FILE_PLUGIN_TYPE,             /* manage VFS objects */
58552 +       REISER4_DIR_PLUGIN_TYPE,              /* manage directories */
58553 +       REISER4_ITEM_PLUGIN_TYPE,             /* manage items */
58554 +       REISER4_NODE_PLUGIN_TYPE,             /* manage formatted nodes */
58555 +       REISER4_HASH_PLUGIN_TYPE,             /* hash methods */
58556 +       REISER4_FIBRATION_PLUGIN_TYPE,        /* directory fibrations */
58557 +       REISER4_FORMATTING_PLUGIN_TYPE,       /* dispatching policy */
58558 +       REISER4_PERM_PLUGIN_TYPE,             /* stub (vacancy) */
58559 +       REISER4_SD_EXT_PLUGIN_TYPE,           /* manage stat-data extensions */
58560 +       REISER4_FORMAT_PLUGIN_TYPE,           /* disk format specifications */
58561 +       REISER4_JNODE_PLUGIN_TYPE,            /* manage in-memory headers */
58562 +       REISER4_CIPHER_PLUGIN_TYPE,           /* cipher transform methods */
58563 +       REISER4_DIGEST_PLUGIN_TYPE,           /* digest transform methods */
58564 +       REISER4_COMPRESSION_PLUGIN_TYPE,      /* compression methods */
58565 +       REISER4_COMPRESSION_MODE_PLUGIN_TYPE, /* dispatching policies */
58566 +       REISER4_CLUSTER_PLUGIN_TYPE,          /* manage logical clusters */
58567 +       REISER4_PLUGIN_TYPES
58568 +} reiser4_plugin_type;
58569 +
58570 +/* Supported plugin groups */
58571 +typedef enum {
58572 +       REISER4_DIRECTORY_FILE,
58573 +       REISER4_REGULAR_FILE,
58574 +       REISER4_SYMLINK_FILE,
58575 +       REISER4_SPECIAL_FILE,
58576 +} file_plugin_group;
58577 +
58578 +struct reiser4_plugin_ops;
58579 +/* generic plugin operations, supported by each
58580 +    plugin type. */
58581 +typedef struct reiser4_plugin_ops reiser4_plugin_ops;
58582 +
58583 +/* the common part of all plugin instances. */
58584 +typedef struct plugin_header {
58585 +       /* plugin type */
58586 +       reiser4_plugin_type type_id;
58587 +       /* id of this plugin */
58588 +       reiser4_plugin_id id;
58589 +       /* bitmask of groups the plugin belongs to. */
58590 +       reiser4_plugin_groups groups;
58591 +       /* plugin operations */
58592 +       reiser4_plugin_ops *pops;
58593 +/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and
58594 + * defined. */
58595 +       /* short label of this plugin */
58596 +       const char *label;
58597 +       /* descriptive string.. */
58598 +       const char *desc;
58599 +       /* list linkage */
58600 +       struct list_head linkage;
58601 +} plugin_header;
58602 +
58603 +#define plugin_of_group(plug, group) (plug->h.groups & (1 << group))
58604 +
58605 +/* PRIVATE INTERFACES */
58606 +/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in
58607 + * plugin_header? */
58608 +/* plugin type representation. */
58609 +struct reiser4_plugin_type_data {
58610 +       /* internal plugin type identifier. Should coincide with
58611 +          index of this item in plugins[] array. */
58612 +       reiser4_plugin_type type_id;
58613 +       /* short symbolic label of this plugin type. Should be no longer
58614 +          than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */
58615 +       const char *label;
58616 +       /* plugin type description longer than .label */
58617 +       const char *desc;
58618 +
58619 +/* NIKITA-FIXME-HANS: define built-in */
58620 +       /* number of built-in plugin instances of this type */
58621 +       int builtin_num;
58622 +       /* array of built-in plugins */
58623 +       void *builtin;
58624 +       struct list_head plugins_list;
58625 +       size_t size;
58626 +};
58627 +
58628 +extern struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES];
58629 +
58630 +int is_plugin_type_valid(reiser4_plugin_type type);
58631 +int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id);
58632 +
58633 +static inline reiser4_plugin *plugin_at(struct reiser4_plugin_type_data *ptype,
58634 +                                       int i)
58635 +{
58636 +       char *builtin;
58637 +
58638 +       builtin = ptype->builtin;
58639 +       return (reiser4_plugin *) (builtin + i * ptype->size);
58640 +}
58641 +
58642 +/* return plugin by its @type_id and @id */
58643 +static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type,
58644 +                                          reiser4_plugin_id id)
58645 +{
58646 +       assert("nikita-1651", is_plugin_type_valid(type));
58647 +       assert("nikita-1652", is_plugin_id_valid(type, id));
58648 +       return plugin_at(&plugins[type], id);
58649 +}
58650 +
58651 +extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id,
58652 +                                          reiser4_plugin_id id);
58653 +
58654 +/**
58655 + * plugin_by_disk_id - get reiser4_plugin
58656 + * @type_id: plugin type id
58657 + * @did: plugin id in disk format
58658 + *
58659 + * Returns reiser4_plugin by plugin type id an dplugin_id.
58660 + */
58661 +static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG,
58662 +                                               reiser4_plugin_type type_id,
58663 +                                               __le16 *plugin_id)
58664 +{
58665 +       /*
58666 +        * what we should do properly is to maintain within each file-system a
58667 +        * dictionary that maps on-disk plugin ids to "universal" ids. This
58668 +        * dictionary will be resolved on mount time, so that this function
58669 +        * will perform just one additional array lookup.
58670 +        */
58671 +       return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id));
58672 +}
58673 +
58674 +/* __PLUGIN_HEADER_H__ */
58675 +#endif
58676 +
58677 +/*
58678 + * Local variables:
58679 + * c-indentation-style: "K&R"
58680 + * mode-name: "LC"
58681 + * c-basic-offset: 8
58682 + * tab-width: 8
58683 + * fill-column: 79
58684 + * End:
58685 + */
58686 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/plugin_set.c linux-2.6.35/fs/reiser4/plugin/plugin_set.c
58687 --- linux-2.6.35.orig/fs/reiser4/plugin/plugin_set.c    1970-01-01 01:00:00.000000000 +0100
58688 +++ linux-2.6.35/fs/reiser4/plugin/plugin_set.c 2010-08-04 15:44:57.000000000 +0200
58689 @@ -0,0 +1,380 @@
58690 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
58691 + * reiser4/README */
58692 +/* This file contains Reiser4 plugin set operations */
58693 +
58694 +/* plugin sets
58695 + *
58696 + * Each file in reiser4 is controlled by a whole set of plugins (file plugin,
58697 + * directory plugin, hash plugin, tail policy plugin, security plugin, etc.)
58698 + * assigned (inherited, deduced from mode bits, etc.) at creation time. This
58699 + * set of plugins (so called pset) is described by structure plugin_set (see
58700 + * plugin/plugin_set.h), which contains pointers to all required plugins.
58701 + *
58702 + * Children can inherit some pset members from their parent, however sometimes
58703 + * it is useful to specify members different from parent ones. Since object's
58704 + * pset can not be easily changed without fatal consequences, we use for this
58705 + * purpose another special plugin table (so called hset, or heir set) described
58706 + * by the same structure.
58707 + *
58708 + * Inode only stores a pointers to pset and hset. Different inodes with the
58709 + * same set of pset (hset) members point to the same pset (hset). This is
58710 + * archived by storing psets and hsets in global hash table. Races are avoided
58711 + * by simple (and efficient so far) solution of never recycling psets, even
58712 + * when last inode pointing to it is destroyed.
58713 + */
58714 +
58715 +#include "../debug.h"
58716 +#include "../super.h"
58717 +#include "plugin_set.h"
58718 +
58719 +#include <linux/slab.h>
58720 +#include <linux/stddef.h>
58721 +
58722 +/* slab for plugin sets */
58723 +static struct kmem_cache *plugin_set_slab;
58724 +
58725 +static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = {
58726 +       [0 ... 7] = SPIN_LOCK_UNLOCKED
58727 +};
58728 +
58729 +/* hash table support */
58730 +
58731 +#define PS_TABLE_SIZE (32)
58732 +
58733 +static inline plugin_set *cast_to(const unsigned long *a)
58734 +{
58735 +       return container_of(a, plugin_set, hashval);
58736 +}
58737 +
58738 +static inline int pseq(const unsigned long *a1, const unsigned long *a2)
58739 +{
58740 +       plugin_set *set1;
58741 +       plugin_set *set2;
58742 +
58743 +       /* make sure fields are not missed in the code below */
58744 +       cassert(sizeof *set1 ==
58745 +               sizeof set1->hashval +
58746 +               sizeof set1->link +
58747 +               sizeof set1->file +
58748 +               sizeof set1->dir +
58749 +               sizeof set1->perm +
58750 +               sizeof set1->formatting +
58751 +               sizeof set1->hash +
58752 +               sizeof set1->fibration +
58753 +               sizeof set1->sd +
58754 +               sizeof set1->dir_item +
58755 +               sizeof set1->cipher +
58756 +               sizeof set1->digest +
58757 +               sizeof set1->compression +
58758 +               sizeof set1->compression_mode +
58759 +               sizeof set1->cluster +
58760 +               sizeof set1->create);
58761 +
58762 +       set1 = cast_to(a1);
58763 +       set2 = cast_to(a2);
58764 +       return
58765 +           set1->hashval == set2->hashval &&
58766 +           set1->file == set2->file &&
58767 +           set1->dir == set2->dir &&
58768 +           set1->perm == set2->perm &&
58769 +           set1->formatting == set2->formatting &&
58770 +           set1->hash == set2->hash &&
58771 +           set1->fibration == set2->fibration &&
58772 +           set1->sd == set2->sd &&
58773 +           set1->dir_item == set2->dir_item &&
58774 +           set1->cipher == set2->cipher &&
58775 +           set1->digest == set2->digest &&
58776 +           set1->compression == set2->compression &&
58777 +           set1->compression_mode == set2->compression_mode &&
58778 +           set1->cluster == set2->cluster &&
58779 +           set1->create == set2->create;
58780 +}
58781 +
58782 +#define HASH_FIELD(hash, set, field)           \
58783 +({                                             \
58784 +       (hash) += (unsigned long)(set)->field >> 2;     \
58785 +})
58786 +
58787 +static inline unsigned long calculate_hash(const plugin_set * set)
58788 +{
58789 +       unsigned long result;
58790 +
58791 +       result = 0;
58792 +       HASH_FIELD(result, set, file);
58793 +       HASH_FIELD(result, set, dir);
58794 +       HASH_FIELD(result, set, perm);
58795 +       HASH_FIELD(result, set, formatting);
58796 +       HASH_FIELD(result, set, hash);
58797 +       HASH_FIELD(result, set, fibration);
58798 +       HASH_FIELD(result, set, sd);
58799 +       HASH_FIELD(result, set, dir_item);
58800 +       HASH_FIELD(result, set, cipher);
58801 +       HASH_FIELD(result, set, digest);
58802 +       HASH_FIELD(result, set, compression);
58803 +       HASH_FIELD(result, set, compression_mode);
58804 +       HASH_FIELD(result, set, cluster);
58805 +       HASH_FIELD(result, set, create);
58806 +       return result & (PS_TABLE_SIZE - 1);
58807 +}
58808 +
58809 +static inline unsigned long
58810 +pshash(ps_hash_table * table, const unsigned long *a)
58811 +{
58812 +       return *a;
58813 +}
58814 +
58815 +/* The hash table definition */
58816 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
58817 +#define KFREE(ptr, size) kfree(ptr)
58818 +TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash,
58819 +                     pseq);
58820 +#undef KFREE
58821 +#undef KMALLOC
58822 +
58823 +static ps_hash_table ps_table;
58824 +static plugin_set empty_set = {
58825 +       .hashval = 0,
58826 +       .file = NULL,
58827 +       .dir = NULL,
58828 +       .perm = NULL,
58829 +       .formatting = NULL,
58830 +       .hash = NULL,
58831 +       .fibration = NULL,
58832 +       .sd = NULL,
58833 +       .dir_item = NULL,
58834 +       .cipher = NULL,
58835 +       .digest = NULL,
58836 +       .compression = NULL,
58837 +       .compression_mode = NULL,
58838 +       .cluster = NULL,
58839 +       .create = NULL,
58840 +       .link = {NULL}
58841 +};
58842 +
58843 +plugin_set *plugin_set_get_empty(void)
58844 +{
58845 +       return &empty_set;
58846 +}
58847 +
58848 +void plugin_set_put(plugin_set * set)
58849 +{
58850 +}
58851 +
58852 +static inline unsigned long *pset_field(plugin_set * set, int offset)
58853 +{
58854 +       return (unsigned long *)(((char *)set) + offset);
58855 +}
58856 +
58857 +static int plugin_set_field(plugin_set ** set, const unsigned long val,
58858 +                           const int offset)
58859 +{
58860 +       unsigned long *spot;
58861 +       spinlock_t *lock;
58862 +       plugin_set replica;
58863 +       plugin_set *twin;
58864 +       plugin_set *psal;
58865 +       plugin_set *orig;
58866 +
58867 +       assert("nikita-2902", set != NULL);
58868 +       assert("nikita-2904", *set != NULL);
58869 +
58870 +       spot = pset_field(*set, offset);
58871 +       if (unlikely(*spot == val))
58872 +               return 0;
58873 +
58874 +       replica = *(orig = *set);
58875 +       *pset_field(&replica, offset) = val;
58876 +       replica.hashval = calculate_hash(&replica);
58877 +       rcu_read_lock();
58878 +       twin = ps_hash_find(&ps_table, &replica.hashval);
58879 +       if (unlikely(twin == NULL)) {
58880 +               rcu_read_unlock();
58881 +               psal = kmem_cache_alloc(plugin_set_slab,
58882 +                                       reiser4_ctx_gfp_mask_get());
58883 +               if (psal == NULL)
58884 +                       return RETERR(-ENOMEM);
58885 +               *psal = replica;
58886 +               lock = &plugin_set_lock[replica.hashval & 7];
58887 +               spin_lock(lock);
58888 +               twin = ps_hash_find(&ps_table, &replica.hashval);
58889 +               if (likely(twin == NULL)) {
58890 +                       *set = psal;
58891 +                       ps_hash_insert_rcu(&ps_table, psal);
58892 +               } else {
58893 +                       *set = twin;
58894 +                       kmem_cache_free(plugin_set_slab, psal);
58895 +               }
58896 +               spin_unlock(lock);
58897 +       } else {
58898 +               rcu_read_unlock();
58899 +               *set = twin;
58900 +       }
58901 +       return 0;
58902 +}
58903 +
58904 +static struct {
58905 +       int offset;
58906 +       reiser4_plugin_groups groups;
58907 +       reiser4_plugin_type type;
58908 +} pset_descr[PSET_LAST] = {
58909 +       [PSET_FILE] = {
58910 +               .offset = offsetof(plugin_set, file),
58911 +               .type = REISER4_FILE_PLUGIN_TYPE,
58912 +               .groups = 0
58913 +       },
58914 +       [PSET_DIR] = {
58915 +               .offset = offsetof(plugin_set, dir),
58916 +               .type = REISER4_DIR_PLUGIN_TYPE,
58917 +               .groups = 0
58918 +       },
58919 +       [PSET_PERM] = {
58920 +               .offset = offsetof(plugin_set, perm),
58921 +               .type = REISER4_PERM_PLUGIN_TYPE,
58922 +               .groups = 0
58923 +       },
58924 +       [PSET_FORMATTING] = {
58925 +               .offset = offsetof(plugin_set, formatting),
58926 +               .type = REISER4_FORMATTING_PLUGIN_TYPE,
58927 +               .groups = 0
58928 +       },
58929 +       [PSET_HASH] = {
58930 +               .offset = offsetof(plugin_set, hash),
58931 +               .type = REISER4_HASH_PLUGIN_TYPE,
58932 +               .groups = 0
58933 +       },
58934 +       [PSET_FIBRATION] = {
58935 +               .offset = offsetof(plugin_set, fibration),
58936 +               .type = REISER4_FIBRATION_PLUGIN_TYPE,
58937 +               .groups = 0
58938 +       },
58939 +       [PSET_SD] = {
58940 +               .offset = offsetof(plugin_set, sd),
58941 +               .type = REISER4_ITEM_PLUGIN_TYPE,
58942 +               .groups = (1 << STAT_DATA_ITEM_TYPE)
58943 +       },
58944 +       [PSET_DIR_ITEM] = {
58945 +               .offset = offsetof(plugin_set, dir_item),
58946 +               .type = REISER4_ITEM_PLUGIN_TYPE,
58947 +               .groups = (1 << DIR_ENTRY_ITEM_TYPE)
58948 +       },
58949 +       [PSET_CIPHER] = {
58950 +               .offset = offsetof(plugin_set, cipher),
58951 +               .type = REISER4_CIPHER_PLUGIN_TYPE,
58952 +               .groups = 0
58953 +       },
58954 +       [PSET_DIGEST] = {
58955 +               .offset = offsetof(plugin_set, digest),
58956 +               .type = REISER4_DIGEST_PLUGIN_TYPE,
58957 +               .groups = 0
58958 +       },
58959 +       [PSET_COMPRESSION] = {
58960 +               .offset = offsetof(plugin_set, compression),
58961 +               .type = REISER4_COMPRESSION_PLUGIN_TYPE,
58962 +               .groups = 0
58963 +       },
58964 +       [PSET_COMPRESSION_MODE] = {
58965 +               .offset = offsetof(plugin_set, compression_mode),
58966 +               .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
58967 +               .groups = 0
58968 +       },
58969 +       [PSET_CLUSTER] = {
58970 +               .offset = offsetof(plugin_set, cluster),
58971 +               .type = REISER4_CLUSTER_PLUGIN_TYPE,
58972 +               .groups = 0
58973 +       },
58974 +       [PSET_CREATE] = {
58975 +               .offset = offsetof(plugin_set, create),
58976 +               .type = REISER4_FILE_PLUGIN_TYPE,
58977 +               .groups = (1 << REISER4_REGULAR_FILE)
58978 +       }
58979 +};
58980 +
58981 +#define DEFINE_PSET_OPS(PREFIX)                                                       \
58982 +       reiser4_plugin_type PREFIX##_member_to_type_unsafe(pset_member memb)   \
58983 +{                                                                             \
58984 +       if (memb > PSET_LAST)                                                  \
58985 +               return REISER4_PLUGIN_TYPES;                                   \
58986 +       return pset_descr[memb].type;                                          \
58987 +}                                                                             \
58988 +                                                                              \
58989 +int PREFIX##_set_unsafe(plugin_set ** set, pset_member memb,                  \
58990 +                    reiser4_plugin * plugin)                                  \
58991 +{                                                                             \
58992 +       assert("nikita-3492", set != NULL);                                    \
58993 +       assert("nikita-3493", *set != NULL);                                   \
58994 +       assert("nikita-3494", plugin != NULL);                                 \
58995 +       assert("nikita-3495", 0 <= memb && memb < PSET_LAST);                  \
58996 +       assert("nikita-3496", plugin->h.type_id == pset_descr[memb].type);     \
58997 +                                                                              \
58998 +       if (pset_descr[memb].groups)                                           \
58999 +               if (!(pset_descr[memb].groups & plugin->h.groups))             \
59000 +                       return -EINVAL;                                        \
59001 +                                                                              \
59002 +       return plugin_set_field(set,                                           \
59003 +                       (unsigned long)plugin, pset_descr[memb].offset);       \
59004 +}                                                                             \
59005 +                                                                              \
59006 +reiser4_plugin *PREFIX##_get(plugin_set * set, pset_member memb)              \
59007 +{                                                                             \
59008 +       assert("nikita-3497", set != NULL);                                    \
59009 +       assert("nikita-3498", 0 <= memb && memb < PSET_LAST);                  \
59010 +                                                                              \
59011 +       return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset); \
59012 +}
59013 +
59014 +DEFINE_PSET_OPS(aset);
59015 +
59016 +int set_plugin(plugin_set ** set, pset_member memb, reiser4_plugin * plugin)
59017 +{
59018 +       return plugin_set_field(set,
59019 +               (unsigned long)plugin, pset_descr[memb].offset);
59020 +}
59021 +
59022 +/**
59023 + * init_plugin_set - create plugin set cache and hash table
59024 + *
59025 + * Initializes slab cache of plugin_set-s and their hash table. It is part of
59026 + * reiser4 module initialization.
59027 + */
59028 +int init_plugin_set(void)
59029 +{
59030 +       int result;
59031 +
59032 +       result = ps_hash_init(&ps_table, PS_TABLE_SIZE);
59033 +       if (result == 0) {
59034 +               plugin_set_slab = kmem_cache_create("plugin_set",
59035 +                                                   sizeof(plugin_set), 0,
59036 +                                                   SLAB_HWCACHE_ALIGN,
59037 +                                                   NULL);
59038 +               if (plugin_set_slab == NULL)
59039 +                       result = RETERR(-ENOMEM);
59040 +       }
59041 +       return result;
59042 +}
59043 +
59044 +/**
59045 + * done_plugin_set - delete plugin_set cache and plugin_set hash table
59046 + *
59047 + * This is called on reiser4 module unloading or system shutdown.
59048 + */
59049 +void done_plugin_set(void)
59050 +{
59051 +       plugin_set *cur, *next;
59052 +
59053 +       for_all_in_htable(&ps_table, ps, cur, next) {
59054 +               ps_hash_remove(&ps_table, cur);
59055 +               kmem_cache_free(plugin_set_slab, cur);
59056 +       }
59057 +       destroy_reiser4_cache(&plugin_set_slab);
59058 +       ps_hash_done(&ps_table);
59059 +}
59060 +
59061 +/*
59062 + * Local variables:
59063 + * c-indentation-style: "K&R"
59064 + * mode-name: "LC"
59065 + * c-basic-offset: 8
59066 + * tab-width: 8
59067 + * fill-column: 120
59068 + * End:
59069 + */
59070 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/plugin_set.h linux-2.6.35/fs/reiser4/plugin/plugin_set.h
59071 --- linux-2.6.35.orig/fs/reiser4/plugin/plugin_set.h    1970-01-01 01:00:00.000000000 +0100
59072 +++ linux-2.6.35/fs/reiser4/plugin/plugin_set.h 2010-08-04 15:44:57.000000000 +0200
59073 @@ -0,0 +1,78 @@
59074 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
59075 + * reiser4/README */
59076 +
59077 +/* Reiser4 plugin set definition.
59078 +   See fs/reiser4/plugin/plugin_set.c for details */
59079 +
59080 +#if !defined(__PLUGIN_SET_H__)
59081 +#define __PLUGIN_SET_H__
59082 +
59083 +#include "../type_safe_hash.h"
59084 +#include "plugin.h"
59085 +
59086 +#include <linux/rcupdate.h>
59087 +
59088 +struct plugin_set;
59089 +typedef struct plugin_set plugin_set;
59090 +
59091 +TYPE_SAFE_HASH_DECLARE(ps, plugin_set);
59092 +
59093 +struct plugin_set {
59094 +       unsigned long hashval;
59095 +       /* plugin of file */
59096 +       file_plugin *file;
59097 +       /* plugin of dir */
59098 +       dir_plugin *dir;
59099 +       /* perm plugin for this file */
59100 +       perm_plugin *perm;
59101 +       /* tail policy plugin. Only meaningful for regular files */
59102 +       formatting_plugin *formatting;
59103 +       /* hash plugin. Only meaningful for directories. */
59104 +       hash_plugin *hash;
59105 +       /* fibration plugin. Only meaningful for directories. */
59106 +       fibration_plugin *fibration;
59107 +       /* plugin of stat-data */
59108 +       item_plugin *sd;
59109 +       /* plugin of items a directory is built of */
59110 +       item_plugin *dir_item;
59111 +       /* cipher plugin */
59112 +       cipher_plugin *cipher;
59113 +       /* digest plugin */
59114 +       digest_plugin *digest;
59115 +       /* compression plugin */
59116 +       compression_plugin *compression;
59117 +       /* compression mode plugin */
59118 +       compression_mode_plugin *compression_mode;
59119 +       /* cluster plugin */
59120 +       cluster_plugin *cluster;
59121 +       /* this specifies file plugin of regular children.
59122 +          only meaningful for directories */
59123 +       file_plugin *create;
59124 +       ps_hash_link link;
59125 +};
59126 +
59127 +extern plugin_set *plugin_set_get_empty(void);
59128 +extern void plugin_set_put(plugin_set * set);
59129 +
59130 +extern int init_plugin_set(void);
59131 +extern void done_plugin_set(void);
59132 +
59133 +extern reiser4_plugin *aset_get(plugin_set * set, pset_member memb);
59134 +extern int set_plugin(plugin_set ** set, pset_member memb,
59135 +                     reiser4_plugin * plugin);
59136 +extern int aset_set_unsafe(plugin_set ** set, pset_member memb,
59137 +                          reiser4_plugin * plugin);
59138 +extern reiser4_plugin_type aset_member_to_type_unsafe(pset_member memb);
59139 +
59140 +/* __PLUGIN_SET_H__ */
59141 +#endif
59142 +
59143 +/* Make Linus happy.
59144 +   Local variables:
59145 +   c-indentation-style: "K&R"
59146 +   mode-name: "LC"
59147 +   c-basic-offset: 8
59148 +   tab-width: 8
59149 +   fill-column: 120
59150 +   End:
59151 +*/
59152 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/security/Makefile linux-2.6.35/fs/reiser4/plugin/security/Makefile
59153 --- linux-2.6.35.orig/fs/reiser4/plugin/security/Makefile       1970-01-01 01:00:00.000000000 +0100
59154 +++ linux-2.6.35/fs/reiser4/plugin/security/Makefile    2010-08-04 15:44:57.000000000 +0200
59155 @@ -0,0 +1,4 @@
59156 +obj-$(CONFIG_REISER4_FS) += security_plugins.o
59157 +
59158 +security_plugins-objs :=       \
59159 +       perm.o
59160 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/security/perm.c linux-2.6.35/fs/reiser4/plugin/security/perm.c
59161 --- linux-2.6.35.orig/fs/reiser4/plugin/security/perm.c 1970-01-01 01:00:00.000000000 +0100
59162 +++ linux-2.6.35/fs/reiser4/plugin/security/perm.c      2010-08-04 15:44:57.000000000 +0200
59163 @@ -0,0 +1,33 @@
59164 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
59165 +
59166 +/*
59167 + * This file contains implementation of permission plugins.
59168 + * See the comments in perm.h
59169 + */
59170 +
59171 +#include "../plugin.h"
59172 +#include "../plugin_header.h"
59173 +#include "../../debug.h"
59174 +
59175 +perm_plugin perm_plugins[LAST_PERM_ID] = {
59176 +       [NULL_PERM_ID] = {
59177 +               .h = {
59178 +                       .type_id = REISER4_PERM_PLUGIN_TYPE,
59179 +                       .id = NULL_PERM_ID,
59180 +                       .pops = NULL,
59181 +                       .label = "null",
59182 +                       .desc = "stub permission plugin",
59183 +                       .linkage = {NULL, NULL}
59184 +               }
59185 +       }
59186 +};
59187 +
59188 +/*
59189 + * Local variables:
59190 + * c-indentation-style: "K&R"
59191 + * mode-name: "LC"
59192 + * c-basic-offset: 8
59193 + * tab-width: 8
59194 + * fill-column: 79
59195 + * End:
59196 + */
59197 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/security/perm.h linux-2.6.35/fs/reiser4/plugin/security/perm.h
59198 --- linux-2.6.35.orig/fs/reiser4/plugin/security/perm.h 1970-01-01 01:00:00.000000000 +0100
59199 +++ linux-2.6.35/fs/reiser4/plugin/security/perm.h      2010-08-04 15:44:57.000000000 +0200
59200 @@ -0,0 +1,38 @@
59201 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
59202 +
59203 +/* Perm (short for "permissions") plugins common stuff. */
59204 +
59205 +#if !defined( __REISER4_PERM_H__ )
59206 +#define __REISER4_PERM_H__
59207 +
59208 +#include "../../forward.h"
59209 +#include "../plugin_header.h"
59210 +
59211 +#include <linux/types.h>
59212 +
59213 +/* Definition of permission plugin */
59214 +/* NIKITA-FIXME-HANS: define what this is targeted for.
59215 +   It does not seem to be intended for use with sys_reiser4.  Explain. */
59216 +
59217 +/* NOTE-EDWARD: This seems to be intended for deprecated sys_reiser4.
59218 +   Consider it like a temporary "seam" and reserved pset member.
59219 +   If you have something usefull to add, then rename this plugin and add here */
59220 +typedef struct perm_plugin {
59221 +       /* generic plugin fields */
59222 +       plugin_header h;
59223 +} perm_plugin;
59224 +
59225 +typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id;
59226 +
59227 +/* __REISER4_PERM_H__ */
59228 +#endif
59229 +
59230 +/* Make Linus happy.
59231 +   Local variables:
59232 +   c-indentation-style: "K&R"
59233 +   mode-name: "LC"
59234 +   c-basic-offset: 8
59235 +   tab-width: 8
59236 +   fill-column: 120
59237 +   End:
59238 +*/
59239 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/space/bitmap.c linux-2.6.35/fs/reiser4/plugin/space/bitmap.c
59240 --- linux-2.6.35.orig/fs/reiser4/plugin/space/bitmap.c  1970-01-01 01:00:00.000000000 +0100
59241 +++ linux-2.6.35/fs/reiser4/plugin/space/bitmap.c       2010-08-04 15:44:57.000000000 +0200
59242 @@ -0,0 +1,1585 @@
59243 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
59244 +
59245 +#include "../../debug.h"
59246 +#include "../../dformat.h"
59247 +#include "../../txnmgr.h"
59248 +#include "../../jnode.h"
59249 +#include "../../block_alloc.h"
59250 +#include "../../tree.h"
59251 +#include "../../super.h"
59252 +#include "../plugin.h"
59253 +#include "space_allocator.h"
59254 +#include "bitmap.h"
59255 +
59256 +#include <linux/types.h>
59257 +#include <linux/fs.h>          /* for struct super_block  */
59258 +#include <linux/mutex.h>
59259 +#include <asm/div64.h>
59260 +
59261 +/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap
59262 + * blocks
59263 +
59264 +   A useful optimization of reiser4 bitmap handling would be dynamic bitmap
59265 +   blocks loading/unloading which is different from v3.x where all bitmap
59266 +   blocks are loaded at mount time.
59267 +
59268 +   To implement bitmap blocks unloading we need to count bitmap block usage
59269 +   and detect currently unused blocks allowing them to be unloaded. It is not
59270 +   a simple task since we allow several threads to modify one bitmap block
59271 +   simultaneously.
59272 +
59273 +   Briefly speaking, the following schema is proposed: we count in special
59274 +   variable associated with each bitmap block. That is for counting of block
59275 +   alloc/dealloc operations on that bitmap block. With a deferred block
59276 +   deallocation feature of reiser4 all those operation will be represented in
59277 +   atom dirty/deleted lists as jnodes for freshly allocated or deleted
59278 +   nodes.
59279 +
59280 +   So, we increment usage counter for each new node allocated or deleted, and
59281 +   decrement it at atom commit one time for each node from the dirty/deleted
59282 +   atom's list.  Of course, freshly allocated node deletion and node reusing
59283 +   from atom deleted (if we do so) list should decrement bitmap usage counter
59284 +   also.
59285 +
59286 +   This schema seems to be working but that reference counting is
59287 +   not easy to debug. I think we should agree with Hans and do not implement
59288 +   it in v4.0. Current code implements "on-demand" bitmap blocks loading only.
59289 +
59290 +   For simplicity all bitmap nodes (both commit and working bitmap blocks) are
59291 +   loaded into memory on fs mount time or each bitmap nodes are loaded at the
59292 +   first access to it, the "dont_load_bitmap" mount option controls whether
59293 +   bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap
59294 +   nodes currently is not supported. */
59295 +
59296 +#define CHECKSUM_SIZE    4
59297 +
59298 +#define BYTES_PER_LONG   (sizeof(long))
59299 +
59300 +#if BITS_PER_LONG == 64
59301 +#  define LONG_INT_SHIFT (6)
59302 +#else
59303 +#  define LONG_INT_SHIFT (5)
59304 +#endif
59305 +
59306 +#define LONG_INT_MASK (BITS_PER_LONG - 1UL)
59307 +
59308 +typedef unsigned long ulong_t;
59309 +
59310 +#define bmap_size(blocksize)       ((blocksize) - CHECKSUM_SIZE)
59311 +#define bmap_bit_count(blocksize)   (bmap_size(blocksize) << 3)
59312 +
59313 +/* Block allocation/deallocation are done through special bitmap objects which
59314 +   are allocated in an array at fs mount. */
59315 +struct bitmap_node {
59316 +       struct mutex mutex;     /* long term lock object */
59317 +
59318 +       jnode *wjnode;          /* j-nodes for WORKING ... */
59319 +       jnode *cjnode;          /* ... and COMMIT bitmap blocks */
59320 +
59321 +       bmap_off_t first_zero_bit;      /* for skip_busy option implementation */
59322 +
59323 +       atomic_t loaded;        /* a flag which shows that bnode is loaded
59324 +                                * already */
59325 +};
59326 +
59327 +static inline char *bnode_working_data(struct bitmap_node *bnode)
59328 +{
59329 +       char *data;
59330 +
59331 +       data = jdata(bnode->wjnode);
59332 +       assert("zam-429", data != NULL);
59333 +
59334 +       return data + CHECKSUM_SIZE;
59335 +}
59336 +
59337 +static inline char *bnode_commit_data(const struct bitmap_node *bnode)
59338 +{
59339 +       char *data;
59340 +
59341 +       data = jdata(bnode->cjnode);
59342 +       assert("zam-430", data != NULL);
59343 +
59344 +       return data + CHECKSUM_SIZE;
59345 +}
59346 +
59347 +static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode)
59348 +{
59349 +       char *data;
59350 +
59351 +       data = jdata(bnode->cjnode);
59352 +       assert("vpf-261", data != NULL);
59353 +
59354 +       return le32_to_cpu(get_unaligned((d32 *)data));
59355 +}
59356 +
59357 +static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc)
59358 +{
59359 +       char *data;
59360 +
59361 +       data = jdata(bnode->cjnode);
59362 +       assert("vpf-261", data != NULL);
59363 +
59364 +       put_unaligned(cpu_to_le32(crc), (d32 *)data);
59365 +}
59366 +
59367 +/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having
59368 + * written the code, does this added abstraction still have */
59369 +/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the
59370 + * reiser4_space_allocator structure) */
59371 +/* ZAM-FIXME-HANS: I don't understand your english in comment above. */
59372 +/* FIXME-HANS(Zam): I don't understand the questions like "might be a union
59373 + * someday?". What they about?  If there is a reason to have a union, it should
59374 + * be a union, if not, it should not be a union.  "..might be someday" means no
59375 + * reason. */
59376 +struct bitmap_allocator_data {
59377 +       /* an array for bitmap blocks direct access */
59378 +       struct bitmap_node *bitmap;
59379 +};
59380 +
59381 +#define get_barray(super) \
59382 +(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap)
59383 +
59384 +#define get_bnode(super, i) (get_barray(super) + i)
59385 +
59386 +/* allocate and initialize jnode with JNODE_BITMAP type */
59387 +static jnode *bnew(void)
59388 +{
59389 +       jnode *jal = jalloc();
59390 +
59391 +       if (jal)
59392 +               jnode_init(jal, current_tree, JNODE_BITMAP);
59393 +
59394 +       return jal;
59395 +}
59396 +
59397 +/* this file contains:
59398 +   - bitmap based implementation of space allocation plugin
59399 +   - all the helper functions like set bit, find_first_zero_bit, etc */
59400 +
59401 +/* Audited by: green(2002.06.12) */
59402 +static int find_next_zero_bit_in_word(ulong_t word, int start_bit)
59403 +{
59404 +       ulong_t mask = 1UL << start_bit;
59405 +       int i = start_bit;
59406 +
59407 +       while ((word & mask) != 0) {
59408 +               mask <<= 1;
59409 +               if (++i >= BITS_PER_LONG)
59410 +                       break;
59411 +       }
59412 +
59413 +       return i;
59414 +}
59415 +
59416 +#include <linux/bitops.h>
59417 +
59418 +#if BITS_PER_LONG == 64
59419 +
59420 +#define OFF(addr)  (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3)
59421 +#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1)))
59422 +
59423 +static inline void reiser4_set_bit(int nr, void *addr)
59424 +{
59425 +       ext2_set_bit(nr + OFF(addr), BASE(addr));
59426 +}
59427 +
59428 +static inline void reiser4_clear_bit(int nr, void *addr)
59429 +{
59430 +       ext2_clear_bit(nr + OFF(addr), BASE(addr));
59431 +}
59432 +
59433 +static inline int reiser4_test_bit(int nr, void *addr)
59434 +{
59435 +       return ext2_test_bit(nr + OFF(addr), BASE(addr));
59436 +}
59437 +static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset,
59438 +                                            int offset)
59439 +{
59440 +       int off = OFF(addr);
59441 +
59442 +       return ext2_find_next_zero_bit(BASE(addr), maxoffset + off,
59443 +                                      offset + off) - off;
59444 +}
59445 +
59446 +#else
59447 +
59448 +#define reiser4_set_bit(nr, addr)    ext2_set_bit(nr, addr)
59449 +#define reiser4_clear_bit(nr, addr)  ext2_clear_bit(nr, addr)
59450 +#define reiser4_test_bit(nr, addr)  ext2_test_bit(nr, addr)
59451 +
59452 +#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \
59453 +ext2_find_next_zero_bit(addr, maxoffset, offset)
59454 +#endif
59455 +
59456 +/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets
59457 + * are counted from @addr, return the offset of the first bit if it is found,
59458 + * @maxoffset otherwise. */
59459 +static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
59460 +                                             bmap_off_t start_offset)
59461 +{
59462 +       ulong_t *base = addr;
59463 +       /* start_offset is in bits, convert it to byte offset within bitmap. */
59464 +       int word_nr = start_offset >> LONG_INT_SHIFT;
59465 +       /* bit number within the byte. */
59466 +       int bit_nr = start_offset & LONG_INT_MASK;
59467 +       int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT;
59468 +
59469 +       assert("zam-387", max_offset != 0);
59470 +
59471 +       /* Unaligned @start_offset case.  */
59472 +       if (bit_nr != 0) {
59473 +               bmap_nr_t nr;
59474 +
59475 +               nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr);
59476 +
59477 +               if (nr < BITS_PER_LONG)
59478 +                       return (word_nr << LONG_INT_SHIFT) + nr;
59479 +
59480 +               ++word_nr;
59481 +       }
59482 +
59483 +       /* Fast scan trough aligned words. */
59484 +       while (word_nr <= max_word_nr) {
59485 +               if (base[word_nr] != 0) {
59486 +                       return (word_nr << LONG_INT_SHIFT)
59487 +                           + find_next_zero_bit_in_word(~(base[word_nr]), 0);
59488 +               }
59489 +
59490 +               ++word_nr;
59491 +       }
59492 +
59493 +       return max_offset;
59494 +}
59495 +
59496 +#if BITS_PER_LONG == 64
59497 +
59498 +static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
59499 +                                           bmap_off_t start_offset)
59500 +{
59501 +       bmap_off_t off = OFF(addr);
59502 +
59503 +       return __reiser4_find_next_set_bit(BASE(addr), max_offset + off,
59504 +                                          start_offset + off) - off;
59505 +}
59506 +
59507 +#else
59508 +#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \
59509 +  __reiser4_find_next_set_bit(addr, max_offset, start_offset)
59510 +#endif
59511 +
59512 +/* search for the first set bit in single word. */
59513 +static int find_last_set_bit_in_word(ulong_t word, int start_bit)
59514 +{
59515 +       ulong_t bit_mask;
59516 +       int nr = start_bit;
59517 +
59518 +       assert("zam-965", start_bit < BITS_PER_LONG);
59519 +       assert("zam-966", start_bit >= 0);
59520 +
59521 +       bit_mask = (1UL << nr);
59522 +
59523 +       while (bit_mask != 0) {
59524 +               if (bit_mask & word)
59525 +                       return nr;
59526 +               bit_mask >>= 1;
59527 +               nr--;
59528 +       }
59529 +       return BITS_PER_LONG;
59530 +}
59531 +
59532 +/* Search bitmap for a set bit in backward direction from the end to the
59533 + * beginning of given region
59534 + *
59535 + * @result: result offset of the last set bit
59536 + * @addr:   base memory address,
59537 + * @low_off:  low end of the search region, edge bit included into the region,
59538 + * @high_off: high end of the search region, edge bit included into the region,
59539 + *
59540 + * @return: 0 - set bit was found, -1 otherwise.
59541 + */
59542 +static int
59543 +reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
59544 +                         bmap_off_t high_off)
59545 +{
59546 +       ulong_t *base = addr;
59547 +       int last_word;
59548 +       int first_word;
59549 +       int last_bit;
59550 +       int nr;
59551 +
59552 +       assert("zam-962", high_off >= low_off);
59553 +
59554 +       last_word = high_off >> LONG_INT_SHIFT;
59555 +       last_bit = high_off & LONG_INT_MASK;
59556 +       first_word = low_off >> LONG_INT_SHIFT;
59557 +
59558 +       if (last_bit < BITS_PER_LONG) {
59559 +               nr = find_last_set_bit_in_word(base[last_word], last_bit);
59560 +               if (nr < BITS_PER_LONG) {
59561 +                       *result = (last_word << LONG_INT_SHIFT) + nr;
59562 +                       return 0;
59563 +               }
59564 +               --last_word;
59565 +       }
59566 +       while (last_word >= first_word) {
59567 +               if (base[last_word] != 0x0) {
59568 +                       last_bit =
59569 +                           find_last_set_bit_in_word(base[last_word],
59570 +                                                     BITS_PER_LONG - 1);
59571 +                       assert("zam-972", last_bit < BITS_PER_LONG);
59572 +                       *result = (last_word << LONG_INT_SHIFT) + last_bit;
59573 +                       return 0;
59574 +               }
59575 +               --last_word;
59576 +       }
59577 +
59578 +       return -1;              /* set bit not found */
59579 +}
59580 +
59581 +/* Search bitmap for a clear bit in backward direction from the end to the
59582 + * beginning of given region */
59583 +static int
59584 +reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
59585 +                          bmap_off_t high_off)
59586 +{
59587 +       ulong_t *base = addr;
59588 +       int last_word;
59589 +       int first_word;
59590 +       int last_bit;
59591 +       int nr;
59592 +
59593 +       last_word = high_off >> LONG_INT_SHIFT;
59594 +       last_bit = high_off & LONG_INT_MASK;
59595 +       first_word = low_off >> LONG_INT_SHIFT;
59596 +
59597 +       if (last_bit < BITS_PER_LONG) {
59598 +               nr = find_last_set_bit_in_word(~base[last_word], last_bit);
59599 +               if (nr < BITS_PER_LONG) {
59600 +                       *result = (last_word << LONG_INT_SHIFT) + nr;
59601 +                       return 0;
59602 +               }
59603 +               --last_word;
59604 +       }
59605 +       while (last_word >= first_word) {
59606 +               if (base[last_word] != (ulong_t) (-1)) {
59607 +                       *result = (last_word << LONG_INT_SHIFT) +
59608 +                           find_last_set_bit_in_word(~base[last_word],
59609 +                                                     BITS_PER_LONG - 1);
59610 +                       return 0;
59611 +               }
59612 +               --last_word;
59613 +       }
59614 +
59615 +       return -1;              /* zero bit not found */
59616 +}
59617 +
59618 +/* Audited by: green(2002.06.12) */
59619 +static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end)
59620 +{
59621 +       int first_byte;
59622 +       int last_byte;
59623 +
59624 +       unsigned char first_byte_mask = 0xFF;
59625 +       unsigned char last_byte_mask = 0xFF;
59626 +
59627 +       assert("zam-410", start < end);
59628 +
59629 +       first_byte = start >> 3;
59630 +       last_byte = (end - 1) >> 3;
59631 +
59632 +       if (last_byte > first_byte + 1)
59633 +               memset(addr + first_byte + 1, 0,
59634 +                      (size_t) (last_byte - first_byte - 1));
59635 +
59636 +       first_byte_mask >>= 8 - (start & 0x7);
59637 +       last_byte_mask <<= ((end - 1) & 0x7) + 1;
59638 +
59639 +       if (first_byte == last_byte) {
59640 +               addr[first_byte] &= (first_byte_mask | last_byte_mask);
59641 +       } else {
59642 +               addr[first_byte] &= first_byte_mask;
59643 +               addr[last_byte] &= last_byte_mask;
59644 +       }
59645 +}
59646 +
59647 +/* Audited by: green(2002.06.12) */
59648 +/* ZAM-FIXME-HANS: comment this */
59649 +static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end)
59650 +{
59651 +       int first_byte;
59652 +       int last_byte;
59653 +
59654 +       unsigned char first_byte_mask = 0xFF;
59655 +       unsigned char last_byte_mask = 0xFF;
59656 +
59657 +       assert("zam-386", start < end);
59658 +
59659 +       first_byte = start >> 3;
59660 +       last_byte = (end - 1) >> 3;
59661 +
59662 +       if (last_byte > first_byte + 1)
59663 +               memset(addr + first_byte + 1, 0xFF,
59664 +                      (size_t) (last_byte - first_byte - 1));
59665 +
59666 +       first_byte_mask <<= start & 0x7;
59667 +       last_byte_mask >>= 7 - ((end - 1) & 0x7);
59668 +
59669 +       if (first_byte == last_byte) {
59670 +               addr[first_byte] |= (first_byte_mask & last_byte_mask);
59671 +       } else {
59672 +               addr[first_byte] |= first_byte_mask;
59673 +               addr[last_byte] |= last_byte_mask;
59674 +       }
59675 +}
59676 +
59677 +#define ADLER_BASE    65521
59678 +#define ADLER_NMAX    5552
59679 +
59680 +/* Calculates the adler32 checksum for the data pointed by `data` of the
59681 +    length `len`. This function was originally taken from zlib, version 1.1.3,
59682 +    July 9th, 1998.
59683 +
59684 +    Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
59685 +
59686 +    This software is provided 'as-is', without any express or implied
59687 +    warranty.  In no event will the authors be held liable for any damages
59688 +    arising from the use of this software.
59689 +
59690 +    Permission is granted to anyone to use this software for any purpose,
59691 +    including commercial applications, and to alter it and redistribute it
59692 +    freely, subject to the following restrictions:
59693 +
59694 +    1. The origin of this software must not be misrepresented; you must not
59695 +       claim that you wrote the original software. If you use this software
59696 +       in a product, an acknowledgment in the product documentation would be
59697 +       appreciated but is not required.
59698 +    2. Altered source versions must be plainly marked as such, and must not be
59699 +       misrepresented as being the original software.
59700 +    3. This notice may not be removed or altered from any source distribution.
59701 +
59702 +    Jean-loup Gailly        Mark Adler
59703 +    jloup@gzip.org          madler@alumni.caltech.edu
59704 +
59705 +    The above comment applies only to the reiser4_adler32 function.
59706 +*/
59707 +
59708 +__u32 reiser4_adler32(char *data, __u32 len)
59709 +{
59710 +       unsigned char *t = data;
59711 +       __u32 s1 = 1;
59712 +       __u32 s2 = 0;
59713 +       int k;
59714 +
59715 +       while (len > 0) {
59716 +               k = len < ADLER_NMAX ? len : ADLER_NMAX;
59717 +               len -= k;
59718 +
59719 +               while (k--) {
59720 +                       s1 += *t++;
59721 +                       s2 += s1;
59722 +               }
59723 +
59724 +               s1 %= ADLER_BASE;
59725 +               s2 %= ADLER_BASE;
59726 +       }
59727 +       return (s2 << 16) | s1;
59728 +}
59729 +
59730 +#define sb_by_bnode(bnode) \
59731 +       ((struct super_block *)jnode_get_tree(bnode->wjnode)->super)
59732 +
59733 +static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size)
59734 +{
59735 +       return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size));
59736 +}
59737 +
59738 +static int
59739 +bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size)
59740 +{
59741 +       if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) {
59742 +               bmap_nr_t bmap;
59743 +
59744 +               bmap = bnode - get_bnode(sb_by_bnode(bnode), 0);
59745 +
59746 +               warning("vpf-263",
59747 +                       "Checksum for the bitmap block %llu is incorrect",
59748 +                       bmap);
59749 +
59750 +               return RETERR(-EIO);
59751 +       }
59752 +
59753 +       return 0;
59754 +}
59755 +
59756 +#define REISER4_CHECK_BMAP_CRC (0)
59757 +
59758 +#if REISER4_CHECK_BMAP_CRC
59759 +static int bnode_check_crc(const struct bitmap_node *bnode)
59760 +{
59761 +       return bnode_check_adler32(bnode,
59762 +                                  bmap_size(sb_by_bnode(bnode)->s_blocksize));
59763 +}
59764 +
59765 +/* REISER4_CHECK_BMAP_CRC */
59766 +#else
59767 +
59768 +#define bnode_check_crc(bnode) (0)
59769 +
59770 +/* REISER4_CHECK_BMAP_CRC */
59771 +#endif
59772 +
59773 +/* Recalculates the adler32 checksum for only 1 byte change.
59774 +    adler - previous adler checksum
59775 +    old_data, data - old, new byte values.
59776 +    tail == (chunk - offset) : length, checksum was calculated for, - offset of
59777 +    the changed byte within this chunk.
59778 +    This function can be used for checksum calculation optimisation.
59779 +*/
59780 +
59781 +static __u32
59782 +adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data,
59783 +              __u32 tail)
59784 +{
59785 +       __u32 delta = data - old_data + 2 * ADLER_BASE;
59786 +       __u32 s1 = adler & 0xffff;
59787 +       __u32 s2 = (adler >> 16) & 0xffff;
59788 +
59789 +       s1 = (delta + s1) % ADLER_BASE;
59790 +       s2 = (delta * tail + s2) % ADLER_BASE;
59791 +
59792 +       return (s2 << 16) | s1;
59793 +}
59794 +
59795 +#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val))
59796 +
59797 +/**
59798 + * get_nr_bitmap - calculate number of bitmap blocks
59799 + * @super: super block with initialized blocksize and block count
59800 + *
59801 + * Calculates number of bitmap blocks of a filesystem which uses bitmaps to
59802 + * maintain free disk space. It assumes that each bitmap addresses the same
59803 + * number of blocks which is calculated by bmap_block_count macro defined in
59804 + * above. Number of blocks in the filesystem has to be initialized in reiser4
59805 + * private data of super block already so that it can be obtained via
59806 + * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap
59807 + * is not power of 2 because 4 bytes are used for checksum. Therefore, we have
59808 + * to use special function to divide and modulo 64bits filesystem block
59809 + * counters.
59810 + *
59811 + * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap
59812 + * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address
59813 + * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2.
59814 + */
59815 +static bmap_nr_t get_nr_bmap(const struct super_block *super)
59816 +{
59817 +       u64 quotient;
59818 +
59819 +       assert("zam-393", reiser4_block_count(super) != 0);
59820 +
59821 +       quotient = reiser4_block_count(super) - 1;
59822 +       do_div(quotient, bmap_bit_count(super->s_blocksize));
59823 +       return quotient + 1;
59824 +}
59825 +
59826 +/**
59827 + * parse_blocknr - calculate bitmap number and offset in it by block number
59828 + * @block: pointer to block number to calculate location in bitmap of
59829 + * @bmap: pointer where to store bitmap block number
59830 + * @offset: pointer where to store offset within bitmap block
59831 + *
59832 + * Calculates location of bit which is responsible for allocation/freeing of
59833 + * block @*block. That location is represented by bitmap block number and offset
59834 + * within that bitmap block.
59835 + */
59836 +static void
59837 +parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap,
59838 +             bmap_off_t *offset)
59839 +{
59840 +       struct super_block *super = get_current_context()->super;
59841 +       u64 quotient = *block;
59842 +
59843 +       *offset = do_div(quotient, bmap_bit_count(super->s_blocksize));
59844 +       *bmap = quotient;
59845 +
59846 +       assert("zam-433", *bmap < get_nr_bmap(super));
59847 +       assert("", *offset < bmap_bit_count(super->s_blocksize));
59848 +}
59849 +
59850 +#if REISER4_DEBUG
59851 +/* Audited by: green(2002.06.12) */
59852 +static void
59853 +check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len)
59854 +{
59855 +       struct super_block *sb = reiser4_get_current_sb();
59856 +
59857 +       assert("zam-436", sb != NULL);
59858 +
59859 +       assert("zam-455", start != NULL);
59860 +       assert("zam-437", *start != 0);
59861 +       assert("zam-541", !reiser4_blocknr_is_fake(start));
59862 +       assert("zam-441", *start < reiser4_block_count(sb));
59863 +
59864 +       if (len != NULL) {
59865 +               assert("zam-438", *len != 0);
59866 +               assert("zam-442", *start + *len <= reiser4_block_count(sb));
59867 +       }
59868 +}
59869 +
59870 +static void check_bnode_loaded(const struct bitmap_node *bnode)
59871 +{
59872 +       assert("zam-485", bnode != NULL);
59873 +       assert("zam-483", jnode_page(bnode->wjnode) != NULL);
59874 +       assert("zam-484", jnode_page(bnode->cjnode) != NULL);
59875 +       assert("nikita-2820", jnode_is_loaded(bnode->wjnode));
59876 +       assert("nikita-2821", jnode_is_loaded(bnode->cjnode));
59877 +}
59878 +
59879 +#else
59880 +
59881 +#  define check_block_range(start, len) do { /* nothing */} while(0)
59882 +#  define check_bnode_loaded(bnode)     do { /* nothing */} while(0)
59883 +
59884 +#endif
59885 +
59886 +/* modify bnode->first_zero_bit (if we free bits before); bnode should be
59887 +   spin-locked */
59888 +static inline void
59889 +adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset)
59890 +{
59891 +       if (offset < bnode->first_zero_bit)
59892 +               bnode->first_zero_bit = offset;
59893 +}
59894 +
59895 +/* return a physical disk address for logical bitmap number @bmap */
59896 +/* FIXME-VS: this is somehow related to disk layout? */
59897 +/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference
59898 + * per block allocation so that performance is not affected.  Probably this
59899 + * whole file should be considered part of the disk layout plugin, and other
59900 + * disk layouts can use other defines and efficiency will not be significantly
59901 + * affected.  */
59902 +
59903 +#define REISER4_FIRST_BITMAP_BLOCK \
59904 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2)
59905 +
59906 +/* Audited by: green(2002.06.12) */
59907 +static void
59908 +get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap,
59909 +                  reiser4_block_nr * bnr)
59910 +{
59911 +
59912 +       assert("zam-390", bmap < get_nr_bmap(super));
59913 +
59914 +#ifdef CONFIG_REISER4_BADBLOCKS
59915 +#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff))
59916 +       /* Check if the diskmap have this already, first. */
59917 +       if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0)
59918 +               return;         /* Found it in diskmap */
59919 +#endif
59920 +       /* FIXME_ZAM: before discussing of disk layouts and disk format
59921 +          plugins I implement bitmap location scheme which is close to scheme
59922 +          used in reiser 3.6 */
59923 +       if (bmap == 0) {
59924 +               *bnr = REISER4_FIRST_BITMAP_BLOCK;
59925 +       } else {
59926 +               *bnr = bmap * bmap_bit_count(super->s_blocksize);
59927 +       }
59928 +}
59929 +
59930 +/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */
59931 +/* Audited by: green(2002.06.12) */
59932 +static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr)
59933 +{
59934 +       *bnr =
59935 +           (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) |
59936 +                               REISER4_BITMAP_BLOCKS_STATUS_VALUE);
59937 +}
59938 +
59939 +/* bnode structure initialization */
59940 +static void
59941 +init_bnode(struct bitmap_node *bnode,
59942 +          struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG)
59943 +{
59944 +       memset(bnode, 0, sizeof(struct bitmap_node));
59945 +
59946 +       mutex_init(&bnode->mutex);
59947 +       atomic_set(&bnode->loaded, 0);
59948 +}
59949 +
59950 +static void release(jnode * node)
59951 +{
59952 +       jrelse(node);
59953 +       JF_SET(node, JNODE_HEARD_BANSHEE);
59954 +       jput(node);
59955 +}
59956 +
59957 +/* This function is for internal bitmap.c use because it assumes that jnode is
59958 +   in under full control of this thread */
59959 +static void done_bnode(struct bitmap_node *bnode)
59960 +{
59961 +       if (bnode) {
59962 +               atomic_set(&bnode->loaded, 0);
59963 +               if (bnode->wjnode != NULL)
59964 +                       release(bnode->wjnode);
59965 +               if (bnode->cjnode != NULL)
59966 +                       release(bnode->cjnode);
59967 +               bnode->wjnode = bnode->cjnode = NULL;
59968 +       }
59969 +}
59970 +
59971 +/* ZAM-FIXME-HANS: comment this.  Called only by load_and_lock_bnode()*/
59972 +static int prepare_bnode(struct bitmap_node *bnode, jnode **cjnode_ret,
59973 +                        jnode **wjnode_ret)
59974 +{
59975 +       struct super_block *super;
59976 +       jnode *cjnode;
59977 +       jnode *wjnode;
59978 +       bmap_nr_t bmap;
59979 +       int ret;
59980 +
59981 +       super = reiser4_get_current_sb();
59982 +
59983 +       *wjnode_ret = wjnode = bnew();
59984 +       if (wjnode == NULL) {
59985 +               *cjnode_ret = NULL;
59986 +               return RETERR(-ENOMEM);
59987 +       }
59988 +
59989 +       *cjnode_ret = cjnode = bnew();
59990 +       if (cjnode == NULL)
59991 +               return RETERR(-ENOMEM);
59992 +
59993 +       bmap = bnode - get_bnode(super, 0);
59994 +
59995 +       get_working_bitmap_blocknr(bmap, &wjnode->blocknr);
59996 +       get_bitmap_blocknr(super, bmap, &cjnode->blocknr);
59997 +
59998 +       jref(cjnode);
59999 +       jref(wjnode);
60000 +
60001 +       /* load commit bitmap */
60002 +       ret = jload_gfp(cjnode, GFP_NOFS, 1);
60003 +
60004 +       if (ret)
60005 +               goto error;
60006 +
60007 +       /* allocate memory for working bitmap block. Note that for
60008 +        * bitmaps jinit_new() doesn't actually modifies node content,
60009 +        * so parallel calls to this are ok. */
60010 +       ret = jinit_new(wjnode, GFP_NOFS);
60011 +
60012 +       if (ret != 0) {
60013 +               jrelse(cjnode);
60014 +               goto error;
60015 +       }
60016 +
60017 +       return 0;
60018 +
60019 +      error:
60020 +       jput(cjnode);
60021 +       jput(wjnode);
60022 +       *wjnode_ret = *cjnode_ret = NULL;
60023 +       return ret;
60024 +
60025 +}
60026 +
60027 +/* Check the bnode data on read. */
60028 +static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize)
60029 +{
60030 +       void *data;
60031 +       int ret;
60032 +
60033 +       /* Check CRC */
60034 +       ret = bnode_check_adler32(bnode, blksize);
60035 +
60036 +       if (ret) {
60037 +               return ret;
60038 +       }
60039 +
60040 +       data = jdata(bnode->cjnode) + CHECKSUM_SIZE;
60041 +
60042 +       /* Check the very first bit -- it must be busy. */
60043 +       if (!reiser4_test_bit(0, data)) {
60044 +               warning("vpf-1362", "The allocator block %llu is not marked "
60045 +                       "as used.", (unsigned long long)bnode->cjnode->blocknr);
60046 +
60047 +               return -EINVAL;
60048 +       }
60049 +
60050 +       return 0;
60051 +}
60052 +
60053 +/* load bitmap blocks "on-demand" */
60054 +static int load_and_lock_bnode(struct bitmap_node *bnode)
60055 +{
60056 +       int ret;
60057 +
60058 +       jnode *cjnode;
60059 +       jnode *wjnode;
60060 +
60061 +       assert("nikita-3040", reiser4_schedulable());
60062 +
60063 +/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not
60064 + * need to be atomic, right? Just leave a comment that if bitmaps were
60065 + * unloadable, this would need to be atomic.  */
60066 +       if (atomic_read(&bnode->loaded)) {
60067 +               /* bitmap is already loaded, nothing to do */
60068 +               check_bnode_loaded(bnode);
60069 +               mutex_lock(&bnode->mutex);
60070 +               assert("nikita-2827", atomic_read(&bnode->loaded));
60071 +               return 0;
60072 +       }
60073 +
60074 +       ret = prepare_bnode(bnode, &cjnode, &wjnode);
60075 +       if (ret == 0) {
60076 +               mutex_lock(&bnode->mutex);
60077 +
60078 +               if (!atomic_read(&bnode->loaded)) {
60079 +                       assert("nikita-2822", cjnode != NULL);
60080 +                       assert("nikita-2823", wjnode != NULL);
60081 +                       assert("nikita-2824", jnode_is_loaded(cjnode));
60082 +                       assert("nikita-2825", jnode_is_loaded(wjnode));
60083 +
60084 +                       bnode->wjnode = wjnode;
60085 +                       bnode->cjnode = cjnode;
60086 +
60087 +                       ret = check_struct_bnode(bnode, current_blocksize);
60088 +                       if (!ret) {
60089 +                               cjnode = wjnode = NULL;
60090 +                               atomic_set(&bnode->loaded, 1);
60091 +                               /* working bitmap is initialized by on-disk
60092 +                                * commit bitmap. This should be performed
60093 +                                * under mutex. */
60094 +                               memcpy(bnode_working_data(bnode),
60095 +                                      bnode_commit_data(bnode),
60096 +                                      bmap_size(current_blocksize));
60097 +                       } else
60098 +                               mutex_unlock(&bnode->mutex);
60099 +               } else
60100 +                       /* race: someone already loaded bitmap while we were
60101 +                        * busy initializing data. */
60102 +                       check_bnode_loaded(bnode);
60103 +       }
60104 +
60105 +       if (wjnode != NULL) {
60106 +               release(wjnode);
60107 +               bnode->wjnode = NULL;
60108 +       }
60109 +       if (cjnode != NULL) {
60110 +               release(cjnode);
60111 +               bnode->cjnode = NULL;
60112 +       }
60113 +
60114 +       return ret;
60115 +}
60116 +
60117 +static void release_and_unlock_bnode(struct bitmap_node *bnode)
60118 +{
60119 +       check_bnode_loaded(bnode);
60120 +       mutex_unlock(&bnode->mutex);
60121 +}
60122 +
60123 +/* This function does all block allocation work but only for one bitmap
60124 +   block.*/
60125 +/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap
60126 +   block responsibility zone boundaries. This had no sense in v3.6 but may
60127 +   have it in v4.x */
60128 +/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */
60129 +static int
60130 +search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset,
60131 +                         bmap_off_t max_offset, int min_len, int max_len)
60132 +{
60133 +       struct super_block *super = get_current_context()->super;
60134 +       struct bitmap_node *bnode = get_bnode(super, bmap);
60135 +
60136 +       char *data;
60137 +
60138 +       bmap_off_t search_end;
60139 +       bmap_off_t start;
60140 +       bmap_off_t end;
60141 +
60142 +       int set_first_zero_bit = 0;
60143 +
60144 +       int ret;
60145 +
60146 +       assert("zam-364", min_len > 0);
60147 +       assert("zam-365", max_len >= min_len);
60148 +       assert("zam-366", *offset <= max_offset);
60149 +
60150 +       ret = load_and_lock_bnode(bnode);
60151 +
60152 +       if (ret)
60153 +               return ret;
60154 +
60155 +       data = bnode_working_data(bnode);
60156 +
60157 +       start = *offset;
60158 +
60159 +       if (bnode->first_zero_bit >= start) {
60160 +               start = bnode->first_zero_bit;
60161 +               set_first_zero_bit = 1;
60162 +       }
60163 +
60164 +       while (start + min_len < max_offset) {
60165 +
60166 +               start =
60167 +                   reiser4_find_next_zero_bit((long *)data, max_offset, start);
60168 +               if (set_first_zero_bit) {
60169 +                       bnode->first_zero_bit = start;
60170 +                       set_first_zero_bit = 0;
60171 +               }
60172 +               if (start >= max_offset)
60173 +                       break;
60174 +
60175 +               search_end = LIMIT(start + max_len, max_offset);
60176 +               end =
60177 +                   reiser4_find_next_set_bit((long *)data, search_end, start);
60178 +               if (end >= start + min_len) {
60179 +                       /* we can't trust find_next_set_bit result if set bit
60180 +                          was not fount, result may be bigger than
60181 +                          max_offset */
60182 +                       if (end > search_end)
60183 +                               end = search_end;
60184 +
60185 +                       ret = end - start;
60186 +                       *offset = start;
60187 +
60188 +                       reiser4_set_bits(data, start, end);
60189 +
60190 +                       /* FIXME: we may advance first_zero_bit if [start,
60191 +                          end] region overlaps the first_zero_bit point */
60192 +
60193 +                       break;
60194 +               }
60195 +
60196 +               start = end + 1;
60197 +       }
60198 +
60199 +       release_and_unlock_bnode(bnode);
60200 +
60201 +       return ret;
60202 +}
60203 +
60204 +static int
60205 +search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset,
60206 +                          bmap_off_t end_offset, int min_len, int max_len)
60207 +{
60208 +       struct super_block *super = get_current_context()->super;
60209 +       struct bitmap_node *bnode = get_bnode(super, bmap);
60210 +       char *data;
60211 +       bmap_off_t start;
60212 +       int ret;
60213 +
60214 +       assert("zam-958", min_len > 0);
60215 +       assert("zam-959", max_len >= min_len);
60216 +       assert("zam-960", *start_offset >= end_offset);
60217 +
60218 +       ret = load_and_lock_bnode(bnode);
60219 +       if (ret)
60220 +               return ret;
60221 +
60222 +       data = bnode_working_data(bnode);
60223 +       start = *start_offset;
60224 +
60225 +       while (1) {
60226 +               bmap_off_t end, search_end;
60227 +
60228 +               /* Find the beginning of the zero filled region */
60229 +               if (reiser4_find_last_zero_bit(&start, data, end_offset, start))
60230 +                       break;
60231 +               /* Is there more than `min_len' bits from `start' to
60232 +                * `end_offset'?  */
60233 +               if (start < end_offset + min_len - 1)
60234 +                       break;
60235 +
60236 +               /* Do not search to `end_offset' if we need to find less than
60237 +                * `max_len' zero bits. */
60238 +               if (end_offset + max_len - 1 < start)
60239 +                       search_end = start - max_len + 1;
60240 +               else
60241 +                       search_end = end_offset;
60242 +
60243 +               if (reiser4_find_last_set_bit(&end, data, search_end, start))
60244 +                       end = search_end;
60245 +               else
60246 +                       end++;
60247 +
60248 +               if (end + min_len <= start + 1) {
60249 +                       if (end < search_end)
60250 +                               end = search_end;
60251 +                       ret = start - end + 1;
60252 +                       *start_offset = end;    /* `end' is lowest offset */
60253 +                       assert("zam-987",
60254 +                              reiser4_find_next_set_bit(data, start + 1,
60255 +                                                        end) >= start + 1);
60256 +                       reiser4_set_bits(data, end, start + 1);
60257 +                       break;
60258 +               }
60259 +
60260 +               if (end <= end_offset)
60261 +                       /* left search boundary reached. */
60262 +                       break;
60263 +               start = end - 1;
60264 +       }
60265 +
60266 +       release_and_unlock_bnode(bnode);
60267 +       return ret;
60268 +}
60269 +
60270 +/* allocate contiguous range of blocks in bitmap */
60271 +static int bitmap_alloc_forward(reiser4_block_nr * start,
60272 +                               const reiser4_block_nr * end, int min_len,
60273 +                               int max_len)
60274 +{
60275 +       bmap_nr_t bmap, end_bmap;
60276 +       bmap_off_t offset, end_offset;
60277 +       int len;
60278 +
60279 +       reiser4_block_nr tmp;
60280 +
60281 +       struct super_block *super = get_current_context()->super;
60282 +       const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
60283 +
60284 +       parse_blocknr(start, &bmap, &offset);
60285 +
60286 +       tmp = *end - 1;
60287 +       parse_blocknr(&tmp, &end_bmap, &end_offset);
60288 +       ++end_offset;
60289 +
60290 +       assert("zam-358", end_bmap >= bmap);
60291 +       assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset));
60292 +
60293 +       for (; bmap < end_bmap; bmap++, offset = 0) {
60294 +               len =
60295 +                   search_one_bitmap_forward(bmap, &offset, max_offset,
60296 +                                             min_len, max_len);
60297 +               if (len != 0)
60298 +                       goto out;
60299 +       }
60300 +
60301 +       len =
60302 +           search_one_bitmap_forward(bmap, &offset, end_offset, min_len,
60303 +                                     max_len);
60304 +      out:
60305 +       *start = bmap * max_offset + offset;
60306 +       return len;
60307 +}
60308 +
60309 +/* allocate contiguous range of blocks in bitmap (from @start to @end in
60310 + * backward direction) */
60311 +static int bitmap_alloc_backward(reiser4_block_nr * start,
60312 +                                const reiser4_block_nr * end, int min_len,
60313 +                                int max_len)
60314 +{
60315 +       bmap_nr_t bmap, end_bmap;
60316 +       bmap_off_t offset, end_offset;
60317 +       int len;
60318 +       struct super_block *super = get_current_context()->super;
60319 +       const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
60320 +
60321 +       parse_blocknr(start, &bmap, &offset);
60322 +       parse_blocknr(end, &end_bmap, &end_offset);
60323 +
60324 +       assert("zam-961", end_bmap <= bmap);
60325 +       assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset));
60326 +
60327 +       for (; bmap > end_bmap; bmap--, offset = max_offset - 1) {
60328 +               len =
60329 +                   search_one_bitmap_backward(bmap, &offset, 0, min_len,
60330 +                                              max_len);
60331 +               if (len != 0)
60332 +                       goto out;
60333 +       }
60334 +
60335 +       len =
60336 +           search_one_bitmap_backward(bmap, &offset, end_offset, min_len,
60337 +                                      max_len);
60338 +      out:
60339 +       *start = bmap * max_offset + offset;
60340 +       return len;
60341 +}
60342 +
60343 +/* plugin->u.space_allocator.alloc_blocks() */
60344 +static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed,
60345 +                               reiser4_block_nr *start, reiser4_block_nr *len)
60346 +{
60347 +       struct super_block *super = get_current_context()->super;
60348 +       int actual_len;
60349 +
60350 +       reiser4_block_nr search_start;
60351 +       reiser4_block_nr search_end;
60352 +
60353 +       assert("zam-398", super != NULL);
60354 +       assert("zam-412", hint != NULL);
60355 +       assert("zam-397", hint->blk <= reiser4_block_count(super));
60356 +
60357 +       if (hint->max_dist == 0)
60358 +               search_end = reiser4_block_count(super);
60359 +       else
60360 +               search_end =
60361 +                   LIMIT(hint->blk + hint->max_dist,
60362 +                         reiser4_block_count(super));
60363 +
60364 +       /* We use @hint -> blk as a search start and search from it to the end
60365 +          of the disk or in given region if @hint -> max_dist is not zero */
60366 +       search_start = hint->blk;
60367 +
60368 +       actual_len =
60369 +           bitmap_alloc_forward(&search_start, &search_end, 1, needed);
60370 +
60371 +       /* There is only one bitmap search if max_dist was specified or first
60372 +          pass was from the beginning of the bitmap. We also do one pass for
60373 +          scanning bitmap in backward direction. */
60374 +       if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) {
60375 +               /* next step is a scanning from 0 to search_start */
60376 +               search_end = search_start;
60377 +               search_start = 0;
60378 +               actual_len =
60379 +                   bitmap_alloc_forward(&search_start, &search_end, 1, needed);
60380 +       }
60381 +       if (actual_len == 0)
60382 +               return RETERR(-ENOSPC);
60383 +       if (actual_len < 0)
60384 +               return RETERR(actual_len);
60385 +       *len = actual_len;
60386 +       *start = search_start;
60387 +       return 0;
60388 +}
60389 +
60390 +static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed,
60391 +                                reiser4_block_nr * start,
60392 +                                reiser4_block_nr * len)
60393 +{
60394 +       reiser4_block_nr search_start;
60395 +       reiser4_block_nr search_end;
60396 +       int actual_len;
60397 +
60398 +       ON_DEBUG(struct super_block *super = reiser4_get_current_sb());
60399 +
60400 +       assert("zam-969", super != NULL);
60401 +       assert("zam-970", hint != NULL);
60402 +       assert("zam-971", hint->blk <= reiser4_block_count(super));
60403 +
60404 +       search_start = hint->blk;
60405 +       if (hint->max_dist == 0 || search_start <= hint->max_dist)
60406 +               search_end = 0;
60407 +       else
60408 +               search_end = search_start - hint->max_dist;
60409 +
60410 +       actual_len =
60411 +           bitmap_alloc_backward(&search_start, &search_end, 1, needed);
60412 +       if (actual_len == 0)
60413 +               return RETERR(-ENOSPC);
60414 +       if (actual_len < 0)
60415 +               return RETERR(actual_len);
60416 +       *len = actual_len;
60417 +       *start = search_start;
60418 +       return 0;
60419 +}
60420 +
60421 +/* plugin->u.space_allocator.alloc_blocks() */
60422 +int reiser4_alloc_blocks_bitmap(reiser4_space_allocator * allocator,
60423 +                               reiser4_blocknr_hint * hint, int needed,
60424 +                               reiser4_block_nr * start, reiser4_block_nr * len)
60425 +{
60426 +       if (hint->backward)
60427 +               return alloc_blocks_backward(hint, needed, start, len);
60428 +       return alloc_blocks_forward(hint, needed, start, len);
60429 +}
60430 +
60431 +/* plugin->u.space_allocator.dealloc_blocks(). */
60432 +/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted
60433 +   nodes deletion is deferred until transaction commit.  However, deallocation
60434 +   of temporary objects like wandered blocks and transaction commit records
60435 +   requires immediate node deletion from WORKING BITMAP.*/
60436 +void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator * allocator,
60437 +                                  reiser4_block_nr start, reiser4_block_nr len)
60438 +{
60439 +       struct super_block *super = reiser4_get_current_sb();
60440 +
60441 +       bmap_nr_t bmap;
60442 +       bmap_off_t offset;
60443 +
60444 +       struct bitmap_node *bnode;
60445 +       int ret;
60446 +
60447 +       assert("zam-468", len != 0);
60448 +       check_block_range(&start, &len);
60449 +
60450 +       parse_blocknr(&start, &bmap, &offset);
60451 +
60452 +       assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize));
60453 +
60454 +       bnode = get_bnode(super, bmap);
60455 +
60456 +       assert("zam-470", bnode != NULL);
60457 +
60458 +       ret = load_and_lock_bnode(bnode);
60459 +       assert("zam-481", ret == 0);
60460 +
60461 +       reiser4_clear_bits(bnode_working_data(bnode), offset,
60462 +                          (bmap_off_t) (offset + len));
60463 +
60464 +       adjust_first_zero_bit(bnode, offset);
60465 +
60466 +       release_and_unlock_bnode(bnode);
60467 +}
60468 +
60469 +/* plugin->u.space_allocator.check_blocks(). */
60470 +void reiser4_check_blocks_bitmap(const reiser4_block_nr * start,
60471 +                                const reiser4_block_nr * len, int desired)
60472 +{
60473 +#if REISER4_DEBUG
60474 +       struct super_block *super = reiser4_get_current_sb();
60475 +
60476 +       bmap_nr_t bmap;
60477 +       bmap_off_t start_offset;
60478 +       bmap_off_t end_offset;
60479 +
60480 +       struct bitmap_node *bnode;
60481 +       int ret;
60482 +
60483 +       assert("zam-622", len != NULL);
60484 +       check_block_range(start, len);
60485 +       parse_blocknr(start, &bmap, &start_offset);
60486 +
60487 +       end_offset = start_offset + *len;
60488 +       assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize));
60489 +
60490 +       bnode = get_bnode(super, bmap);
60491 +
60492 +       assert("nikita-2215", bnode != NULL);
60493 +
60494 +       ret = load_and_lock_bnode(bnode);
60495 +       assert("zam-626", ret == 0);
60496 +
60497 +       assert("nikita-2216", jnode_is_loaded(bnode->wjnode));
60498 +
60499 +       if (desired) {
60500 +               assert("zam-623",
60501 +                      reiser4_find_next_zero_bit(bnode_working_data(bnode),
60502 +                                                 end_offset, start_offset)
60503 +                      >= end_offset);
60504 +       } else {
60505 +               assert("zam-624",
60506 +                      reiser4_find_next_set_bit(bnode_working_data(bnode),
60507 +                                                end_offset, start_offset)
60508 +                      >= end_offset);
60509 +       }
60510 +
60511 +       release_and_unlock_bnode(bnode);
60512 +#endif
60513 +}
60514 +
60515 +/* conditional insertion of @node into atom's overwrite set  if it was not there */
60516 +static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node)
60517 +{
60518 +       assert("zam-546", atom != NULL);
60519 +       assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT);
60520 +       assert("zam-548", node != NULL);
60521 +
60522 +       spin_lock_atom(atom);
60523 +       spin_lock_jnode(node);
60524 +
60525 +       if (node->atom == NULL) {
60526 +               JF_SET(node, JNODE_OVRWR);
60527 +               insert_into_atom_ovrwr_list(atom, node);
60528 +       } else {
60529 +               assert("zam-549", node->atom == atom);
60530 +       }
60531 +
60532 +       spin_unlock_jnode(node);
60533 +       spin_unlock_atom(atom);
60534 +}
60535 +
60536 +/* an actor which applies delete set to COMMIT bitmap pages and link modified
60537 +   pages in a single-linked list */
60538 +static int
60539 +apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start,
60540 +                         const reiser4_block_nr * len, void *data)
60541 +{
60542 +
60543 +       bmap_nr_t bmap;
60544 +       bmap_off_t offset;
60545 +       int ret;
60546 +
60547 +       long long *blocks_freed_p = data;
60548 +
60549 +       struct bitmap_node *bnode;
60550 +
60551 +       struct super_block *sb = reiser4_get_current_sb();
60552 +
60553 +       check_block_range(start, len);
60554 +
60555 +       parse_blocknr(start, &bmap, &offset);
60556 +
60557 +       /* FIXME-ZAM: we assume that all block ranges are allocated by this
60558 +          bitmap-based allocator and each block range can't go over a zone of
60559 +          responsibility of one bitmap block; same assumption is used in
60560 +          other journal hooks in bitmap code. */
60561 +       bnode = get_bnode(sb, bmap);
60562 +       assert("zam-448", bnode != NULL);
60563 +
60564 +       /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */
60565 +       assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT);
60566 +       ret = load_and_lock_bnode(bnode);
60567 +       if (ret)
60568 +               return ret;
60569 +
60570 +       /* put bnode into atom's overwrite set */
60571 +       cond_add_to_overwrite_set(atom, bnode->cjnode);
60572 +
60573 +       data = bnode_commit_data(bnode);
60574 +
60575 +       ret = bnode_check_crc(bnode);
60576 +       if (ret != 0)
60577 +               return ret;
60578 +
60579 +       if (len != NULL) {
60580 +               /* FIXME-ZAM: a check that all bits are set should be there */
60581 +               assert("zam-443",
60582 +                      offset + *len <= bmap_bit_count(sb->s_blocksize));
60583 +               reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len));
60584 +
60585 +               (*blocks_freed_p) += *len;
60586 +       } else {
60587 +               reiser4_clear_bit(offset, data);
60588 +               (*blocks_freed_p)++;
60589 +       }
60590 +
60591 +       bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize));
60592 +
60593 +       release_and_unlock_bnode(bnode);
60594 +
60595 +       return 0;
60596 +}
60597 +
60598 +/* plugin->u.space_allocator.pre_commit_hook(). */
60599 +/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the
60600 +   rest is done by transaction manager (allocate wandered locations for COMMIT
60601 +   BITMAP blocks, copy COMMIT BITMAP blocks data). */
60602 +/* Only one instance of this function can be running at one given time, because
60603 +   only one transaction can be committed a time, therefore it is safe to access
60604 +   some global variables without any locking */
60605 +
60606 +int reiser4_pre_commit_hook_bitmap(void)
60607 +{
60608 +       struct super_block *super = reiser4_get_current_sb();
60609 +       txn_atom *atom;
60610 +
60611 +       long long blocks_freed = 0;
60612 +
60613 +       atom = get_current_atom_locked();
60614 +       assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
60615 +       spin_unlock_atom(atom);
60616 +
60617 +       {                       /* scan atom's captured list and find all freshly allocated nodes,
60618 +                                * mark corresponded bits in COMMIT BITMAP as used */
60619 +               struct list_head *head = ATOM_CLEAN_LIST(atom);
60620 +               jnode *node = list_entry(head->next, jnode, capture_link);
60621 +
60622 +               while (head != &node->capture_link) {
60623 +                       /* we detect freshly allocated jnodes */
60624 +                       if (JF_ISSET(node, JNODE_RELOC)) {
60625 +                               int ret;
60626 +                               bmap_nr_t bmap;
60627 +
60628 +                               bmap_off_t offset;
60629 +                               bmap_off_t index;
60630 +                               struct bitmap_node *bn;
60631 +                               __u32 size = bmap_size(super->s_blocksize);
60632 +                               __u32 crc;
60633 +                               char byte;
60634 +
60635 +                               assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
60636 +                               assert("zam-460",
60637 +                                      !reiser4_blocknr_is_fake(&node->blocknr));
60638 +
60639 +                               parse_blocknr(&node->blocknr, &bmap, &offset);
60640 +                               bn = get_bnode(super, bmap);
60641 +
60642 +                               index = offset >> 3;
60643 +                               assert("vpf-276", index < size);
60644 +
60645 +                               ret = bnode_check_crc(bnode);
60646 +                               if (ret != 0)
60647 +                                       return ret;
60648 +
60649 +                               check_bnode_loaded(bn);
60650 +                               load_and_lock_bnode(bn);
60651 +
60652 +                               byte = *(bnode_commit_data(bn) + index);
60653 +                               reiser4_set_bit(offset, bnode_commit_data(bn));
60654 +
60655 +                               crc = adler32_recalc(bnode_commit_crc(bn), byte,
60656 +                                                    *(bnode_commit_data(bn) +
60657 +                                                      index),
60658 +                                                    size - index),
60659 +                                   bnode_set_commit_crc(bn, crc);
60660 +
60661 +                               release_and_unlock_bnode(bn);
60662 +
60663 +                               ret = bnode_check_crc(bn);
60664 +                               if (ret != 0)
60665 +                                       return ret;
60666 +
60667 +                               /* working of this depends on how it inserts
60668 +                                  new j-node into clean list, because we are
60669 +                                  scanning the same list now. It is OK, if
60670 +                                  insertion is done to the list front */
60671 +                               cond_add_to_overwrite_set(atom, bn->cjnode);
60672 +                       }
60673 +
60674 +                       node = list_entry(node->capture_link.next, jnode, capture_link);
60675 +               }
60676 +       }
60677 +
60678 +       blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap,
60679 +                            &blocks_freed, 0);
60680 +
60681 +       blocks_freed -= atom->nr_blocks_allocated;
60682 +
60683 +       {
60684 +               reiser4_super_info_data *sbinfo;
60685 +
60686 +               sbinfo = get_super_private(super);
60687 +
60688 +               spin_lock_reiser4_super(sbinfo);
60689 +               sbinfo->blocks_free_committed += blocks_freed;
60690 +               spin_unlock_reiser4_super(sbinfo);
60691 +       }
60692 +
60693 +       return 0;
60694 +}
60695 +
60696 +/* plugin->u.space_allocator.init_allocator
60697 +    constructor of reiser4_space_allocator object. It is called on fs mount */
60698 +int reiser4_init_allocator_bitmap(reiser4_space_allocator * allocator,
60699 +                                 struct super_block *super, void *arg)
60700 +{
60701 +       struct bitmap_allocator_data *data = NULL;
60702 +       bmap_nr_t bitmap_blocks_nr;
60703 +       bmap_nr_t i;
60704 +
60705 +       assert("nikita-3039", reiser4_schedulable());
60706 +
60707 +       /* getting memory for bitmap allocator private data holder */
60708 +       data =
60709 +               kmalloc(sizeof(struct bitmap_allocator_data),
60710 +                       reiser4_ctx_gfp_mask_get());
60711 +
60712 +       if (data == NULL)
60713 +               return RETERR(-ENOMEM);
60714 +
60715 +       /* allocation and initialization for the array of bnodes */
60716 +       bitmap_blocks_nr = get_nr_bmap(super);
60717 +
60718 +       /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps
60719 +          which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17,
60720 +          may I never meet someone who still uses the ia32 architecture when
60721 +          storage devices of that size enter the market, and wants to use ia32
60722 +          with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and,
60723 +          probably, another dynamic data structure should replace a static
60724 +          array of bnodes. */
60725 +       /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */
60726 +       data->bitmap = reiser4_vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr);
60727 +       if (data->bitmap == NULL) {
60728 +               kfree(data);
60729 +               return RETERR(-ENOMEM);
60730 +       }
60731 +
60732 +       for (i = 0; i < bitmap_blocks_nr; i++)
60733 +               init_bnode(data->bitmap + i, super, i);
60734 +
60735 +       allocator->u.generic = data;
60736 +
60737 +#if REISER4_DEBUG
60738 +       get_super_private(super)->min_blocks_used += bitmap_blocks_nr;
60739 +#endif
60740 +
60741 +       /* Load all bitmap blocks at mount time. */
60742 +       if (!test_bit
60743 +           (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) {
60744 +               __u64 start_time, elapsed_time;
60745 +               struct bitmap_node *bnode;
60746 +               int ret;
60747 +
60748 +               if (REISER4_DEBUG)
60749 +                       printk(KERN_INFO "loading reiser4 bitmap...");
60750 +               start_time = jiffies;
60751 +
60752 +               for (i = 0; i < bitmap_blocks_nr; i++) {
60753 +                       bnode = data->bitmap + i;
60754 +                       ret = load_and_lock_bnode(bnode);
60755 +                       if (ret) {
60756 +                               reiser4_destroy_allocator_bitmap(allocator,
60757 +                                                                super);
60758 +                               return ret;
60759 +                       }
60760 +                       release_and_unlock_bnode(bnode);
60761 +               }
60762 +
60763 +               elapsed_time = jiffies - start_time;
60764 +               if (REISER4_DEBUG)
60765 +                       printk("...done (%llu jiffies)\n",
60766 +                              (unsigned long long)elapsed_time);
60767 +       }
60768 +
60769 +       return 0;
60770 +}
60771 +
60772 +/* plugin->u.space_allocator.destroy_allocator
60773 +   destructor. It is called on fs unmount */
60774 +int reiser4_destroy_allocator_bitmap(reiser4_space_allocator * allocator,
60775 +                                    struct super_block *super)
60776 +{
60777 +       bmap_nr_t bitmap_blocks_nr;
60778 +       bmap_nr_t i;
60779 +
60780 +       struct bitmap_allocator_data *data = allocator->u.generic;
60781 +
60782 +       assert("zam-414", data != NULL);
60783 +       assert("zam-376", data->bitmap != NULL);
60784 +
60785 +       bitmap_blocks_nr = get_nr_bmap(super);
60786 +
60787 +       for (i = 0; i < bitmap_blocks_nr; i++) {
60788 +               struct bitmap_node *bnode = data->bitmap + i;
60789 +
60790 +               mutex_lock(&bnode->mutex);
60791 +
60792 +#if REISER4_DEBUG
60793 +               if (atomic_read(&bnode->loaded)) {
60794 +                       jnode *wj = bnode->wjnode;
60795 +                       jnode *cj = bnode->cjnode;
60796 +
60797 +                       assert("zam-480", jnode_page(cj) != NULL);
60798 +                       assert("zam-633", jnode_page(wj) != NULL);
60799 +
60800 +                       assert("zam-634",
60801 +                              memcmp(jdata(wj), jdata(wj),
60802 +                                     bmap_size(super->s_blocksize)) == 0);
60803 +
60804 +               }
60805 +#endif
60806 +               done_bnode(bnode);
60807 +               mutex_unlock(&bnode->mutex);
60808 +       }
60809 +
60810 +       vfree(data->bitmap);
60811 +       kfree(data);
60812 +
60813 +       allocator->u.generic = NULL;
60814 +
60815 +       return 0;
60816 +}
60817 +
60818 +/*
60819 + * Local variables:
60820 + * c-indentation-style: "K&R"
60821 + * mode-name: "LC"
60822 + * c-basic-offset: 8
60823 + * tab-width: 8
60824 + * fill-column: 79
60825 + * scroll-step: 1
60826 + * End:
60827 + */
60828 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/space/bitmap.h linux-2.6.35/fs/reiser4/plugin/space/bitmap.h
60829 --- linux-2.6.35.orig/fs/reiser4/plugin/space/bitmap.h  1970-01-01 01:00:00.000000000 +0100
60830 +++ linux-2.6.35/fs/reiser4/plugin/space/bitmap.h       2010-08-04 15:44:57.000000000 +0200
60831 @@ -0,0 +1,47 @@
60832 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60833 +
60834 +#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__)
60835 +#define __REISER4_PLUGIN_SPACE_BITMAP_H__
60836 +
60837 +#include "../../dformat.h"
60838 +#include "../../block_alloc.h"
60839 +
60840 +#include <linux/types.h>       /* for __u??  */
60841 +#include <linux/fs.h>          /* for struct super_block  */
60842 +/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */
60843 +/* declarations of functions implementing methods of space allocator plugin for
60844 +   bitmap based allocator. The functions themselves are in bitmap.c */
60845 +extern int reiser4_init_allocator_bitmap(reiser4_space_allocator *,
60846 +                                        struct super_block *, void *);
60847 +extern int reiser4_destroy_allocator_bitmap(reiser4_space_allocator *,
60848 +                                           struct super_block *);
60849 +extern int reiser4_alloc_blocks_bitmap(reiser4_space_allocator *,
60850 +                                      reiser4_blocknr_hint *, int needed,
60851 +                                      reiser4_block_nr * start,
60852 +                                      reiser4_block_nr * len);
60853 +extern void reiser4_check_blocks_bitmap(const reiser4_block_nr *,
60854 +                                       const reiser4_block_nr *, int);
60855 +extern void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator *,
60856 +                                         reiser4_block_nr,
60857 +                                         reiser4_block_nr);
60858 +extern int reiser4_pre_commit_hook_bitmap(void);
60859 +
60860 +#define reiser4_post_commit_hook_bitmap() do{}while(0)
60861 +#define reiser4_post_write_back_hook_bitmap() do{}while(0)
60862 +#define reiser4_print_info_bitmap(pref, al) do{}while(0)
60863 +
60864 +typedef __u64 bmap_nr_t;
60865 +typedef __u32 bmap_off_t;
60866 +
60867 +#endif                         /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */
60868 +
60869 +/* Make Linus happy.
60870 +   Local variables:
60871 +   c-indentation-style: "K&R"
60872 +   mode-name: "LC"
60873 +   c-basic-offset: 8
60874 +   tab-width: 8
60875 +   fill-column: 120
60876 +   scroll-step: 1
60877 +   End:
60878 +*/
60879 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/space/Makefile linux-2.6.35/fs/reiser4/plugin/space/Makefile
60880 --- linux-2.6.35.orig/fs/reiser4/plugin/space/Makefile  1970-01-01 01:00:00.000000000 +0100
60881 +++ linux-2.6.35/fs/reiser4/plugin/space/Makefile       2010-08-04 15:44:57.000000000 +0200
60882 @@ -0,0 +1,4 @@
60883 +obj-$(CONFIG_REISER4_FS) += space_plugins.o
60884 +
60885 +space_plugins-objs := \
60886 +       bitmap.o
60887 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/space/space_allocator.h linux-2.6.35/fs/reiser4/plugin/space/space_allocator.h
60888 --- linux-2.6.35.orig/fs/reiser4/plugin/space/space_allocator.h 1970-01-01 01:00:00.000000000 +0100
60889 +++ linux-2.6.35/fs/reiser4/plugin/space/space_allocator.h      2010-08-04 15:44:57.000000000 +0200
60890 @@ -0,0 +1,80 @@
60891 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60892 +
60893 +#ifndef __SPACE_ALLOCATOR_H__
60894 +#define __SPACE_ALLOCATOR_H__
60895 +
60896 +#include "../../forward.h"
60897 +#include "bitmap.h"
60898 +/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now,
60899 + * but... */
60900 +#define DEF_SPACE_ALLOCATOR(allocator)                                                                                 \
60901 +                                                                                                                       \
60902 +static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque)               \
60903 +{                                                                                                                      \
60904 +       return reiser4_init_allocator_##allocator (al, s, opaque);                                                      \
60905 +}                                                                                                                      \
60906 +                                                                                                                       \
60907 +static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s)                           \
60908 +{                                                                                                                      \
60909 +       reiser4_destroy_allocator_##allocator (al, s);                                                                  \
60910 +}                                                                                                                      \
60911 +                                                                                                                       \
60912 +static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint,                           \
60913 +                                  int needed, reiser4_block_nr * start, reiser4_block_nr * len)                        \
60914 +{                                                                                                                      \
60915 +       return reiser4_alloc_blocks_##allocator (al, hint, needed, start, len);                                         \
60916 +}                                                                                                                      \
60917 +static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len)      \
60918 +{                                                                                                                      \
60919 +       reiser4_dealloc_blocks_##allocator (al, start, len);                                                            \
60920 +}                                                                                                                      \
60921 +                                                                                                                       \
60922 +static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired)                 \
60923 +{                                                                                                                      \
60924 +       reiser4_check_blocks_##allocator (start, end, desired);                                                         \
60925 +}                                                                                                                      \
60926 +                                                                                                                       \
60927 +static inline void sa_pre_commit_hook (void)                                                                           \
60928 +{                                                                                                                      \
60929 +       reiser4_pre_commit_hook_##allocator ();                                                                         \
60930 +}                                                                                                                      \
60931 +                                                                                                                       \
60932 +static inline void sa_post_commit_hook (void)                                                                          \
60933 +{                                                                                                                      \
60934 +       reiser4_post_commit_hook_##allocator ();                                                                        \
60935 +}                                                                                                                      \
60936 +                                                                                                                       \
60937 +static inline void sa_post_write_back_hook (void)                                                                      \
60938 +{                                                                                                                      \
60939 +       reiser4_post_write_back_hook_##allocator();                                                                     \
60940 +}                                                                                                                      \
60941 +                                                                                                                       \
60942 +static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al)                                    \
60943 +{                                                                                                                      \
60944 +       reiser4_print_info_##allocator (prefix, al);                                                                    \
60945 +}
60946 +
60947 +DEF_SPACE_ALLOCATOR(bitmap)
60948 +
60949 +/* this object is part of reiser4 private in-core super block */
60950 +struct reiser4_space_allocator {
60951 +       union {
60952 +               /* space allocators might use this pointer to reference their
60953 +                * data. */
60954 +               void *generic;
60955 +       } u;
60956 +};
60957 +
60958 +/* __SPACE_ALLOCATOR_H__ */
60959 +#endif
60960 +
60961 +/* Make Linus happy.
60962 +   Local variables:
60963 +   c-indentation-style: "K&R"
60964 +   mode-name: "LC"
60965 +   c-basic-offset: 8
60966 +   tab-width: 8
60967 +   fill-column: 120
60968 +   scroll-step: 1
60969 +   End:
60970 +*/
60971 diff -urN linux-2.6.35.orig/fs/reiser4/plugin/tail_policy.c linux-2.6.35/fs/reiser4/plugin/tail_policy.c
60972 --- linux-2.6.35.orig/fs/reiser4/plugin/tail_policy.c   1970-01-01 01:00:00.000000000 +0100
60973 +++ linux-2.6.35/fs/reiser4/plugin/tail_policy.c        2010-08-04 15:44:57.000000000 +0200
60974 @@ -0,0 +1,113 @@
60975 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
60976 + * reiser4/README */
60977 +
60978 +/* Formatting policy plugins */
60979 +
60980 +/*
60981 + * Formatting policy plugin is used by object plugin (of regular file) to
60982 + * convert file between two representations.
60983 + *
60984 + * Currently following policies are implemented:
60985 + *  never store file in formatted nodes
60986 + *  always store file in formatted nodes
60987 + *  store file in formatted nodes if file is smaller than 4 blocks (default)
60988 + */
60989 +
60990 +#include "../tree.h"
60991 +#include "../inode.h"
60992 +#include "../super.h"
60993 +#include "object.h"
60994 +#include "plugin.h"
60995 +#include "node/node.h"
60996 +#include "plugin_header.h"
60997 +
60998 +#include <linux/pagemap.h>
60999 +#include <linux/fs.h>          /* For struct inode */
61000 +
61001 +/**
61002 + * have_formatting_never -
61003 + * @inode:
61004 + * @size:
61005 + *
61006 + *
61007 + */
61008 +/* Never store file's tail as direct item */
61009 +/* Audited by: green(2002.06.12) */
61010 +static int have_formatting_never(const struct inode *inode UNUSED_ARG
61011 +                     /* inode to operate on */ ,
61012 +                     loff_t size UNUSED_ARG/* new object size */)
61013 +{
61014 +       return 0;
61015 +}
61016 +
61017 +/* Always store file's tail as direct item */
61018 +/* Audited by: green(2002.06.12) */
61019 +static int
61020 +have_formatting_always(const struct inode *inode UNUSED_ARG
61021 +                      /* inode to operate on */ ,
61022 +                      loff_t size UNUSED_ARG/* new object size */)
61023 +{
61024 +       return 1;
61025 +}
61026 +
61027 +/* This function makes test if we should store file denoted @inode as tails only
61028 +   or as extents only. */
61029 +static int
61030 +have_formatting_default(const struct inode *inode UNUSED_ARG
61031 +                       /* inode to operate on */ ,
61032 +                       loff_t size/* new object size */)
61033 +{
61034 +       assert("umka-1253", inode != NULL);
61035 +
61036 +       if (size > inode->i_sb->s_blocksize * 4)
61037 +               return 0;
61038 +
61039 +       return 1;
61040 +}
61041 +
61042 +/* tail plugins */
61043 +formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = {
61044 +       [NEVER_TAILS_FORMATTING_ID] = {
61045 +               .h = {
61046 +                       .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
61047 +                       .id = NEVER_TAILS_FORMATTING_ID,
61048 +                       .pops = NULL,
61049 +                       .label = "never",
61050 +                       .desc = "Never store file's tail",
61051 +                       .linkage = {NULL, NULL}
61052 +               },
61053 +               .have_tail = have_formatting_never
61054 +       },
61055 +       [ALWAYS_TAILS_FORMATTING_ID] = {
61056 +               .h = {
61057 +                       .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
61058 +                       .id = ALWAYS_TAILS_FORMATTING_ID,
61059 +                       .pops = NULL,
61060 +                       .label = "always",
61061 +                       .desc = "Always store file's tail",
61062 +                       .linkage = {NULL, NULL}
61063 +               },
61064 +               .have_tail = have_formatting_always
61065 +       },
61066 +       [SMALL_FILE_FORMATTING_ID] = {
61067 +               .h = {
61068 +                       .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
61069 +                       .id = SMALL_FILE_FORMATTING_ID,
61070 +                       .pops = NULL,
61071 +                       .label = "4blocks",
61072 +                       .desc = "store files shorter than 4 blocks in tail items",
61073 +                       .linkage = {NULL, NULL}
61074 +               },
61075 +               .have_tail = have_formatting_default
61076 +       }
61077 +};
61078 +
61079 +/*
61080 + * Local variables:
61081 + * c-indentation-style: "K&R"
61082 + * mode-name: "LC"
61083 + * c-basic-offset: 8
61084 + * tab-width: 8
61085 + * fill-column: 79
61086 + * End:
61087 + */
61088 diff -urN linux-2.6.35.orig/fs/reiser4/pool.c linux-2.6.35/fs/reiser4/pool.c
61089 --- linux-2.6.35.orig/fs/reiser4/pool.c 1970-01-01 01:00:00.000000000 +0100
61090 +++ linux-2.6.35/fs/reiser4/pool.c      2010-08-04 15:44:57.000000000 +0200
61091 @@ -0,0 +1,231 @@
61092 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61093 + * reiser4/README */
61094 +
61095 +/* Fast pool allocation.
61096 +
61097 +   There are situations when some sub-system normally asks memory allocator
61098 +   for only few objects, but under some circumstances could require much
61099 +   more. Typical and actually motivating example is tree balancing. It needs
61100 +   to keep track of nodes that were involved into it, and it is well-known
61101 +   that in reasonable packed balanced tree most (92.938121%) percent of all
61102 +   balancings end up after working with only few nodes (3.141592 on
61103 +   average). But in rare cases balancing can involve much more nodes
61104 +   (3*tree_height+1 in extremal situation).
61105 +
61106 +   On the one hand, we don't want to resort to dynamic allocation (slab,
61107 +    malloc(), etc.) to allocate data structures required to keep track of
61108 +   nodes during balancing. On the other hand, we cannot statically allocate
61109 +   required amount of space on the stack, because first: it is useless wastage
61110 +   of precious resource, and second: this amount is unknown in advance (tree
61111 +   height can change).
61112 +
61113 +   Pools, implemented in this file are solution for this problem:
61114 +
61115 +    - some configurable amount of objects is statically preallocated on the
61116 +    stack
61117 +
61118 +    - if this preallocated pool is exhausted and more objects is requested
61119 +    they are allocated dynamically.
61120 +
61121 +   Pools encapsulate distinction between statically and dynamically allocated
61122 +   objects. Both allocation and recycling look exactly the same.
61123 +
61124 +   To keep track of dynamically allocated objects, pool adds its own linkage
61125 +   to each object.
61126 +
61127 +   NOTE-NIKITA This linkage also contains some balancing-specific data. This
61128 +   is not perfect. On the other hand, balancing is currently the only client
61129 +   of pool code.
61130 +
61131 +   NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation
61132 +   functions in the style of tslist/tshash, i.e., make them unreadable, but
61133 +   type-safe.
61134 +
61135 +*/
61136 +
61137 +#include "debug.h"
61138 +#include "pool.h"
61139 +#include "super.h"
61140 +
61141 +#include <linux/types.h>
61142 +#include <linux/err.h>
61143 +
61144 +/* initialize new pool object @h */
61145 +static void reiser4_init_pool_obj(struct reiser4_pool_header *h)
61146 +{
61147 +       INIT_LIST_HEAD(&h->usage_linkage);
61148 +       INIT_LIST_HEAD(&h->level_linkage);
61149 +       INIT_LIST_HEAD(&h->extra_linkage);
61150 +}
61151 +
61152 +/* initialize new pool */
61153 +void reiser4_init_pool(struct reiser4_pool *pool /* pool to initialize */ ,
61154 +                      size_t obj_size /* size of objects in @pool */ ,
61155 +                      int num_of_objs /* number of preallocated objects */ ,
61156 +                      char *data/* area for preallocated objects */)
61157 +{
61158 +       struct reiser4_pool_header *h;
61159 +       int i;
61160 +
61161 +       assert("nikita-955", pool != NULL);
61162 +       assert("nikita-1044", obj_size > 0);
61163 +       assert("nikita-956", num_of_objs >= 0);
61164 +       assert("nikita-957", data != NULL);
61165 +
61166 +       memset(pool, 0, sizeof *pool);
61167 +       pool->obj_size = obj_size;
61168 +       pool->data = data;
61169 +       INIT_LIST_HEAD(&pool->free);
61170 +       INIT_LIST_HEAD(&pool->used);
61171 +       INIT_LIST_HEAD(&pool->extra);
61172 +       memset(data, 0, obj_size * num_of_objs);
61173 +       for (i = 0; i < num_of_objs; ++i) {
61174 +               h = (struct reiser4_pool_header *) (data + i * obj_size);
61175 +               reiser4_init_pool_obj(h);
61176 +               /* add pool header to the end of pool's free list */
61177 +               list_add_tail(&h->usage_linkage, &pool->free);
61178 +       }
61179 +}
61180 +
61181 +/* release pool resources
61182 +
61183 +   Release all resources acquired by this pool, specifically, dynamically
61184 +   allocated objects.
61185 +
61186 +*/
61187 +void reiser4_done_pool(struct reiser4_pool *pool UNUSED_ARG)
61188 +{
61189 +}
61190 +
61191 +/* allocate carry object from @pool
61192 +
61193 +   First, try to get preallocated object. If this fails, resort to dynamic
61194 +   allocation.
61195 +
61196 +*/
61197 +static void *reiser4_pool_alloc(struct reiser4_pool *pool)
61198 +{
61199 +       struct reiser4_pool_header *result;
61200 +
61201 +       assert("nikita-959", pool != NULL);
61202 +
61203 +       if (!list_empty(&pool->free)) {
61204 +               struct list_head *linkage;
61205 +
61206 +               linkage = pool->free.next;
61207 +               list_del(linkage);
61208 +               INIT_LIST_HEAD(linkage);
61209 +               result = list_entry(linkage, struct reiser4_pool_header,
61210 +                                   usage_linkage);
61211 +               BUG_ON(!list_empty(&result->level_linkage) ||
61212 +                      !list_empty(&result->extra_linkage));
61213 +       } else {
61214 +               /* pool is empty. Extra allocations don't deserve dedicated
61215 +                  slab to be served from, as they are expected to be rare. */
61216 +               result = kmalloc(pool->obj_size, reiser4_ctx_gfp_mask_get());
61217 +               if (result != 0) {
61218 +                       reiser4_init_pool_obj(result);
61219 +                       list_add(&result->extra_linkage, &pool->extra);
61220 +               } else
61221 +                       return ERR_PTR(RETERR(-ENOMEM));
61222 +               BUG_ON(!list_empty(&result->usage_linkage) ||
61223 +                      !list_empty(&result->level_linkage));
61224 +       }
61225 +       ++pool->objs;
61226 +       list_add(&result->usage_linkage, &pool->used);
61227 +       memset(result + 1, 0, pool->obj_size - sizeof *result);
61228 +       return result;
61229 +}
61230 +
61231 +/* return object back to the pool */
61232 +void reiser4_pool_free(struct reiser4_pool *pool,
61233 +                      struct reiser4_pool_header *h)
61234 +{
61235 +       assert("nikita-961", h != NULL);
61236 +       assert("nikita-962", pool != NULL);
61237 +
61238 +       --pool->objs;
61239 +       assert("nikita-963", pool->objs >= 0);
61240 +
61241 +       list_del_init(&h->usage_linkage);
61242 +       list_del_init(&h->level_linkage);
61243 +
61244 +       if (list_empty(&h->extra_linkage))
61245 +               /*
61246 +                * pool header is not an extra one. Push it onto free list
61247 +                * using usage_linkage
61248 +                */
61249 +               list_add(&h->usage_linkage, &pool->free);
61250 +       else {
61251 +               /* remove pool header from pool's extra list and kfree it */
61252 +               list_del(&h->extra_linkage);
61253 +               kfree(h);
61254 +       }
61255 +}
61256 +
61257 +/* add new object to the carry level list
61258 +
61259 +   Carry level is FIFO most of the time, but not always. Complications arise
61260 +   when make_space() function tries to go to the left neighbor and thus adds
61261 +   carry node before existing nodes, and also, when updating delimiting keys
61262 +   after moving data between two nodes, we want left node to be locked before
61263 +   right node.
61264 +
61265 +   Latter case is confusing at the first glance. Problem is that COP_UPDATE
61266 +   opration that updates delimiting keys is sometimes called with two nodes
61267 +   (when data are moved between two nodes) and sometimes with only one node
61268 +   (when leftmost item is deleted in a node). In any case operation is
61269 +   supplied with at least node whose left delimiting key is to be updated
61270 +   (that is "right" node).
61271 +
61272 +   @pool - from which to allocate new object;
61273 +   @list - where to add object;
61274 +   @reference - after (or before) which existing object to add
61275 +*/
61276 +struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool *pool,
61277 +                                        struct list_head *list,
61278 +                                        pool_ordering order,
61279 +                                        struct reiser4_pool_header *reference)
61280 +{
61281 +       struct reiser4_pool_header *result;
61282 +
61283 +       assert("nikita-972", pool != NULL);
61284 +
61285 +       result = reiser4_pool_alloc(pool);
61286 +       if (IS_ERR(result))
61287 +               return result;
61288 +
61289 +       assert("nikita-973", result != NULL);
61290 +
61291 +       switch (order) {
61292 +       case POOLO_BEFORE:
61293 +               __list_add(&result->level_linkage,
61294 +                          reference->level_linkage.prev,
61295 +                          &reference->level_linkage);
61296 +               break;
61297 +       case POOLO_AFTER:
61298 +               __list_add(&result->level_linkage,
61299 +                          &reference->level_linkage,
61300 +                          reference->level_linkage.next);
61301 +               break;
61302 +       case POOLO_LAST:
61303 +               list_add_tail(&result->level_linkage, list);
61304 +               break;
61305 +       case POOLO_FIRST:
61306 +               list_add(&result->level_linkage, list);
61307 +               break;
61308 +       default:
61309 +               wrong_return_value("nikita-927", "order");
61310 +       }
61311 +       return result;
61312 +}
61313 +
61314 +/* Make Linus happy.
61315 +   Local variables:
61316 +   c-indentation-style: "K&R"
61317 +   mode-name: "LC"
61318 +   c-basic-offset: 8
61319 +   tab-width: 8
61320 +   fill-column: 120
61321 +   End:
61322 +*/
61323 diff -urN linux-2.6.35.orig/fs/reiser4/pool.h linux-2.6.35/fs/reiser4/pool.h
61324 --- linux-2.6.35.orig/fs/reiser4/pool.h 1970-01-01 01:00:00.000000000 +0100
61325 +++ linux-2.6.35/fs/reiser4/pool.h      2010-08-04 15:44:57.000000000 +0200
61326 @@ -0,0 +1,57 @@
61327 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61328 + * reiser4/README */
61329 +
61330 +/* Fast pool allocation */
61331 +
61332 +#ifndef __REISER4_POOL_H__
61333 +#define __REISER4_POOL_H__
61334 +
61335 +#include <linux/types.h>
61336 +
61337 +struct reiser4_pool {
61338 +       size_t obj_size;
61339 +       int objs;
61340 +       char *data;
61341 +       struct list_head free;
61342 +       struct list_head used;
61343 +       struct list_head extra;
61344 +};
61345 +
61346 +struct reiser4_pool_header {
61347 +       /* object is either on free or "used" lists */
61348 +       struct list_head usage_linkage;
61349 +       struct list_head level_linkage;
61350 +       struct list_head extra_linkage;
61351 +};
61352 +
61353 +typedef enum {
61354 +       POOLO_BEFORE,
61355 +       POOLO_AFTER,
61356 +       POOLO_LAST,
61357 +       POOLO_FIRST
61358 +} pool_ordering;
61359 +
61360 +/* pool manipulation functions */
61361 +
61362 +extern void reiser4_init_pool(struct reiser4_pool *pool, size_t obj_size,
61363 +                             int num_of_objs, char *data);
61364 +extern void reiser4_done_pool(struct reiser4_pool *pool);
61365 +extern void reiser4_pool_free(struct reiser4_pool *pool,
61366 +                             struct reiser4_pool_header *h);
61367 +struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool *pool,
61368 +                                        struct list_head *list,
61369 +                                        pool_ordering order,
61370 +                                        struct reiser4_pool_header *reference);
61371 +
61372 +/* __REISER4_POOL_H__ */
61373 +#endif
61374 +
61375 +/* Make Linus happy.
61376 +   Local variables:
61377 +   c-indentation-style: "K&R"
61378 +   mode-name: "LC"
61379 +   c-basic-offset: 8
61380 +   tab-width: 8
61381 +   fill-column: 120
61382 +   End:
61383 +*/
61384 diff -urN linux-2.6.35.orig/fs/reiser4/readahead.c linux-2.6.35/fs/reiser4/readahead.c
61385 --- linux-2.6.35.orig/fs/reiser4/readahead.c    1970-01-01 01:00:00.000000000 +0100
61386 +++ linux-2.6.35/fs/reiser4/readahead.c 2010-08-04 15:44:57.000000000 +0200
61387 @@ -0,0 +1,140 @@
61388 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61389 + * reiser4/README */
61390 +
61391 +#include "forward.h"
61392 +#include "tree.h"
61393 +#include "tree_walk.h"
61394 +#include "super.h"
61395 +#include "inode.h"
61396 +#include "key.h"
61397 +#include "znode.h"
61398 +
61399 +#include <linux/swap.h>                /* for totalram_pages */
61400 +
61401 +void reiser4_init_ra_info(ra_info_t *rai)
61402 +{
61403 +       rai->key_to_stop = *reiser4_min_key();
61404 +}
61405 +
61406 +/* global formatted node readahead parameter. It can be set by mount option
61407 + * -o readahead:NUM:1 */
61408 +static inline int ra_adjacent_only(int flags)
61409 +{
61410 +       return flags & RA_ADJACENT_ONLY;
61411 +}
61412 +
61413 +/* this is used by formatted_readahead to decide whether read for right neighbor
61414 + * of node is to be issued. It returns 1 if right neighbor's first key is less
61415 + * or equal to readahead's stop key */
61416 +static int should_readahead_neighbor(znode * node, ra_info_t *info)
61417 +{
61418 +       int result;
61419 +
61420 +       read_lock_dk(znode_get_tree(node));
61421 +       result = keyle(znode_get_rd_key(node), &info->key_to_stop);
61422 +       read_unlock_dk(znode_get_tree(node));
61423 +       return result;
61424 +}
61425 +
61426 +#define LOW_MEM_PERCENTAGE (5)
61427 +
61428 +static int low_on_memory(void)
61429 +{
61430 +       unsigned int freepages;
61431 +
61432 +       freepages = nr_free_pages();
61433 +       return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100);
61434 +}
61435 +
61436 +/* start read for @node and for a few of its right neighbors */
61437 +void formatted_readahead(znode * node, ra_info_t *info)
61438 +{
61439 +       struct formatted_ra_params *ra_params;
61440 +       znode *cur;
61441 +       int i;
61442 +       int grn_flags;
61443 +       lock_handle next_lh;
61444 +
61445 +       /* do nothing if node block number has not been assigned to node (which
61446 +        * means it is still in cache). */
61447 +       if (reiser4_blocknr_is_fake(znode_get_block(node)))
61448 +               return;
61449 +
61450 +       ra_params = get_current_super_ra_params();
61451 +
61452 +       if (znode_page(node) == NULL)
61453 +               jstartio(ZJNODE(node));
61454 +
61455 +       if (znode_get_level(node) != LEAF_LEVEL)
61456 +               return;
61457 +
61458 +       /* don't waste memory for read-ahead when low on memory */
61459 +       if (low_on_memory())
61460 +               return;
61461 +
61462 +       /* We can have locked nodes on upper tree levels, in this situation lock
61463 +          priorities do not help to resolve deadlocks, we have to use TRY_LOCK
61464 +          here. */
61465 +       grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK);
61466 +
61467 +       i = 0;
61468 +       cur = zref(node);
61469 +       init_lh(&next_lh);
61470 +       while (i < ra_params->max) {
61471 +               const reiser4_block_nr * nextblk;
61472 +
61473 +               if (!should_readahead_neighbor(cur, info))
61474 +                       break;
61475 +
61476 +               if (reiser4_get_right_neighbor
61477 +                   (&next_lh, cur, ZNODE_READ_LOCK, grn_flags))
61478 +                       break;
61479 +
61480 +               nextblk = znode_get_block(next_lh.node);
61481 +               if (reiser4_blocknr_is_fake(nextblk) ||
61482 +                   (ra_adjacent_only(ra_params->flags)
61483 +                    && *nextblk != *znode_get_block(cur) + 1))
61484 +                       break;
61485 +
61486 +               zput(cur);
61487 +               cur = zref(next_lh.node);
61488 +               done_lh(&next_lh);
61489 +               if (znode_page(cur) == NULL)
61490 +                       jstartio(ZJNODE(cur));
61491 +               else
61492 +                       /* Do not scan read-ahead window if pages already
61493 +                        * allocated (and i/o already started). */
61494 +                       break;
61495 +
61496 +               i++;
61497 +       }
61498 +       zput(cur);
61499 +       done_lh(&next_lh);
61500 +}
61501 +
61502 +void reiser4_readdir_readahead_init(struct inode *dir, tap_t *tap)
61503 +{
61504 +       reiser4_key *stop_key;
61505 +
61506 +       assert("nikita-3542", dir != NULL);
61507 +       assert("nikita-3543", tap != NULL);
61508 +
61509 +       stop_key = &tap->ra_info.key_to_stop;
61510 +       /* initialize readdir readahead information: include into readahead
61511 +        * stat data of all files of the directory */
61512 +       set_key_locality(stop_key, get_inode_oid(dir));
61513 +       set_key_type(stop_key, KEY_SD_MINOR);
61514 +       set_key_ordering(stop_key, get_key_ordering(reiser4_max_key()));
61515 +       set_key_objectid(stop_key, get_key_objectid(reiser4_max_key()));
61516 +       set_key_offset(stop_key, get_key_offset(reiser4_max_key()));
61517 +}
61518 +
61519 +/*
61520 +   Local variables:
61521 +   c-indentation-style: "K&R"
61522 +   mode-name: "LC"
61523 +   c-basic-offset: 8
61524 +   tab-width: 8
61525 +   fill-column: 80
61526 +   End:
61527 +*/
61528 diff -urN linux-2.6.35.orig/fs/reiser4/readahead.h linux-2.6.35/fs/reiser4/readahead.h
61529 --- linux-2.6.35.orig/fs/reiser4/readahead.h    1970-01-01 01:00:00.000000000 +0100
61530 +++ linux-2.6.35/fs/reiser4/readahead.h 2010-08-04 15:44:57.000000000 +0200
61531 @@ -0,0 +1,52 @@
61532 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61533 + * reiser4/README */
61534 +
61535 +#ifndef __READAHEAD_H__
61536 +#define __READAHEAD_H__
61537 +
61538 +#include "key.h"
61539 +
61540 +typedef enum {
61541 +       RA_ADJACENT_ONLY = 1,   /* only requests nodes which are adjacent.
61542 +                                  Default is NO (not only adjacent) */
61543 +} ra_global_flags;
61544 +
61545 +/* reiser4 super block has a field of this type.
61546 +   It controls readahead during tree traversals */
61547 +struct formatted_ra_params {
61548 +       unsigned long max;      /* request not more than this amount of nodes.
61549 +                                  Default is totalram_pages / 4 */
61550 +       int flags;
61551 +};
61552 +
61553 +typedef struct {
61554 +       reiser4_key key_to_stop;
61555 +} ra_info_t;
61556 +
61557 +void formatted_readahead(znode * , ra_info_t *);
61558 +void reiser4_init_ra_info(ra_info_t *rai);
61559 +
61560 +struct reiser4_file_ra_state {
61561 +       loff_t start;           /* Current window */
61562 +       loff_t size;
61563 +       loff_t next_size;       /* Next window size */
61564 +       loff_t ahead_start;     /* Ahead window */
61565 +       loff_t ahead_size;
61566 +       loff_t max_window_size; /* Maximum readahead window */
61567 +       loff_t slow_start;      /* enlarging r/a size algorithm. */
61568 +};
61569 +
61570 +extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t *tap);
61571 +
61572 +/* __READAHEAD_H__ */
61573 +#endif
61574 +
61575 +/*
61576 +   Local variables:
61577 +   c-indentation-style: "K&R"
61578 +   mode-name: "LC"
61579 +   c-basic-offset: 8
61580 +   tab-width: 8
61581 +   fill-column: 120
61582 +   End:
61583 +*/
61584 diff -urN linux-2.6.35.orig/fs/reiser4/README linux-2.6.35/fs/reiser4/README
61585 --- linux-2.6.35.orig/fs/reiser4/README 1970-01-01 01:00:00.000000000 +0100
61586 +++ linux-2.6.35/fs/reiser4/README      2010-08-04 15:44:57.000000000 +0200
61587 @@ -0,0 +1,128 @@
61588 +[LICENSING]
61589 +
61590 +Reiser4 is hereby licensed under the GNU General
61591 +Public License version 2.
61592 +
61593 +Source code files that contain the phrase "licensing governed by
61594 +reiser4/README" are "governed files" throughout this file.  Governed
61595 +files are licensed under the GPL.  The portions of them owned by Hans
61596 +Reiser, or authorized to be licensed by him, have been in the past,
61597 +and likely will be in the future, licensed to other parties under
61598 +other licenses.  If you add your code to governed files, and don't
61599 +want it to be owned by Hans Reiser, put your copyright label on that
61600 +code so the poor blight and his customers can keep things straight.
61601 +All portions of governed files not labeled otherwise are owned by Hans
61602 +Reiser, and by adding your code to it, widely distributing it to
61603 +others or sending us a patch, and leaving the sentence in stating that
61604 +licensing is governed by the statement in this file, you accept this.
61605 +It will be a kindness if you identify whether Hans Reiser is allowed
61606 +to license code labeled as owned by you on your behalf other than
61607 +under the GPL, because he wants to know if it is okay to do so and put
61608 +a check in the mail to you (for non-trivial improvements) when he
61609 +makes his next sale.  He makes no guarantees as to the amount if any,
61610 +though he feels motivated to motivate contributors, and you can surely
61611 +discuss this with him before or after contributing.  You have the
61612 +right to decline to allow him to license your code contribution other
61613 +than under the GPL.
61614 +
61615 +Further licensing options are available for commercial and/or other
61616 +interests directly from Hans Reiser: reiser@namesys.com.  If you interpret
61617 +the GPL as not allowing those additional licensing options, you read
61618 +it wrongly, and Richard Stallman agrees with me, when carefully read
61619 +you can see that those restrictions on additional terms do not apply
61620 +to the owner of the copyright, and my interpretation of this shall
61621 +govern for this license.
61622 +
61623 +[END LICENSING]
61624 +
61625 +Reiser4 is a file system based on dancing tree algorithms, and is
61626 +described at http://www.namesys.com
61627 +
61628 +mkfs.reiser4 and other utilities are on our webpage or wherever your
61629 +Linux provider put them.  You really want to be running the latest
61630 +version off the website if you use fsck.
61631 +
61632 +Yes, if you update your reiser4 kernel module you do have to
61633 +recompile your kernel, most of the time.  The errors you get will be
61634 +quite cryptic if your forget to do so.
61635 +
61636 +Hideous Commercial Pitch: Spread your development costs across other OS
61637 +vendors.  Select from the best in the world, not the best in your
61638 +building, by buying from third party OS component suppliers.  Leverage
61639 +the software component development power of the internet.  Be the most
61640 +aggressive in taking advantage of the commercial possibilities of
61641 +decentralized internet development, and add value through your branded
61642 +integration that you sell as an operating system.  Let your competitors
61643 +be the ones to compete against the entire internet by themselves.  Be
61644 +hip, get with the new economic trend, before your competitors do.  Send
61645 +email to reiser@namesys.com
61646 +
61647 +Hans Reiser was the primary architect of Reiser4, but a whole team
61648 +chipped their ideas in.  He invested everything he had into Namesys
61649 +for 5.5 dark years of no money before Reiser3 finally started to work well
61650 +enough to bring in money.  He owns the copyright.
61651 +
61652 +DARPA was the primary sponsor of Reiser4.  DARPA does not endorse
61653 +Reiser4, it merely sponsors it.  DARPA is, in solely Hans's personal
61654 +opinion, unique in its willingness to invest into things more
61655 +theoretical than the VC community can readily understand, and more
61656 +longterm than allows them to be sure that they will be the ones to
61657 +extract the economic benefits from.  DARPA also integrated us into a
61658 +security community that transformed our security worldview.
61659 +
61660 +Vladimir Saveliev is our lead programmer, with us from the beginning,
61661 +and he worked long hours writing the cleanest code.  This is why he is
61662 +now the lead programmer after years of commitment to our work.  He
61663 +always made the effort to be the best he could be, and to make his
61664 +code the best that it could be.  What resulted was quite remarkable. I
61665 +don't think that money can ever motivate someone to work the way he
61666 +did, he is one of the most selfless men I know.
61667 +
61668 +Alexander Lyamin was our sysadmin, and helped to educate us in
61669 +security issues.  Moscow State University and IMT were very generous
61670 +in the internet access they provided us, and in lots of other little
61671 +ways that a generous institution can be.
61672 +
61673 +Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
61674 +locking code, the block allocator, and finished the flushing code.
61675 +His code is always crystal clean and well structured.
61676 +
61677 +Nikita Danilov wrote the core of the balancing code, the core of the
61678 +plugins code, and the directory code.  He worked a steady pace of long
61679 +hours that produced a whole lot of well abstracted code.  He is our
61680 +senior computer scientist.
61681 +
61682 +Vladimir Demidov wrote the parser.  Writing an in kernel parser is
61683 +something very few persons have the skills for, and it is thanks to
61684 +him that we can say that the parser is really not so big compared to
61685 +various bits of our other code, and making a parser work in the kernel
61686 +was not so complicated as everyone would imagine mainly because it was
61687 +him doing it...
61688 +
61689 +Joshua McDonald wrote the transaction manager, and the flush code.
61690 +The flush code unexpectedly turned out be extremely hairy for reasons
61691 +you can read about on our web page, and he did a great job on an
61692 +extremely difficult task.
61693 +
61694 +Nina Reiser handled our accounting, government relations, and much
61695 +more.
61696 +
61697 +Ramon Reiser developed our website.
61698 +
61699 +Beverly Palmer drew our graphics.
61700 +
61701 +Vitaly Fertman developed librepair, userspace plugins repair code, fsck
61702 +and worked with Umka on developing libreiser4 and userspace plugins.
61703 +
61704 +Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
61705 +userspace tools (reiser4progs).
61706 +
61707 +Oleg Drokin (aka Green) is the release manager who fixes everything.
61708 +It is so nice to have someone like that on the team.  He (plus Chris
61709 +and Jeff) make it possible for the entire rest of the Namesys team to
61710 +focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also.  It
61711 +is just amazing to watch his talent for spotting bugs in action.
61712 +
61713 +Edward Shishkin wrote cryptcompress file plugin (which manages files
61714 +built of encrypted and(or) compressed bodies) and other plugins related
61715 +to transparent encryption and compression support.
61716 diff -urN linux-2.6.35.orig/fs/reiser4/reiser4.h linux-2.6.35/fs/reiser4/reiser4.h
61717 --- linux-2.6.35.orig/fs/reiser4/reiser4.h      1970-01-01 01:00:00.000000000 +0100
61718 +++ linux-2.6.35/fs/reiser4/reiser4.h   2010-08-04 15:44:57.000000000 +0200
61719 @@ -0,0 +1,259 @@
61720 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61721 + * reiser4/README */
61722 +
61723 +/* definitions of common constants used by reiser4 */
61724 +
61725 +#if !defined( __REISER4_H__ )
61726 +#define __REISER4_H__
61727 +
61728 +#include <asm/param.h>         /* for HZ */
61729 +#include <linux/errno.h>
61730 +#include <linux/types.h>
61731 +#include <linux/fs.h>
61732 +#include <linux/hardirq.h>
61733 +#include <linux/sched.h>
61734 +
61735 +/*
61736 + * reiser4 compilation options.
61737 + */
61738 +
61739 +#if defined(CONFIG_REISER4_DEBUG)
61740 +/* turn on assertion checks */
61741 +#define REISER4_DEBUG (1)
61742 +#else
61743 +#define REISER4_DEBUG (0)
61744 +#endif
61745 +
61746 +#define REISER4_SHA256 (0)
61747 +
61748 +/*
61749 + * Turn on large keys mode. In his mode (which is default), reiser4 key has 4
61750 + * 8-byte components. In the old "small key" mode, it's 3 8-byte
61751 + * components. Additional component, referred to as "ordering" is used to
61752 + * order items from which given object is composed of. As such, ordering is
61753 + * placed between locality and objectid. For directory item ordering contains
61754 + * initial prefix of the file name this item is for. This sorts all directory
61755 + * items within given directory lexicographically (but see
61756 + * fibration.[ch]). For file body and stat-data, ordering contains initial
61757 + * prefix of the name file was initially created with. In the common case
61758 + * (files with single name) this allows to order file bodies and stat-datas in
61759 + * the same order as their respective directory entries, thus speeding up
61760 + * readdir.
61761 + *
61762 + * Note, that kernel can only mount file system with the same key size as one
61763 + * it is compiled for, so flipping this option may render your data
61764 + * inaccessible.
61765 + */
61766 +#define REISER4_LARGE_KEY (1)
61767 +/*#define REISER4_LARGE_KEY (0)*/
61768 +
61769 +/*#define GUESS_EXISTS 1*/
61770 +
61771 +/*
61772 + * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation
61773 + * option
61774 + */
61775 +
61776 +extern const char *REISER4_SUPER_MAGIC_STRING;
61777 +extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the
61778 +                                        * beginning of device */
61779 +
61780 +/* here go tunable parameters that are not worth special entry in kernel
61781 +   configuration */
61782 +
61783 +/* default number of slots in coord-by-key caches */
61784 +#define CBK_CACHE_SLOTS    (16)
61785 +/* how many elementary tree operation to carry on the next level */
61786 +#define CARRIES_POOL_SIZE        (5)
61787 +/* size of pool of preallocated nodes for carry process. */
61788 +#define NODES_LOCKED_POOL_SIZE   (5)
61789 +
61790 +#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
61791 +#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
61792 +#define REISER4_PASTE_FLAGS (COPI_GO_LEFT)
61793 +#define REISER4_INSERT_FLAGS (COPI_GO_LEFT)
61794 +
61795 +/* we are supporting reservation of disk space on uid basis */
61796 +#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0)
61797 +/* we are supporting reservation of disk space for groups */
61798 +#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0)
61799 +/* we are supporting reservation of disk space for root */
61800 +#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0)
61801 +/* we use rapid flush mode, see flush.c for comments.  */
61802 +#define REISER4_USE_RAPID_FLUSH (1)
61803 +
61804 +/*
61805 + * set this to 0 if you don't want to use wait-for-flush in ->writepage().
61806 + */
61807 +#define REISER4_USE_ENTD (1)
61808 +
61809 +/* key allocation is Plan-A */
61810 +#define REISER4_PLANA_KEY_ALLOCATION (1)
61811 +/* key allocation follows good old 3.x scheme */
61812 +#define REISER4_3_5_KEY_ALLOCATION (0)
61813 +
61814 +/* size of hash-table for znodes */
61815 +#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13)
61816 +
61817 +/* number of buckets in lnode hash-table */
61818 +#define LNODE_HTABLE_BUCKETS (1024)
61819 +
61820 +/* some ridiculously high maximal limit on height of znode tree. This
61821 +    is used in declaration of various per level arrays and
61822 +    to allocate stattistics gathering array for per-level stats. */
61823 +#define REISER4_MAX_ZTREE_HEIGHT     (8)
61824 +
61825 +#define REISER4_PANIC_MSG_BUFFER_SIZE (1024)
61826 +
61827 +/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then,
61828 +   sequential search is on average faster than binary. This is because
61829 +   of better optimization and because sequential search is more CPU
61830 +   cache friendly. This number (25) was found by experiments on dual AMD
61831 +   Athlon(tm), 1400MHz.
61832 +
61833 +   NOTE: testing in kernel has shown that binary search is more effective than
61834 +   implied by results of the user level benchmarking. Probably because in the
61835 +   node keys are separated by other data. So value was adjusted after few
61836 +   tests. More thorough tuning is needed.
61837 +*/
61838 +#define REISER4_SEQ_SEARCH_BREAK      (3)
61839 +
61840 +/* don't allow tree to be lower than this */
61841 +#define REISER4_MIN_TREE_HEIGHT       (TWIG_LEVEL)
61842 +
61843 +/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to
61844 + * available memory. */
61845 +/* Default value of maximal atom size. Can be ovewritten by
61846 +   tmgr.atom_max_size mount option. By default infinity. */
61847 +#define REISER4_ATOM_MAX_SIZE         ((unsigned)(~0))
61848 +
61849 +/* Default value of maximal atom age (in jiffies). After reaching this age
61850 +   atom will be forced to commit, either synchronously or asynchronously. Can
61851 +   be overwritten by tmgr.atom_max_age mount option. */
61852 +#define REISER4_ATOM_MAX_AGE          (600 * HZ)
61853 +
61854 +/* sleeping period for ktxnmrgd */
61855 +#define REISER4_TXNMGR_TIMEOUT  (5 * HZ)
61856 +
61857 +/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */
61858 +#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000)
61859 +
61860 +/* start complaining after that many restarts in coord_by_key().
61861 +
61862 +   This either means incredibly heavy contention for this part of a tree, or
61863 +   some corruption or bug.
61864 +*/
61865 +#define REISER4_CBK_ITERATIONS_LIMIT  (100)
61866 +
61867 +/* return -EIO after that many iterations in coord_by_key().
61868 +
61869 +   I have witnessed more than 800 iterations (in 30 thread test) before cbk
61870 +   finished. --nikita
61871 +*/
61872 +#define REISER4_MAX_CBK_ITERATIONS    500000
61873 +
61874 +/* put a per-inode limit on maximal number of directory entries with identical
61875 +   keys in hashed directory.
61876 +
61877 +   Disable this until inheritance interfaces stabilize: we need some way to
61878 +   set per directory limit.
61879 +*/
61880 +#define REISER4_USE_COLLISION_LIMIT    (0)
61881 +
61882 +/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level
61883 +   blocks it will force them to be relocated. */
61884 +#define FLUSH_RELOCATE_THRESHOLD 64
61885 +/* If flush finds can find a block allocation closer than at most
61886 +   FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that position.
61887 + */
61888 +#define FLUSH_RELOCATE_DISTANCE  64
61889 +
61890 +/* If we have written this much or more blocks before encountering busy jnode
61891 +   in flush list - abort flushing hoping that next time we get called
61892 +   this jnode will be clean already, and we will save some seeks. */
61893 +#define FLUSH_WRITTEN_THRESHOLD 50
61894 +
61895 +/* The maximum number of nodes to scan left on a level during flush. */
61896 +#define FLUSH_SCAN_MAXNODES 10000
61897 +
61898 +/* per-atom limit of flushers */
61899 +#define ATOM_MAX_FLUSHERS (1)
61900 +
61901 +/* default tracing buffer size */
61902 +#define REISER4_TRACE_BUF_SIZE (1 << 15)
61903 +
61904 +/* what size units of IO we would like cp, etc., to use, in writing to
61905 +   reiser4. In bytes.
61906 +
61907 +   Can be overwritten by optimal_io_size mount option.
61908 +*/
61909 +#define REISER4_OPTIMAL_IO_SIZE (64 * 1024)
61910 +
61911 +/* see comments in inode.c:oid_to_uino() */
61912 +#define REISER4_UINO_SHIFT (1 << 30)
61913 +
61914 +/* Mark function argument as unused to avoid compiler warnings. */
61915 +#define UNUSED_ARG __attribute__((unused))
61916 +
61917 +#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
61918 +#define NONNULL __attribute__((nonnull))
61919 +#else
61920 +#define NONNULL
61921 +#endif
61922 +
61923 +/* master super block offset in bytes.*/
61924 +#define REISER4_MASTER_OFFSET 65536
61925 +
61926 +/* size of VFS block */
61927 +#define VFS_BLKSIZE 512
61928 +/* number of bits in size of VFS block (512==2^9) */
61929 +#define VFS_BLKSIZE_BITS 9
61930 +
61931 +#define REISER4_I reiser4_inode_data
61932 +
61933 +/* implication */
61934 +#define ergo(antecedent, consequent) (!(antecedent) || (consequent))
61935 +/* logical equivalence */
61936 +#define equi(p1, p2) (ergo((p1), (p2)) && ergo((p2), (p1)))
61937 +
61938 +#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0])))
61939 +
61940 +#define NOT_YET                       (0)
61941 +
61942 +/** Reiser4 specific error codes **/
61943 +
61944 +#define REISER4_ERROR_CODE_BASE 10000
61945 +
61946 +/* Neighbor is not available (side neighbor or parent) */
61947 +#define E_NO_NEIGHBOR  (REISER4_ERROR_CODE_BASE)
61948 +
61949 +/* Node was not found in cache */
61950 +#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1)
61951 +
61952 +/* node has no free space enough for completion of balancing operation */
61953 +#define E_NODE_FULL    (REISER4_ERROR_CODE_BASE + 2)
61954 +
61955 +/* repeat operation */
61956 +#define E_REPEAT       (REISER4_ERROR_CODE_BASE + 3)
61957 +
61958 +/* deadlock happens */
61959 +#define E_DEADLOCK     (REISER4_ERROR_CODE_BASE + 4)
61960 +
61961 +/* operation cannot be performed, because it would block and non-blocking mode
61962 + * was requested. */
61963 +#define E_BLOCK        (REISER4_ERROR_CODE_BASE + 5)
61964 +
61965 +/* wait some event (depends on context), then repeat */
61966 +#define E_WAIT         (REISER4_ERROR_CODE_BASE + 6)
61967 +
61968 +#endif                         /* __REISER4_H__ */
61969 +
61970 +/* Make Linus happy.
61971 +   Local variables:
61972 +   c-indentation-style: "K&R"
61973 +   mode-name: "LC"
61974 +   c-basic-offset: 8
61975 +   tab-width: 8
61976 +   fill-column: 120
61977 +   End:
61978 +*/
61979 diff -urN linux-2.6.35.orig/fs/reiser4/safe_link.c linux-2.6.35/fs/reiser4/safe_link.c
61980 --- linux-2.6.35.orig/fs/reiser4/safe_link.c    1970-01-01 01:00:00.000000000 +0100
61981 +++ linux-2.6.35/fs/reiser4/safe_link.c 2010-08-04 15:44:57.000000000 +0200
61982 @@ -0,0 +1,354 @@
61983 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
61984 + * reiser4/README */
61985 +
61986 +/* Safe-links. */
61987 +
61988 +/*
61989 + * Safe-links are used to maintain file system consistency during operations
61990 + * that spawns multiple transactions. For example:
61991 + *
61992 + *     1. Unlink. UNIX supports "open-but-unlinked" files, that is files
61993 + *     without user-visible names in the file system, but still opened by some
61994 + *     active process. What happens here is that unlink proper (i.e., removal
61995 + *     of the last file name) and file deletion (truncate of file body to zero
61996 + *     and deletion of stat-data, that happens when last file descriptor is
61997 + *     closed), may belong to different transactions T1 and T2. If a crash
61998 + *     happens after T1 commit, but before T2 commit, on-disk file system has
61999 + *     a file without name, that is, disk space leak.
62000 + *
62001 + *     2. Truncate. Truncate of large file may spawn multiple transactions. If
62002 + *     system crashes while truncate was in-progress, file is left partially
62003 + *     truncated, which violates "atomicity guarantees" of reiser4, viz. that
62004 + *     every system is atomic.
62005 + *
62006 + * Safe-links address both above cases. Basically, safe-link is a way post
62007 + * some operation to be executed during commit of some other transaction than
62008 + * current one. (Another way to look at the safe-link is to interpret it as a
62009 + * logical logging.)
62010 + *
62011 + * Specifically, at the beginning of unlink safe-link in inserted in the
62012 + * tree. This safe-link is normally removed by file deletion code (during
62013 + * transaction T2 in the above terms). Truncate also inserts safe-link that is
62014 + * normally removed when truncate operation is finished.
62015 + *
62016 + * This means, that in the case of "clean umount" there are no safe-links in
62017 + * the tree. If safe-links are observed during mount, it means that (a) system
62018 + * was terminated abnormally, and (b) safe-link correspond to the "pending"
62019 + * (i.e., not finished) operations that were in-progress during system
62020 + * termination. Each safe-link record enough information to complete
62021 + * corresponding operation, and mount simply "replays" them (hence, the
62022 + * analogy with the logical logging).
62023 + *
62024 + * Safe-links are implemented as blackbox items (see
62025 + * plugin/item/blackbox.[ch]).
62026 + *
62027 + * For the reference: ext3 also has similar mechanism, it's called "an orphan
62028 + * list" there.
62029 + */
62030 +
62031 +#include "safe_link.h"
62032 +#include "debug.h"
62033 +#include "inode.h"
62034 +
62035 +#include "plugin/item/blackbox.h"
62036 +
62037 +#include <linux/fs.h>
62038 +
62039 +/*
62040 + * On-disk format of safe-link.
62041 + */
62042 +typedef struct safelink {
62043 +       reiser4_key sdkey;      /* key of stat-data for the file safe-link is
62044 +                                * for */
62045 +       d64 size;               /* size to which file should be truncated */
62046 +} safelink_t;
62047 +
62048 +/*
62049 + * locality where safe-link items are stored. Next to the objectid of root
62050 + * directory.
62051 + */
62052 +static oid_t safe_link_locality(reiser4_tree * tree)
62053 +{
62054 +       return get_key_objectid(get_super_private(tree->super)->df_plug->
62055 +                               root_dir_key(tree->super)) + 1;
62056 +}
62057 +
62058 +/*
62059 +  Construct a key for the safe-link. Key has the following format:
62060 +
62061 +|        60     | 4 |        64        | 4 |      60       |         64       |
62062 ++---------------+---+------------------+---+---------------+------------------+
62063 +|   locality    | 0 |        0         | 0 |   objectid    |     link type    |
62064 ++---------------+---+------------------+---+---------------+------------------+
62065 +|                   |                  |                   |                  |
62066 +|     8 bytes       |     8 bytes      |      8 bytes      |      8 bytes     |
62067 +
62068 +   This is in large keys format. In small keys format second 8 byte chunk is
62069 +   out. Locality is a constant returned by safe_link_locality(). objectid is
62070 +   an oid of a file on which operation protected by this safe-link is
62071 +   performed. link-type is used to distinguish safe-links for different
62072 +   operations.
62073 +
62074 + */
62075 +static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid,
62076 +                                  reiser4_safe_link_t link, reiser4_key * key)
62077 +{
62078 +       reiser4_key_init(key);
62079 +       set_key_locality(key, safe_link_locality(tree));
62080 +       set_key_objectid(key, oid);
62081 +       set_key_offset(key, link);
62082 +       return key;
62083 +}
62084 +
62085 +/*
62086 + * how much disk space is necessary to insert and remove (in the
62087 + * error-handling path) safe-link.
62088 + */
62089 +static __u64 safe_link_tograb(reiser4_tree * tree)
62090 +{
62091 +       return
62092 +           /* insert safe link */
62093 +           estimate_one_insert_item(tree) +
62094 +           /* remove safe link */
62095 +           estimate_one_item_removal(tree) +
62096 +           /* drill to the leaf level during insertion */
62097 +           1 + estimate_one_insert_item(tree) +
62098 +           /*
62099 +            * possible update of existing safe-link. Actually, if
62100 +            * safe-link existed already (we failed to remove it), then no
62101 +            * insertion is necessary, so this term is already "covered",
62102 +            * but for simplicity let's left it.
62103 +            */
62104 +           1;
62105 +}
62106 +
62107 +/*
62108 + * grab enough disk space to insert and remove (in the error-handling path)
62109 + * safe-link.
62110 + */
62111 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags)
62112 +{
62113 +       int result;
62114 +
62115 +       grab_space_enable();
62116 +       /* The sbinfo->delete_mutex can be taken here.
62117 +        * safe_link_release() should be called before leaving reiser4
62118 +        * context. */
62119 +       result =
62120 +           reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags);
62121 +       grab_space_enable();
62122 +       return result;
62123 +}
62124 +
62125 +/*
62126 + * release unused disk space reserved by safe_link_grab().
62127 + */
62128 +void safe_link_release(reiser4_tree * tree)
62129 +{
62130 +       reiser4_release_reserved(tree->super);
62131 +}
62132 +
62133 +/*
62134 + * insert into tree safe-link for operation @link on inode @inode.
62135 + */
62136 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link)
62137 +{
62138 +       reiser4_key key;
62139 +       safelink_t sl;
62140 +       int length;
62141 +       int result;
62142 +       reiser4_tree *tree;
62143 +
62144 +       build_sd_key(inode, &sl.sdkey);
62145 +       length = sizeof sl.sdkey;
62146 +
62147 +       if (link == SAFE_TRUNCATE) {
62148 +               /*
62149 +                * for truncate we have to store final file length also,
62150 +                * expand item.
62151 +                */
62152 +               length += sizeof(sl.size);
62153 +               put_unaligned(cpu_to_le64(inode->i_size), &sl.size);
62154 +       }
62155 +       tree = reiser4_tree_by_inode(inode);
62156 +       build_link_key(tree, get_inode_oid(inode), link, &key);
62157 +
62158 +       result = store_black_box(tree, &key, &sl, length);
62159 +       if (result == -EEXIST)
62160 +               result = update_black_box(tree, &key, &sl, length);
62161 +       return result;
62162 +}
62163 +
62164 +/*
62165 + * remove safe-link corresponding to the operation @link on inode @inode from
62166 + * the tree.
62167 + */
62168 +int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link)
62169 +{
62170 +       reiser4_key key;
62171 +
62172 +       return kill_black_box(tree, build_link_key(tree, oid, link, &key));
62173 +}
62174 +
62175 +/*
62176 + * in-memory structure to keep information extracted from safe-link. This is
62177 + * used to iterate over all safe-links.
62178 + */
62179 +struct safe_link_context {
62180 +       reiser4_tree *tree;     /* internal tree */
62181 +       reiser4_key key;        /* safe-link key */
62182 +       reiser4_key sdkey;      /* key of object stat-data */
62183 +       reiser4_safe_link_t link;       /* safe-link type */
62184 +       oid_t oid;              /* object oid */
62185 +       __u64 size;             /* final size for truncate */
62186 +};
62187 +
62188 +/*
62189 + * start iterating over all safe-links.
62190 + */
62191 +static void safe_link_iter_begin(reiser4_tree * tree,
62192 +                                struct safe_link_context *ctx)
62193 +{
62194 +       ctx->tree = tree;
62195 +       reiser4_key_init(&ctx->key);
62196 +       set_key_locality(&ctx->key, safe_link_locality(tree));
62197 +       set_key_objectid(&ctx->key, get_key_objectid(reiser4_max_key()));
62198 +       set_key_offset(&ctx->key, get_key_offset(reiser4_max_key()));
62199 +}
62200 +
62201 +/*
62202 + * return next safe-link.
62203 + */
62204 +static int safe_link_iter_next(struct safe_link_context *ctx)
62205 +{
62206 +       int result;
62207 +       safelink_t sl;
62208 +
62209 +       result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0);
62210 +       if (result == 0) {
62211 +               ctx->oid = get_key_objectid(&ctx->key);
62212 +               ctx->link = get_key_offset(&ctx->key);
62213 +               ctx->sdkey = sl.sdkey;
62214 +               if (ctx->link == SAFE_TRUNCATE)
62215 +                       ctx->size = le64_to_cpu(get_unaligned(&sl.size));
62216 +       }
62217 +       return result;
62218 +}
62219 +
62220 +/*
62221 + * check are there any more safe-links left in the tree.
62222 + */
62223 +static int safe_link_iter_finished(struct safe_link_context *ctx)
62224 +{
62225 +       return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree);
62226 +}
62227 +
62228 +/*
62229 + * finish safe-link iteration.
62230 + */
62231 +static void safe_link_iter_end(struct safe_link_context *ctx)
62232 +{
62233 +       /* nothing special */
62234 +}
62235 +
62236 +/*
62237 + * process single safe-link.
62238 + */
62239 +static int process_safelink(struct super_block *super, reiser4_safe_link_t link,
62240 +                           reiser4_key * sdkey, oid_t oid, __u64 size)
62241 +{
62242 +       struct inode *inode;
62243 +       int result;
62244 +
62245 +       /*
62246 +        * obtain object inode by reiser4_iget(), then call object plugin
62247 +        * ->safelink() method to do actual work, then delete safe-link on
62248 +        * success.
62249 +        */
62250 +       inode = reiser4_iget(super, sdkey, 1);
62251 +       if (!IS_ERR(inode)) {
62252 +               file_plugin *fplug;
62253 +
62254 +               fplug = inode_file_plugin(inode);
62255 +               assert("nikita-3428", fplug != NULL);
62256 +               assert("", oid == get_inode_oid(inode));
62257 +               if (fplug->safelink != NULL) {
62258 +                       /* reiser4_txn_restart_current is not necessary because
62259 +                        * mounting is signle thread. However, without it
62260 +                        * deadlock detection code will complain (see
62261 +                        * nikita-3361). */
62262 +                       reiser4_txn_restart_current();
62263 +                       result = fplug->safelink(inode, link, size);
62264 +               } else {
62265 +                       warning("nikita-3430",
62266 +                               "Cannot handle safelink for %lli",
62267 +                               (unsigned long long)oid);
62268 +                       reiser4_print_key("key", sdkey);
62269 +                       result = 0;
62270 +               }
62271 +               if (result != 0) {
62272 +                       warning("nikita-3431",
62273 +                               "Error processing safelink for %lli: %i",
62274 +                               (unsigned long long)oid, result);
62275 +               }
62276 +               reiser4_iget_complete(inode);
62277 +               iput(inode);
62278 +               if (result == 0) {
62279 +                       result = safe_link_grab(reiser4_get_tree(super),
62280 +                                               BA_CAN_COMMIT);
62281 +                       if (result == 0)
62282 +                               result =
62283 +                                   safe_link_del(reiser4_get_tree(super), oid,
62284 +                                                 link);
62285 +                       safe_link_release(reiser4_get_tree(super));
62286 +                       /*
62287 +                        * restart transaction: if there was large number of
62288 +                        * safe-links, their processing may fail to fit into
62289 +                        * single transaction.
62290 +                        */
62291 +                       if (result == 0)
62292 +                               reiser4_txn_restart_current();
62293 +               }
62294 +       } else
62295 +               result = PTR_ERR(inode);
62296 +       return result;
62297 +}
62298 +
62299 +/*
62300 + * iterate over all safe-links in the file-system processing them one by one.
62301 + */
62302 +int process_safelinks(struct super_block *super)
62303 +{
62304 +       struct safe_link_context ctx;
62305 +       int result;
62306 +
62307 +       if (rofs_super(super))
62308 +               /* do nothing on the read-only file system */
62309 +               return 0;
62310 +       safe_link_iter_begin(&get_super_private(super)->tree, &ctx);
62311 +       result = 0;
62312 +       do {
62313 +               result = safe_link_iter_next(&ctx);
62314 +               if (safe_link_iter_finished(&ctx) || result == -ENOENT) {
62315 +                       result = 0;
62316 +                       break;
62317 +               }
62318 +               if (result == 0)
62319 +                       result = process_safelink(super, ctx.link,
62320 +                                                 &ctx.sdkey, ctx.oid,
62321 +                                                 ctx.size);
62322 +       } while (result == 0);
62323 +       safe_link_iter_end(&ctx);
62324 +       return result;
62325 +}
62326 +
62327 +/* Make Linus happy.
62328 +   Local variables:
62329 +   c-indentation-style: "K&R"
62330 +   mode-name: "LC"
62331 +   c-basic-offset: 8
62332 +   tab-width: 8
62333 +   fill-column: 120
62334 +   scroll-step: 1
62335 +   End:
62336 +*/
62337 diff -urN linux-2.6.35.orig/fs/reiser4/safe_link.h linux-2.6.35/fs/reiser4/safe_link.h
62338 --- linux-2.6.35.orig/fs/reiser4/safe_link.h    1970-01-01 01:00:00.000000000 +0100
62339 +++ linux-2.6.35/fs/reiser4/safe_link.h 2010-08-04 15:44:57.000000000 +0200
62340 @@ -0,0 +1,29 @@
62341 +/* Copyright 2003 by Hans Reiser, licensing governed by
62342 + * reiser4/README */
62343 +
62344 +/* Safe-links. See safe_link.c for details. */
62345 +
62346 +#if !defined(__FS_SAFE_LINK_H__)
62347 +#define __FS_SAFE_LINK_H__
62348 +
62349 +#include "tree.h"
62350 +
62351 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags);
62352 +void safe_link_release(reiser4_tree * tree);
62353 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link);
62354 +int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link);
62355 +
62356 +int process_safelinks(struct super_block *super);
62357 +
62358 +/* __FS_SAFE_LINK_H__ */
62359 +#endif
62360 +
62361 +/* Make Linus happy.
62362 +   Local variables:
62363 +   c-indentation-style: "K&R"
62364 +   mode-name: "LC"
62365 +   c-basic-offset: 8
62366 +   tab-width: 8
62367 +   fill-column: 120
62368 +   End:
62369 +*/
62370 diff -urN linux-2.6.35.orig/fs/reiser4/seal.c linux-2.6.35/fs/reiser4/seal.c
62371 --- linux-2.6.35.orig/fs/reiser4/seal.c 1970-01-01 01:00:00.000000000 +0100
62372 +++ linux-2.6.35/fs/reiser4/seal.c      2010-08-04 15:44:57.000000000 +0200
62373 @@ -0,0 +1,218 @@
62374 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62375 +/* Seals implementation. */
62376 +/* Seals are "weak" tree pointers. They are analogous to tree coords in
62377 +   allowing to bypass tree traversal. But normal usage of coords implies that
62378 +   node pointed to by coord is locked, whereas seals don't keep a lock (or
62379 +   even a reference) to znode. In stead, each znode contains a version number,
62380 +   increased on each znode modification. This version number is copied into a
62381 +   seal when seal is created. Later, one can "validate" seal by calling
62382 +   reiser4_seal_validate(). If znode is in cache and its version number is
62383 +   still the same, seal is "pristine" and coord associated with it can be
62384 +   re-used immediately.
62385 +
62386 +   If, on the other hand, znode is out of cache, or it is obviously different
62387 +   one from the znode seal was initially attached to (for example, it is on
62388 +   the different level, or is being removed from the tree), seal is
62389 +   irreparably invalid ("burned") and tree traversal has to be repeated.
62390 +
62391 +   Otherwise, there is some hope, that while znode was modified (and seal was
62392 +   "broken" as a result), key attached to the seal is still in the node. This
62393 +   is checked by first comparing this key with delimiting keys of node and, if
62394 +   key is ok, doing intra-node lookup.
62395 +
62396 +   Znode version is maintained in the following way:
62397 +
62398 +   there is reiser4_tree.znode_epoch counter. Whenever new znode is created,
62399 +   znode_epoch is incremented and its new value is stored in ->version field
62400 +   of new znode. Whenever znode is dirtied (which means it was probably
62401 +   modified), znode_epoch is also incremented and its new value is stored in
62402 +   znode->version. This is done so, because just incrementing znode->version
62403 +   on each update is not enough: it may so happen, that znode get deleted, new
62404 +   znode is allocated for the same disk block and gets the same version
62405 +   counter, tricking seal code into false positive.
62406 +*/
62407 +
62408 +#include "forward.h"
62409 +#include "debug.h"
62410 +#include "key.h"
62411 +#include "coord.h"
62412 +#include "seal.h"
62413 +#include "plugin/item/item.h"
62414 +#include "plugin/node/node.h"
62415 +#include "jnode.h"
62416 +#include "znode.h"
62417 +#include "super.h"
62418 +
62419 +static znode *seal_node(const seal_t *seal);
62420 +static int seal_matches(const seal_t *seal, znode * node);
62421 +
62422 +/* initialise seal. This can be called several times on the same seal. @coord
62423 +   and @key can be NULL.  */
62424 +void reiser4_seal_init(seal_t *seal /* seal to initialise */ ,
62425 +                      const coord_t *coord /* coord @seal will be
62426 +                                             * attached to */ ,
62427 +                      const reiser4_key * key UNUSED_ARG /* key @seal will be
62428 +                                                          * attached to */ )
62429 +{
62430 +       assert("nikita-1886", seal != NULL);
62431 +       memset(seal, 0, sizeof *seal);
62432 +       if (coord != NULL) {
62433 +               znode *node;
62434 +
62435 +               node = coord->node;
62436 +               assert("nikita-1987", node != NULL);
62437 +               spin_lock_znode(node);
62438 +               seal->version = node->version;
62439 +               assert("nikita-1988", seal->version != 0);
62440 +               seal->block = *znode_get_block(node);
62441 +#if REISER4_DEBUG
62442 +               seal->coord1 = *coord;
62443 +               if (key != NULL)
62444 +                       seal->key = *key;
62445 +#endif
62446 +               spin_unlock_znode(node);
62447 +       }
62448 +}
62449 +
62450 +/* finish with seal */
62451 +void reiser4_seal_done(seal_t *seal/* seal to clear */)
62452 +{
62453 +       assert("nikita-1887", seal != NULL);
62454 +       seal->version = 0;
62455 +}
62456 +
62457 +/* true if seal was initialised */
62458 +int reiser4_seal_is_set(const seal_t *seal/* seal to query */)
62459 +{
62460 +       assert("nikita-1890", seal != NULL);
62461 +       return seal->version != 0;
62462 +}
62463 +
62464 +#if REISER4_DEBUG
62465 +/* helper function for reiser4_seal_validate(). It checks that item at @coord
62466 + * has expected key. This is to detect cases where node was modified but wasn't
62467 + * marked dirty. */
62468 +static inline int check_seal_match(const coord_t *coord /* coord to check */ ,
62469 +                                  const reiser4_key * k/* expected key */)
62470 +{
62471 +       reiser4_key ukey;
62472 +
62473 +       return (coord->between != AT_UNIT) ||
62474 +           /* FIXME-VS: we only can compare keys for items whose units
62475 +              represent exactly one key */
62476 +           ((coord_is_existing_unit(coord))
62477 +            && (item_is_extent(coord)
62478 +                || keyeq(k, unit_key_by_coord(coord, &ukey))))
62479 +           || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord))
62480 +               && keyge(k, unit_key_by_coord(coord, &ukey)));
62481 +}
62482 +#endif
62483 +
62484 +/* this is used by reiser4_seal_validate. It accepts return value of
62485 + * longterm_lock_znode and returns 1 if it can be interpreted as seal
62486 + * validation failure. For instance, when longterm_lock_znode returns -EINVAL,
62487 + * reiser4_seal_validate returns -E_REPEAT and caller will call tre search.
62488 + * We cannot do this in longterm_lock_znode(), because sometimes we want to
62489 + * distinguish between -EINVAL and -E_REPEAT. */
62490 +static int should_repeat(int return_code)
62491 +{
62492 +       return return_code == -EINVAL;
62493 +}
62494 +
62495 +/* (re-)validate seal.
62496 +
62497 +   Checks whether seal is pristine, and try to revalidate it if possible.
62498 +
62499 +   If seal was burned, or broken irreparably, return -E_REPEAT.
62500 +
62501 +   NOTE-NIKITA currently reiser4_seal_validate() returns -E_REPEAT if key we are
62502 +   looking for is in range of keys covered by the sealed node, but item wasn't
62503 +   found by node ->lookup() method. Alternative is to return -ENOENT in this
62504 +   case, but this would complicate callers logic.
62505 +
62506 +*/
62507 +int reiser4_seal_validate(seal_t *seal /* seal to validate */,
62508 +                         coord_t *coord /* coord to validate against */,
62509 +                         const reiser4_key * key /* key to validate against */,
62510 +                         lock_handle * lh /* resulting lock handle */,
62511 +                         znode_lock_mode mode /* lock node */,
62512 +                         znode_lock_request request/* locking priority */)
62513 +{
62514 +       znode *node;
62515 +       int result;
62516 +
62517 +       assert("nikita-1889", seal != NULL);
62518 +       assert("nikita-1881", reiser4_seal_is_set(seal));
62519 +       assert("nikita-1882", key != NULL);
62520 +       assert("nikita-1883", coord != NULL);
62521 +       assert("nikita-1884", lh != NULL);
62522 +       assert("nikita-1885", keyeq(&seal->key, key));
62523 +       assert("nikita-1989", coords_equal(&seal->coord1, coord));
62524 +
62525 +       /* obtain znode by block number */
62526 +       node = seal_node(seal);
62527 +       if (node != NULL) {
62528 +               /* znode was in cache, lock it */
62529 +               result = longterm_lock_znode(lh, node, mode, request);
62530 +               zput(node);
62531 +               if (result == 0) {
62532 +                       if (seal_matches(seal, node)) {
62533 +                               /* if seal version and znode version
62534 +                                  coincide */
62535 +                               ON_DEBUG(coord_update_v(coord));
62536 +                               assert("nikita-1990",
62537 +                                      node == seal->coord1.node);
62538 +                               assert("nikita-1898",
62539 +                                      WITH_DATA_RET(coord->node, 1,
62540 +                                                    check_seal_match(coord,
62541 +                                                                     key)));
62542 +                       } else
62543 +                               result = RETERR(-E_REPEAT);
62544 +               }
62545 +               if (result != 0) {
62546 +                       if (should_repeat(result))
62547 +                               result = RETERR(-E_REPEAT);
62548 +                       /* unlock node on failure */
62549 +                       done_lh(lh);
62550 +               }
62551 +       } else {
62552 +               /* znode wasn't in cache */
62553 +               result = RETERR(-E_REPEAT);
62554 +       }
62555 +       return result;
62556 +}
62557 +
62558 +/* helpers functions */
62559 +
62560 +/* obtain reference to znode seal points to, if in cache */
62561 +static znode *seal_node(const seal_t *seal/* seal to query */)
62562 +{
62563 +       assert("nikita-1891", seal != NULL);
62564 +       return zlook(current_tree, &seal->block);
62565 +}
62566 +
62567 +/* true if @seal version and @node version coincide */
62568 +static int seal_matches(const seal_t *seal /* seal to check */ ,
62569 +                       znode * node/* node to check */)
62570 +{
62571 +       int result;
62572 +
62573 +       assert("nikita-1991", seal != NULL);
62574 +       assert("nikita-1993", node != NULL);
62575 +
62576 +       spin_lock_znode(node);
62577 +       result = (seal->version == node->version);
62578 +       spin_unlock_znode(node);
62579 +       return result;
62580 +}
62581 +
62582 +/* Make Linus happy.
62583 +   Local variables:
62584 +   c-indentation-style: "K&R"
62585 +   mode-name: "LC"
62586 +   c-basic-offset: 8
62587 +   tab-width: 8
62588 +   fill-column: 120
62589 +   scroll-step: 1
62590 +   End:
62591 +*/
62592 diff -urN linux-2.6.35.orig/fs/reiser4/seal.h linux-2.6.35/fs/reiser4/seal.h
62593 --- linux-2.6.35.orig/fs/reiser4/seal.h 1970-01-01 01:00:00.000000000 +0100
62594 +++ linux-2.6.35/fs/reiser4/seal.h      2010-08-04 15:44:57.000000000 +0200
62595 @@ -0,0 +1,49 @@
62596 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62597 +
62598 +/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */
62599 +
62600 +#ifndef __SEAL_H__
62601 +#define __SEAL_H__
62602 +
62603 +#include "forward.h"
62604 +#include "debug.h"
62605 +#include "dformat.h"
62606 +#include "key.h"
62607 +#include "coord.h"
62608 +
62609 +/* for __u?? types */
62610 +/*#include <linux/types.h>*/
62611 +
62612 +/* seal. See comment at the top of seal.c */
62613 +typedef struct seal_s {
62614 +       /* version of znode recorder at the time of seal creation */
62615 +       __u64 version;
62616 +       /* block number of znode attached to this seal */
62617 +       reiser4_block_nr block;
62618 +#if REISER4_DEBUG
62619 +       /* coord this seal is attached to. For debugging. */
62620 +       coord_t coord1;
62621 +       /* key this seal is attached to. For debugging. */
62622 +       reiser4_key key;
62623 +#endif
62624 +} seal_t;
62625 +
62626 +extern void reiser4_seal_init(seal_t *, const coord_t *, const reiser4_key *);
62627 +extern void reiser4_seal_done(seal_t *);
62628 +extern int reiser4_seal_is_set(const seal_t *);
62629 +extern int reiser4_seal_validate(seal_t *, coord_t *,
62630 +                        const reiser4_key *, lock_handle * ,
62631 +                        znode_lock_mode mode, znode_lock_request request);
62632 +
62633 +/* __SEAL_H__ */
62634 +#endif
62635 +
62636 +/* Make Linus happy.
62637 +   Local variables:
62638 +   c-indentation-style: "K&R"
62639 +   mode-name: "LC"
62640 +   c-basic-offset: 8
62641 +   tab-width: 8
62642 +   fill-column: 120
62643 +   End:
62644 +*/
62645 diff -urN linux-2.6.35.orig/fs/reiser4/search.c linux-2.6.35/fs/reiser4/search.c
62646 --- linux-2.6.35.orig/fs/reiser4/search.c       1970-01-01 01:00:00.000000000 +0100
62647 +++ linux-2.6.35/fs/reiser4/search.c    2010-08-04 15:44:57.000000000 +0200
62648 @@ -0,0 +1,1612 @@
62649 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
62650 + * reiser4/README */
62651 +
62652 +#include "forward.h"
62653 +#include "debug.h"
62654 +#include "dformat.h"
62655 +#include "key.h"
62656 +#include "coord.h"
62657 +#include "seal.h"
62658 +#include "plugin/item/item.h"
62659 +#include "plugin/node/node.h"
62660 +#include "plugin/plugin.h"
62661 +#include "jnode.h"
62662 +#include "znode.h"
62663 +#include "block_alloc.h"
62664 +#include "tree_walk.h"
62665 +#include "tree.h"
62666 +#include "reiser4.h"
62667 +#include "super.h"
62668 +#include "inode.h"
62669 +
62670 +#include <linux/slab.h>
62671 +
62672 +static const char *bias_name(lookup_bias bias);
62673 +
62674 +/* tree searching algorithm, intranode searching algorithms are in
62675 +   plugin/node/ */
62676 +
62677 +/* tree lookup cache
62678 + *
62679 + * The coord by key cache consists of small list of recently accessed nodes
62680 + * maintained according to the LRU discipline. Before doing real top-to-down
62681 + * tree traversal this cache is scanned for nodes that can contain key
62682 + * requested.
62683 + *
62684 + * The efficiency of coord cache depends heavily on locality of reference for
62685 + * tree accesses. Our user level simulations show reasonably good hit ratios
62686 + * for coord cache under most loads so far.
62687 + */
62688 +
62689 +/* Initialise coord cache slot */
62690 +static void cbk_cache_init_slot(cbk_cache_slot *slot)
62691 +{
62692 +       assert("nikita-345", slot != NULL);
62693 +
62694 +       INIT_LIST_HEAD(&slot->lru);
62695 +       slot->node = NULL;
62696 +}
62697 +
62698 +/* Initialize coord cache */
62699 +int cbk_cache_init(cbk_cache * cache/* cache to init */)
62700 +{
62701 +       int i;
62702 +
62703 +       assert("nikita-346", cache != NULL);
62704 +
62705 +       cache->slot =
62706 +               kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots,
62707 +                       reiser4_ctx_gfp_mask_get());
62708 +       if (cache->slot == NULL)
62709 +               return RETERR(-ENOMEM);
62710 +
62711 +       INIT_LIST_HEAD(&cache->lru);
62712 +       for (i = 0; i < cache->nr_slots; ++i) {
62713 +               cbk_cache_init_slot(cache->slot + i);
62714 +               list_add_tail(&((cache->slot + i)->lru), &cache->lru);
62715 +       }
62716 +       rwlock_init(&cache->guard);
62717 +       return 0;
62718 +}
62719 +
62720 +/* free cbk cache data */
62721 +void cbk_cache_done(cbk_cache * cache/* cache to release */)
62722 +{
62723 +       assert("nikita-2493", cache != NULL);
62724 +       if (cache->slot != NULL) {
62725 +               kfree(cache->slot);
62726 +               cache->slot = NULL;
62727 +       }
62728 +}
62729 +
62730 +/* macro to iterate over all cbk cache slots */
62731 +#define for_all_slots(cache, slot)                                       \
62732 +       for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru); \
62733 +            &(cache)->lru != &(slot)->lru;                               \
62734 +            (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
62735 +
62736 +#if REISER4_DEBUG
62737 +/* this function assures that [cbk-cache-invariant] invariant holds */
62738 +static int cbk_cache_invariant(const cbk_cache * cache)
62739 +{
62740 +       cbk_cache_slot *slot;
62741 +       int result;
62742 +       int unused;
62743 +
62744 +       if (cache->nr_slots == 0)
62745 +               return 1;
62746 +
62747 +       assert("nikita-2469", cache != NULL);
62748 +       unused = 0;
62749 +       result = 1;
62750 +       read_lock(&((cbk_cache *)cache)->guard);
62751 +       for_all_slots(cache, slot) {
62752 +               /* in LRU first go all `used' slots followed by `unused' */
62753 +               if (unused && (slot->node != NULL))
62754 +                       result = 0;
62755 +               if (slot->node == NULL)
62756 +                       unused = 1;
62757 +               else {
62758 +                       cbk_cache_slot *scan;
62759 +
62760 +                       /* all cached nodes are different */
62761 +                       scan = slot;
62762 +                       while (result) {
62763 +                               scan = list_entry(scan->lru.next,
62764 +                                                 cbk_cache_slot, lru);
62765 +                               if (&cache->lru == &scan->lru)
62766 +                                       break;
62767 +                               if (slot->node == scan->node)
62768 +                                       result = 0;
62769 +                       }
62770 +               }
62771 +               if (!result)
62772 +                       break;
62773 +       }
62774 +       read_unlock(&((cbk_cache *)cache)->guard);
62775 +       return result;
62776 +}
62777 +
62778 +#endif
62779 +
62780 +/* Remove references, if any, to @node from coord cache */
62781 +void cbk_cache_invalidate(const znode * node /* node to remove from cache */ ,
62782 +                         reiser4_tree * tree/* tree to remove node from */)
62783 +{
62784 +       cbk_cache_slot *slot;
62785 +       cbk_cache *cache;
62786 +       int i;
62787 +
62788 +       assert("nikita-350", node != NULL);
62789 +       assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree));
62790 +
62791 +       cache = &tree->cbk_cache;
62792 +       assert("nikita-2470", cbk_cache_invariant(cache));
62793 +
62794 +       write_lock(&(cache->guard));
62795 +       for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
62796 +               if (slot->node == node) {
62797 +                       list_move_tail(&slot->lru, &cache->lru);
62798 +                       slot->node = NULL;
62799 +                       break;
62800 +               }
62801 +       }
62802 +       write_unlock(&(cache->guard));
62803 +       assert("nikita-2471", cbk_cache_invariant(cache));
62804 +}
62805 +
62806 +/* add to the cbk-cache in the "tree" information about "node". This
62807 +    can actually be update of existing slot in a cache. */
62808 +static void cbk_cache_add(const znode * node/* node to add to the cache */)
62809 +{
62810 +       cbk_cache *cache;
62811 +
62812 +       cbk_cache_slot *slot;
62813 +       int i;
62814 +
62815 +       assert("nikita-352", node != NULL);
62816 +
62817 +       cache = &znode_get_tree(node)->cbk_cache;
62818 +       assert("nikita-2472", cbk_cache_invariant(cache));
62819 +
62820 +       if (cache->nr_slots == 0)
62821 +               return;
62822 +
62823 +       write_lock(&(cache->guard));
62824 +       /* find slot to update/add */
62825 +       for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
62826 +               /* oops, this node is already in a cache */
62827 +               if (slot->node == node)
62828 +                       break;
62829 +       }
62830 +       /* if all slots are used, reuse least recently used one */
62831 +       if (i == cache->nr_slots) {
62832 +               slot = list_entry(cache->lru.prev, cbk_cache_slot, lru);
62833 +               slot->node = (znode *) node;
62834 +       }
62835 +       list_move(&slot->lru, &cache->lru);
62836 +       write_unlock(&(cache->guard));
62837 +       assert("nikita-2473", cbk_cache_invariant(cache));
62838 +}
62839 +
62840 +static int setup_delimiting_keys(cbk_handle * h);
62841 +static lookup_result coord_by_handle(cbk_handle * handle);
62842 +static lookup_result traverse_tree(cbk_handle * h);
62843 +static int cbk_cache_search(cbk_handle * h);
62844 +
62845 +static level_lookup_result cbk_level_lookup(cbk_handle * h);
62846 +static level_lookup_result cbk_node_lookup(cbk_handle * h);
62847 +
62848 +/* helper functions */
62849 +
62850 +static void update_stale_dk(reiser4_tree * tree, znode * node);
62851 +
62852 +/* release parent node during traversal */
62853 +static void put_parent(cbk_handle * h);
62854 +/* check consistency of fields */
62855 +static int sanity_check(cbk_handle * h);
62856 +/* release resources in handle */
62857 +static void hput(cbk_handle * h);
62858 +
62859 +static level_lookup_result search_to_left(cbk_handle * h);
62860 +
62861 +/* pack numerous (numberous I should say) arguments of coord_by_key() into
62862 + * cbk_handle */
62863 +static cbk_handle *cbk_pack(cbk_handle * handle,
62864 +                           reiser4_tree * tree,
62865 +                           const reiser4_key * key,
62866 +                           coord_t *coord,
62867 +                           lock_handle * active_lh,
62868 +                           lock_handle * parent_lh,
62869 +                           znode_lock_mode lock_mode,
62870 +                           lookup_bias bias,
62871 +                           tree_level lock_level,
62872 +                           tree_level stop_level,
62873 +                           __u32 flags, ra_info_t *info)
62874 +{
62875 +       memset(handle, 0, sizeof *handle);
62876 +
62877 +       handle->tree = tree;
62878 +       handle->key = key;
62879 +       handle->lock_mode = lock_mode;
62880 +       handle->bias = bias;
62881 +       handle->lock_level = lock_level;
62882 +       handle->stop_level = stop_level;
62883 +       handle->coord = coord;
62884 +       /* set flags. See comment in tree.h:cbk_flags */
62885 +       handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK;
62886 +
62887 +       handle->active_lh = active_lh;
62888 +       handle->parent_lh = parent_lh;
62889 +       handle->ra_info = info;
62890 +       return handle;
62891 +}
62892 +
62893 +/* main tree lookup procedure
62894 +
62895 +   Check coord cache. If key we are looking for is not found there, call cbk()
62896 +   to do real tree traversal.
62897 +
62898 +   As we have extents on the twig level, @lock_level and @stop_level can
62899 +   be different from LEAF_LEVEL and each other.
62900 +
62901 +   Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
62902 +   long term locks) while calling this.
62903 +*/
62904 +lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search
62905 +                                                * in. Usually this tree is
62906 +                                                * part of file-system
62907 +                                                * super-block */ ,
62908 +                          const reiser4_key * key /* key to look for */ ,
62909 +                          coord_t *coord       /* where to store found
62910 +                                                * position in a tree. Fields
62911 +                                                * in "coord" are only valid if
62912 +                                                * coord_by_key() returned
62913 +                                                * "CBK_COORD_FOUND" */ ,
62914 +                          lock_handle * lh,    /* resulting lock handle */
62915 +                          znode_lock_mode lock_mode    /* type of lookup we
62916 +                                                        * want on node. Pass
62917 +                                                        * ZNODE_READ_LOCK here
62918 +                                                        * if you only want to
62919 +                                                        * read item found and
62920 +                                                        * ZNODE_WRITE_LOCK if
62921 +                                                        * you want to modify
62922 +                                                        * it */ ,
62923 +                          lookup_bias bias     /* what to return if coord
62924 +                                                * with exactly the @key is
62925 +                                                * not in the tree */ ,
62926 +                          tree_level lock_level/* tree level where to start
62927 +                                                * taking @lock type of
62928 +                                                * locks */ ,
62929 +                          tree_level stop_level/* tree level to stop. Pass
62930 +                                                * LEAF_LEVEL or TWIG_LEVEL
62931 +                                                * here Item being looked
62932 +                                                * for has to be between
62933 +                                                * @lock_level and
62934 +                                                * @stop_level, inclusive */ ,
62935 +                          __u32 flags /* search flags */ ,
62936 +                          ra_info_t *
62937 +                          info
62938 +                          /* information about desired tree traversal
62939 +                           * readahead */
62940 +                          )
62941 +{
62942 +       cbk_handle handle;
62943 +       lock_handle parent_lh;
62944 +       lookup_result result;
62945 +
62946 +       init_lh(lh);
62947 +       init_lh(&parent_lh);
62948 +
62949 +       assert("nikita-3023", reiser4_schedulable());
62950 +
62951 +       assert("nikita-353", tree != NULL);
62952 +       assert("nikita-354", key != NULL);
62953 +       assert("nikita-355", coord != NULL);
62954 +       assert("nikita-356", (bias == FIND_EXACT)
62955 +              || (bias == FIND_MAX_NOT_MORE_THAN));
62956 +       assert("nikita-357", stop_level >= LEAF_LEVEL);
62957 +       /* no locks can be held during tree traversal */
62958 +       assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
62959 +
62960 +       cbk_pack(&handle,
62961 +                tree,
62962 +                key,
62963 +                coord,
62964 +                lh,
62965 +                &parent_lh,
62966 +                lock_mode, bias, lock_level, stop_level, flags, info);
62967 +
62968 +       result = coord_by_handle(&handle);
62969 +       assert("nikita-3247",
62970 +              ergo(!IS_CBKERR(result), coord->node == lh->node));
62971 +       return result;
62972 +}
62973 +
62974 +/* like coord_by_key(), but starts traversal from vroot of @object rather than
62975 + * from tree root. */
62976 +lookup_result reiser4_object_lookup(struct inode *object,
62977 +                                   const reiser4_key * key,
62978 +                                   coord_t *coord,
62979 +                                   lock_handle * lh,
62980 +                                   znode_lock_mode lock_mode,
62981 +                                   lookup_bias bias,
62982 +                                   tree_level lock_level,
62983 +                                   tree_level stop_level, __u32 flags,
62984 +                                   ra_info_t *info)
62985 +{
62986 +       cbk_handle handle;
62987 +       lock_handle parent_lh;
62988 +       lookup_result result;
62989 +
62990 +       init_lh(lh);
62991 +       init_lh(&parent_lh);
62992 +
62993 +       assert("nikita-3023", reiser4_schedulable());
62994 +
62995 +       assert("nikita-354", key != NULL);
62996 +       assert("nikita-355", coord != NULL);
62997 +       assert("nikita-356", (bias == FIND_EXACT)
62998 +              || (bias == FIND_MAX_NOT_MORE_THAN));
62999 +       assert("nikita-357", stop_level >= LEAF_LEVEL);
63000 +       /* no locks can be held during tree search by key */
63001 +       assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
63002 +
63003 +       cbk_pack(&handle,
63004 +                object != NULL ? reiser4_tree_by_inode(object) : current_tree,
63005 +                key,
63006 +                coord,
63007 +                lh,
63008 +                &parent_lh,
63009 +                lock_mode, bias, lock_level, stop_level, flags, info);
63010 +       handle.object = object;
63011 +
63012 +       result = coord_by_handle(&handle);
63013 +       assert("nikita-3247",
63014 +              ergo(!IS_CBKERR(result), coord->node == lh->node));
63015 +       return result;
63016 +}
63017 +
63018 +/* lookup by cbk_handle. Common part of coord_by_key() and
63019 +   reiser4_object_lookup(). */
63020 +static lookup_result coord_by_handle(cbk_handle * handle)
63021 +{
63022 +       /*
63023 +        * first check cbk_cache (which is look-aside cache for our tree) and
63024 +        * of this fails, start traversal.
63025 +        */
63026 +       /* first check whether "key" is in cache of recent lookups. */
63027 +       if (cbk_cache_search(handle) == 0)
63028 +               return handle->result;
63029 +       else
63030 +               return traverse_tree(handle);
63031 +}
63032 +
63033 +/* Execute actor for each item (or unit, depending on @through_units_p),
63034 +   starting from @coord, right-ward, until either:
63035 +
63036 +   - end of the tree is reached
63037 +   - unformatted node is met
63038 +   - error occurred
63039 +   - @actor returns 0 or less
63040 +
63041 +   Error code, or last actor return value is returned.
63042 +
63043 +   This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through
63044 +   sequence of entries with identical keys and alikes.
63045 +*/
63046 +int reiser4_iterate_tree(reiser4_tree * tree /* tree to scan */ ,
63047 +                        coord_t *coord /* coord to start from */ ,
63048 +                        lock_handle * lh /* lock handle to start with and to
63049 +                                          * update along the way */ ,
63050 +                        tree_iterate_actor_t actor /* function to call on each
63051 +                                                    * item/unit */ ,
63052 +                        void *arg /* argument to pass to @actor */ ,
63053 +                        znode_lock_mode mode /* lock mode on scanned nodes */ ,
63054 +                        int through_units_p /* call @actor on each item or on
63055 +                                             * each unit */ )
63056 +{
63057 +       int result;
63058 +
63059 +       assert("nikita-1143", tree != NULL);
63060 +       assert("nikita-1145", coord != NULL);
63061 +       assert("nikita-1146", lh != NULL);
63062 +       assert("nikita-1147", actor != NULL);
63063 +
63064 +       result = zload(coord->node);
63065 +       coord_clear_iplug(coord);
63066 +       if (result != 0)
63067 +               return result;
63068 +       if (!coord_is_existing_unit(coord)) {
63069 +               zrelse(coord->node);
63070 +               return -ENOENT;
63071 +       }
63072 +       while ((result = actor(tree, coord, lh, arg)) > 0) {
63073 +               /* move further  */
63074 +               if ((through_units_p && coord_next_unit(coord)) ||
63075 +                   (!through_units_p && coord_next_item(coord))) {
63076 +                       do {
63077 +                               lock_handle couple;
63078 +
63079 +                               /* move to the next node  */
63080 +                               init_lh(&couple);
63081 +                               result =
63082 +                                   reiser4_get_right_neighbor(&couple,
63083 +                                                              coord->node,
63084 +                                                              (int)mode,
63085 +                                                              GN_CAN_USE_UPPER_LEVELS);
63086 +                               zrelse(coord->node);
63087 +                               if (result == 0) {
63088 +
63089 +                                       result = zload(couple.node);
63090 +                                       if (result != 0) {
63091 +                                               done_lh(&couple);
63092 +                                               return result;
63093 +                                       }
63094 +
63095 +                                       coord_init_first_unit(coord,
63096 +                                                             couple.node);
63097 +                                       done_lh(lh);
63098 +                                       move_lh(lh, &couple);
63099 +                               } else
63100 +                                       return result;
63101 +                       } while (node_is_empty(coord->node));
63102 +               }
63103 +
63104 +               assert("nikita-1149", coord_is_existing_unit(coord));
63105 +       }
63106 +       zrelse(coord->node);
63107 +       return result;
63108 +}
63109 +
63110 +/* return locked uber znode for @tree */
63111 +int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
63112 +                  znode_lock_request pri, lock_handle * lh)
63113 +{
63114 +       int result;
63115 +
63116 +       result = longterm_lock_znode(lh, tree->uber, mode, pri);
63117 +       return result;
63118 +}
63119 +
63120 +/* true if @key is strictly within @node
63121 +
63122 +   we are looking for possibly non-unique key and it is item is at the edge of
63123 +   @node. May be it is in the neighbor.
63124 +*/
63125 +static int znode_contains_key_strict(znode * node      /* node to check key
63126 +                                                        * against */ ,
63127 +                                    const reiser4_key *
63128 +                                    key /* key to check */ ,
63129 +                                    int isunique)
63130 +{
63131 +       int answer;
63132 +
63133 +       assert("nikita-1760", node != NULL);
63134 +       assert("nikita-1722", key != NULL);
63135 +
63136 +       if (keyge(key, &node->rd_key))
63137 +               return 0;
63138 +
63139 +       answer = keycmp(&node->ld_key, key);
63140 +
63141 +       if (isunique)
63142 +               return answer != GREATER_THAN;
63143 +       else
63144 +               return answer == LESS_THAN;
63145 +}
63146 +
63147 +/*
63148 + * Virtual Root (vroot) code.
63149 + *
63150 + *     For given file system object (e.g., regular file or directory) let's
63151 + *     define its "virtual root" as lowest in the tree (that is, furtherest
63152 + *     from the tree root) node such that all body items of said object are
63153 + *     located in a tree rooted at this node.
63154 + *
63155 + *     Once vroot of object is found all tree lookups for items within body of
63156 + *     this object ("object lookups") can be started from its vroot rather
63157 + *     than from real root. This has following advantages:
63158 + *
63159 + *         1. amount of nodes traversed during lookup (and, hence, amount of
63160 + *         key comparisons made) decreases, and
63161 + *
63162 + *         2. contention on tree root is decreased. This latter was actually
63163 + *         motivating reason behind vroot, because spin lock of root node,
63164 + *         which is taken when acquiring long-term lock on root node is the
63165 + *         hottest lock in the reiser4.
63166 + *
63167 + * How to find vroot.
63168 + *
63169 + *     When vroot of object F is not yet determined, all object lookups start
63170 + *     from the root of the tree. At each tree level during traversal we have
63171 + *     a node N such that a key we are looking for (which is the key inside
63172 + *     object's body) is located within N. In function handle_vroot() called
63173 + *     from cbk_level_lookup() we check whether N is possible vroot for
63174 + *     F. Check is trivial---if neither leftmost nor rightmost item of N
63175 + *     belongs to F (and we already have helpful ->owns_item() method of
63176 + *     object plugin for this), then N is possible vroot of F. This, of
63177 + *     course, relies on the assumption that each object occupies contiguous
63178 + *     range of keys in the tree.
63179 + *
63180 + *     Thus, traversing tree downward and checking each node as we go, we can
63181 + *     find lowest such node, which, by definition, is vroot.
63182 + *
63183 + * How to track vroot.
63184 + *
63185 + *     Nohow. If actual vroot changes, next object lookup will just restart
63186 + *     from the actual tree root, refreshing object's vroot along the way.
63187 + *
63188 + */
63189 +
63190 +/*
63191 + * Check whether @node is possible vroot of @object.
63192 + */
63193 +static void handle_vroot(struct inode *object, znode * node)
63194 +{
63195 +       file_plugin *fplug;
63196 +       coord_t coord;
63197 +
63198 +       fplug = inode_file_plugin(object);
63199 +       assert("nikita-3353", fplug != NULL);
63200 +       assert("nikita-3354", fplug->owns_item != NULL);
63201 +
63202 +       if (unlikely(node_is_empty(node)))
63203 +               return;
63204 +
63205 +       coord_init_first_unit(&coord, node);
63206 +       /*
63207 +        * if leftmost item of @node belongs to @object, we cannot be sure
63208 +        * that @node is vroot of @object, because, some items of @object are
63209 +        * probably in the sub-tree rooted at the left neighbor of @node.
63210 +        */
63211 +       if (fplug->owns_item(object, &coord))
63212 +               return;
63213 +       coord_init_last_unit(&coord, node);
63214 +       /* mutatis mutandis for the rightmost item */
63215 +       if (fplug->owns_item(object, &coord))
63216 +               return;
63217 +       /* otherwise, @node is possible vroot of @object */
63218 +       inode_set_vroot(object, node);
63219 +}
63220 +
63221 +/*
63222 + * helper function used by traverse tree to start tree traversal not from the
63223 + * tree root, but from @h->object's vroot, if possible.
63224 + */
63225 +static int prepare_object_lookup(cbk_handle * h)
63226 +{
63227 +       znode *vroot;
63228 +       int result;
63229 +
63230 +       vroot = inode_get_vroot(h->object);
63231 +       if (vroot == NULL) {
63232 +               /*
63233 +                * object doesn't have known vroot, start from real tree root.
63234 +                */
63235 +               return LOOKUP_CONT;
63236 +       }
63237 +
63238 +       h->level = znode_get_level(vroot);
63239 +       /* take a long-term lock on vroot */
63240 +       h->result = longterm_lock_znode(h->active_lh, vroot,
63241 +                                       cbk_lock_mode(h->level, h),
63242 +                                       ZNODE_LOCK_LOPRI);
63243 +       result = LOOKUP_REST;
63244 +       if (h->result == 0) {
63245 +               int isunique;
63246 +               int inside;
63247 +
63248 +               isunique = h->flags & CBK_UNIQUE;
63249 +               /* check that key is inside vroot */
63250 +               read_lock_dk(h->tree);
63251 +               inside = (znode_contains_key_strict(vroot, h->key, isunique) &&
63252 +                         !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE));
63253 +               read_unlock_dk(h->tree);
63254 +               if (inside) {
63255 +                       h->result = zload(vroot);
63256 +                       if (h->result == 0) {
63257 +                               /* search for key in vroot. */
63258 +                               result = cbk_node_lookup(h);
63259 +                               zrelse(vroot);  /*h->active_lh->node); */
63260 +                               if (h->active_lh->node != vroot) {
63261 +                                       result = LOOKUP_REST;
63262 +                               } else if (result == LOOKUP_CONT) {
63263 +                                       move_lh(h->parent_lh, h->active_lh);
63264 +                                       h->flags &= ~CBK_DKSET;
63265 +                               }
63266 +                       }
63267 +               }
63268 +       }
63269 +
63270 +       zput(vroot);
63271 +
63272 +       if (IS_CBKERR(h->result) || result == LOOKUP_REST)
63273 +               hput(h);
63274 +       return result;
63275 +}
63276 +
63277 +/* main function that handles common parts of tree traversal: starting
63278 +    (fake znode handling), restarts, error handling, completion */
63279 +static lookup_result traverse_tree(cbk_handle * h/* search handle */)
63280 +{
63281 +       int done;
63282 +       int iterations;
63283 +       int vroot_used;
63284 +
63285 +       assert("nikita-365", h != NULL);
63286 +       assert("nikita-366", h->tree != NULL);
63287 +       assert("nikita-367", h->key != NULL);
63288 +       assert("nikita-368", h->coord != NULL);
63289 +       assert("nikita-369", (h->bias == FIND_EXACT)
63290 +              || (h->bias == FIND_MAX_NOT_MORE_THAN));
63291 +       assert("nikita-370", h->stop_level >= LEAF_LEVEL);
63292 +       assert("nikita-2949", !(h->flags & CBK_DKSET));
63293 +       assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
63294 +
63295 +       done = 0;
63296 +       iterations = 0;
63297 +       vroot_used = 0;
63298 +
63299 +       /* loop for restarts */
63300 +restart:
63301 +
63302 +       assert("nikita-3024", reiser4_schedulable());
63303 +
63304 +       h->result = CBK_COORD_FOUND;
63305 +       /* connect_znode() needs it */
63306 +       h->ld_key = *reiser4_min_key();
63307 +       h->rd_key = *reiser4_max_key();
63308 +       h->flags |= CBK_DKSET;
63309 +       h->error = NULL;
63310 +
63311 +       if (!vroot_used && h->object != NULL) {
63312 +               vroot_used = 1;
63313 +               done = prepare_object_lookup(h);
63314 +               if (done == LOOKUP_REST)
63315 +                       goto restart;
63316 +               else if (done == LOOKUP_DONE)
63317 +                       return h->result;
63318 +       }
63319 +       if (h->parent_lh->node == NULL) {
63320 +               done =
63321 +                   get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
63322 +                                  h->parent_lh);
63323 +
63324 +               assert("nikita-1637", done != -E_DEADLOCK);
63325 +
63326 +               h->block = h->tree->root_block;
63327 +               h->level = h->tree->height;
63328 +               h->coord->node = h->parent_lh->node;
63329 +
63330 +               if (done != 0)
63331 +                       return done;
63332 +       }
63333 +
63334 +       /* loop descending a tree */
63335 +       while (!done) {
63336 +
63337 +               if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
63338 +                            IS_POW(iterations))) {
63339 +                       warning("nikita-1481", "Too many iterations: %i",
63340 +                               iterations);
63341 +                       reiser4_print_key("key", h->key);
63342 +                       ++iterations;
63343 +               } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
63344 +                       h->error =
63345 +                           "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
63346 +                       h->result = RETERR(-EIO);
63347 +                       break;
63348 +               }
63349 +               switch (cbk_level_lookup(h)) {
63350 +               case LOOKUP_CONT:
63351 +                       move_lh(h->parent_lh, h->active_lh);
63352 +                       continue;
63353 +               default:
63354 +                       wrong_return_value("nikita-372", "cbk_level");
63355 +               case LOOKUP_DONE:
63356 +                       done = 1;
63357 +                       break;
63358 +               case LOOKUP_REST:
63359 +                       hput(h);
63360 +                       /* deadlock avoidance is normal case. */
63361 +                       if (h->result != -E_DEADLOCK)
63362 +                               ++iterations;
63363 +                       reiser4_preempt_point();
63364 +                       goto restart;
63365 +               }
63366 +       }
63367 +       /* that's all. The rest is error handling */
63368 +       if (unlikely(h->error != NULL)) {
63369 +               warning("nikita-373", "%s: level: %i, "
63370 +                       "lock_level: %i, stop_level: %i "
63371 +                       "lock_mode: %s, bias: %s",
63372 +                       h->error, h->level, h->lock_level, h->stop_level,
63373 +                       lock_mode_name(h->lock_mode), bias_name(h->bias));
63374 +               reiser4_print_address("block", &h->block);
63375 +               reiser4_print_key("key", h->key);
63376 +               print_coord_content("coord", h->coord);
63377 +       }
63378 +       /* `unlikely' error case */
63379 +       if (unlikely(IS_CBKERR(h->result))) {
63380 +               /* failure. do cleanup */
63381 +               hput(h);
63382 +       } else {
63383 +               assert("nikita-1605", WITH_DATA_RET
63384 +                      (h->coord->node, 1,
63385 +                       ergo((h->result == CBK_COORD_FOUND) &&
63386 +                            (h->bias == FIND_EXACT) &&
63387 +                            (!node_is_empty(h->coord->node)),
63388 +                            coord_is_existing_item(h->coord))));
63389 +       }
63390 +       return h->result;
63391 +}
63392 +
63393 +/* find delimiting keys of child
63394 +
63395 +   Determine left and right delimiting keys for child pointed to by
63396 +   @parent_coord.
63397 +
63398 +*/
63399 +static void find_child_delimiting_keys(znode * parent  /* parent znode, passed
63400 +                                                        * locked */ ,
63401 +                                      const coord_t *parent_coord
63402 +                                                       /* coord where pointer
63403 +                                                        * to child is stored
63404 +                                                        */ ,
63405 +                                      reiser4_key * ld /* where to store left
63406 +                                                        * delimiting key */ ,
63407 +                                      reiser4_key * rd /* where to store right
63408 +                                                        * delimiting key */ )
63409 +{
63410 +       coord_t neighbor;
63411 +
63412 +       assert("nikita-1484", parent != NULL);
63413 +       assert_rw_locked(&(znode_get_tree(parent)->dk_lock));
63414 +
63415 +       coord_dup(&neighbor, parent_coord);
63416 +
63417 +       if (neighbor.between == AT_UNIT)
63418 +               /* imitate item ->lookup() behavior. */
63419 +               neighbor.between = AFTER_UNIT;
63420 +
63421 +       if (coord_set_to_left(&neighbor) == 0)
63422 +               unit_key_by_coord(&neighbor, ld);
63423 +       else {
63424 +               assert("nikita-14851", 0);
63425 +               *ld = *znode_get_ld_key(parent);
63426 +       }
63427 +
63428 +       coord_dup(&neighbor, parent_coord);
63429 +       if (neighbor.between == AT_UNIT)
63430 +               neighbor.between = AFTER_UNIT;
63431 +       if (coord_set_to_right(&neighbor) == 0)
63432 +               unit_key_by_coord(&neighbor, rd);
63433 +       else
63434 +               *rd = *znode_get_rd_key(parent);
63435 +}
63436 +
63437 +/*
63438 + * setup delimiting keys for a child
63439 + *
63440 + * @parent parent node
63441 + *
63442 + * @coord location in @parent where pointer to @child is
63443 + *
63444 + * @child child node
63445 + */
63446 +int
63447 +set_child_delimiting_keys(znode * parent, const coord_t *coord, znode * child)
63448 +{
63449 +       reiser4_tree *tree;
63450 +
63451 +       assert("nikita-2952",
63452 +              znode_get_level(parent) == znode_get_level(coord->node));
63453 +
63454 +       /* fast check without taking dk lock. This is safe, because
63455 +        * JNODE_DKSET is never cleared once set. */
63456 +       if (!ZF_ISSET(child, JNODE_DKSET)) {
63457 +               tree = znode_get_tree(parent);
63458 +               write_lock_dk(tree);
63459 +               if (likely(!ZF_ISSET(child, JNODE_DKSET))) {
63460 +                       find_child_delimiting_keys(parent, coord,
63461 +                                                  &child->ld_key,
63462 +                                                  &child->rd_key);
63463 +                       ON_DEBUG(child->ld_key_version =
63464 +                                atomic_inc_return(&delim_key_version);
63465 +                                child->rd_key_version =
63466 +                                atomic_inc_return(&delim_key_version););
63467 +                       ZF_SET(child, JNODE_DKSET);
63468 +               }
63469 +               write_unlock_dk(tree);
63470 +               return 1;
63471 +       }
63472 +       return 0;
63473 +}
63474 +
63475 +/* Perform tree lookup at one level. This is called from cbk_traverse()
63476 +   function that drives lookup through tree and calls cbk_node_lookup() to
63477 +   perform lookup within one node.
63478 +
63479 +   See comments in a code.
63480 +*/
63481 +static level_lookup_result cbk_level_lookup(cbk_handle * h/* search handle */)
63482 +{
63483 +       int ret;
63484 +       int setdk;
63485 +       int ldkeyset = 0;
63486 +       reiser4_key ldkey;
63487 +       reiser4_key key;
63488 +       znode *active;
63489 +
63490 +       assert("nikita-3025", reiser4_schedulable());
63491 +
63492 +       /* acquire reference to @active node */
63493 +       active =
63494 +           zget(h->tree, &h->block, h->parent_lh->node, h->level,
63495 +                reiser4_ctx_gfp_mask_get());
63496 +
63497 +       if (IS_ERR(active)) {
63498 +               h->result = PTR_ERR(active);
63499 +               return LOOKUP_DONE;
63500 +       }
63501 +
63502 +       /* lock @active */
63503 +       h->result = longterm_lock_znode(h->active_lh,
63504 +                                       active,
63505 +                                       cbk_lock_mode(h->level, h),
63506 +                                       ZNODE_LOCK_LOPRI);
63507 +       /* longterm_lock_znode() acquires additional reference to znode (which
63508 +          will be later released by longterm_unlock_znode()). Release
63509 +          reference acquired by zget().
63510 +        */
63511 +       zput(active);
63512 +       if (unlikely(h->result != 0))
63513 +               goto fail_or_restart;
63514 +
63515 +       setdk = 0;
63516 +       /* if @active is accessed for the first time, setup delimiting keys on
63517 +          it. Delimiting keys are taken from the parent node. See
63518 +          setup_delimiting_keys() for details.
63519 +        */
63520 +       if (h->flags & CBK_DKSET) {
63521 +               setdk = setup_delimiting_keys(h);
63522 +               h->flags &= ~CBK_DKSET;
63523 +       } else {
63524 +               znode *parent;
63525 +
63526 +               parent = h->parent_lh->node;
63527 +               h->result = zload(parent);
63528 +               if (unlikely(h->result != 0))
63529 +                       goto fail_or_restart;
63530 +
63531 +               if (!ZF_ISSET(active, JNODE_DKSET))
63532 +                       setdk = set_child_delimiting_keys(parent,
63533 +                                                         h->coord, active);
63534 +               else {
63535 +                       read_lock_dk(h->tree);
63536 +                       find_child_delimiting_keys(parent, h->coord, &ldkey,
63537 +                                                  &key);
63538 +                       read_unlock_dk(h->tree);
63539 +                       ldkeyset = 1;
63540 +               }
63541 +               zrelse(parent);
63542 +       }
63543 +
63544 +       /* this is ugly kludge. Reminder: this is necessary, because
63545 +          ->lookup() method returns coord with ->between field probably set
63546 +          to something different from AT_UNIT.
63547 +        */
63548 +       h->coord->between = AT_UNIT;
63549 +
63550 +       if (znode_just_created(active) && (h->coord->node != NULL)) {
63551 +               write_lock_tree(h->tree);
63552 +               /* if we are going to load znode right now, setup
63553 +                  ->in_parent: coord where pointer to this node is stored in
63554 +                  parent.
63555 +                */
63556 +               coord_to_parent_coord(h->coord, &active->in_parent);
63557 +               write_unlock_tree(h->tree);
63558 +       }
63559 +
63560 +       /* check connectedness without holding tree lock---false negatives
63561 +        * will be re-checked by connect_znode(), and false positives are
63562 +        * impossible---@active cannot suddenly turn into unconnected
63563 +        * state. */
63564 +       if (!znode_is_connected(active)) {
63565 +               h->result = connect_znode(h->coord, active);
63566 +               if (unlikely(h->result != 0)) {
63567 +                       put_parent(h);
63568 +                       goto fail_or_restart;
63569 +               }
63570 +       }
63571 +
63572 +       jload_prefetch(ZJNODE(active));
63573 +
63574 +       if (setdk)
63575 +               update_stale_dk(h->tree, active);
63576 +
63577 +       /* put_parent() cannot be called earlier, because connect_znode()
63578 +          assumes parent node is referenced; */
63579 +       put_parent(h);
63580 +
63581 +       if ((!znode_contains_key_lock(active, h->key) &&
63582 +            (h->flags & CBK_TRUST_DK))
63583 +           || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) {
63584 +               /* 1. key was moved out of this node while this thread was
63585 +                  waiting for the lock. Restart. More elaborate solution is
63586 +                  to determine where key moved (to the left, or to the right)
63587 +                  and try to follow it through sibling pointers.
63588 +
63589 +                  2. or, node itself is going to be removed from the
63590 +                  tree. Release lock and restart.
63591 +                */
63592 +               h->result = -E_REPEAT;
63593 +       }
63594 +       if (h->result == -E_REPEAT)
63595 +               return LOOKUP_REST;
63596 +
63597 +       h->result = zload_ra(active, h->ra_info);
63598 +       if (h->result)
63599 +               return LOOKUP_DONE;
63600 +
63601 +       /* sanity checks */
63602 +       if (sanity_check(h)) {
63603 +               zrelse(active);
63604 +               return LOOKUP_DONE;
63605 +       }
63606 +
63607 +       /* check that key of leftmost item in the @active is the same as in
63608 +        * its parent */
63609 +       if (ldkeyset && !node_is_empty(active) &&
63610 +           !keyeq(leftmost_key_in_node(active, &key), &ldkey)) {
63611 +               warning("vs-3533", "Keys are inconsistent. Fsck?");
63612 +               reiser4_print_key("inparent", &ldkey);
63613 +               reiser4_print_key("inchild", &key);
63614 +               h->result = RETERR(-EIO);
63615 +               zrelse(active);
63616 +               return LOOKUP_DONE;
63617 +       }
63618 +
63619 +       if (h->object != NULL)
63620 +               handle_vroot(h->object, active);
63621 +
63622 +       ret = cbk_node_lookup(h);
63623 +
63624 +       /* h->active_lh->node might change, but active is yet to be zrelsed */
63625 +       zrelse(active);
63626 +
63627 +       return ret;
63628 +
63629 +fail_or_restart:
63630 +       if (h->result == -E_DEADLOCK)
63631 +               return LOOKUP_REST;
63632 +       return LOOKUP_DONE;
63633 +}
63634 +
63635 +#if REISER4_DEBUG
63636 +/* check left and right delimiting keys of a znode */
63637 +void check_dkeys(znode * node)
63638 +{
63639 +       znode *left;
63640 +       znode *right;
63641 +
63642 +       read_lock_tree(current_tree);
63643 +       read_lock_dk(current_tree);
63644 +
63645 +       assert("vs-1710", znode_is_any_locked(node));
63646 +       assert("vs-1197",
63647 +              !keygt(znode_get_ld_key(node), znode_get_rd_key(node)));
63648 +
63649 +       left = node->left;
63650 +       right = node->right;
63651 +
63652 +       if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
63653 +           && left != NULL && ZF_ISSET(left, JNODE_DKSET))
63654 +               /* check left neighbor. Note that left neighbor is not locked,
63655 +                  so it might get wrong delimiting keys therefore */
63656 +               assert("vs-1198",
63657 +                      (keyeq(znode_get_rd_key(left), znode_get_ld_key(node))
63658 +                       || ZF_ISSET(left, JNODE_HEARD_BANSHEE)));
63659 +
63660 +       if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
63661 +           && right != NULL && ZF_ISSET(right, JNODE_DKSET))
63662 +               /* check right neighbor. Note that right neighbor is not
63663 +                  locked, so it might get wrong delimiting keys therefore  */
63664 +               assert("vs-1199",
63665 +                      (keyeq(znode_get_rd_key(node), znode_get_ld_key(right))
63666 +                       || ZF_ISSET(right, JNODE_HEARD_BANSHEE)));
63667 +
63668 +       read_unlock_dk(current_tree);
63669 +       read_unlock_tree(current_tree);
63670 +}
63671 +#endif
63672 +
63673 +/* true if @key is left delimiting key of @node */
63674 +static int key_is_ld(znode * node, const reiser4_key * key)
63675 +{
63676 +       int ld;
63677 +
63678 +       assert("nikita-1716", node != NULL);
63679 +       assert("nikita-1758", key != NULL);
63680 +
63681 +       read_lock_dk(znode_get_tree(node));
63682 +       assert("nikita-1759", znode_contains_key(node, key));
63683 +       ld = keyeq(znode_get_ld_key(node), key);
63684 +       read_unlock_dk(znode_get_tree(node));
63685 +       return ld;
63686 +}
63687 +
63688 +/* Process one node during tree traversal.
63689 +
63690 +   This is called by cbk_level_lookup(). */
63691 +static level_lookup_result cbk_node_lookup(cbk_handle * h/* search handle */)
63692 +{
63693 +       /* node plugin of @active */
63694 +       node_plugin *nplug;
63695 +       /* item plugin of item that was found */
63696 +       item_plugin *iplug;
63697 +       /* search bias */
63698 +       lookup_bias node_bias;
63699 +       /* node we are operating upon */
63700 +       znode *active;
63701 +       /* tree we are searching in */
63702 +       reiser4_tree *tree;
63703 +       /* result */
63704 +       int result;
63705 +
63706 +       assert("nikita-379", h != NULL);
63707 +
63708 +       active = h->active_lh->node;
63709 +       tree = h->tree;
63710 +
63711 +       nplug = active->nplug;
63712 +       assert("nikita-380", nplug != NULL);
63713 +
63714 +       ON_DEBUG(check_dkeys(active));
63715 +
63716 +       /* return item from "active" node with maximal key not greater than
63717 +          "key"  */
63718 +       node_bias = h->bias;
63719 +       result = nplug->lookup(active, h->key, node_bias, h->coord);
63720 +       if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) {
63721 +               /* error occurred */
63722 +               h->result = result;
63723 +               return LOOKUP_DONE;
63724 +       }
63725 +       if (h->level == h->stop_level) {
63726 +               /* welcome to the stop level */
63727 +               assert("nikita-381", h->coord->node == active);
63728 +               if (result == NS_FOUND) {
63729 +                       /* success of tree lookup */
63730 +                       if (!(h->flags & CBK_UNIQUE)
63731 +                           && key_is_ld(active, h->key))
63732 +                               return search_to_left(h);
63733 +                       else
63734 +                               h->result = CBK_COORD_FOUND;
63735 +               } else {
63736 +                       h->result = CBK_COORD_NOTFOUND;
63737 +               }
63738 +               if (!(h->flags & CBK_IN_CACHE))
63739 +                       cbk_cache_add(active);
63740 +               return LOOKUP_DONE;
63741 +       }
63742 +
63743 +       if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) {
63744 +               h->error = "not found on internal node";
63745 +               h->result = result;
63746 +               return LOOKUP_DONE;
63747 +       }
63748 +
63749 +       assert("vs-361", h->level > h->stop_level);
63750 +
63751 +       if (handle_eottl(h, &result)) {
63752 +               assert("vs-1674", (result == LOOKUP_DONE ||
63753 +                                  result == LOOKUP_REST));
63754 +               return result;
63755 +       }
63756 +
63757 +       /* go down to next level */
63758 +       check_me("vs-12", zload(h->coord->node) == 0);
63759 +       assert("nikita-2116", item_is_internal(h->coord));
63760 +       iplug = item_plugin_by_coord(h->coord);
63761 +       iplug->s.internal.down_link(h->coord, h->key, &h->block);
63762 +       zrelse(h->coord->node);
63763 +       --h->level;
63764 +       return LOOKUP_CONT;     /* continue */
63765 +}
63766 +
63767 +/* scan cbk_cache slots looking for a match for @h */
63768 +static int cbk_cache_scan_slots(cbk_handle * h/* cbk handle */)
63769 +{
63770 +       level_lookup_result llr;
63771 +       znode *node;
63772 +       reiser4_tree *tree;
63773 +       cbk_cache_slot *slot;
63774 +       cbk_cache *cache;
63775 +       tree_level level;
63776 +       int isunique;
63777 +       const reiser4_key *key;
63778 +       int result;
63779 +
63780 +       assert("nikita-1317", h != NULL);
63781 +       assert("nikita-1315", h->tree != NULL);
63782 +       assert("nikita-1316", h->key != NULL);
63783 +
63784 +       tree = h->tree;
63785 +       cache = &tree->cbk_cache;
63786 +       if (cache->nr_slots == 0)
63787 +               /* size of cbk cache was set to 0 by mount time option. */
63788 +               return RETERR(-ENOENT);
63789 +
63790 +       assert("nikita-2474", cbk_cache_invariant(cache));
63791 +       node = NULL;            /* to keep gcc happy */
63792 +       level = h->level;
63793 +       key = h->key;
63794 +       isunique = h->flags & CBK_UNIQUE;
63795 +       result = RETERR(-ENOENT);
63796 +
63797 +       /*
63798 +        * this is time-critical function and dragons had, hence, been settled
63799 +        * here.
63800 +        *
63801 +        * Loop below scans cbk cache slots trying to find matching node with
63802 +        * suitable range of delimiting keys and located at the h->level.
63803 +        *
63804 +        * Scan is done under cbk cache spin lock that protects slot->node
63805 +        * pointers. If suitable node is found we want to pin it in
63806 +        * memory. But slot->node can point to the node with x_count 0
63807 +        * (unreferenced). Such node can be recycled at any moment, or can
63808 +        * already be in the process of being recycled (within jput()).
63809 +        *
63810 +        * As we found node in the cbk cache, it means that jput() hasn't yet
63811 +        * called cbk_cache_invalidate().
63812 +        *
63813 +        * We acquire reference to the node without holding tree lock, and
63814 +        * later, check node's RIP bit. This avoids races with jput().
63815 +        */
63816 +
63817 +       rcu_read_lock();
63818 +       read_lock(&((cbk_cache *)cache)->guard);
63819 +
63820 +       slot = list_entry(cache->lru.next, cbk_cache_slot, lru);
63821 +       slot = list_entry(slot->lru.prev, cbk_cache_slot, lru);
63822 +       BUG_ON(&slot->lru != &cache->lru);/*????*/
63823 +       while (1) {
63824 +
63825 +               slot = list_entry(slot->lru.next, cbk_cache_slot, lru);
63826 +
63827 +               if (&cache->lru != &slot->lru)
63828 +                       node = slot->node;
63829 +               else
63830 +                       node = NULL;
63831 +
63832 +               if (unlikely(node == NULL))
63833 +                       break;
63834 +
63835 +               /*
63836 +                * this is (hopefully) the only place in the code where we are
63837 +                * working with delimiting keys without holding dk lock. This
63838 +                * is fine here, because this is only "guess" anyway---keys
63839 +                * are rechecked under dk lock below.
63840 +                */
63841 +               if (znode_get_level(node) == level &&
63842 +                   /* reiser4_min_key < key < reiser4_max_key */
63843 +                   znode_contains_key_strict(node, key, isunique)) {
63844 +                       zref(node);
63845 +                       result = 0;
63846 +                       spin_lock_prefetch(&tree->tree_lock);
63847 +                       break;
63848 +               }
63849 +       }
63850 +       read_unlock(&((cbk_cache *)cache)->guard);
63851 +
63852 +       assert("nikita-2475", cbk_cache_invariant(cache));
63853 +
63854 +       if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
63855 +               result = -ENOENT;
63856 +
63857 +       rcu_read_unlock();
63858 +
63859 +       if (result != 0) {
63860 +               h->result = CBK_COORD_NOTFOUND;
63861 +               return RETERR(-ENOENT);
63862 +       }
63863 +
63864 +       result =
63865 +           longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h),
63866 +                               ZNODE_LOCK_LOPRI);
63867 +       zput(node);
63868 +       if (result != 0)
63869 +               return result;
63870 +       result = zload(node);
63871 +       if (result != 0)
63872 +               return result;
63873 +
63874 +       /* recheck keys */
63875 +       read_lock_dk(tree);
63876 +       result = (znode_contains_key_strict(node, key, isunique) &&
63877 +               !ZF_ISSET(node, JNODE_HEARD_BANSHEE));
63878 +       read_unlock_dk(tree);
63879 +       if (result) {
63880 +               /* do lookup inside node */
63881 +               llr = cbk_node_lookup(h);
63882 +               /* if cbk_node_lookup() wandered to another node (due to eottl
63883 +                  or non-unique keys), adjust @node */
63884 +               /*node = h->active_lh->node; */
63885 +
63886 +               if (llr != LOOKUP_DONE) {
63887 +                       /* restart or continue on the next level */
63888 +                       result = RETERR(-ENOENT);
63889 +               } else if (IS_CBKERR(h->result))
63890 +                       /* io or oom */
63891 +                       result = RETERR(-ENOENT);
63892 +               else {
63893 +                       /* good. Either item found or definitely not found. */
63894 +                       result = 0;
63895 +
63896 +                       write_lock(&(cache->guard));
63897 +                       if (slot->node == h->active_lh->node) {
63898 +                               /* if this node is still in cbk cache---move
63899 +                                  its slot to the head of the LRU list. */
63900 +                               list_move(&slot->lru, &cache->lru);
63901 +                       }
63902 +                       write_unlock(&(cache->guard));
63903 +               }
63904 +       } else {
63905 +               /* race. While this thread was waiting for the lock, node was
63906 +                  rebalanced and item we are looking for, shifted out of it
63907 +                  (if it ever was here).
63908 +
63909 +                  Continuing scanning is almost hopeless: node key range was
63910 +                  moved to, is almost certainly at the beginning of the LRU
63911 +                  list at this time, because it's hot, but restarting
63912 +                  scanning from the very beginning is complex. Just return,
63913 +                  so that cbk() will be performed. This is not that
63914 +                  important, because such races should be rare. Are they?
63915 +                */
63916 +               result = RETERR(-ENOENT);       /* -ERAUGHT */
63917 +       }
63918 +       zrelse(node);
63919 +       assert("nikita-2476", cbk_cache_invariant(cache));
63920 +       return result;
63921 +}
63922 +
63923 +/* look for item with given key in the coord cache
63924 +
63925 +   This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
63926 +   which is a small LRU list of znodes accessed lately. For each znode in
63927 +   znode in this list, it checks whether key we are looking for fits into key
63928 +   range covered by this node. If so, and in addition, node lies at allowed
63929 +   level (this is to handle extents on a twig level), node is locked, and
63930 +   lookup inside it is performed.
63931 +
63932 +   we need a measurement of the cost of this cache search compared to the cost
63933 +   of coord_by_key.
63934 +
63935 +*/
63936 +static int cbk_cache_search(cbk_handle * h/* cbk handle */)
63937 +{
63938 +       int result = 0;
63939 +       tree_level level;
63940 +
63941 +       /* add CBK_IN_CACHE to the handle flags. This means that
63942 +        * cbk_node_lookup() assumes that cbk_cache is scanned and would add
63943 +        * found node to the cache. */
63944 +       h->flags |= CBK_IN_CACHE;
63945 +       for (level = h->stop_level; level <= h->lock_level; ++level) {
63946 +               h->level = level;
63947 +               result = cbk_cache_scan_slots(h);
63948 +               if (result != 0) {
63949 +                       done_lh(h->active_lh);
63950 +                       done_lh(h->parent_lh);
63951 +               } else {
63952 +                       assert("nikita-1319", !IS_CBKERR(h->result));
63953 +                       break;
63954 +               }
63955 +       }
63956 +       h->flags &= ~CBK_IN_CACHE;
63957 +       return result;
63958 +}
63959 +
63960 +/* type of lock we want to obtain during tree traversal. On stop level
63961 +    we want type of lock user asked for, on upper levels: read lock. */
63962 +znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h)
63963 +{
63964 +       assert("nikita-382", h != NULL);
63965 +
63966 +       return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK;
63967 +}
63968 +
63969 +/* update outdated delimiting keys */
63970 +static void stale_dk(reiser4_tree * tree, znode * node)
63971 +{
63972 +       znode *right;
63973 +
63974 +       read_lock_tree(tree);
63975 +       write_lock_dk(tree);
63976 +       right = node->right;
63977 +
63978 +       if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
63979 +           right && ZF_ISSET(right, JNODE_DKSET) &&
63980 +           !keyeq(znode_get_rd_key(node), znode_get_ld_key(right)))
63981 +               znode_set_rd_key(node, znode_get_ld_key(right));
63982 +
63983 +       write_unlock_dk(tree);
63984 +       read_unlock_tree(tree);
63985 +}
63986 +
63987 +/* check for possibly outdated delimiting keys, and update them if
63988 + * necessary. */
63989 +static void update_stale_dk(reiser4_tree * tree, znode * node)
63990 +{
63991 +       znode *right;
63992 +       reiser4_key rd;
63993 +
63994 +       read_lock_tree(tree);
63995 +       read_lock_dk(tree);
63996 +       rd = *znode_get_rd_key(node);
63997 +       right = node->right;
63998 +       if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
63999 +                    right && ZF_ISSET(right, JNODE_DKSET) &&
64000 +                    !keyeq(&rd, znode_get_ld_key(right)))) {
64001 +               assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET));
64002 +               read_unlock_dk(tree);
64003 +               read_unlock_tree(tree);
64004 +               stale_dk(tree, node);
64005 +               return;
64006 +       }
64007 +       read_unlock_dk(tree);
64008 +       read_unlock_tree(tree);
64009 +}
64010 +
64011 +/*
64012 + * handle searches a the non-unique key.
64013 + *
64014 + * Suppose that we are looking for an item with possibly non-unique key 100.
64015 + *
64016 + * Root node contains two pointers: one to a node with left delimiting key 0,
64017 + * and another to a node with left delimiting key 100. Item we interested in
64018 + * may well happen in the sub-tree rooted at the first pointer.
64019 + *
64020 + * To handle this search_to_left() is called when search reaches stop
64021 + * level. This function checks it is _possible_ that item we are looking for
64022 + * is in the left neighbor (this can be done by comparing delimiting keys) and
64023 + * if so, tries to lock left neighbor (this is low priority lock, so it can
64024 + * deadlock, tree traversal is just restarted if it did) and then checks
64025 + * whether left neighbor actually contains items with our key.
64026 + *
64027 + * Note that this is done on the stop level only. It is possible to try such
64028 + * left-check on each level, but as duplicate keys are supposed to be rare
64029 + * (very unlikely that more than one node is completely filled with items with
64030 + * duplicate keys), it sis cheaper to scan to the left on the stop level once.
64031 + *
64032 + */
64033 +static level_lookup_result search_to_left(cbk_handle * h/* search handle */)
64034 +{
64035 +       level_lookup_result result;
64036 +       coord_t *coord;
64037 +       znode *node;
64038 +       znode *neighbor;
64039 +
64040 +       lock_handle lh;
64041 +
64042 +       assert("nikita-1761", h != NULL);
64043 +       assert("nikita-1762", h->level == h->stop_level);
64044 +
64045 +       init_lh(&lh);
64046 +       coord = h->coord;
64047 +       node = h->active_lh->node;
64048 +       assert("nikita-1763", coord_is_leftmost_unit(coord));
64049 +
64050 +       h->result =
64051 +           reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode,
64052 +                                     GN_CAN_USE_UPPER_LEVELS);
64053 +       neighbor = NULL;
64054 +       switch (h->result) {
64055 +       case -E_DEADLOCK:
64056 +               result = LOOKUP_REST;
64057 +               break;
64058 +       case 0:{
64059 +                       node_plugin *nplug;
64060 +                       coord_t crd;
64061 +                       lookup_bias bias;
64062 +
64063 +                       neighbor = lh.node;
64064 +                       h->result = zload(neighbor);
64065 +                       if (h->result != 0) {
64066 +                               result = LOOKUP_DONE;
64067 +                               break;
64068 +                       }
64069 +
64070 +                       nplug = neighbor->nplug;
64071 +
64072 +                       coord_init_zero(&crd);
64073 +                       bias = h->bias;
64074 +                       h->bias = FIND_EXACT;
64075 +                       h->result =
64076 +                           nplug->lookup(neighbor, h->key, h->bias, &crd);
64077 +                       h->bias = bias;
64078 +
64079 +                       if (h->result == NS_NOT_FOUND) {
64080 +       case -E_NO_NEIGHBOR:
64081 +                               h->result = CBK_COORD_FOUND;
64082 +                               if (!(h->flags & CBK_IN_CACHE))
64083 +                                       cbk_cache_add(node);
64084 +       default:                /* some other error */
64085 +                               result = LOOKUP_DONE;
64086 +                       } else if (h->result == NS_FOUND) {
64087 +                               read_lock_dk(znode_get_tree(neighbor));
64088 +                               h->rd_key = *znode_get_ld_key(node);
64089 +                               leftmost_key_in_node(neighbor, &h->ld_key);
64090 +                               read_unlock_dk(znode_get_tree(neighbor));
64091 +                               h->flags |= CBK_DKSET;
64092 +
64093 +                               h->block = *znode_get_block(neighbor);
64094 +                               /* clear coord->node so that cbk_level_lookup()
64095 +                                  wouldn't overwrite parent hint in neighbor.
64096 +
64097 +                                  Parent hint was set up by
64098 +                                  reiser4_get_left_neighbor()
64099 +                                */
64100 +                               /* FIXME: why do we have to spinlock here? */
64101 +                               write_lock_tree(znode_get_tree(neighbor));
64102 +                               h->coord->node = NULL;
64103 +                               write_unlock_tree(znode_get_tree(neighbor));
64104 +                               result = LOOKUP_CONT;
64105 +                       } else {
64106 +                               result = LOOKUP_DONE;
64107 +                       }
64108 +                       if (neighbor != NULL)
64109 +                               zrelse(neighbor);
64110 +               }
64111 +       }
64112 +       done_lh(&lh);
64113 +       return result;
64114 +}
64115 +
64116 +/* debugging aid: return symbolic name of search bias */
64117 +static const char *bias_name(lookup_bias bias/* bias to get name of */)
64118 +{
64119 +       if (bias == FIND_EXACT)
64120 +               return "exact";
64121 +       else if (bias == FIND_MAX_NOT_MORE_THAN)
64122 +               return "left-slant";
64123 +/*     else if( bias == RIGHT_SLANT_BIAS ) */
64124 +/*             return "right-bias"; */
64125 +       else {
64126 +               static char buf[30];
64127 +
64128 +               sprintf(buf, "unknown: %i", bias);
64129 +               return buf;
64130 +       }
64131 +}
64132 +
64133 +#if REISER4_DEBUG
64134 +/* debugging aid: print human readable information about @p */
64135 +void print_coord_content(const char *prefix /* prefix to print */ ,
64136 +                        coord_t *p/* coord to print */)
64137 +{
64138 +       reiser4_key key;
64139 +
64140 +       if (p == NULL) {
64141 +               printk("%s: null\n", prefix);
64142 +               return;
64143 +       }
64144 +       if ((p->node != NULL) && znode_is_loaded(p->node)
64145 +           && coord_is_existing_item(p))
64146 +               printk("%s: data: %p, length: %i\n", prefix,
64147 +                      item_body_by_coord(p), item_length_by_coord(p));
64148 +       if (znode_is_loaded(p->node)) {
64149 +               item_key_by_coord(p, &key);
64150 +               reiser4_print_key(prefix, &key);
64151 +       }
64152 +}
64153 +
64154 +/* debugging aid: print human readable information about @block */
64155 +void reiser4_print_address(const char *prefix /* prefix to print */ ,
64156 +                  const reiser4_block_nr * block/* block number to print */)
64157 +{
64158 +       printk("%s: %s\n", prefix, sprint_address(block));
64159 +}
64160 +#endif
64161 +
64162 +/* return string containing human readable representation of @block */
64163 +char *sprint_address(const reiser4_block_nr *
64164 +                    block/* block number to print */)
64165 +{
64166 +       static char address[30];
64167 +
64168 +       if (block == NULL)
64169 +               sprintf(address, "null");
64170 +       else if (reiser4_blocknr_is_fake(block))
64171 +               sprintf(address, "%llx", (unsigned long long)(*block));
64172 +       else
64173 +               sprintf(address, "%llu", (unsigned long long)(*block));
64174 +       return address;
64175 +}
64176 +
64177 +/* release parent node during traversal */
64178 +static void put_parent(cbk_handle * h/* search handle */)
64179 +{
64180 +       assert("nikita-383", h != NULL);
64181 +       if (h->parent_lh->node != NULL)
64182 +               longterm_unlock_znode(h->parent_lh);
64183 +}
64184 +
64185 +/* helper function used by coord_by_key(): release reference to parent znode
64186 +   stored in handle before processing its child. */
64187 +static void hput(cbk_handle * h/* search handle */)
64188 +{
64189 +       assert("nikita-385", h != NULL);
64190 +       done_lh(h->parent_lh);
64191 +       done_lh(h->active_lh);
64192 +}
64193 +
64194 +/* Helper function used by cbk(): update delimiting keys of child node (stored
64195 +   in h->active_lh->node) using key taken from parent on the parent level. */
64196 +static int setup_delimiting_keys(cbk_handle * h/* search handle */)
64197 +{
64198 +       znode *active;
64199 +       reiser4_tree *tree;
64200 +
64201 +       assert("nikita-1088", h != NULL);
64202 +
64203 +       active = h->active_lh->node;
64204 +
64205 +       /* fast check without taking dk lock. This is safe, because
64206 +        * JNODE_DKSET is never cleared once set. */
64207 +       if (!ZF_ISSET(active, JNODE_DKSET)) {
64208 +               tree = znode_get_tree(active);
64209 +               write_lock_dk(tree);
64210 +               if (!ZF_ISSET(active, JNODE_DKSET)) {
64211 +                       znode_set_ld_key(active, &h->ld_key);
64212 +                       znode_set_rd_key(active, &h->rd_key);
64213 +                       ZF_SET(active, JNODE_DKSET);
64214 +               }
64215 +               write_unlock_dk(tree);
64216 +               return 1;
64217 +       }
64218 +       return 0;
64219 +}
64220 +
64221 +/* true if @block makes sense for the @tree. Used to detect corrupted node
64222 + * pointers */
64223 +static int
64224 +block_nr_is_correct(reiser4_block_nr * block /* block number to check */ ,
64225 +                   reiser4_tree * tree/* tree to check against */)
64226 +{
64227 +       assert("nikita-757", block != NULL);
64228 +       assert("nikita-758", tree != NULL);
64229 +
64230 +       /* check to see if it exceeds the size of the device. */
64231 +       return reiser4_blocknr_is_sane_for(tree->super, block);
64232 +}
64233 +
64234 +/* check consistency of fields */
64235 +static int sanity_check(cbk_handle * h/* search handle */)
64236 +{
64237 +       assert("nikita-384", h != NULL);
64238 +
64239 +       if (h->level < h->stop_level) {
64240 +               h->error = "Buried under leaves";
64241 +               h->result = RETERR(-EIO);
64242 +               return LOOKUP_DONE;
64243 +       } else if (!block_nr_is_correct(&h->block, h->tree)) {
64244 +               h->error = "bad block number";
64245 +               h->result = RETERR(-EIO);
64246 +               return LOOKUP_DONE;
64247 +       } else
64248 +               return 0;
64249 +}
64250 +
64251 +/* Make Linus happy.
64252 +   Local variables:
64253 +   c-indentation-style: "K&R"
64254 +   mode-name: "LC"
64255 +   c-basic-offset: 8
64256 +   tab-width: 8
64257 +   fill-column: 120
64258 +   scroll-step: 1
64259 +   End:
64260 +*/
64261 diff -urN linux-2.6.35.orig/fs/reiser4/status_flags.c linux-2.6.35/fs/reiser4/status_flags.c
64262 --- linux-2.6.35.orig/fs/reiser4/status_flags.c 1970-01-01 01:00:00.000000000 +0100
64263 +++ linux-2.6.35/fs/reiser4/status_flags.c      2010-08-04 15:44:57.000000000 +0200
64264 @@ -0,0 +1,174 @@
64265 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64266 + * reiser4/README */
64267 +
64268 +/* Functions that deal with reiser4 status block, query status and update it,
64269 + * if needed */
64270 +
64271 +#include <linux/bio.h>
64272 +#include <linux/highmem.h>
64273 +#include <linux/fs.h>
64274 +#include <linux/blkdev.h>
64275 +#include "debug.h"
64276 +#include "dformat.h"
64277 +#include "status_flags.h"
64278 +#include "super.h"
64279 +
64280 +/* This is our end I/O handler that marks page uptodate if IO was successful.
64281 +   It also unconditionally unlocks the page, so we can see that io was done.
64282 +   We do not free bio, because we hope to reuse that. */
64283 +static void reiser4_status_endio(struct bio *bio, int err)
64284 +{
64285 +       if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
64286 +               SetPageUptodate(bio->bi_io_vec->bv_page);
64287 +       } else {
64288 +               ClearPageUptodate(bio->bi_io_vec->bv_page);
64289 +               SetPageError(bio->bi_io_vec->bv_page);
64290 +       }
64291 +       unlock_page(bio->bi_io_vec->bv_page);
64292 +}
64293 +
64294 +/* Initialise status code. This is expected to be called from the disk format
64295 +   code. block paremeter is where status block lives. */
64296 +int reiser4_status_init(reiser4_block_nr block)
64297 +{
64298 +       struct super_block *sb = reiser4_get_current_sb();
64299 +       struct reiser4_status *statuspage;
64300 +       struct bio *bio;
64301 +       struct page *page;
64302 +
64303 +       get_super_private(sb)->status_page = NULL;
64304 +       get_super_private(sb)->status_bio = NULL;
64305 +
64306 +       page = alloc_pages(reiser4_ctx_gfp_mask_get(), 0);
64307 +       if (!page)
64308 +               return -ENOMEM;
64309 +
64310 +       bio = bio_alloc(reiser4_ctx_gfp_mask_get(), 1);
64311 +       if (bio != NULL) {
64312 +               bio->bi_sector = block * (sb->s_blocksize >> 9);
64313 +               bio->bi_bdev = sb->s_bdev;
64314 +               bio->bi_io_vec[0].bv_page = page;
64315 +               bio->bi_io_vec[0].bv_len = sb->s_blocksize;
64316 +               bio->bi_io_vec[0].bv_offset = 0;
64317 +               bio->bi_vcnt = 1;
64318 +               bio->bi_size = sb->s_blocksize;
64319 +               bio->bi_end_io = reiser4_status_endio;
64320 +       } else {
64321 +               __free_pages(page, 0);
64322 +               return -ENOMEM;
64323 +       }
64324 +       lock_page(page);
64325 +       submit_bio(READ, bio);
64326 +       blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
64327 +       wait_on_page_locked(page);
64328 +       if (!PageUptodate(page)) {
64329 +               warning("green-2007",
64330 +                       "I/O error while tried to read status page\n");
64331 +               return -EIO;
64332 +       }
64333 +
64334 +       statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0);
64335 +       if (memcmp
64336 +           (statuspage->magic, REISER4_STATUS_MAGIC,
64337 +            sizeof(REISER4_STATUS_MAGIC))) {
64338 +               /* Magic does not match. */
64339 +               kunmap_atomic((char *)statuspage, KM_USER0);
64340 +               warning("green-2008", "Wrong magic in status block\n");
64341 +               __free_pages(page, 0);
64342 +               bio_put(bio);
64343 +               return -EINVAL;
64344 +       }
64345 +       kunmap_atomic((char *)statuspage, KM_USER0);
64346 +
64347 +       get_super_private(sb)->status_page = page;
64348 +       get_super_private(sb)->status_bio = bio;
64349 +       return 0;
64350 +}
64351 +
64352 +/* Query the status of fs. Returns if the FS can be safely mounted.
64353 +   Also if "status" and "extended" parameters are given, it will fill
64354 +   actual parts of status from disk there. */
64355 +int reiser4_status_query(u64 *status, u64 *extended)
64356 +{
64357 +       struct super_block *sb = reiser4_get_current_sb();
64358 +       struct reiser4_status *statuspage;
64359 +       int retval;
64360 +
64361 +       if (!get_super_private(sb)->status_page)
64362 +               /* No status page? */
64363 +               return REISER4_STATUS_MOUNT_UNKNOWN;
64364 +       statuspage = (struct reiser4_status *)
64365 +           kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
64366 +       switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) {
64367 +       /* FIXME: this cast is a hack for 32 bit arches to work. */
64368 +       case REISER4_STATUS_OK:
64369 +               retval = REISER4_STATUS_MOUNT_OK;
64370 +               break;
64371 +       case REISER4_STATUS_CORRUPTED:
64372 +               retval = REISER4_STATUS_MOUNT_WARN;
64373 +               break;
64374 +       case REISER4_STATUS_DAMAGED:
64375 +       case REISER4_STATUS_DESTROYED:
64376 +       case REISER4_STATUS_IOERROR:
64377 +               retval = REISER4_STATUS_MOUNT_RO;
64378 +               break;
64379 +       default:
64380 +               retval = REISER4_STATUS_MOUNT_UNKNOWN;
64381 +               break;
64382 +       }
64383 +
64384 +       if (status)
64385 +               *status = le64_to_cpu(get_unaligned(&statuspage->status));
64386 +       if (extended)
64387 +               *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status));
64388 +
64389 +       kunmap_atomic((char *)statuspage, KM_USER0);
64390 +       return retval;
64391 +}
64392 +
64393 +/* This function should be called when something bad happens (e.g. from
64394 +   reiser4_panic). It fills the status structure and tries to push it to disk.*/
64395 +int reiser4_status_write(__u64 status, __u64 extended_status, char *message)
64396 +{
64397 +       struct super_block *sb = reiser4_get_current_sb();
64398 +       struct reiser4_status *statuspage;
64399 +       struct bio *bio = get_super_private(sb)->status_bio;
64400 +
64401 +       if (!get_super_private(sb)->status_page)
64402 +               /* No status page? */
64403 +               return -1;
64404 +       statuspage = (struct reiser4_status *)
64405 +           kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
64406 +
64407 +       put_unaligned(cpu_to_le64(status), &statuspage->status);
64408 +       put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status);
64409 +       strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN);
64410 +
64411 +       kunmap_atomic((char *)statuspage, KM_USER0);
64412 +       bio->bi_bdev = sb->s_bdev;
64413 +       bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page;
64414 +       bio->bi_io_vec[0].bv_len = sb->s_blocksize;
64415 +       bio->bi_io_vec[0].bv_offset = 0;
64416 +       bio->bi_vcnt = 1;
64417 +       bio->bi_size = sb->s_blocksize;
64418 +       bio->bi_end_io = reiser4_status_endio;
64419 +       lock_page(get_super_private(sb)->status_page);  /* Safe as nobody should
64420 +                                                        * touch our page. */
64421 +       /* We can block now, but we have no other choice anyway */
64422 +       submit_bio(WRITE, bio);
64423 +       blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
64424 +       return 0;               /* We do not wait for io to finish. */
64425 +}
64426 +
64427 +/* Frees the page with status and bio structure. Should be called by disk format
64428 + * at umount time */
64429 +int reiser4_status_finish(void)
64430 +{
64431 +       struct super_block *sb = reiser4_get_current_sb();
64432 +
64433 +       __free_pages(get_super_private(sb)->status_page, 0);
64434 +       get_super_private(sb)->status_page = NULL;
64435 +       bio_put(get_super_private(sb)->status_bio);
64436 +       get_super_private(sb)->status_bio = NULL;
64437 +       return 0;
64438 +}
64439 diff -urN linux-2.6.35.orig/fs/reiser4/status_flags.h linux-2.6.35/fs/reiser4/status_flags.h
64440 --- linux-2.6.35.orig/fs/reiser4/status_flags.h 1970-01-01 01:00:00.000000000 +0100
64441 +++ linux-2.6.35/fs/reiser4/status_flags.h      2010-08-04 15:44:57.000000000 +0200
64442 @@ -0,0 +1,47 @@
64443 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64444 + * reiser4/README */
64445 +
64446 +/* Here we declare structures and flags that store reiser4 status on disk.
64447 +   The status that helps us to find out if the filesystem is valid or if it
64448 +   contains some critical, or not so critical errors */
64449 +
64450 +#if !defined(__REISER4_STATUS_FLAGS_H__)
64451 +#define __REISER4_STATUS_FLAGS_H__
64452 +
64453 +#include "dformat.h"
64454 +/* These are major status flags */
64455 +#define REISER4_STATUS_OK 0
64456 +#define REISER4_STATUS_CORRUPTED 0x1
64457 +#define REISER4_STATUS_DAMAGED 0x2
64458 +#define REISER4_STATUS_DESTROYED 0x4
64459 +#define REISER4_STATUS_IOERROR 0x8
64460 +
64461 +/* Return values for reiser4_status_query() */
64462 +#define REISER4_STATUS_MOUNT_OK 0
64463 +#define REISER4_STATUS_MOUNT_WARN 1
64464 +#define REISER4_STATUS_MOUNT_RO 2
64465 +#define REISER4_STATUS_MOUNT_UNKNOWN -1
64466 +
64467 +#define REISER4_TEXTERROR_LEN 256
64468 +
64469 +#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl"
64470 +/* We probably need to keep its size under sector size which is 512 bytes */
64471 +struct reiser4_status {
64472 +       char magic[16];
64473 +       d64 status;             /* Current FS state */
64474 +       d64 extended_status;    /* Any additional info that might have sense in
64475 +                                * addition to "status". E.g. last sector where
64476 +                                * io error happened if status is
64477 +                                * "io error encountered" */
64478 +       d64 stacktrace[10];     /* Last ten functional calls made (addresses) */
64479 +       char texterror[REISER4_TEXTERROR_LEN];  /* Any error message if
64480 +                                                * appropriate, otherwise filled
64481 +                                                * with zeroes */
64482 +};
64483 +
64484 +int reiser4_status_init(reiser4_block_nr block);
64485 +int reiser4_status_query(u64 *status, u64 *extended);
64486 +int reiser4_status_write(u64 status, u64 extended_status, char *message);
64487 +int reiser4_status_finish(void);
64488 +
64489 +#endif
64490 diff -urN linux-2.6.35.orig/fs/reiser4/super.c linux-2.6.35/fs/reiser4/super.c
64491 --- linux-2.6.35.orig/fs/reiser4/super.c        1970-01-01 01:00:00.000000000 +0100
64492 +++ linux-2.6.35/fs/reiser4/super.c     2010-08-04 15:44:57.000000000 +0200
64493 @@ -0,0 +1,306 @@
64494 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
64495 + * reiser4/README */
64496 +
64497 +/* Super-block manipulations. */
64498 +
64499 +#include "debug.h"
64500 +#include "dformat.h"
64501 +#include "key.h"
64502 +#include "plugin/security/perm.h"
64503 +#include "plugin/space/space_allocator.h"
64504 +#include "plugin/plugin.h"
64505 +#include "tree.h"
64506 +#include "vfs_ops.h"
64507 +#include "super.h"
64508 +#include "reiser4.h"
64509 +
64510 +#include <linux/types.h>       /* for __u??  */
64511 +#include <linux/fs.h>          /* for struct super_block  */
64512 +
64513 +static __u64 reserved_for_gid(const struct super_block *super, gid_t gid);
64514 +static __u64 reserved_for_uid(const struct super_block *super, uid_t uid);
64515 +static __u64 reserved_for_root(const struct super_block *super);
64516 +
64517 +/* Return reiser4-specific part of super block */
64518 +reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super)
64519 +{
64520 +       return (reiser4_super_info_data *) super->s_fs_info;
64521 +}
64522 +
64523 +/* Return reiser4 fstype: value that is returned in ->f_type field by statfs()
64524 + */
64525 +long reiser4_statfs_type(const struct super_block *super UNUSED_ARG)
64526 +{
64527 +       assert("nikita-448", super != NULL);
64528 +       assert("nikita-449", is_reiser4_super(super));
64529 +       return (long)REISER4_SUPER_MAGIC;
64530 +}
64531 +
64532 +/* functions to read/modify fields of reiser4_super_info_data */
64533 +
64534 +/* get number of blocks in file system */
64535 +__u64 reiser4_block_count(const struct super_block *super      /* super block
64536 +                                                                  queried */ )
64537 +{
64538 +       assert("vs-494", super != NULL);
64539 +       assert("vs-495", is_reiser4_super(super));
64540 +       return get_super_private(super)->block_count;
64541 +}
64542 +
64543 +#if REISER4_DEBUG
64544 +/*
64545 + * number of blocks in the current file system
64546 + */
64547 +__u64 reiser4_current_block_count(void)
64548 +{
64549 +       return get_current_super_private()->block_count;
64550 +}
64551 +#endif  /*  REISER4_DEBUG  */
64552 +
64553 +/* set number of block in filesystem */
64554 +void reiser4_set_block_count(const struct super_block *super, __u64 nr)
64555 +{
64556 +       assert("vs-501", super != NULL);
64557 +       assert("vs-502", is_reiser4_super(super));
64558 +       get_super_private(super)->block_count = nr;
64559 +       /*
64560 +        * The proper calculation of the reserved space counter (%5 of device
64561 +        * block counter) we need a 64 bit division which is missing in Linux
64562 +        * on i386 platform. Because we do not need a precise calculation here
64563 +        * we can replace a div64 operation by this combination of
64564 +        * multiplication and shift: 51. / (2^10) == .0498 .
64565 +        * FIXME: this is a bug. It comes up only for very small filesystems
64566 +        * which probably are never used. Nevertheless, it is a bug. Number of
64567 +        * reserved blocks must be not less than maximal number of blocks which
64568 +        * get grabbed with BA_RESERVED.
64569 +        */
64570 +       get_super_private(super)->blocks_reserved = ((nr * 51) >> 10);
64571 +}
64572 +
64573 +/* amount of blocks used (allocated for data) in file system */
64574 +__u64 reiser4_data_blocks(const struct super_block *super      /* super block
64575 +                                                                  queried */ )
64576 +{
64577 +       assert("nikita-452", super != NULL);
64578 +       assert("nikita-453", is_reiser4_super(super));
64579 +       return get_super_private(super)->blocks_used;
64580 +}
64581 +
64582 +/* set number of block used in filesystem */
64583 +void reiser4_set_data_blocks(const struct super_block *super, __u64 nr)
64584 +{
64585 +       assert("vs-503", super != NULL);
64586 +       assert("vs-504", is_reiser4_super(super));
64587 +       get_super_private(super)->blocks_used = nr;
64588 +}
64589 +
64590 +/* amount of free blocks in file system */
64591 +__u64 reiser4_free_blocks(const struct super_block *super      /* super block
64592 +                                                                  queried */ )
64593 +{
64594 +       assert("nikita-454", super != NULL);
64595 +       assert("nikita-455", is_reiser4_super(super));
64596 +       return get_super_private(super)->blocks_free;
64597 +}
64598 +
64599 +/* set number of blocks free in filesystem */
64600 +void reiser4_set_free_blocks(const struct super_block *super, __u64 nr)
64601 +{
64602 +       assert("vs-505", super != NULL);
64603 +       assert("vs-506", is_reiser4_super(super));
64604 +       get_super_private(super)->blocks_free = nr;
64605 +}
64606 +
64607 +/* get mkfs unique identifier */
64608 +__u32 reiser4_mkfs_id(const struct super_block *super  /* super block
64609 +                                                          queried */ )
64610 +{
64611 +       assert("vpf-221", super != NULL);
64612 +       assert("vpf-222", is_reiser4_super(super));
64613 +       return get_super_private(super)->mkfs_id;
64614 +}
64615 +
64616 +/* amount of free blocks in file system */
64617 +__u64 reiser4_free_committed_blocks(const struct super_block *super)
64618 +{
64619 +       assert("vs-497", super != NULL);
64620 +       assert("vs-498", is_reiser4_super(super));
64621 +       return get_super_private(super)->blocks_free_committed;
64622 +}
64623 +
64624 +/* amount of blocks in the file system reserved for @uid and @gid */
64625 +long reiser4_reserved_blocks(const struct super_block *super   /* super block
64626 +                                                                  queried */ ,
64627 +                            uid_t uid /* user id */ ,
64628 +                            gid_t gid/* group id */)
64629 +{
64630 +       long reserved;
64631 +
64632 +       assert("nikita-456", super != NULL);
64633 +       assert("nikita-457", is_reiser4_super(super));
64634 +
64635 +       reserved = 0;
64636 +       if (REISER4_SUPPORT_GID_SPACE_RESERVATION)
64637 +               reserved += reserved_for_gid(super, gid);
64638 +       if (REISER4_SUPPORT_UID_SPACE_RESERVATION)
64639 +               reserved += reserved_for_uid(super, uid);
64640 +       if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0))
64641 +               reserved += reserved_for_root(super);
64642 +       return reserved;
64643 +}
64644 +
64645 +/* get/set value of/to grabbed blocks counter */
64646 +__u64 reiser4_grabbed_blocks(const struct super_block * super)
64647 +{
64648 +       assert("zam-512", super != NULL);
64649 +       assert("zam-513", is_reiser4_super(super));
64650 +
64651 +       return get_super_private(super)->blocks_grabbed;
64652 +}
64653 +
64654 +__u64 reiser4_flush_reserved(const struct super_block *super)
64655 +{
64656 +       assert("vpf-285", super != NULL);
64657 +       assert("vpf-286", is_reiser4_super(super));
64658 +
64659 +       return get_super_private(super)->blocks_flush_reserved;
64660 +}
64661 +
64662 +/* get/set value of/to counter of fake allocated formatted blocks */
64663 +__u64 reiser4_fake_allocated(const struct super_block *super)
64664 +{
64665 +       assert("zam-516", super != NULL);
64666 +       assert("zam-517", is_reiser4_super(super));
64667 +
64668 +       return get_super_private(super)->blocks_fake_allocated;
64669 +}
64670 +
64671 +/* get/set value of/to counter of fake allocated unformatted blocks */
64672 +__u64 reiser4_fake_allocated_unformatted(const struct super_block *super)
64673 +{
64674 +       assert("zam-516", super != NULL);
64675 +       assert("zam-517", is_reiser4_super(super));
64676 +
64677 +       return get_super_private(super)->blocks_fake_allocated_unformatted;
64678 +}
64679 +
64680 +/* get/set value of/to counter of clustered blocks */
64681 +__u64 reiser4_clustered_blocks(const struct super_block *super)
64682 +{
64683 +       assert("edward-601", super != NULL);
64684 +       assert("edward-602", is_reiser4_super(super));
64685 +
64686 +       return get_super_private(super)->blocks_clustered;
64687 +}
64688 +
64689 +/* space allocator used by this file system */
64690 +reiser4_space_allocator * reiser4_get_space_allocator(const struct super_block
64691 +                                                     *super)
64692 +{
64693 +       assert("nikita-1965", super != NULL);
64694 +       assert("nikita-1966", is_reiser4_super(super));
64695 +       return &get_super_private(super)->space_allocator;
64696 +}
64697 +
64698 +/* return fake inode used to bind formatted nodes in the page cache */
64699 +struct inode *reiser4_get_super_fake(const struct super_block *super)
64700 +{
64701 +       assert("nikita-1757", super != NULL);
64702 +       return get_super_private(super)->fake;
64703 +}
64704 +
64705 +/* return fake inode used to bind copied on capture nodes in the page cache */
64706 +struct inode *reiser4_get_cc_fake(const struct super_block *super)
64707 +{
64708 +       assert("nikita-1757", super != NULL);
64709 +       return get_super_private(super)->cc;
64710 +}
64711 +
64712 +/* return fake inode used to bind bitmaps and journlal heads */
64713 +struct inode *reiser4_get_bitmap_fake(const struct super_block *super)
64714 +{
64715 +       assert("nikita-17571", super != NULL);
64716 +       return get_super_private(super)->bitmap;
64717 +}
64718 +
64719 +/* tree used by this file system */
64720 +reiser4_tree *reiser4_get_tree(const struct super_block *super)
64721 +{
64722 +       assert("nikita-460", super != NULL);
64723 +       assert("nikita-461", is_reiser4_super(super));
64724 +       return &get_super_private(super)->tree;
64725 +}
64726 +
64727 +/* Check that @super is (looks like) reiser4 super block. This is mainly for
64728 +   use in assertions. */
64729 +int is_reiser4_super(const struct super_block *super)
64730 +{
64731 +       return
64732 +           super != NULL &&
64733 +           get_super_private(super) != NULL &&
64734 +           super->s_op == &(get_super_private(super)->ops.super);
64735 +}
64736 +
64737 +int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f)
64738 +{
64739 +       return test_bit((int)f, &get_super_private(super)->fs_flags);
64740 +}
64741 +
64742 +/* amount of blocks reserved for given group in file system */
64743 +static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG,
64744 +                             gid_t gid UNUSED_ARG/* group id */)
64745 +{
64746 +       return 0;
64747 +}
64748 +
64749 +/* amount of blocks reserved for given user in file system */
64750 +static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG,
64751 +                             uid_t uid UNUSED_ARG/* user id */)
64752 +{
64753 +       return 0;
64754 +}
64755 +
64756 +/* amount of blocks reserved for super user in file system */
64757 +static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG)
64758 +{
64759 +       return 0;
64760 +}
64761 +
64762 +/*
64763 + * true if block number @blk makes sense for the file system at @super.
64764 + */
64765 +int
64766 +reiser4_blocknr_is_sane_for(const struct super_block *super,
64767 +                           const reiser4_block_nr * blk)
64768 +{
64769 +       reiser4_super_info_data *sbinfo;
64770 +
64771 +       assert("nikita-2957", super != NULL);
64772 +       assert("nikita-2958", blk != NULL);
64773 +
64774 +       if (reiser4_blocknr_is_fake(blk))
64775 +               return 1;
64776 +
64777 +       sbinfo = get_super_private(super);
64778 +       return *blk < sbinfo->block_count;
64779 +}
64780 +
64781 +#if REISER4_DEBUG
64782 +/*
64783 + * true, if block number @blk makes sense for the current file system
64784 + */
64785 +int reiser4_blocknr_is_sane(const reiser4_block_nr * blk)
64786 +{
64787 +       return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk);
64788 +}
64789 +#endif  /*  REISER4_DEBUG  */
64790 +
64791 +/* Make Linus happy.
64792 +   Local variables:
64793 +   c-indentation-style: "K&R"
64794 +   mode-name: "LC"
64795 +   c-basic-offset: 8
64796 +   tab-width: 8
64797 +   fill-column: 120
64798 +   End:
64799 +*/
64800 diff -urN linux-2.6.35.orig/fs/reiser4/super.h linux-2.6.35/fs/reiser4/super.h
64801 --- linux-2.6.35.orig/fs/reiser4/super.h        1970-01-01 01:00:00.000000000 +0100
64802 +++ linux-2.6.35/fs/reiser4/super.h     2010-08-04 15:44:57.000000000 +0200
64803 @@ -0,0 +1,466 @@
64804 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
64805 + * reiser4/README */
64806 +
64807 +/* Super-block functions. See super.c for details. */
64808 +
64809 +#if !defined(__REISER4_SUPER_H__)
64810 +#define __REISER4_SUPER_H__
64811 +
64812 +#include <linux/exportfs.h>
64813 +
64814 +#include "tree.h"
64815 +#include "entd.h"
64816 +#include "wander.h"
64817 +#include "fsdata.h"
64818 +#include "plugin/object.h"
64819 +#include "plugin/space/space_allocator.h"
64820 +
64821 +/*
64822 + * Flush algorithms parameters.
64823 + */
64824 +struct flush_params {
64825 +       unsigned relocate_threshold;
64826 +       unsigned relocate_distance;
64827 +       unsigned written_threshold;
64828 +       unsigned scan_maxnodes;
64829 +};
64830 +
64831 +typedef enum {
64832 +       /*
64833 +        * True if this file system doesn't support hard-links (multiple names)
64834 +        * for directories: this is default UNIX behavior.
64835 +        *
64836 +        * If hard-links on directoires are not allowed, file system is Acyclic
64837 +        * Directed Graph (modulo dot, and dotdot, of course).
64838 +        *
64839 +        * This is used by reiser4_link().
64840 +        */
64841 +       REISER4_ADG = 0,
64842 +       /*
64843 +        * set if all nodes in internal tree have the same node layout plugin.
64844 +        * If so, znode_guess_plugin() will return tree->node_plugin in stead
64845 +        * of guessing plugin by plugin id stored in the node.
64846 +        */
64847 +       REISER4_ONE_NODE_PLUGIN = 1,
64848 +       /* if set, bsd gid assignment is supported. */
64849 +       REISER4_BSD_GID = 2,
64850 +       /* [mac]_time are 32 bit in inode */
64851 +       REISER4_32_BIT_TIMES = 3,
64852 +       /* load all bitmap blocks at mount time */
64853 +       REISER4_DONT_LOAD_BITMAP = 5,
64854 +       /* enforce atomicity during write(2) */
64855 +       REISER4_ATOMIC_WRITE = 6,
64856 +       /* don't use write barriers in the log writer code. */
64857 +       REISER4_NO_WRITE_BARRIER = 7
64858 +} reiser4_fs_flag;
64859 +
64860 +/*
64861 + * VFS related operation vectors.
64862 + */
64863 +struct object_ops {
64864 +       struct super_operations super;
64865 +       struct dentry_operations dentry;
64866 +       struct export_operations export;
64867 +};
64868 +
64869 +/* reiser4-specific part of super block
64870 +
64871 +   Locking
64872 +
64873 +   Fields immutable after mount:
64874 +
64875 +    ->oid*
64876 +    ->space*
64877 +    ->default_[ug]id
64878 +    ->mkfs_id
64879 +    ->trace_flags
64880 +    ->debug_flags
64881 +    ->fs_flags
64882 +    ->df_plug
64883 +    ->optimal_io_size
64884 +    ->plug
64885 +    ->flush
64886 +    ->u (bad name)
64887 +    ->txnmgr
64888 +    ->ra_params
64889 +    ->fsuid
64890 +    ->journal_header
64891 +    ->journal_footer
64892 +
64893 +   Fields protected by ->lnode_guard
64894 +
64895 +    ->lnode_htable
64896 +
64897 +   Fields protected by per-super block spin lock
64898 +
64899 +    ->block_count
64900 +    ->blocks_used
64901 +    ->blocks_free
64902 +    ->blocks_free_committed
64903 +    ->blocks_grabbed
64904 +    ->blocks_fake_allocated_unformatted
64905 +    ->blocks_fake_allocated
64906 +    ->blocks_flush_reserved
64907 +    ->eflushed
64908 +    ->blocknr_hint_default
64909 +
64910 +   After journal replaying during mount,
64911 +
64912 +    ->last_committed_tx
64913 +
64914 +   is protected by ->tmgr.commit_mutex
64915 +
64916 +   Invariants involving this data-type:
64917 +
64918 +      [sb-block-counts]
64919 +      [sb-grabbed]
64920 +      [sb-fake-allocated]
64921 +*/
64922 +struct reiser4_super_info_data {
64923 +       /*
64924 +        * guard spinlock which protects reiser4 super block fields (currently
64925 +        * blocks_free, blocks_free_committed)
64926 +        */
64927 +       spinlock_t guard;
64928 +
64929 +       /* next oid that will be returned by oid_allocate() */
64930 +       oid_t next_to_use;
64931 +       /* total number of used oids */
64932 +       oid_t oids_in_use;
64933 +
64934 +       /* space manager plugin */
64935 +       reiser4_space_allocator space_allocator;
64936 +
64937 +       /* reiser4 internal tree */
64938 +       reiser4_tree tree;
64939 +
64940 +       /*
64941 +        * default user id used for light-weight files without their own
64942 +        * stat-data.
64943 +        */
64944 +       uid_t default_uid;
64945 +
64946 +       /*
64947 +        * default group id used for light-weight files without their own
64948 +        * stat-data.
64949 +        */
64950 +       gid_t default_gid;
64951 +
64952 +       /* mkfs identifier generated at mkfs time. */
64953 +       __u32 mkfs_id;
64954 +       /* amount of blocks in a file system */
64955 +       __u64 block_count;
64956 +
64957 +       /* inviolable reserve */
64958 +       __u64 blocks_reserved;
64959 +
64960 +       /* amount of blocks used by file system data and meta-data. */
64961 +       __u64 blocks_used;
64962 +
64963 +       /*
64964 +        * amount of free blocks. This is "working" free blocks counter. It is
64965 +        * like "working" bitmap, please see block_alloc.c for description.
64966 +        */
64967 +       __u64 blocks_free;
64968 +
64969 +       /*
64970 +        * free block count for fs committed state. This is "commit" version of
64971 +        * free block counter.
64972 +        */
64973 +       __u64 blocks_free_committed;
64974 +
64975 +       /*
64976 +        * number of blocks reserved for further allocation, for all
64977 +        * threads.
64978 +        */
64979 +       __u64 blocks_grabbed;
64980 +
64981 +       /* number of fake allocated unformatted blocks in tree. */
64982 +       __u64 blocks_fake_allocated_unformatted;
64983 +
64984 +       /* number of fake allocated formatted blocks in tree. */
64985 +       __u64 blocks_fake_allocated;
64986 +
64987 +       /* number of blocks reserved for flush operations. */
64988 +       __u64 blocks_flush_reserved;
64989 +
64990 +       /* number of blocks reserved for cluster operations. */
64991 +       __u64 blocks_clustered;
64992 +
64993 +       /* unique file-system identifier */
64994 +       __u32 fsuid;
64995 +
64996 +       /* On-disk format version. If does not equal to the disk_format
64997 +          plugin version, some format updates (e.g. enlarging plugin
64998 +          set, etc) may have place on mount. */
64999 +       int version;
65000 +
65001 +       /* file-system wide flags. See reiser4_fs_flag enum */
65002 +       unsigned long fs_flags;
65003 +
65004 +       /* transaction manager */
65005 +       txn_mgr tmgr;
65006 +
65007 +       /* ent thread */
65008 +       entd_context entd;
65009 +
65010 +       /* fake inode used to bind formatted nodes */
65011 +       struct inode *fake;
65012 +       /* inode used to bind bitmaps (and journal heads) */
65013 +       struct inode *bitmap;
65014 +       /* inode used to bind copied on capture nodes */
65015 +       struct inode *cc;
65016 +
65017 +       /* disk layout plugin */
65018 +       disk_format_plugin *df_plug;
65019 +
65020 +       /* disk layout specific part of reiser4 super info data */
65021 +       union {
65022 +               format40_super_info format40;
65023 +       } u;
65024 +
65025 +       /* value we return in st_blksize on stat(2) */
65026 +       unsigned long optimal_io_size;
65027 +
65028 +       /* parameters for the flush algorithm */
65029 +       struct flush_params flush;
65030 +
65031 +       /* pointers to jnodes for journal header and footer */
65032 +       jnode *journal_header;
65033 +       jnode *journal_footer;
65034 +
65035 +       journal_location jloc;
65036 +
65037 +       /* head block number of last committed transaction */
65038 +       __u64 last_committed_tx;
65039 +
65040 +       /*
65041 +        * we remember last written location for using as a hint for new block
65042 +        * allocation
65043 +        */
65044 +       __u64 blocknr_hint_default;
65045 +
65046 +       /* committed number of files (oid allocator state variable ) */
65047 +       __u64 nr_files_committed;
65048 +
65049 +       struct formatted_ra_params ra_params;
65050 +
65051 +       /*
65052 +        * A mutex for serializing cut tree operation if out-of-free-space:
65053 +        * the only one cut_tree thread is allowed to grab space from reserved
65054 +        * area (it is 5% of disk space)
65055 +        */
65056 +       struct mutex delete_mutex;
65057 +       /* task owning ->delete_mutex */
65058 +       struct task_struct *delete_mutex_owner;
65059 +
65060 +       /* Diskmap's blocknumber */
65061 +       __u64 diskmap_block;
65062 +
65063 +       /* What to do in case of error */
65064 +       int onerror;
65065 +
65066 +       /* operations for objects on this file system */
65067 +       struct object_ops ops;
65068 +
65069 +       /*
65070 +        * structure to maintain d_cursors. See plugin/file_ops_readdir.c for
65071 +        * more details
65072 +        */
65073 +       struct d_cursor_info d_info;
65074 +
65075 +#ifdef CONFIG_REISER4_BADBLOCKS
65076 +       /* Alternative master superblock offset (in bytes) */
65077 +       unsigned long altsuper;
65078 +#endif
65079 +       struct repacker *repacker;
65080 +       struct page *status_page;
65081 +       struct bio *status_bio;
65082 +
65083 +#if REISER4_DEBUG
65084 +       /*
65085 +        * minimum used blocks value (includes super blocks, bitmap blocks and
65086 +        * other fs reserved areas), depends on fs format and fs size.
65087 +        */
65088 +       __u64 min_blocks_used;
65089 +
65090 +       /*
65091 +        * when debugging is on, all jnodes (including znodes, bitmaps, etc.)
65092 +        * are kept on a list anchored at sbinfo->all_jnodes. This list is
65093 +        * protected by sbinfo->all_guard spin lock. This lock should be taken
65094 +        * with _irq modifier, because it is also modified from interrupt
65095 +        * contexts (by RCU).
65096 +        */
65097 +       spinlock_t all_guard;
65098 +       /* list of all jnodes */
65099 +       struct list_head all_jnodes;
65100 +#endif
65101 +       struct dentry *debugfs_root;
65102 +};
65103 +
65104 +extern reiser4_super_info_data *get_super_private_nocheck(const struct
65105 +                                                         super_block * super);
65106 +
65107 +/* Return reiser4-specific part of super block */
65108 +static inline reiser4_super_info_data *get_super_private(const struct
65109 +                                                        super_block * super)
65110 +{
65111 +       assert("nikita-447", super != NULL);
65112 +
65113 +       return (reiser4_super_info_data *) super->s_fs_info;
65114 +}
65115 +
65116 +/* get ent context for the @super */
65117 +static inline entd_context *get_entd_context(struct super_block *super)
65118 +{
65119 +       return &get_super_private(super)->entd;
65120 +}
65121 +
65122 +/* "Current" super-block: main super block used during current system
65123 +   call. Reference to this super block is stored in reiser4_context. */
65124 +static inline struct super_block *reiser4_get_current_sb(void)
65125 +{
65126 +       return get_current_context()->super;
65127 +}
65128 +
65129 +/* Reiser4-specific part of "current" super-block: main super block used
65130 +   during current system call. Reference to this super block is stored in
65131 +   reiser4_context. */
65132 +static inline reiser4_super_info_data *get_current_super_private(void)
65133 +{
65134 +       return get_super_private(reiser4_get_current_sb());
65135 +}
65136 +
65137 +static inline struct formatted_ra_params *get_current_super_ra_params(void)
65138 +{
65139 +       return &(get_current_super_private()->ra_params);
65140 +}
65141 +
65142 +/*
65143 + * true, if file system on @super is read-only
65144 + */
65145 +static inline int rofs_super(struct super_block *super)
65146 +{
65147 +       return super->s_flags & MS_RDONLY;
65148 +}
65149 +
65150 +/*
65151 + * true, if @tree represents read-only file system
65152 + */
65153 +static inline int rofs_tree(reiser4_tree * tree)
65154 +{
65155 +       return rofs_super(tree->super);
65156 +}
65157 +
65158 +/*
65159 + * true, if file system where @inode lives on, is read-only
65160 + */
65161 +static inline int rofs_inode(struct inode *inode)
65162 +{
65163 +       return rofs_super(inode->i_sb);
65164 +}
65165 +
65166 +/*
65167 + * true, if file system where @node lives on, is read-only
65168 + */
65169 +static inline int rofs_jnode(jnode * node)
65170 +{
65171 +       return rofs_tree(jnode_get_tree(node));
65172 +}
65173 +
65174 +extern __u64 reiser4_current_block_count(void);
65175 +
65176 +extern void build_object_ops(struct super_block *super, struct object_ops *ops);
65177 +
65178 +#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */
65179 +
65180 +static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo)
65181 +{
65182 +       spin_lock(&(sbinfo->guard));
65183 +}
65184 +
65185 +static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo)
65186 +{
65187 +       assert_spin_locked(&(sbinfo->guard));
65188 +       spin_unlock(&(sbinfo->guard));
65189 +}
65190 +
65191 +extern __u64 reiser4_flush_reserved(const struct super_block *);
65192 +extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f);
65193 +extern long reiser4_statfs_type(const struct super_block *super);
65194 +extern __u64 reiser4_block_count(const struct super_block *super);
65195 +extern void reiser4_set_block_count(const struct super_block *super, __u64 nr);
65196 +extern __u64 reiser4_data_blocks(const struct super_block *super);
65197 +extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr);
65198 +extern __u64 reiser4_free_blocks(const struct super_block *super);
65199 +extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr);
65200 +extern __u32 reiser4_mkfs_id(const struct super_block *super);
65201 +
65202 +extern __u64 reiser4_free_committed_blocks(const struct super_block *super);
65203 +
65204 +extern __u64 reiser4_grabbed_blocks(const struct super_block *);
65205 +extern __u64 reiser4_fake_allocated(const struct super_block *);
65206 +extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *);
65207 +extern __u64 reiser4_clustered_blocks(const struct super_block *);
65208 +
65209 +extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid,
65210 +                                   gid_t gid);
65211 +
65212 +extern reiser4_space_allocator *
65213 +reiser4_get_space_allocator(const struct super_block *super);
65214 +extern reiser4_oid_allocator *
65215 +reiser4_get_oid_allocator(const struct super_block *super);
65216 +extern struct inode *reiser4_get_super_fake(const struct super_block *super);
65217 +extern struct inode *reiser4_get_cc_fake(const struct super_block *super);
65218 +extern struct inode *reiser4_get_bitmap_fake(const struct super_block *super);
65219 +extern reiser4_tree *reiser4_get_tree(const struct super_block *super);
65220 +extern int is_reiser4_super(const struct super_block *super);
65221 +
65222 +extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk);
65223 +extern int reiser4_blocknr_is_sane_for(const struct super_block *super,
65224 +                                      const reiser4_block_nr * blk);
65225 +extern int reiser4_fill_super(struct super_block *s, void *data, int silent);
65226 +extern int reiser4_done_super(struct super_block *s);
65227 +
65228 +/* step of fill super */
65229 +extern int reiser4_init_fs_info(struct super_block *);
65230 +extern void reiser4_done_fs_info(struct super_block *);
65231 +extern int reiser4_init_super_data(struct super_block *, char *opt_string);
65232 +extern int reiser4_init_read_super(struct super_block *, int silent);
65233 +extern int reiser4_init_root_inode(struct super_block *);
65234 +extern reiser4_plugin *get_default_plugin(pset_member memb);
65235 +
65236 +/* Maximal possible object id. */
65237 +#define  ABSOLUTE_MAX_OID ((oid_t)~0)
65238 +
65239 +#define OIDS_RESERVED  (1 << 16)
65240 +int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next);
65241 +oid_t oid_allocate(struct super_block *);
65242 +int oid_release(struct super_block *, oid_t);
65243 +oid_t oid_next(const struct super_block *);
65244 +void oid_count_allocated(void);
65245 +void oid_count_released(void);
65246 +long oids_used(const struct super_block *);
65247 +
65248 +#if REISER4_DEBUG
65249 +void print_fs_info(const char *prefix, const struct super_block *);
65250 +#endif
65251 +
65252 +extern void destroy_reiser4_cache(struct kmem_cache **);
65253 +
65254 +extern struct super_operations reiser4_super_operations;
65255 +extern struct export_operations reiser4_export_operations;
65256 +extern struct dentry_operations reiser4_dentry_operations;
65257 +
65258 +/* __REISER4_SUPER_H__ */
65259 +#endif
65260 +
65261 +/*
65262 + * Local variables:
65263 + * c-indentation-style: "K&R"
65264 + * mode-name: "LC"
65265 + * c-basic-offset: 8
65266 + * tab-width: 8
65267 + * fill-column: 120
65268 + * End:
65269 + */
65270 diff -urN linux-2.6.35.orig/fs/reiser4/super_ops.c linux-2.6.35/fs/reiser4/super_ops.c
65271 --- linux-2.6.35.orig/fs/reiser4/super_ops.c    1970-01-01 01:00:00.000000000 +0100
65272 +++ linux-2.6.35/fs/reiser4/super_ops.c 2010-08-04 17:01:34.000000000 +0200
65273 @@ -0,0 +1,737 @@
65274 +/* Copyright 2005 by Hans Reiser, licensing governed by
65275 + * reiser4/README */
65276 +
65277 +#include "inode.h"
65278 +#include "page_cache.h"
65279 +#include "ktxnmgrd.h"
65280 +#include "flush.h"
65281 +#include "safe_link.h"
65282 +
65283 +#include <linux/vfs.h>
65284 +#include <linux/writeback.h>
65285 +#include <linux/mount.h>
65286 +#include <linux/seq_file.h>
65287 +#include <linux/debugfs.h>
65288 +
65289 +/* slab cache for inodes */
65290 +static struct kmem_cache *inode_cache;
65291 +
65292 +static struct dentry *reiser4_debugfs_root = NULL;
65293 +
65294 +/**
65295 + * init_once - constructor for reiser4 inodes
65296 + * @cache: cache @obj belongs to
65297 + * @obj: inode to be initialized
65298 + *
65299 + * Initialization function to be called when new page is allocated by reiser4
65300 + * inode cache. It is set on inode cache creation.
65301 + */
65302 +static void init_once(void *obj)
65303 +{
65304 +       struct reiser4_inode_object *info;
65305 +
65306 +       info = obj;
65307 +
65308 +       /* initialize vfs inode */
65309 +       inode_init_once(&info->vfs_inode);
65310 +
65311 +       /*
65312 +        * initialize reiser4 specific part fo inode.
65313 +        * NOTE-NIKITA add here initializations for locks, list heads,
65314 +        * etc. that will be added to our private inode part.
65315 +        */
65316 +       INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode));
65317 +       init_rwsem(&info->p.conv_sem);
65318 +       /* init semaphore which is used during inode loading */
65319 +       loading_init_once(&info->p);
65320 +       INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p),
65321 +                       GFP_ATOMIC);
65322 +#if REISER4_DEBUG
65323 +       info->p.nr_jnodes = 0;
65324 +#endif
65325 +}
65326 +
65327 +/**
65328 + * init_inodes - create znode cache
65329 + *
65330 + * Initializes slab cache of inodes. It is part of reiser4 module initialization
65331 + */
65332 +static int init_inodes(void)
65333 +{
65334 +       inode_cache = kmem_cache_create("reiser4_inode",
65335 +                                       sizeof(struct reiser4_inode_object),
65336 +                                       0,
65337 +                                       SLAB_HWCACHE_ALIGN |
65338 +                                       SLAB_RECLAIM_ACCOUNT, init_once);
65339 +       if (inode_cache == NULL)
65340 +               return RETERR(-ENOMEM);
65341 +       return 0;
65342 +}
65343 +
65344 +/**
65345 + * done_inodes - delete inode cache
65346 + *
65347 + * This is called on reiser4 module unloading or system shutdown.
65348 + */
65349 +static void done_inodes(void)
65350 +{
65351 +       destroy_reiser4_cache(&inode_cache);
65352 +}
65353 +
65354 +/**
65355 + * reiser4_alloc_inode - alloc_inode of super operations
65356 + * @super: super block new inode is allocated for
65357 + *
65358 + * Allocates new inode, initializes reiser4 specific part of it.
65359 + */
65360 +static struct inode *reiser4_alloc_inode(struct super_block *super)
65361 +{
65362 +       struct reiser4_inode_object *obj;
65363 +
65364 +       assert("nikita-1696", super != NULL);
65365 +       obj = kmem_cache_alloc(inode_cache, reiser4_ctx_gfp_mask_get());
65366 +       if (obj != NULL) {
65367 +               reiser4_inode *info;
65368 +
65369 +               info = &obj->p;
65370 +
65371 +               info->pset = plugin_set_get_empty();
65372 +               info->hset = plugin_set_get_empty();
65373 +               info->extmask = 0;
65374 +               info->locality_id = 0ull;
65375 +               info->plugin_mask = 0;
65376 +               info->heir_mask = 0;
65377 +#if !REISER4_INO_IS_OID
65378 +               info->oid_hi = 0;
65379 +#endif
65380 +               reiser4_seal_init(&info->sd_seal, NULL, NULL);
65381 +               coord_init_invalid(&info->sd_coord, NULL);
65382 +               info->flags = 0;
65383 +               spin_lock_init(&info->guard);
65384 +               /* this deals with info's loading semaphore */
65385 +               loading_alloc(info);
65386 +               info->vroot = UBER_TREE_ADDR;
65387 +               return &obj->vfs_inode;
65388 +       } else
65389 +               return NULL;
65390 +}
65391 +
65392 +/**
65393 + * reiser4_destroy_inode - destroy_inode of super operations
65394 + * @inode: inode being destroyed
65395 + *
65396 + * Puts reiser4 specific portion of inode, frees memory occupied by inode.
65397 + */
65398 +static void reiser4_destroy_inode(struct inode *inode)
65399 +{
65400 +       reiser4_inode *info;
65401 +
65402 +       info = reiser4_inode_data(inode);
65403 +
65404 +       assert("vs-1220", inode_has_no_jnodes(info));
65405 +
65406 +       if (!is_bad_inode(inode) && is_inode_loaded(inode)) {
65407 +               file_plugin *fplug = inode_file_plugin(inode);
65408 +               if (fplug->destroy_inode != NULL)
65409 +                       fplug->destroy_inode(inode);
65410 +       }
65411 +       reiser4_dispose_cursors(inode);
65412 +       if (info->pset)
65413 +               plugin_set_put(info->pset);
65414 +       if (info->hset)
65415 +               plugin_set_put(info->hset);
65416 +
65417 +       /*
65418 +        * cannot add similar assertion about ->i_list as prune_icache return
65419 +        * inode into slab with dangling ->list.{next,prev}. This is safe,
65420 +        * because they are re-initialized in the new_inode().
65421 +        */
65422 +       assert("nikita-2895", list_empty(&inode->i_dentry));
65423 +       assert("nikita-2896", hlist_unhashed(&inode->i_hash));
65424 +       assert("nikita-2898", list_empty_careful(get_readdir_list(inode)));
65425 +
65426 +       /* this deals with info's loading semaphore */
65427 +       loading_destroy(info);
65428 +
65429 +       kmem_cache_free(inode_cache,
65430 +                       container_of(info, struct reiser4_inode_object, p));
65431 +}
65432 +
65433 +/**
65434 + * reiser4_dirty_inode - dirty_inode of super operations
65435 + * @inode: inode being dirtied
65436 + *
65437 + * Updates stat data.
65438 + */
65439 +static void reiser4_dirty_inode(struct inode *inode)
65440 +{
65441 +       int result;
65442 +
65443 +       if (!is_in_reiser4_context())
65444 +               return;
65445 +       assert("", !IS_RDONLY(inode));
65446 +       assert("", (inode_file_plugin(inode)->estimate.update(inode) <=
65447 +                   get_current_context()->grabbed_blocks));
65448 +
65449 +       result = reiser4_update_sd(inode);
65450 +       if (result)
65451 +               warning("", "failed to dirty inode for %llu: %d",
65452 +                       get_inode_oid(inode), result);
65453 +}
65454 +
65455 +/**
65456 + * reiser4_delete_inode - delete_inode of super operations
65457 + * @inode: inode to delete
65458 + *
65459 + * Calls file plugin's delete_object method to delete object items from
65460 + * filesystem tree and calls clear_inode.
65461 + */
65462 +static void reiser4_delete_inode(struct inode *inode)
65463 +{
65464 +       reiser4_context *ctx;
65465 +       file_plugin *fplug;
65466 +
65467 +       ctx = reiser4_init_context(inode->i_sb);
65468 +       if (IS_ERR(ctx)) {
65469 +               warning("vs-15", "failed to init context");
65470 +               return;
65471 +       }
65472 +
65473 +       if (is_inode_loaded(inode)) {
65474 +               fplug = inode_file_plugin(inode);
65475 +               if (fplug != NULL && fplug->delete_object != NULL)
65476 +                       fplug->delete_object(inode);
65477 +       }
65478 +
65479 +       truncate_inode_pages(&inode->i_data, 0);
65480 +       inode->i_blocks = 0;
65481 +       end_writeback(inode);
65482 +       reiser4_exit_context(ctx);
65483 +}
65484 +
65485 +/**
65486 + * reiser4_put_super - put_super of super operations
65487 + * @super: super block to free
65488 + *
65489 + * Stops daemons, release resources, umounts in short.
65490 + */
65491 +static void reiser4_put_super(struct super_block *super)
65492 +{
65493 +       reiser4_super_info_data *sbinfo;
65494 +       reiser4_context *ctx;
65495 +
65496 +       sbinfo = get_super_private(super);
65497 +       assert("vs-1699", sbinfo);
65498 +
65499 +       debugfs_remove(sbinfo->tmgr.debugfs_atom_count);
65500 +       debugfs_remove(sbinfo->tmgr.debugfs_id_count);
65501 +       debugfs_remove(sbinfo->debugfs_root);
65502 +
65503 +       ctx = reiser4_init_context(super);
65504 +       if (IS_ERR(ctx)) {
65505 +               warning("vs-17", "failed to init context");
65506 +               return;
65507 +       }
65508 +
65509 +       /* have disk format plugin to free its resources */
65510 +       if (get_super_private(super)->df_plug->release)
65511 +               get_super_private(super)->df_plug->release(super);
65512 +
65513 +       reiser4_done_formatted_fake(super);
65514 +
65515 +       /* stop daemons: ktxnmgr and entd */
65516 +       reiser4_done_entd(super);
65517 +       reiser4_done_ktxnmgrd(super);
65518 +       reiser4_done_txnmgr(&sbinfo->tmgr);
65519 +
65520 +       reiser4_done_fs_info(super);
65521 +       reiser4_exit_context(ctx);
65522 +}
65523 +
65524 +/**
65525 + * reiser4_write_super - write_super of super operations
65526 + * @super: super block to write
65527 + *
65528 + * Captures znode associated with super block, comit all transactions.
65529 + */
65530 +static void reiser4_write_super(struct super_block *super)
65531 +{
65532 +       int ret;
65533 +       reiser4_context *ctx;
65534 +
65535 +       assert("vs-1700", !rofs_super(super));
65536 +
65537 +       ctx = reiser4_init_context(super);
65538 +       if (IS_ERR(ctx)) {
65539 +               warning("vs-16", "failed to init context");
65540 +               return;
65541 +       }
65542 +
65543 +       ret = reiser4_capture_super_block(super);
65544 +       if (ret != 0)
65545 +               warning("vs-1701",
65546 +                       "reiser4_capture_super_block failed in write_super: %d",
65547 +                       ret);
65548 +       ret = txnmgr_force_commit_all(super, 0);
65549 +       if (ret != 0)
65550 +               warning("jmacd-77113",
65551 +                       "txn_force failed in write_super: %d", ret);
65552 +
65553 +       super->s_dirt = 0;
65554 +
65555 +       reiser4_exit_context(ctx);
65556 +}
65557 +
65558 +/**
65559 + * reiser4_statfs - statfs of super operations
65560 + * @super: super block of file system in queried
65561 + * @stafs: buffer to fill with statistics
65562 + *
65563 + * Returns information about filesystem.
65564 + */
65565 +static int reiser4_statfs(struct dentry *dentry, struct kstatfs *statfs)
65566 +{
65567 +       sector_t total;
65568 +       sector_t reserved;
65569 +       sector_t free;
65570 +       sector_t forroot;
65571 +       sector_t deleted;
65572 +       reiser4_context *ctx;
65573 +       struct super_block *super = dentry->d_sb;
65574 +
65575 +       assert("nikita-408", super != NULL);
65576 +       assert("nikita-409", statfs != NULL);
65577 +
65578 +       ctx = reiser4_init_context(super);
65579 +       if (IS_ERR(ctx))
65580 +               return PTR_ERR(ctx);
65581 +
65582 +       statfs->f_type = reiser4_statfs_type(super);
65583 +       statfs->f_bsize = super->s_blocksize;
65584 +
65585 +       /*
65586 +        * 5% of total block space is reserved. This is needed for flush and
65587 +        * for truncates (so that we are able to perform truncate/unlink even
65588 +        * on the otherwise completely full file system). If this reservation
65589 +        * is hidden from statfs(2), users will mistakenly guess that they
65590 +        * have enough free space to complete some operation, which is
65591 +        * frustrating.
65592 +        *
65593 +        * Another possible solution is to subtract ->blocks_reserved from
65594 +        * ->f_bfree, but changing available space seems less intrusive than
65595 +        * letting user to see 5% of disk space to be used directly after
65596 +        * mkfs.
65597 +        */
65598 +       total = reiser4_block_count(super);
65599 +       reserved = get_super_private(super)->blocks_reserved;
65600 +       deleted = txnmgr_count_deleted_blocks();
65601 +       free = reiser4_free_blocks(super) + deleted;
65602 +       forroot = reiser4_reserved_blocks(super, 0, 0);
65603 +
65604 +       /*
65605 +        * These counters may be in inconsistent state because we take the
65606 +        * values without keeping any global spinlock.  Here we do a sanity
65607 +        * check that free block counter does not exceed the number of all
65608 +        * blocks.
65609 +        */
65610 +       if (free > total)
65611 +               free = total;
65612 +       statfs->f_blocks = total - reserved;
65613 +       /* make sure statfs->f_bfree is never larger than statfs->f_blocks */
65614 +       if (free > reserved)
65615 +               free -= reserved;
65616 +       else
65617 +               free = 0;
65618 +       statfs->f_bfree = free;
65619 +
65620 +       if (free > forroot)
65621 +               free -= forroot;
65622 +       else
65623 +               free = 0;
65624 +       statfs->f_bavail = free;
65625 +
65626 +       statfs->f_files = 0;
65627 +       statfs->f_ffree = 0;
65628 +
65629 +       /* maximal acceptable name length depends on directory plugin. */
65630 +       assert("nikita-3351", super->s_root->d_inode != NULL);
65631 +       statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode);
65632 +       reiser4_exit_context(ctx);
65633 +       return 0;
65634 +}
65635 +
65636 +/**
65637 + * reiser4_clear_inode - clear_inode of super operation
65638 + * @inode: inode about to destroy
65639 + *
65640 + * Does sanity checks: being destroyed should have all jnodes detached.
65641 + */
65642 +static void reiser4_clear_inode(struct inode *inode)
65643 +{
65644 +#if REISER4_DEBUG
65645 +       reiser4_inode *r4_inode;
65646 +
65647 +       r4_inode = reiser4_inode_data(inode);
65648 +       if (!inode_has_no_jnodes(r4_inode))
65649 +               warning("vs-1732", "reiser4 inode has %ld jnodes\n",
65650 +                       r4_inode->nr_jnodes);
65651 +#endif
65652 +}
65653 +
65654 +/**
65655 + * reiser4_writeback_inodes - writeback_inodes of super operations
65656 + * @super:
65657 + * @wb:
65658 + * @wbc:
65659 + *
65660 + * This method is called by background and non-backgound writeback. Reiser4's
65661 + * implementation uses generic_writeback_sb_inodes to call reiser4_writepages
65662 + * for each of dirty inodes. reiser4_writepages handles pages dirtied via shared
65663 + * mapping - dirty pages get into atoms. Writeout is called to flush some atoms.
65664 + */
65665 +static int reiser4_writeback_inodes(struct super_block *super,
65666 +                                   struct bdi_writeback *wb,
65667 +                                   struct writeback_control *wbc,
65668 +                                   bool only_this_sb)
65669 +{
65670 +       int ret;
65671 +       long to_write;
65672 +       reiser4_context *ctx;
65673 +
65674 +       if (wbc->for_kupdate)
65675 +               /* reiser4 has its own means of periodical write-out */
65676 +               goto skip;
65677 +       assert("vs-49", wbc->older_than_this == NULL);
65678 +
65679 +       spin_unlock(&inode_lock);
65680 +       ctx = reiser4_init_context(super);
65681 +       if (IS_ERR(ctx)) {
65682 +               warning("vs-13", "failed to init context");
65683 +               spin_lock(&inode_lock);
65684 +               goto skip;
65685 +       }
65686 +       to_write = wbc->nr_to_write;
65687 +       /*
65688 +        * call reiser4_writepages for each of dirty inodes to turn
65689 +        * dirty pages into transactions if they were not yet.
65690 +        */
65691 +       spin_lock(&inode_lock);
65692 +       ret = generic_writeback_sb_inodes(super, wb, wbc, only_this_sb);
65693 +       spin_unlock(&inode_lock);
65694 +
65695 +       wbc->nr_to_write = to_write;
65696 +
65697 +       /* flush goes here */
65698 +       reiser4_writeout(super, wbc);
65699 +
65700 +       /* avoid recursive calls to ->writeback_inodes */
65701 +       context_set_commit_async(ctx);
65702 +       reiser4_exit_context(ctx);
65703 +       spin_lock(&inode_lock);
65704 +
65705 +       return wbc->nr_to_write <= 0 ? 1 : ret;
65706 + skip:
65707 +       writeback_skip_sb_inodes(super, wb);
65708 +       return 0;
65709 +}
65710 +
65711 +/**
65712 + * reiser4_show_options - show_options of super operations
65713 + * @m: file where to write information
65714 + * @mnt: mount structure
65715 + *
65716 + * Makes reiser4 mount options visible in /proc/mounts.
65717 + */
65718 +static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt)
65719 +{
65720 +       struct super_block *super;
65721 +       reiser4_super_info_data *sbinfo;
65722 +
65723 +       super = mnt->mnt_sb;
65724 +       sbinfo = get_super_private(super);
65725 +
65726 +       seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size);
65727 +       seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age);
65728 +       seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size);
65729 +       seq_printf(m, ",atom_max_flushers=0x%x",
65730 +                  sbinfo->tmgr.atom_max_flushers);
65731 +       seq_printf(m, ",cbk_cache_slots=0x%x",
65732 +                  sbinfo->tree.cbk_cache.nr_slots);
65733 +
65734 +       return 0;
65735 +}
65736 +
65737 +struct super_operations reiser4_super_operations = {
65738 +       .alloc_inode = reiser4_alloc_inode,
65739 +       .destroy_inode = reiser4_destroy_inode,
65740 +       .dirty_inode = reiser4_dirty_inode,
65741 +       .evict_inode = reiser4_delete_inode,
65742 +       .put_super = reiser4_put_super,
65743 +       .write_super = reiser4_write_super,
65744 +       .statfs = reiser4_statfs,
65745 +//     .clear_inode = reiser4_clear_inode,
65746 +       .writeback_inodes = reiser4_writeback_inodes,
65747 +       .show_options = reiser4_show_options
65748 +};
65749 +
65750 +/**
65751 + * fill_super - initialize super block on mount
65752 + * @super: super block to fill
65753 + * @data: reiser4 specific mount option
65754 + * @silent:
65755 + *
65756 + * This is to be called by reiser4_get_sb. Mounts filesystem.
65757 + */
65758 +static int fill_super(struct super_block *super, void *data, int silent)
65759 +{
65760 +       reiser4_context ctx;
65761 +       int result;
65762 +       reiser4_super_info_data *sbinfo;
65763 +
65764 +       assert("zam-989", super != NULL);
65765 +
65766 +       super->s_op = NULL;
65767 +       init_stack_context(&ctx, super);
65768 +
65769 +       /* allocate reiser4 specific super block */
65770 +       if ((result = reiser4_init_fs_info(super)) != 0)
65771 +               goto failed_init_sinfo;
65772 +
65773 +       sbinfo = get_super_private(super);
65774 +       /* initialize various reiser4 parameters, parse mount options */
65775 +       if ((result = reiser4_init_super_data(super, data)) != 0)
65776 +               goto failed_init_super_data;
65777 +
65778 +       /* read reiser4 master super block, initialize disk format plugin */
65779 +       if ((result = reiser4_init_read_super(super, silent)) != 0)
65780 +               goto failed_init_read_super;
65781 +
65782 +       /* initialize transaction manager */
65783 +       reiser4_init_txnmgr(&sbinfo->tmgr);
65784 +
65785 +       /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */
65786 +       if ((result = reiser4_init_ktxnmgrd(super)) != 0)
65787 +               goto failed_init_ktxnmgrd;
65788 +
65789 +       /* initialize entd context and start kernel thread entd */
65790 +       if ((result = reiser4_init_entd(super)) != 0)
65791 +               goto failed_init_entd;
65792 +
65793 +       /* initialize address spaces for formatted nodes and bitmaps */
65794 +       if ((result = reiser4_init_formatted_fake(super)) != 0)
65795 +               goto failed_init_formatted_fake;
65796 +
65797 +       /* initialize disk format plugin */
65798 +       if ((result = get_super_private(super)->df_plug->init_format(super,
65799 +                                                                   data)) != 0)
65800 +               goto failed_init_disk_format;
65801 +
65802 +       /*
65803 +        * There are some 'committed' versions of reiser4 super block counters,
65804 +        * which correspond to reiser4 on-disk state. These counters are
65805 +        * initialized here
65806 +        */
65807 +       sbinfo->blocks_free_committed = sbinfo->blocks_free;
65808 +       sbinfo->nr_files_committed = oids_used(super);
65809 +
65810 +       /* get inode of root directory */
65811 +       if ((result = reiser4_init_root_inode(super)) != 0)
65812 +               goto failed_init_root_inode;
65813 +
65814 +       if ((result = get_super_private(super)->df_plug->version_update(super)) != 0)
65815 +               goto failed_update_format_version;
65816 +
65817 +       process_safelinks(super);
65818 +       reiser4_exit_context(&ctx);
65819 +
65820 +       sbinfo->debugfs_root = debugfs_create_dir(super->s_id,
65821 +                                                 reiser4_debugfs_root);
65822 +       if (sbinfo->debugfs_root) {
65823 +               sbinfo->tmgr.debugfs_atom_count =
65824 +                       debugfs_create_u32("atom_count", S_IFREG|S_IRUSR,
65825 +                                          sbinfo->debugfs_root,
65826 +                                          &sbinfo->tmgr.atom_count);
65827 +               sbinfo->tmgr.debugfs_id_count =
65828 +                       debugfs_create_u32("id_count", S_IFREG|S_IRUSR,
65829 +                                          sbinfo->debugfs_root,
65830 +                                          &sbinfo->tmgr.id_count);
65831 +       }
65832 +       return 0;
65833 +
65834 + failed_update_format_version:
65835 + failed_init_root_inode:
65836 +       if (sbinfo->df_plug->release)
65837 +               sbinfo->df_plug->release(super);
65838 + failed_init_disk_format:
65839 +       reiser4_done_formatted_fake(super);
65840 + failed_init_formatted_fake:
65841 +       reiser4_done_entd(super);
65842 + failed_init_entd:
65843 +       reiser4_done_ktxnmgrd(super);
65844 + failed_init_ktxnmgrd:
65845 +       reiser4_done_txnmgr(&sbinfo->tmgr);
65846 + failed_init_read_super:
65847 + failed_init_super_data:
65848 +       reiser4_done_fs_info(super);
65849 + failed_init_sinfo:
65850 +       reiser4_exit_context(&ctx);
65851 +       return result;
65852 +}
65853 +
65854 +/**
65855 + * reiser4_get_sb - get_sb of file_system_type operations
65856 + * @fs_type:
65857 + * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc
65858 + * @dev_name: block device file name
65859 + * @data: specific mount options
65860 + *
65861 + * Reiser4 mount entry.
65862 + */
65863 +static int reiser4_get_sb(struct file_system_type *fs_type, int flags,
65864 +                       const char *dev_name, void *data, struct vfsmount *mnt)
65865 +{
65866 +       return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
65867 +}
65868 +
65869 +/* structure describing the reiser4 filesystem implementation */
65870 +static struct file_system_type reiser4_fs_type = {
65871 +       .owner = THIS_MODULE,
65872 +       .name = "reiser4",
65873 +       .fs_flags = FS_REQUIRES_DEV,
65874 +       .get_sb = reiser4_get_sb,
65875 +       .kill_sb = kill_block_super,
65876 +       .next = NULL
65877 +};
65878 +
65879 +void destroy_reiser4_cache(struct kmem_cache **cachep)
65880 +{
65881 +       BUG_ON(*cachep == NULL);
65882 +       kmem_cache_destroy(*cachep);
65883 +       *cachep = NULL;
65884 +}
65885 +
65886 +/**
65887 + * init_reiser4 - reiser4 initialization entry point
65888 + *
65889 + * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called
65890 + * on kernel initialization or during reiser4 module load.
65891 + */
65892 +static int __init init_reiser4(void)
65893 +{
65894 +       int result;
65895 +
65896 +       printk(KERN_INFO
65897 +              "Loading Reiser4. "
65898 +              "See www.namesys.com for a description of Reiser4.\n");
65899 +
65900 +       /* initialize slab cache of inodes */
65901 +       if ((result = init_inodes()) != 0)
65902 +               goto failed_inode_cache;
65903 +
65904 +       /* initialize cache of znodes */
65905 +       if ((result = init_znodes()) != 0)
65906 +               goto failed_init_znodes;
65907 +
65908 +       /* initialize all plugins */
65909 +       if ((result = init_plugins()) != 0)
65910 +               goto failed_init_plugins;
65911 +
65912 +       /* initialize cache of plugin_set-s and plugin_set's hash table */
65913 +       if ((result = init_plugin_set()) != 0)
65914 +               goto failed_init_plugin_set;
65915 +
65916 +       /* initialize caches of txn_atom-s and txn_handle-s */
65917 +       if ((result = init_txnmgr_static()) != 0)
65918 +               goto failed_init_txnmgr_static;
65919 +
65920 +       /* initialize cache of jnodes */
65921 +       if ((result = init_jnodes()) != 0)
65922 +               goto failed_init_jnodes;
65923 +
65924 +       /* initialize cache of flush queues */
65925 +       if ((result = reiser4_init_fqs()) != 0)
65926 +               goto failed_init_fqs;
65927 +
65928 +       /* initialize cache of structures attached to dentry->d_fsdata */
65929 +       if ((result = reiser4_init_dentry_fsdata()) != 0)
65930 +               goto failed_init_dentry_fsdata;
65931 +
65932 +       /* initialize cache of structures attached to file->private_data */
65933 +       if ((result = reiser4_init_file_fsdata()) != 0)
65934 +               goto failed_init_file_fsdata;
65935 +
65936 +       /*
65937 +        * initialize cache of d_cursors. See plugin/file_ops_readdir.c for
65938 +        * more details
65939 +        */
65940 +       if ((result = reiser4_init_d_cursor()) != 0)
65941 +               goto failed_init_d_cursor;
65942 +
65943 +       if ((result = register_filesystem(&reiser4_fs_type)) == 0) {
65944 +               reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL);
65945 +               return 0;
65946 +       }
65947 +
65948 +       reiser4_done_d_cursor();
65949 + failed_init_d_cursor:
65950 +       reiser4_done_file_fsdata();
65951 + failed_init_file_fsdata:
65952 +       reiser4_done_dentry_fsdata();
65953 + failed_init_dentry_fsdata:
65954 +       reiser4_done_fqs();
65955 + failed_init_fqs:
65956 +       done_jnodes();
65957 + failed_init_jnodes:
65958 +       done_txnmgr_static();
65959 + failed_init_txnmgr_static:
65960 +       done_plugin_set();
65961 + failed_init_plugin_set:
65962 + failed_init_plugins:
65963 +       done_znodes();
65964 + failed_init_znodes:
65965 +       done_inodes();
65966 + failed_inode_cache:
65967 +       return result;
65968 +}
65969 +
65970 +/**
65971 + * done_reiser4 - reiser4 exit entry point
65972 + *
65973 + * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown
65974 + * or at module unload.
65975 + */
65976 +static void __exit done_reiser4(void)
65977 +{
65978 +       int result;
65979 +
65980 +       debugfs_remove(reiser4_debugfs_root);
65981 +       result = unregister_filesystem(&reiser4_fs_type);
65982 +       BUG_ON(result != 0);
65983 +       reiser4_done_d_cursor();
65984 +       reiser4_done_file_fsdata();
65985 +       reiser4_done_dentry_fsdata();
65986 +       reiser4_done_fqs();
65987 +       done_jnodes();
65988 +       done_txnmgr_static();
65989 +       done_plugin_set();
65990 +       done_znodes();
65991 +       destroy_reiser4_cache(&inode_cache);
65992 +}
65993 +
65994 +module_init(init_reiser4);
65995 +module_exit(done_reiser4);
65996 +
65997 +MODULE_DESCRIPTION("Reiser4 filesystem");
65998 +MODULE_AUTHOR("Hans Reiser <Reiser@Namesys.COM>");
65999 +
66000 +MODULE_LICENSE("GPL");
66001 +
66002 +/*
66003 + * Local variables:
66004 + * c-indentation-style: "K&R"
66005 + * mode-name: "LC"
66006 + * c-basic-offset: 8
66007 + * tab-width: 8
66008 + * fill-column: 79
66009 + * End:
66010 + */
66011 diff -urN linux-2.6.35.orig/fs/reiser4/tap.c linux-2.6.35/fs/reiser4/tap.c
66012 --- linux-2.6.35.orig/fs/reiser4/tap.c  1970-01-01 01:00:00.000000000 +0100
66013 +++ linux-2.6.35/fs/reiser4/tap.c       2010-08-04 15:44:57.000000000 +0200
66014 @@ -0,0 +1,376 @@
66015 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66016 + * reiser4/README */
66017 +
66018 +/*
66019 +   Tree Access Pointer (tap).
66020 +
66021 +   tap is data structure combining coord and lock handle (mostly). It is
66022 +   useful when one has to scan tree nodes (for example, in readdir, or flush),
66023 +   for tap functions allow to move tap in either direction transparently
66024 +   crossing unit/item/node borders.
66025 +
66026 +   Tap doesn't provide automatic synchronization of its fields as it is
66027 +   supposed to be per-thread object.
66028 +*/
66029 +
66030 +#include "forward.h"
66031 +#include "debug.h"
66032 +#include "coord.h"
66033 +#include "tree.h"
66034 +#include "context.h"
66035 +#include "tap.h"
66036 +#include "znode.h"
66037 +#include "tree_walk.h"
66038 +
66039 +#if REISER4_DEBUG
66040 +static int tap_invariant(const tap_t *tap);
66041 +static void tap_check(const tap_t *tap);
66042 +#else
66043 +#define tap_check(tap) noop
66044 +#endif
66045 +
66046 +/** load node tap is pointing to, if not loaded already */
66047 +int reiser4_tap_load(tap_t *tap)
66048 +{
66049 +       tap_check(tap);
66050 +       if (tap->loaded == 0) {
66051 +               int result;
66052 +
66053 +               result = zload_ra(tap->coord->node, &tap->ra_info);
66054 +               if (result != 0)
66055 +                       return result;
66056 +               coord_clear_iplug(tap->coord);
66057 +       }
66058 +       ++tap->loaded;
66059 +       tap_check(tap);
66060 +       return 0;
66061 +}
66062 +
66063 +/** release node tap is pointing to. Dual to tap_load() */
66064 +void reiser4_tap_relse(tap_t *tap)
66065 +{
66066 +       tap_check(tap);
66067 +       if (tap->loaded > 0) {
66068 +               --tap->loaded;
66069 +               if (tap->loaded == 0)
66070 +                       zrelse(tap->coord->node);
66071 +       }
66072 +       tap_check(tap);
66073 +}
66074 +
66075 +/**
66076 + * init tap to consist of @coord and @lh. Locks on nodes will be acquired with
66077 + * @mode
66078 + */
66079 +void reiser4_tap_init(tap_t *tap, coord_t *coord, lock_handle * lh,
66080 +                     znode_lock_mode mode)
66081 +{
66082 +       tap->coord = coord;
66083 +       tap->lh = lh;
66084 +       tap->mode = mode;
66085 +       tap->loaded = 0;
66086 +       INIT_LIST_HEAD(&tap->linkage);
66087 +       reiser4_init_ra_info(&tap->ra_info);
66088 +}
66089 +
66090 +/** add @tap to the per-thread list of all taps */
66091 +void reiser4_tap_monitor(tap_t *tap)
66092 +{
66093 +       assert("nikita-2623", tap != NULL);
66094 +       tap_check(tap);
66095 +       list_add(&tap->linkage, reiser4_taps_list());
66096 +       tap_check(tap);
66097 +}
66098 +
66099 +/* duplicate @src into @dst. Copy lock handle. @dst is not initially
66100 + * loaded. */
66101 +void reiser4_tap_copy(tap_t *dst, tap_t *src)
66102 +{
66103 +       assert("nikita-3193", src != NULL);
66104 +       assert("nikita-3194", dst != NULL);
66105 +
66106 +       *dst->coord = *src->coord;
66107 +       if (src->lh->node)
66108 +               copy_lh(dst->lh, src->lh);
66109 +       dst->mode = src->mode;
66110 +       dst->loaded = 0;
66111 +       INIT_LIST_HEAD(&dst->linkage);
66112 +       dst->ra_info = src->ra_info;
66113 +}
66114 +
66115 +/** finish with @tap */
66116 +void reiser4_tap_done(tap_t *tap)
66117 +{
66118 +       assert("nikita-2565", tap != NULL);
66119 +       tap_check(tap);
66120 +       if (tap->loaded > 0)
66121 +               zrelse(tap->coord->node);
66122 +       done_lh(tap->lh);
66123 +       tap->loaded = 0;
66124 +       list_del_init(&tap->linkage);
66125 +       tap->coord->node = NULL;
66126 +}
66127 +
66128 +/**
66129 + * move @tap to the new node, locked with @target. Load @target, if @tap was
66130 + * already loaded.
66131 + */
66132 +int reiser4_tap_move(tap_t *tap, lock_handle * target)
66133 +{
66134 +       int result = 0;
66135 +
66136 +       assert("nikita-2567", tap != NULL);
66137 +       assert("nikita-2568", target != NULL);
66138 +       assert("nikita-2570", target->node != NULL);
66139 +       assert("nikita-2569", tap->coord->node == tap->lh->node);
66140 +
66141 +       tap_check(tap);
66142 +       if (tap->loaded > 0)
66143 +               result = zload_ra(target->node, &tap->ra_info);
66144 +
66145 +       if (result == 0) {
66146 +               if (tap->loaded > 0)
66147 +                       zrelse(tap->coord->node);
66148 +               done_lh(tap->lh);
66149 +               copy_lh(tap->lh, target);
66150 +               tap->coord->node = target->node;
66151 +               coord_clear_iplug(tap->coord);
66152 +       }
66153 +       tap_check(tap);
66154 +       return result;
66155 +}
66156 +
66157 +/**
66158 + * move @tap to @target. Acquire lock on @target, if @tap was already
66159 + * loaded.
66160 + */
66161 +static int tap_to(tap_t *tap, znode * target)
66162 +{
66163 +       int result;
66164 +
66165 +       assert("nikita-2624", tap != NULL);
66166 +       assert("nikita-2625", target != NULL);
66167 +
66168 +       tap_check(tap);
66169 +       result = 0;
66170 +       if (tap->coord->node != target) {
66171 +               lock_handle here;
66172 +
66173 +               init_lh(&here);
66174 +               result = longterm_lock_znode(&here, target,
66175 +                                            tap->mode, ZNODE_LOCK_HIPRI);
66176 +               if (result == 0) {
66177 +                       result = reiser4_tap_move(tap, &here);
66178 +                       done_lh(&here);
66179 +               }
66180 +       }
66181 +       tap_check(tap);
66182 +       return result;
66183 +}
66184 +
66185 +/**
66186 + * move @tap to given @target, loading and locking @target->node if
66187 + * necessary
66188 + */
66189 +int tap_to_coord(tap_t *tap, coord_t *target)
66190 +{
66191 +       int result;
66192 +
66193 +       tap_check(tap);
66194 +       result = tap_to(tap, target->node);
66195 +       if (result == 0)
66196 +               coord_dup(tap->coord, target);
66197 +       tap_check(tap);
66198 +       return result;
66199 +}
66200 +
66201 +/** return list of all taps */
66202 +struct list_head *reiser4_taps_list(void)
66203 +{
66204 +       return &get_current_context()->taps;
66205 +}
66206 +
66207 +/** helper function for go_{next,prev}_{item,unit,node}() */
66208 +int go_dir_el(tap_t *tap, sideof dir, int units_p)
66209 +{
66210 +       coord_t dup;
66211 +       coord_t *coord;
66212 +       int result;
66213 +
66214 +       int (*coord_dir) (coord_t *);
66215 +       int (*get_dir_neighbor) (lock_handle *, znode *, int, int);
66216 +       void (*coord_init) (coord_t *, const znode *);
66217 +       ON_DEBUG(int (*coord_check) (const coord_t *));
66218 +
66219 +       assert("nikita-2556", tap != NULL);
66220 +       assert("nikita-2557", tap->coord != NULL);
66221 +       assert("nikita-2558", tap->lh != NULL);
66222 +       assert("nikita-2559", tap->coord->node != NULL);
66223 +
66224 +       tap_check(tap);
66225 +       if (dir == LEFT_SIDE) {
66226 +               coord_dir = units_p ? coord_prev_unit : coord_prev_item;
66227 +               get_dir_neighbor = reiser4_get_left_neighbor;
66228 +               coord_init = coord_init_last_unit;
66229 +       } else {
66230 +               coord_dir = units_p ? coord_next_unit : coord_next_item;
66231 +               get_dir_neighbor = reiser4_get_right_neighbor;
66232 +               coord_init = coord_init_first_unit;
66233 +       }
66234 +       ON_DEBUG(coord_check =
66235 +                units_p ? coord_is_existing_unit : coord_is_existing_item);
66236 +       assert("nikita-2560", coord_check(tap->coord));
66237 +
66238 +       coord = tap->coord;
66239 +       coord_dup(&dup, coord);
66240 +       if (coord_dir(&dup) != 0) {
66241 +               do {
66242 +                       /* move to the left neighboring node */
66243 +                       lock_handle dup;
66244 +
66245 +                       init_lh(&dup);
66246 +                       result =
66247 +                           get_dir_neighbor(&dup, coord->node, (int)tap->mode,
66248 +                                            GN_CAN_USE_UPPER_LEVELS);
66249 +                       if (result == 0) {
66250 +                               result = reiser4_tap_move(tap, &dup);
66251 +                               if (result == 0)
66252 +                                       coord_init(tap->coord, dup.node);
66253 +                               done_lh(&dup);
66254 +                       }
66255 +                       /* skip empty nodes */
66256 +               } while ((result == 0) && node_is_empty(coord->node));
66257 +       } else {
66258 +               result = 0;
66259 +               coord_dup(coord, &dup);
66260 +       }
66261 +       assert("nikita-2564", ergo(!result, coord_check(tap->coord)));
66262 +       tap_check(tap);
66263 +       return result;
66264 +}
66265 +
66266 +/**
66267 + * move @tap to the next unit, transparently crossing item and node
66268 + * boundaries
66269 + */
66270 +int go_next_unit(tap_t *tap)
66271 +{
66272 +       return go_dir_el(tap, RIGHT_SIDE, 1);
66273 +}
66274 +
66275 +/**
66276 + * move @tap to the previous unit, transparently crossing item and node
66277 + * boundaries
66278 + */
66279 +int go_prev_unit(tap_t *tap)
66280 +{
66281 +       return go_dir_el(tap, LEFT_SIDE, 1);
66282 +}
66283 +
66284 +/**
66285 + * @shift times apply @actor to the @tap. This is used to move @tap by
66286 + * @shift units (or items, or nodes) in either direction.
66287 + */
66288 +static int rewind_to(tap_t *tap, go_actor_t actor, int shift)
66289 +{
66290 +       int result;
66291 +
66292 +       assert("nikita-2555", shift >= 0);
66293 +       assert("nikita-2562", tap->coord->node == tap->lh->node);
66294 +
66295 +       tap_check(tap);
66296 +       result = reiser4_tap_load(tap);
66297 +       if (result != 0)
66298 +               return result;
66299 +
66300 +       for (; shift > 0; --shift) {
66301 +               result = actor(tap);
66302 +               assert("nikita-2563", tap->coord->node == tap->lh->node);
66303 +               if (result != 0)
66304 +                       break;
66305 +       }
66306 +       reiser4_tap_relse(tap);
66307 +       tap_check(tap);
66308 +       return result;
66309 +}
66310 +
66311 +/** move @tap @shift units rightward */
66312 +int rewind_right(tap_t *tap, int shift)
66313 +{
66314 +       return rewind_to(tap, go_next_unit, shift);
66315 +}
66316 +
66317 +/** move @tap @shift units leftward */
66318 +int rewind_left(tap_t *tap, int shift)
66319 +{
66320 +       return rewind_to(tap, go_prev_unit, shift);
66321 +}
66322 +
66323 +#if REISER4_DEBUG
66324 +/** debugging function: print @tap content in human readable form */
66325 +static void print_tap(const char *prefix, const tap_t *tap)
66326 +{
66327 +       if (tap == NULL) {
66328 +               printk("%s: null tap\n", prefix);
66329 +               return;
66330 +       }
66331 +       printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix,
66332 +              tap->loaded, (&tap->linkage == tap->linkage.next &&
66333 +                            &tap->linkage == tap->linkage.prev),
66334 +              tap->lh->node,
66335 +              lock_mode_name(tap->mode));
66336 +       print_coord("\tcoord", tap->coord, 0);
66337 +}
66338 +
66339 +/** check [tap-sane] invariant */
66340 +static int tap_invariant(const tap_t *tap)
66341 +{
66342 +       /* [tap-sane] invariant */
66343 +
66344 +       if (tap == NULL)
66345 +               return 1;
66346 +       /* tap->mode is one of
66347 +        *
66348 +        * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
66349 +        */
66350 +       if (tap->mode != ZNODE_NO_LOCK &&
66351 +           tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK)
66352 +               return 2;
66353 +       /* tap->coord != NULL, and */
66354 +       if (tap->coord == NULL)
66355 +               return 3;
66356 +       /* tap->lh != NULL, and */
66357 +       if (tap->lh == NULL)
66358 +               return 4;
66359 +       /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */
66360 +       if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node)))
66361 +               return 5;
66362 +       /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */
66363 +       if (tap->lh->node != NULL && tap->coord->node != tap->lh->node)
66364 +               return 6;
66365 +       return 0;
66366 +}
66367 +
66368 +/** debugging function: check internal @tap consistency */
66369 +static void tap_check(const tap_t *tap)
66370 +{
66371 +       int result;
66372 +
66373 +       result = tap_invariant(tap);
66374 +       if (result != 0) {
66375 +               print_tap("broken", tap);
66376 +               reiser4_panic("nikita-2831", "tap broken: %i\n", result);
66377 +       }
66378 +}
66379 +#endif
66380 +
66381 +/* Make Linus happy.
66382 +   Local variables:
66383 +   c-indentation-style: "K&R"
66384 +   mode-name: "LC"
66385 +   c-basic-offset: 8
66386 +   tab-width: 8
66387 +   fill-column: 120
66388 +   scroll-step: 1
66389 +   End:
66390 +*/
66391 diff -urN linux-2.6.35.orig/fs/reiser4/tap.h linux-2.6.35/fs/reiser4/tap.h
66392 --- linux-2.6.35.orig/fs/reiser4/tap.h  1970-01-01 01:00:00.000000000 +0100
66393 +++ linux-2.6.35/fs/reiser4/tap.h       2010-08-04 15:44:57.000000000 +0200
66394 @@ -0,0 +1,70 @@
66395 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
66396 +
66397 +/* Tree Access Pointers. See tap.c for more details. */
66398 +
66399 +#if !defined(__REISER4_TAP_H__)
66400 +#define __REISER4_TAP_H__
66401 +
66402 +#include "forward.h"
66403 +#include "readahead.h"
66404 +
66405 +/**
66406 +    tree_access_pointer aka tap. Data structure combining coord_t and lock
66407 +    handle.
66408 +    Invariants involving this data-type, see doc/lock-ordering for details:
66409 +
66410 +      [tap-sane]
66411 + */
66412 +struct tree_access_pointer {
66413 +       /* coord tap is at */
66414 +       coord_t *coord;
66415 +       /* lock handle on ->coord->node */
66416 +       lock_handle *lh;
66417 +       /* mode of lock acquired by this tap */
66418 +       znode_lock_mode mode;
66419 +       /* incremented by reiser4_tap_load().
66420 +          Decremented by reiser4_tap_relse(). */
66421 +       int loaded;
66422 +       /* list of taps */
66423 +       struct list_head linkage;
66424 +       /* read-ahead hint */
66425 +       ra_info_t ra_info;
66426 +};
66427 +
66428 +typedef int (*go_actor_t) (tap_t *tap);
66429 +
66430 +extern int reiser4_tap_load(tap_t *tap);
66431 +extern void reiser4_tap_relse(tap_t *tap);
66432 +extern void reiser4_tap_init(tap_t *tap, coord_t *coord, lock_handle * lh,
66433 +                    znode_lock_mode mode);
66434 +extern void reiser4_tap_monitor(tap_t *tap);
66435 +extern void reiser4_tap_copy(tap_t *dst, tap_t *src);
66436 +extern void reiser4_tap_done(tap_t *tap);
66437 +extern int reiser4_tap_move(tap_t *tap, lock_handle * target);
66438 +extern int tap_to_coord(tap_t *tap, coord_t *target);
66439 +
66440 +extern int go_dir_el(tap_t *tap, sideof dir, int units_p);
66441 +extern int go_next_unit(tap_t *tap);
66442 +extern int go_prev_unit(tap_t *tap);
66443 +extern int rewind_right(tap_t *tap, int shift);
66444 +extern int rewind_left(tap_t *tap, int shift);
66445 +
66446 +extern struct list_head *reiser4_taps_list(void);
66447 +
66448 +#define for_all_taps(tap)                                                     \
66449 +       for (tap = list_entry(reiser4_taps_list()->next, tap_t, linkage);      \
66450 +            reiser4_taps_list() != &tap->linkage;                             \
66451 +            tap = list_entry(tap->linkage.next, tap_t, linkage))
66452 +
66453 +/* __REISER4_TAP_H__ */
66454 +#endif
66455 +/* Make Linus happy.
66456 +   Local variables:
66457 +   c-indentation-style: "K&R"
66458 +   mode-name: "LC"
66459 +   c-basic-offset: 8
66460 +   tab-width: 8
66461 +   fill-column: 120
66462 +   scroll-step: 1
66463 +   End:
66464 +*/
66465 diff -urN linux-2.6.35.orig/fs/reiser4/tree.c linux-2.6.35/fs/reiser4/tree.c
66466 --- linux-2.6.35.orig/fs/reiser4/tree.c 1970-01-01 01:00:00.000000000 +0100
66467 +++ linux-2.6.35/fs/reiser4/tree.c      2010-08-04 15:44:57.000000000 +0200
66468 @@ -0,0 +1,1878 @@
66469 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66470 + * reiser4/README */
66471 +
66472 +/*
66473 + * KEYS IN A TREE.
66474 + *
66475 + * The tree consists of nodes located on the disk. Node in the tree is either
66476 + * formatted or unformatted. Formatted node is one that has structure
66477 + * understood by the tree balancing and traversal code. Formatted nodes are
66478 + * further classified into leaf and internal nodes. Latter distinctions is
66479 + * (almost) of only historical importance: general structure of leaves and
66480 + * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
66481 + * that are part of bodies of ordinary files and attributes.
66482 + *
66483 + * Each node in the tree spawns some interval in the key space. Key ranges for
66484 + * all nodes in the tree are disjoint. Actually, this only holds in some weak
66485 + * sense, because of the non-unique keys: intersection of key ranges for
66486 + * different nodes is either empty, or consists of exactly one key.
66487 + *
66488 + * Formatted node consists of a sequence of items. Each item spawns some
66489 + * interval in key space. Key ranges for all items in a tree are disjoint,
66490 + * modulo non-unique keys again. Items within nodes are ordered in the key
66491 + * order of the smallest key in a item.
66492 + *
66493 + * Particular type of item can be further split into units. Unit is piece of
66494 + * item that can be cut from item and moved into another item of the same
66495 + * time. Units are used by balancing code to repack data during balancing.
66496 + *
66497 + * Unit can be further split into smaller entities (for example, extent unit
66498 + * represents several pages, and it is natural for extent code to operate on
66499 + * particular pages and even bytes within one unit), but this is of no
66500 + * relevance to the generic balancing and lookup code.
66501 + *
66502 + * Although item is said to "spawn" range or interval of keys, it is not
66503 + * necessary that item contains piece of data addressable by each and every
66504 + * key in this range. For example, compound directory item, consisting of
66505 + * units corresponding to directory entries and keyed by hashes of file names,
66506 + * looks more as having "discrete spectrum": only some disjoint keys inside
66507 + * range occupied by this item really address data.
66508 + *
66509 + * No than less, each item always has well-defined least (minimal) key, that
66510 + * is recorded in item header, stored in the node this item is in. Also, item
66511 + * plugin can optionally define method ->max_key_inside() returning maximal
66512 + * key that can _possibly_ be located within this item. This method is used
66513 + * (mainly) to determine when given piece of data should be merged into
66514 + * existing item, in stead of creating new one. Because of this, even though
66515 + * ->max_key_inside() can be larger that any key actually located in the item,
66516 + * intervals
66517 + *
66518 + * [ reiser4_min_key( item ), ->max_key_inside( item ) ]
66519 + *
66520 + * are still disjoint for all items within the _same_ node.
66521 + *
66522 + * In memory node is represented by znode. It plays several roles:
66523 + *
66524 + *  . something locks are taken on
66525 + *
66526 + *  . something tracked by transaction manager (this is going to change)
66527 + *
66528 + *  . something used to access node data
66529 + *
66530 + *  . something used to maintain tree structure in memory: sibling and
66531 + *  parental linkage.
66532 + *
66533 + *  . something used to organize nodes into "slums"
66534 + *
66535 + * More on znodes see in znode.[ch]
66536 + *
66537 + * DELIMITING KEYS
66538 + *
66539 + *   To simplify balancing, allow some flexibility in locking and speed up
66540 + *   important coord cache optimization, we keep delimiting keys of nodes in
66541 + *   memory. Depending on disk format (implemented by appropriate node plugin)
66542 + *   node on disk can record both left and right delimiting key, only one of
66543 + *   them, or none. Still, our balancing and tree traversal code keep both
66544 + *   delimiting keys for a node that is in memory stored in the znode. When
66545 + *   node is first brought into memory during tree traversal, its left
66546 + *   delimiting key is taken from its parent, and its right delimiting key is
66547 + *   either next key in its parent, or is right delimiting key of parent if
66548 + *   node is the rightmost child of parent.
66549 + *
66550 + *   Physical consistency of delimiting key is protected by special dk
66551 + *   read-write lock. That is, delimiting keys can only be inspected or
66552 + *   modified under this lock. But dk lock is only sufficient for fast
66553 + *   "pessimistic" check, because to simplify code and to decrease lock
66554 + *   contention, balancing (carry) only updates delimiting keys right before
66555 + *   unlocking all locked nodes on the given tree level. For example,
66556 + *   coord-by-key cache scans LRU list of recently accessed znodes. For each
66557 + *   node it first does fast check under dk spin lock. If key looked for is
66558 + *   not between delimiting keys for this node, next node is inspected and so
66559 + *   on. If key is inside of the key range, long term lock is taken on node
66560 + *   and key range is rechecked.
66561 + *
66562 + * COORDINATES
66563 + *
66564 + *   To find something in the tree, you supply a key, and the key is resolved
66565 + *   by coord_by_key() into a coord (coordinate) that is valid as long as the
66566 + *   node the coord points to remains locked.  As mentioned above trees
66567 + *   consist of nodes that consist of items that consist of units. A unit is
66568 + *   the smallest and indivisible piece of tree as far as balancing and tree
66569 + *   search are concerned. Each node, item, and unit can be addressed by
66570 + *   giving its level in the tree and the key occupied by this entity.  A node
66571 + *   knows what the key ranges are of the items within it, and how to find its
66572 + *   items and invoke their item handlers, but it does not know how to access
66573 + *   individual units within its items except through the item handlers.
66574 + *   coord is a structure containing a pointer to the node, the ordinal number
66575 + *   of the item within this node (a sort of item offset), and the ordinal
66576 + *   number of the unit within this item.
66577 + *
66578 + * TREE LOOKUP
66579 + *
66580 + *   There are two types of access to the tree: lookup and modification.
66581 + *
66582 + *   Lookup is a search for the key in the tree. Search can look for either
66583 + *   exactly the key given to it, or for the largest key that is not greater
66584 + *   than the key given to it. This distinction is determined by "bias"
66585 + *   parameter of search routine (coord_by_key()). coord_by_key() either
66586 + *   returns error (key is not in the tree, or some kind of external error
66587 + *   occurred), or successfully resolves key into coord.
66588 + *
66589 + *   This resolution is done by traversing tree top-to-bottom from root level
66590 + *   to the desired level. On levels above twig level (level one above the
66591 + *   leaf level) nodes consist exclusively of internal items. Internal item is
66592 + *   nothing more than pointer to the tree node on the child level. On twig
66593 + *   level nodes consist of internal items intermixed with extent
66594 + *   items. Internal items form normal search tree structure used by traversal
66595 + *   to descent through the tree.
66596 + *
66597 + * TREE LOOKUP OPTIMIZATIONS
66598 + *
66599 + * Tree lookup described above is expensive even if all nodes traversed are
66600 + * already in the memory: for each node binary search within it has to be
66601 + * performed and binary searches are CPU consuming and tend to destroy CPU
66602 + * caches.
66603 + *
66604 + * Several optimizations are used to work around this:
66605 + *
66606 + *   . cbk_cache (look-aside cache for tree traversals, see search.c for
66607 + *   details)
66608 + *
66609 + *   . seals (see seal.[ch])
66610 + *
66611 + *   . vroot (see search.c)
66612 + *
66613 + * General search-by-key is layered thusly:
66614 + *
66615 + *                   [check seal, if any]   --ok--> done
66616 + *                           |
66617 + *                         failed
66618 + *                           |
66619 + *                           V
66620 + *                     [vroot defined] --no--> node = tree_root
66621 + *                           |                   |
66622 + *                          yes                  |
66623 + *                           |                   |
66624 + *                           V                   |
66625 + *                       node = vroot            |
66626 + *                                 |             |
66627 + *                                 |             |
66628 + *                                 |             |
66629 + *                                 V             V
66630 + *                            [check cbk_cache for key]  --ok--> done
66631 + *                                        |
66632 + *                                      failed
66633 + *                                        |
66634 + *                                        V
66635 + *                       [start tree traversal from node]
66636 + *
66637 + */
66638 +
66639 +#include "forward.h"
66640 +#include "debug.h"
66641 +#include "dformat.h"
66642 +#include "key.h"
66643 +#include "coord.h"
66644 +#include "plugin/item/static_stat.h"
66645 +#include "plugin/item/item.h"
66646 +#include "plugin/node/node.h"
66647 +#include "plugin/plugin.h"
66648 +#include "txnmgr.h"
66649 +#include "jnode.h"
66650 +#include "znode.h"
66651 +#include "block_alloc.h"
66652 +#include "tree_walk.h"
66653 +#include "carry.h"
66654 +#include "carry_ops.h"
66655 +#include "tap.h"
66656 +#include "tree.h"
66657 +#include "vfs_ops.h"
66658 +#include "page_cache.h"
66659 +#include "super.h"
66660 +#include "reiser4.h"
66661 +#include "inode.h"
66662 +
66663 +#include <linux/fs.h>          /* for struct super_block  */
66664 +#include <linux/spinlock.h>
66665 +
66666 +/* Disk address (block number) never ever used for any real tree node. This is
66667 +   used as block number of "uber" znode.
66668 +
66669 +   Invalid block addresses are 0 by tradition.
66670 +
66671 +*/
66672 +const reiser4_block_nr UBER_TREE_ADDR = 0ull;
66673 +
66674 +#define CUT_TREE_MIN_ITERATIONS 64
66675 +
66676 +static int find_child_by_addr(znode * parent, znode * child, coord_t *result);
66677 +
66678 +/* return node plugin of coord->node */
66679 +node_plugin *node_plugin_by_coord(const coord_t *coord)
66680 +{
66681 +       assert("vs-1", coord != NULL);
66682 +       assert("vs-2", coord->node != NULL);
66683 +
66684 +       return coord->node->nplug;
66685 +}
66686 +
66687 +/* insert item into tree. Fields of @coord are updated so that they can be
66688 + * used by consequent insert operation. */
66689 +insert_result insert_by_key(reiser4_tree * tree        /* tree to insert new item
66690 +                                                * into */ ,
66691 +                           const reiser4_key * key /* key of new item */ ,
66692 +                           reiser4_item_data * data    /* parameters for item
66693 +                                                        * creation */ ,
66694 +                           coord_t *coord /* resulting insertion coord */ ,
66695 +                           lock_handle * lh    /* resulting lock
66696 +                                                * handle */ ,
66697 +                           tree_level stop_level /* level where to insert */ ,
66698 +                           __u32 flags/* insertion flags */)
66699 +{
66700 +       int result;
66701 +
66702 +       assert("nikita-358", tree != NULL);
66703 +       assert("nikita-360", coord != NULL);
66704 +
66705 +       result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
66706 +                             FIND_EXACT, stop_level, stop_level,
66707 +                             flags | CBK_FOR_INSERT, NULL/*ra_info */);
66708 +       switch (result) {
66709 +       default:
66710 +               break;
66711 +       case CBK_COORD_FOUND:
66712 +               result = IBK_ALREADY_EXISTS;
66713 +               break;
66714 +       case CBK_COORD_NOTFOUND:
66715 +               assert("nikita-2017", coord->node != NULL);
66716 +               result = insert_by_coord(coord, data, key, lh, 0/*flags */);
66717 +               break;
66718 +       }
66719 +       return result;
66720 +}
66721 +
66722 +/* insert item by calling carry. Helper function called if short-cut
66723 +   insertion failed  */
66724 +static insert_result insert_with_carry_by_coord(coord_t *coord,
66725 +                                       /* coord where to insert */
66726 +                                               lock_handle * lh,
66727 +                                       /* lock handle of insertion node */
66728 +                                               reiser4_item_data * data,
66729 +                                       /* parameters of new item */
66730 +                                               const reiser4_key * key,
66731 +                                       /* key of new item */
66732 +                                               carry_opcode cop,
66733 +                                       /* carry operation to perform */
66734 +                                               cop_insert_flag flags
66735 +                                       /* carry flags */ )
66736 +{
66737 +       int result;
66738 +       carry_pool *pool;
66739 +       carry_level *lowest_level;
66740 +       carry_insert_data *cdata;
66741 +       carry_op *op;
66742 +
66743 +       assert("umka-314", coord != NULL);
66744 +
66745 +       /* allocate carry_pool and 3 carry_level-s */
66746 +       pool =
66747 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
66748 +                           sizeof(*cdata));
66749 +       if (IS_ERR(pool))
66750 +               return PTR_ERR(pool);
66751 +       lowest_level = (carry_level *) (pool + 1);
66752 +       init_carry_level(lowest_level, pool);
66753 +
66754 +       op = reiser4_post_carry(lowest_level, cop, coord->node, 0);
66755 +       if (IS_ERR(op) || (op == NULL)) {
66756 +               done_carry_pool(pool);
66757 +               return RETERR(op ? PTR_ERR(op) : -EIO);
66758 +       }
66759 +       cdata = (carry_insert_data *) (lowest_level + 3);
66760 +       cdata->coord = coord;
66761 +       cdata->data = data;
66762 +       cdata->key = key;
66763 +       op->u.insert.d = cdata;
66764 +       if (flags == 0)
66765 +               flags = znode_get_tree(coord->node)->carry.insert_flags;
66766 +       op->u.insert.flags = flags;
66767 +       op->u.insert.type = COPT_ITEM_DATA;
66768 +       op->u.insert.child = NULL;
66769 +       if (lh != NULL) {
66770 +               assert("nikita-3245", lh->node == coord->node);
66771 +               lowest_level->track_type = CARRY_TRACK_CHANGE;
66772 +               lowest_level->tracked = lh;
66773 +       }
66774 +
66775 +       result = reiser4_carry(lowest_level, NULL);
66776 +       done_carry_pool(pool);
66777 +
66778 +       return result;
66779 +}
66780 +
66781 +/* form carry queue to perform paste of @data with @key at @coord, and launch
66782 +   its execution by calling carry().
66783 +
66784 +   Instruct carry to update @lh it after balancing insertion coord moves into
66785 +   different block.
66786 +
66787 +*/
66788 +static int paste_with_carry(coord_t *coord,    /* coord of paste */
66789 +                           lock_handle * lh,   /* lock handle of node
66790 +                                                * where item is
66791 +                                                * pasted */
66792 +                           reiser4_item_data * data,   /* parameters of new
66793 +                                                        * item */
66794 +                           const reiser4_key * key,    /* key of new item */
66795 +                           unsigned flags/* paste flags */)
66796 +{
66797 +       int result;
66798 +       carry_pool *pool;
66799 +       carry_level *lowest_level;
66800 +       carry_insert_data *cdata;
66801 +       carry_op *op;
66802 +
66803 +       assert("umka-315", coord != NULL);
66804 +       assert("umka-316", key != NULL);
66805 +
66806 +       pool =
66807 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
66808 +                           sizeof(*cdata));
66809 +       if (IS_ERR(pool))
66810 +               return PTR_ERR(pool);
66811 +       lowest_level = (carry_level *) (pool + 1);
66812 +       init_carry_level(lowest_level, pool);
66813 +
66814 +       op = reiser4_post_carry(lowest_level, COP_PASTE, coord->node, 0);
66815 +       if (IS_ERR(op) || (op == NULL)) {
66816 +               done_carry_pool(pool);
66817 +               return RETERR(op ? PTR_ERR(op) : -EIO);
66818 +       }
66819 +       cdata = (carry_insert_data *) (lowest_level + 3);
66820 +       cdata->coord = coord;
66821 +       cdata->data = data;
66822 +       cdata->key = key;
66823 +       op->u.paste.d = cdata;
66824 +       if (flags == 0)
66825 +               flags = znode_get_tree(coord->node)->carry.paste_flags;
66826 +       op->u.paste.flags = flags;
66827 +       op->u.paste.type = COPT_ITEM_DATA;
66828 +       if (lh != NULL) {
66829 +               lowest_level->track_type = CARRY_TRACK_CHANGE;
66830 +               lowest_level->tracked = lh;
66831 +       }
66832 +
66833 +       result = reiser4_carry(lowest_level, NULL);
66834 +       done_carry_pool(pool);
66835 +
66836 +       return result;
66837 +}
66838 +
66839 +/* insert item at the given coord.
66840 +
66841 +   First try to skip carry by directly calling ->create_item() method of node
66842 +   plugin. If this is impossible (there is not enough free space in the node,
66843 +   or leftmost item in the node is created), call insert_with_carry_by_coord()
66844 +   that will do full carry().
66845 +
66846 +*/
66847 +insert_result insert_by_coord(coord_t *coord   /* coord where to
66848 +                                                * insert. coord->node has
66849 +                                                * to be write locked by
66850 +                                                * caller */ ,
66851 +                             reiser4_item_data * data  /* data to be
66852 +                                                        * inserted */ ,
66853 +                             const reiser4_key * key /* key of new item */ ,
66854 +                             lock_handle * lh  /* lock handle of write
66855 +                                                * lock on node */ ,
66856 +                             __u32 flags/* insertion flags */)
66857 +{
66858 +       unsigned item_size;
66859 +       int result;
66860 +       znode *node;
66861 +
66862 +       assert("vs-247", coord != NULL);
66863 +       assert("vs-248", data != NULL);
66864 +       assert("vs-249", data->length >= 0);
66865 +       assert("nikita-1191", znode_is_write_locked(coord->node));
66866 +
66867 +       node = coord->node;
66868 +       coord_clear_iplug(coord);
66869 +       result = zload(node);
66870 +       if (result != 0)
66871 +               return result;
66872 +
66873 +       item_size = space_needed(node, NULL, data, 1);
66874 +       if (item_size > znode_free_space(node) &&
66875 +           (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
66876 +           && (flags & COPI_DONT_ALLOCATE)) {
66877 +               /* we are forced to use free space of coord->node and new item
66878 +                  does not fit into it.
66879 +
66880 +                  Currently we get here only when we allocate and copy units
66881 +                  of extent item from a node to its left neighbor during
66882 +                  "squalloc"-ing.  If @node (this is left neighbor) does not
66883 +                  have enough free space - we do not want to attempt any
66884 +                  shifting and allocations because we are in squeezing and
66885 +                  everything to the left of @node is tightly packed.
66886 +                */
66887 +               result = -E_NODE_FULL;
66888 +       } else if ((item_size <= znode_free_space(node)) &&
66889 +                  !coord_is_before_leftmost(coord) &&
66890 +                  (node_plugin_by_node(node)->fast_insert != NULL)
66891 +                  && node_plugin_by_node(node)->fast_insert(coord)) {
66892 +               /* shortcut insertion without carry() overhead.
66893 +
66894 +                  Only possible if:
66895 +
66896 +                  - there is enough free space
66897 +
66898 +                  - insertion is not into the leftmost position in a node
66899 +                  (otherwise it would require updating of delimiting key in a
66900 +                  parent)
66901 +
66902 +                  - node plugin agrees with this
66903 +
66904 +                */
66905 +               result =
66906 +                   node_plugin_by_node(node)->create_item(coord, key, data,
66907 +                                                          NULL);
66908 +               znode_make_dirty(node);
66909 +       } else {
66910 +               /* otherwise do full-fledged carry(). */
66911 +               result =
66912 +                   insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT,
66913 +                                              flags);
66914 +       }
66915 +       zrelse(node);
66916 +       return result;
66917 +}
66918 +
66919 +/* @coord is set to leaf level and @data is to be inserted to twig level */
66920 +insert_result
66921 +insert_extent_by_coord(coord_t *coord,         /* coord where to insert.
66922 +                                               * coord->node has to be write
66923 +                                               * locked by caller */
66924 +                      reiser4_item_data *data,/* data to be inserted */
66925 +                      const reiser4_key *key, /* key of new item */
66926 +                      lock_handle *lh         /* lock handle of write lock
66927 +                                                 on node */)
66928 +{
66929 +       assert("vs-405", coord != NULL);
66930 +       assert("vs-406", data != NULL);
66931 +       assert("vs-407", data->length > 0);
66932 +       assert("vs-408", znode_is_write_locked(coord->node));
66933 +       assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
66934 +
66935 +       return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT,
66936 +                                         0 /*flags */ );
66937 +}
66938 +
66939 +/* Insert into the item at the given coord.
66940 +
66941 +   First try to skip carry by directly calling ->paste() method of item
66942 +   plugin. If this is impossible (there is not enough free space in the node,
66943 +   or we are pasting into leftmost position in the node), call
66944 +   paste_with_carry() that will do full carry().
66945 +
66946 +*/
66947 +/* paste_into_item */
66948 +int insert_into_item(coord_t * coord /* coord of pasting */ ,
66949 +                    lock_handle * lh /* lock handle on node involved */ ,
66950 +                    const reiser4_key * key /* key of unit being pasted */ ,
66951 +                    reiser4_item_data * data /* parameters for new unit */ ,
66952 +                    unsigned flags /* insert/paste flags */ )
66953 +{
66954 +       int result;
66955 +       int size_change;
66956 +       node_plugin *nplug;
66957 +       item_plugin *iplug;
66958 +
66959 +       assert("umka-317", coord != NULL);
66960 +       assert("umka-318", key != NULL);
66961 +
66962 +       iplug = item_plugin_by_coord(coord);
66963 +       nplug = node_plugin_by_coord(coord);
66964 +
66965 +       assert("nikita-1480", iplug == data->iplug);
66966 +
66967 +       size_change = space_needed(coord->node, coord, data, 0);
66968 +       if (size_change > (int)znode_free_space(coord->node) &&
66969 +           (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
66970 +           && (flags & COPI_DONT_ALLOCATE)) {
66971 +               /* we are forced to use free space of coord->node and new data
66972 +                  does not fit into it. */
66973 +               return -E_NODE_FULL;
66974 +       }
66975 +
66976 +       /* shortcut paste without carry() overhead.
66977 +
66978 +          Only possible if:
66979 +
66980 +          - there is enough free space
66981 +
66982 +          - paste is not into the leftmost unit in a node (otherwise
66983 +          it would require updating of delimiting key in a parent)
66984 +
66985 +          - node plugin agrees with this
66986 +
66987 +          - item plugin agrees with us
66988 +        */
66989 +       if (size_change <= (int)znode_free_space(coord->node) &&
66990 +           (coord->item_pos != 0 ||
66991 +            coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
66992 +           coord->unit_pos != 0 && nplug->fast_paste != NULL &&
66993 +           nplug->fast_paste(coord) &&
66994 +           iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
66995 +               if (size_change > 0)
66996 +                       nplug->change_item_size(coord, size_change);
66997 +               /* NOTE-NIKITA: huh? where @key is used? */
66998 +               result = iplug->b.paste(coord, data, NULL);
66999 +               if (size_change < 0)
67000 +                       nplug->change_item_size(coord, size_change);
67001 +               znode_make_dirty(coord->node);
67002 +       } else
67003 +               /* otherwise do full-fledged carry(). */
67004 +               result = paste_with_carry(coord, lh, data, key, flags);
67005 +       return result;
67006 +}
67007 +
67008 +/* this either appends or truncates item @coord */
67009 +int reiser4_resize_item(coord_t * coord /* coord of item being resized */ ,
67010 +                       reiser4_item_data * data /* parameters of resize */ ,
67011 +                       reiser4_key * key /* key of new unit */ ,
67012 +                       lock_handle * lh        /* lock handle of node
67013 +                                                * being modified */ ,
67014 +                       cop_insert_flag flags /* carry flags */ )
67015 +{
67016 +       int result;
67017 +       znode *node;
67018 +
67019 +       assert("nikita-362", coord != NULL);
67020 +       assert("nikita-363", data != NULL);
67021 +       assert("vs-245", data->length != 0);
67022 +
67023 +       node = coord->node;
67024 +       coord_clear_iplug(coord);
67025 +       result = zload(node);
67026 +       if (result != 0)
67027 +               return result;
67028 +
67029 +       if (data->length < 0)
67030 +               result = node_plugin_by_coord(coord)->shrink_item(coord,
67031 +                                                                 -data->length);
67032 +       else
67033 +               result = insert_into_item(coord, lh, key, data, flags);
67034 +
67035 +       zrelse(node);
67036 +       return result;
67037 +}
67038 +
67039 +/* insert flow @f */
67040 +int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
67041 +{
67042 +       int result;
67043 +       carry_pool *pool;
67044 +       carry_level *lowest_level;
67045 +       reiser4_item_data *data;
67046 +       carry_op *op;
67047 +
67048 +       pool =
67049 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
67050 +                           sizeof(*data));
67051 +       if (IS_ERR(pool))
67052 +               return PTR_ERR(pool);
67053 +       lowest_level = (carry_level *) (pool + 1);
67054 +       init_carry_level(lowest_level, pool);
67055 +
67056 +       op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
67057 +                       0 /* operate directly on coord -> node */ );
67058 +       if (IS_ERR(op) || (op == NULL)) {
67059 +               done_carry_pool(pool);
67060 +               return RETERR(op ? PTR_ERR(op) : -EIO);
67061 +       }
67062 +
67063 +       /* these are permanent during insert_flow */
67064 +       data = (reiser4_item_data *) (lowest_level + 3);
67065 +       data->user = 1;
67066 +       data->iplug = item_plugin_by_id(FORMATTING_ID);
67067 +       data->arg = NULL;
67068 +       /* data.length and data.data will be set before calling paste or
67069 +          insert */
67070 +       data->length = 0;
67071 +       data->data = NULL;
67072 +
67073 +       op->u.insert_flow.flags = 0;
67074 +       op->u.insert_flow.insert_point = coord;
67075 +       op->u.insert_flow.flow = f;
67076 +       op->u.insert_flow.data = data;
67077 +       op->u.insert_flow.new_nodes = 0;
67078 +
67079 +       lowest_level->track_type = CARRY_TRACK_CHANGE;
67080 +       lowest_level->tracked = lh;
67081 +
67082 +       result = reiser4_carry(lowest_level, NULL);
67083 +       done_carry_pool(pool);
67084 +
67085 +       return result;
67086 +}
67087 +
67088 +/* Given a coord in parent node, obtain a znode for the corresponding child */
67089 +znode *child_znode(const coord_t * parent_coord        /* coord of pointer to
67090 +                                                * child */ ,
67091 +                  znode * parent /* parent of child */ ,
67092 +                  int incore_p /* if !0 only return child if already in
67093 +                                * memory */ ,
67094 +                  int setup_dkeys_p    /* if !0 update delimiting keys of
67095 +                                        * child */ )
67096 +{
67097 +       znode *child;
67098 +
67099 +       assert("nikita-1374", parent_coord != NULL);
67100 +       assert("nikita-1482", parent != NULL);
67101 +#if REISER4_DEBUG
67102 +       if (setup_dkeys_p)
67103 +               assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock));
67104 +#endif
67105 +       assert("nikita-2947", znode_is_any_locked(parent));
67106 +
67107 +       if (znode_get_level(parent) <= LEAF_LEVEL) {
67108 +               /* trying to get child of leaf node */
67109 +               warning("nikita-1217", "Child of maize?");
67110 +               return ERR_PTR(RETERR(-EIO));
67111 +       }
67112 +       if (item_is_internal(parent_coord)) {
67113 +               reiser4_block_nr addr;
67114 +               item_plugin *iplug;
67115 +               reiser4_tree *tree;
67116 +
67117 +               iplug = item_plugin_by_coord(parent_coord);
67118 +               assert("vs-512", iplug->s.internal.down_link);
67119 +               iplug->s.internal.down_link(parent_coord, NULL, &addr);
67120 +
67121 +               tree = znode_get_tree(parent);
67122 +               if (incore_p)
67123 +                       child = zlook(tree, &addr);
67124 +               else
67125 +                       child =
67126 +                           zget(tree, &addr, parent,
67127 +                                znode_get_level(parent) - 1,
67128 +                                reiser4_ctx_gfp_mask_get());
67129 +               if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
67130 +                       set_child_delimiting_keys(parent, parent_coord, child);
67131 +       } else {
67132 +               warning("nikita-1483", "Internal item expected");
67133 +               child = ERR_PTR(RETERR(-EIO));
67134 +       }
67135 +       return child;
67136 +}
67137 +
67138 +/* remove znode from transaction */
67139 +static void uncapture_znode(znode * node)
67140 +{
67141 +       struct page *page;
67142 +
67143 +       assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
67144 +
67145 +       if (!reiser4_blocknr_is_fake(znode_get_block(node))) {
67146 +               int ret;
67147 +
67148 +               /* An already allocated block goes right to the atom's delete set. */
67149 +               ret =
67150 +                   reiser4_dealloc_block(znode_get_block(node), 0,
67151 +                                         BA_DEFER | BA_FORMATTED);
67152 +               if (ret)
67153 +                       warning("zam-942",
67154 +                               "can\'t add a block (%llu) number to atom's delete set\n",
67155 +                               (unsigned long long)(*znode_get_block(node)));
67156 +
67157 +               spin_lock_znode(node);
67158 +               /* Here we return flush reserved block which was reserved at the
67159 +                * moment when this allocated node was marked dirty and still
67160 +                * not used by flush in node relocation procedure.  */
67161 +               if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
67162 +                       txn_atom *atom;
67163 +
67164 +                       atom = jnode_get_atom(ZJNODE(node));
67165 +                       assert("zam-939", atom != NULL);
67166 +                       spin_unlock_znode(node);
67167 +                       flush_reserved2grabbed(atom, (__u64) 1);
67168 +                       spin_unlock_atom(atom);
67169 +               } else
67170 +                       spin_unlock_znode(node);
67171 +       } else {
67172 +               /* znode has assigned block which is counted as "fake
67173 +                  allocated". Return it back to "free blocks") */
67174 +               fake_allocated2free((__u64) 1, BA_FORMATTED);
67175 +       }
67176 +
67177 +       /*
67178 +        * uncapture page from transaction. There is a possibility of a race
67179 +        * with ->releasepage(): reiser4_releasepage() detaches page from this
67180 +        * jnode and we have nothing to uncapture. To avoid this, get
67181 +        * reference of node->pg under jnode spin lock. reiser4_uncapture_page()
67182 +        * will deal with released page itself.
67183 +        */
67184 +       spin_lock_znode(node);
67185 +       page = znode_page(node);
67186 +       if (likely(page != NULL)) {
67187 +               /*
67188 +                * reiser4_uncapture_page() can only be called when we are sure
67189 +                * that znode is pinned in memory, which we are, because
67190 +                * forget_znode() is only called from longterm_unlock_znode().
67191 +                */
67192 +               page_cache_get(page);
67193 +               spin_unlock_znode(node);
67194 +               lock_page(page);
67195 +               reiser4_uncapture_page(page);
67196 +               unlock_page(page);
67197 +               page_cache_release(page);
67198 +       } else {
67199 +               txn_atom *atom;
67200 +
67201 +               /* handle "flush queued" znodes */
67202 +               while (1) {
67203 +                       atom = jnode_get_atom(ZJNODE(node));
67204 +                       assert("zam-943", atom != NULL);
67205 +
67206 +                       if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED)
67207 +                           || !atom->nr_running_queues)
67208 +                               break;
67209 +
67210 +                       spin_unlock_znode(node);
67211 +                       reiser4_atom_wait_event(atom);
67212 +                       spin_lock_znode(node);
67213 +               }
67214 +
67215 +               reiser4_uncapture_block(ZJNODE(node));
67216 +               spin_unlock_atom(atom);
67217 +               zput(node);
67218 +       }
67219 +}
67220 +
67221 +/* This is called from longterm_unlock_znode() when last lock is released from
67222 +   the node that has been removed from the tree. At this point node is removed
67223 +   from sibling list and its lock is invalidated. */
67224 +void forget_znode(lock_handle * handle)
67225 +{
67226 +       znode *node;
67227 +       reiser4_tree *tree;
67228 +
67229 +       assert("umka-319", handle != NULL);
67230 +
67231 +       node = handle->node;
67232 +       tree = znode_get_tree(node);
67233 +
67234 +       assert("vs-164", znode_is_write_locked(node));
67235 +       assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
67236 +       assert_rw_locked(&(node->lock.guard));
67237 +
67238 +       /* We assume that this node was detached from its parent before
67239 +        * unlocking, it gives no way to reach this node from parent through a
67240 +        * down link.  The node should have no children and, thereby, can't be
67241 +        * reached from them by their parent pointers.  The only way to obtain a
67242 +        * reference to the node is to use sibling pointers from its left and
67243 +        * right neighbors.  In the next several lines we remove the node from
67244 +        * the sibling list. */
67245 +
67246 +       write_lock_tree(tree);
67247 +       sibling_list_remove(node);
67248 +       znode_remove(node, tree);
67249 +       write_unlock_tree(tree);
67250 +
67251 +       /* Here we set JNODE_DYING and cancel all pending lock requests.  It
67252 +        * forces all lock requestor threads to repeat iterations of getting
67253 +        * lock on a child, neighbor or parent node.  But, those threads can't
67254 +        * come to this node again, because this node is no longer a child,
67255 +        * neighbor or parent of any other node.  This order of znode
67256 +        * invalidation does not allow other threads to waste cpu time is a busy
67257 +        * loop, trying to lock dying object.  The exception is in the flush
67258 +        * code when we take node directly from atom's capture list.*/
67259 +       reiser4_invalidate_lock(handle);
67260 +       uncapture_znode(node);
67261 +}
67262 +
67263 +/* Check that internal item at @pointer really contains pointer to @child. */
67264 +int check_tree_pointer(const coord_t * pointer /* would-be pointer to
67265 +                                                * @child */ ,
67266 +                      const znode * child /* child znode */ )
67267 +{
67268 +       assert("nikita-1016", pointer != NULL);
67269 +       assert("nikita-1017", child != NULL);
67270 +       assert("nikita-1018", pointer->node != NULL);
67271 +
67272 +       assert("nikita-1325", znode_is_any_locked(pointer->node));
67273 +
67274 +       assert("nikita-2985",
67275 +              znode_get_level(pointer->node) == znode_get_level(child) + 1);
67276 +
67277 +       coord_clear_iplug((coord_t *) pointer);
67278 +
67279 +       if (coord_is_existing_unit(pointer)) {
67280 +               item_plugin *iplug;
67281 +               reiser4_block_nr addr;
67282 +
67283 +               if (item_is_internal(pointer)) {
67284 +                       iplug = item_plugin_by_coord(pointer);
67285 +                       assert("vs-513", iplug->s.internal.down_link);
67286 +                       iplug->s.internal.down_link(pointer, NULL, &addr);
67287 +                       /* check that cached value is correct */
67288 +                       if (disk_addr_eq(&addr, znode_get_block(child))) {
67289 +                               return NS_FOUND;
67290 +                       }
67291 +               }
67292 +       }
67293 +       /* warning ("jmacd-1002", "tree pointer incorrect"); */
67294 +       return NS_NOT_FOUND;
67295 +}
67296 +
67297 +/* find coord of pointer to new @child in @parent.
67298 +
67299 +   Find the &coord_t in the @parent where pointer to a given @child will
67300 +   be in.
67301 +
67302 +*/
67303 +int find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
67304 +                      znode *
67305 +                      child UNUSED_ARG /* child znode, passed locked */ ,
67306 +                      znode * left /* left brother of new node */ ,
67307 +                      coord_t * result /* where result is stored in */ )
67308 +{
67309 +       int ret;
67310 +
67311 +       assert("nikita-1486", parent != NULL);
67312 +       assert("nikita-1487", child != NULL);
67313 +       assert("nikita-1488", result != NULL);
67314 +
67315 +       ret = find_child_ptr(parent, left, result);
67316 +       if (ret != NS_FOUND) {
67317 +               warning("nikita-1489", "Cannot find brother position: %i", ret);
67318 +               return RETERR(-EIO);
67319 +       } else {
67320 +               result->between = AFTER_UNIT;
67321 +               return RETERR(NS_NOT_FOUND);
67322 +       }
67323 +}
67324 +
67325 +/* find coord of pointer to @child in @parent.
67326 +
67327 +   Find the &coord_t in the @parent where pointer to a given @child is in.
67328 +
67329 +*/
67330 +int find_child_ptr(znode * parent /* parent znode, passed locked */ ,
67331 +                  znode * child /* child znode, passed locked */ ,
67332 +                  coord_t * result /* where result is stored in */ )
67333 +{
67334 +       int lookup_res;
67335 +       node_plugin *nplug;
67336 +       /* left delimiting key of a child */
67337 +       reiser4_key ld;
67338 +       reiser4_tree *tree;
67339 +
67340 +       assert("nikita-934", parent != NULL);
67341 +       assert("nikita-935", child != NULL);
67342 +       assert("nikita-936", result != NULL);
67343 +       assert("zam-356", znode_is_loaded(parent));
67344 +
67345 +       coord_init_zero(result);
67346 +       result->node = parent;
67347 +
67348 +       nplug = parent->nplug;
67349 +       assert("nikita-939", nplug != NULL);
67350 +
67351 +       tree = znode_get_tree(parent);
67352 +       /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
67353 +        * not aliased to ->in_parent of some znode. Otherwise,
67354 +        * parent_coord_to_coord() below would modify data protected by tree
67355 +        * lock. */
67356 +       read_lock_tree(tree);
67357 +       /* fast path. Try to use cached value. Lock tree to keep
67358 +          node->pos_in_parent and pos->*_blocknr consistent. */
67359 +       if (child->in_parent.item_pos + 1 != 0) {
67360 +               parent_coord_to_coord(&child->in_parent, result);
67361 +               if (check_tree_pointer(result, child) == NS_FOUND) {
67362 +                       read_unlock_tree(tree);
67363 +                       return NS_FOUND;
67364 +               }
67365 +
67366 +               child->in_parent.item_pos = (unsigned short)~0;
67367 +       }
67368 +       read_unlock_tree(tree);
67369 +
67370 +       /* is above failed, find some key from @child. We are looking for the
67371 +          least key in a child. */
67372 +       read_lock_dk(tree);
67373 +       ld = *znode_get_ld_key(child);
67374 +       read_unlock_dk(tree);
67375 +       /*
67376 +        * now, lookup parent with key just found. Note, that left delimiting
67377 +        * key doesn't identify node uniquely, because (in extremely rare
67378 +        * case) two nodes can have equal left delimiting keys, if one of them
67379 +        * is completely filled with directory entries that all happened to be
67380 +        * hash collision. But, we check block number in check_tree_pointer()
67381 +        * and, so, are safe.
67382 +        */
67383 +       lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
67384 +       /* update cached pos_in_node */
67385 +       if (lookup_res == NS_FOUND) {
67386 +               write_lock_tree(tree);
67387 +               coord_to_parent_coord(result, &child->in_parent);
67388 +               write_unlock_tree(tree);
67389 +               lookup_res = check_tree_pointer(result, child);
67390 +       }
67391 +       if (lookup_res == NS_NOT_FOUND)
67392 +               lookup_res = find_child_by_addr(parent, child, result);
67393 +       return lookup_res;
67394 +}
67395 +
67396 +/* find coord of pointer to @child in @parent by scanning
67397 +
67398 +   Find the &coord_t in the @parent where pointer to a given @child
67399 +   is in by scanning all internal items in @parent and comparing block
67400 +   numbers in them with that of @child.
67401 +
67402 +*/
67403 +static int find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
67404 +                             znode * child /* child znode, passed locked */ ,
67405 +                             coord_t * result /* where result is stored in */ )
67406 +{
67407 +       int ret;
67408 +
67409 +       assert("nikita-1320", parent != NULL);
67410 +       assert("nikita-1321", child != NULL);
67411 +       assert("nikita-1322", result != NULL);
67412 +
67413 +       ret = NS_NOT_FOUND;
67414 +
67415 +       for_all_units(result, parent) {
67416 +               if (check_tree_pointer(result, child) == NS_FOUND) {
67417 +                       write_lock_tree(znode_get_tree(parent));
67418 +                       coord_to_parent_coord(result, &child->in_parent);
67419 +                       write_unlock_tree(znode_get_tree(parent));
67420 +                       ret = NS_FOUND;
67421 +                       break;
67422 +               }
67423 +       }
67424 +       return ret;
67425 +}
67426 +
67427 +/* true, if @addr is "unallocated block number", which is just address, with
67428 +   highest bit set. */
67429 +int is_disk_addr_unallocated(const reiser4_block_nr * addr     /* address to
67430 +                                                                * check */ )
67431 +{
67432 +       assert("nikita-1766", addr != NULL);
67433 +       cassert(sizeof(reiser4_block_nr) == 8);
67434 +       return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) ==
67435 +           REISER4_UNALLOCATED_STATUS_VALUE;
67436 +}
67437 +
67438 +/* returns true if removing bytes of given range of key [from_key, to_key]
67439 +   causes removing of whole item @from */
67440 +static int
67441 +item_removed_completely(coord_t * from, const reiser4_key * from_key,
67442 +                       const reiser4_key * to_key)
67443 +{
67444 +       item_plugin *iplug;
67445 +       reiser4_key key_in_item;
67446 +
67447 +       assert("umka-325", from != NULL);
67448 +       assert("", item_is_extent(from));
67449 +
67450 +       /* check first key just for case */
67451 +       item_key_by_coord(from, &key_in_item);
67452 +       if (keygt(from_key, &key_in_item))
67453 +               return 0;
67454 +
67455 +       /* check last key */
67456 +       iplug = item_plugin_by_coord(from);
67457 +       assert("vs-611", iplug && iplug->s.file.append_key);
67458 +
67459 +       iplug->s.file.append_key(from, &key_in_item);
67460 +       set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1);
67461 +
67462 +       if (keylt(to_key, &key_in_item))
67463 +               /* last byte is not removed */
67464 +               return 0;
67465 +       return 1;
67466 +}
67467 +
67468 +/* helper function for prepare_twig_kill(): @left and @right are formatted
67469 + * neighbors of extent item being completely removed. Load and lock neighbors
67470 + * and store lock handles into @cdata for later use by kill_hook_extent() */
67471 +static int
67472 +prepare_children(znode * left, znode * right, carry_kill_data * kdata)
67473 +{
67474 +       int result;
67475 +       int left_loaded;
67476 +       int right_loaded;
67477 +
67478 +       result = 0;
67479 +       left_loaded = right_loaded = 0;
67480 +
67481 +       if (left != NULL) {
67482 +               result = zload(left);
67483 +               if (result == 0) {
67484 +                       left_loaded = 1;
67485 +                       result = longterm_lock_znode(kdata->left, left,
67486 +                                                    ZNODE_READ_LOCK,
67487 +                                                    ZNODE_LOCK_LOPRI);
67488 +               }
67489 +       }
67490 +       if (result == 0 && right != NULL) {
67491 +               result = zload(right);
67492 +               if (result == 0) {
67493 +                       right_loaded = 1;
67494 +                       result = longterm_lock_znode(kdata->right, right,
67495 +                                                    ZNODE_READ_LOCK,
67496 +                                                    ZNODE_LOCK_HIPRI |
67497 +                                                    ZNODE_LOCK_NONBLOCK);
67498 +               }
67499 +       }
67500 +       if (result != 0) {
67501 +               done_lh(kdata->left);
67502 +               done_lh(kdata->right);
67503 +               if (left_loaded != 0)
67504 +                       zrelse(left);
67505 +               if (right_loaded != 0)
67506 +                       zrelse(right);
67507 +       }
67508 +       return result;
67509 +}
67510 +
67511 +static void done_children(carry_kill_data * kdata)
67512 +{
67513 +       if (kdata->left != NULL && kdata->left->node != NULL) {
67514 +               zrelse(kdata->left->node);
67515 +               done_lh(kdata->left);
67516 +       }
67517 +       if (kdata->right != NULL && kdata->right->node != NULL) {
67518 +               zrelse(kdata->right->node);
67519 +               done_lh(kdata->right);
67520 +       }
67521 +}
67522 +
67523 +/* part of cut_node. It is called when cut_node is called to remove or cut part
67524 +   of extent item. When head of that item is removed - we have to update right
67525 +   delimiting of left neighbor of extent. When item is removed completely - we
67526 +   have to set sibling link between left and right neighbor of removed
67527 +   extent. This may return -E_DEADLOCK because of trying to get left neighbor
67528 +   locked. So, caller should repeat an attempt
67529 +*/
67530 +/* Audited by: umka (2002.06.16) */
67531 +static int
67532 +prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor)
67533 +{
67534 +       int result;
67535 +       reiser4_key key;
67536 +       lock_handle left_lh;
67537 +       lock_handle right_lh;
67538 +       coord_t left_coord;
67539 +       coord_t *from;
67540 +       znode *left_child;
67541 +       znode *right_child;
67542 +       reiser4_tree *tree;
67543 +       int left_zloaded_here, right_zloaded_here;
67544 +
67545 +       from = kdata->params.from;
67546 +       assert("umka-326", from != NULL);
67547 +       assert("umka-327", kdata->params.to != NULL);
67548 +
67549 +       /* for one extent item only yet */
67550 +       assert("vs-591", item_is_extent(from));
67551 +       assert("vs-592", from->item_pos == kdata->params.to->item_pos);
67552 +
67553 +       if ((kdata->params.from_key
67554 +            && keygt(kdata->params.from_key, item_key_by_coord(from, &key)))
67555 +           || from->unit_pos != 0) {
67556 +               /* head of item @from is not removed, there is nothing to
67557 +                  worry about */
67558 +               return 0;
67559 +       }
67560 +
67561 +       result = 0;
67562 +       left_zloaded_here = 0;
67563 +       right_zloaded_here = 0;
67564 +
67565 +       left_child = right_child = NULL;
67566 +
67567 +       coord_dup(&left_coord, from);
67568 +       init_lh(&left_lh);
67569 +       init_lh(&right_lh);
67570 +       if (coord_prev_unit(&left_coord)) {
67571 +               /* @from is leftmost item in its node */
67572 +               if (!locked_left_neighbor) {
67573 +                       result =
67574 +                           reiser4_get_left_neighbor(&left_lh, from->node,
67575 +                                                     ZNODE_READ_LOCK,
67576 +                                                     GN_CAN_USE_UPPER_LEVELS);
67577 +                       switch (result) {
67578 +                       case 0:
67579 +                               break;
67580 +                       case -E_NO_NEIGHBOR:
67581 +                               /* there is no formatted node to the left of
67582 +                                  from->node */
67583 +                               warning("vs-605",
67584 +                                       "extent item has smallest key in "
67585 +                                       "the tree and it is about to be removed");
67586 +                               return 0;
67587 +                       case -E_DEADLOCK:
67588 +                               /* need to restart */
67589 +                       default:
67590 +                               return result;
67591 +                       }
67592 +
67593 +                       /* we have acquired left neighbor of from->node */
67594 +                       result = zload(left_lh.node);
67595 +                       if (result)
67596 +                               goto done;
67597 +
67598 +                       locked_left_neighbor = left_lh.node;
67599 +               } else {
67600 +                       /* squalloc_right_twig_cut should have supplied locked
67601 +                        * left neighbor */
67602 +                       assert("vs-834",
67603 +                              znode_is_write_locked(locked_left_neighbor));
67604 +                       result = zload(locked_left_neighbor);
67605 +                       if (result)
67606 +                               return result;
67607 +               }
67608 +
67609 +               left_zloaded_here = 1;
67610 +               coord_init_last_unit(&left_coord, locked_left_neighbor);
67611 +       }
67612 +
67613 +       if (!item_is_internal(&left_coord)) {
67614 +               /* what else but extent can be on twig level */
67615 +               assert("vs-606", item_is_extent(&left_coord));
67616 +
67617 +               /* there is no left formatted child */
67618 +               if (left_zloaded_here)
67619 +                       zrelse(locked_left_neighbor);
67620 +               done_lh(&left_lh);
67621 +               return 0;
67622 +       }
67623 +
67624 +       tree = znode_get_tree(left_coord.node);
67625 +       left_child = child_znode(&left_coord, left_coord.node, 1, 0);
67626 +
67627 +       if (IS_ERR(left_child)) {
67628 +               result = PTR_ERR(left_child);
67629 +               goto done;
67630 +       }
67631 +
67632 +       /* left child is acquired, calculate new right delimiting key for it
67633 +          and get right child if it is necessary */
67634 +       if (item_removed_completely
67635 +           (from, kdata->params.from_key, kdata->params.to_key)) {
67636 +               /* try to get right child of removed item */
67637 +               coord_t right_coord;
67638 +
67639 +               assert("vs-607",
67640 +                      kdata->params.to->unit_pos ==
67641 +                      coord_last_unit_pos(kdata->params.to));
67642 +               coord_dup(&right_coord, kdata->params.to);
67643 +               if (coord_next_unit(&right_coord)) {
67644 +                       /* @to is rightmost unit in the node */
67645 +                       result =
67646 +                           reiser4_get_right_neighbor(&right_lh, from->node,
67647 +                                                      ZNODE_READ_LOCK,
67648 +                                                      GN_CAN_USE_UPPER_LEVELS);
67649 +                       switch (result) {
67650 +                       case 0:
67651 +                               result = zload(right_lh.node);
67652 +                               if (result)
67653 +                                       goto done;
67654 +
67655 +                               right_zloaded_here = 1;
67656 +                               coord_init_first_unit(&right_coord,
67657 +                                                     right_lh.node);
67658 +                               item_key_by_coord(&right_coord, &key);
67659 +                               break;
67660 +
67661 +                       case -E_NO_NEIGHBOR:
67662 +                               /* there is no formatted node to the right of
67663 +                                  from->node */
67664 +                               read_lock_dk(tree);
67665 +                               key = *znode_get_rd_key(from->node);
67666 +                               read_unlock_dk(tree);
67667 +                               right_coord.node = NULL;
67668 +                               result = 0;
67669 +                               break;
67670 +                       default:
67671 +                               /* real error */
67672 +                               goto done;
67673 +                       }
67674 +               } else {
67675 +                       /* there is an item to the right of @from - take its key */
67676 +                       item_key_by_coord(&right_coord, &key);
67677 +               }
67678 +
67679 +               /* try to get right child of @from */
67680 +               if (right_coord.node && /* there is right neighbor of @from */
67681 +                   item_is_internal(&right_coord)) {   /* it is internal item */
67682 +                       right_child = child_znode(&right_coord,
67683 +                                                 right_coord.node, 1, 0);
67684 +
67685 +                       if (IS_ERR(right_child)) {
67686 +                               result = PTR_ERR(right_child);
67687 +                               goto done;
67688 +                       }
67689 +
67690 +               }
67691 +               /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
67692 +                  update of right delimiting key of left_child */
67693 +               result = prepare_children(left_child, right_child, kdata);
67694 +       } else {
67695 +               /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
67696 +               result = prepare_children(left_child, NULL, kdata);
67697 +       }
67698 +
67699 +      done:
67700 +       if (right_child)
67701 +               zput(right_child);
67702 +       if (right_zloaded_here)
67703 +               zrelse(right_lh.node);
67704 +       done_lh(&right_lh);
67705 +
67706 +       if (left_child)
67707 +               zput(left_child);
67708 +       if (left_zloaded_here)
67709 +               zrelse(locked_left_neighbor);
67710 +       done_lh(&left_lh);
67711 +       return result;
67712 +}
67713 +
67714 +/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
67715 +   are to be cut completely */
67716 +/* for try_to_merge_with_left, delete_copied, reiser4_delete_node */
67717 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,       /* first key to be removed */
67718 +                    const reiser4_key * to_key,        /* last key to be removed */
67719 +                    reiser4_key *
67720 +                    smallest_removed /* smallest key actually removed */ )
67721 +{
67722 +       int result;
67723 +       carry_pool *pool;
67724 +       carry_level *lowest_level;
67725 +       carry_cut_data *cut_data;
67726 +       carry_op *op;
67727 +
67728 +       assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
67729 +
67730 +       pool =
67731 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
67732 +                           sizeof(*cut_data));
67733 +       if (IS_ERR(pool))
67734 +               return PTR_ERR(pool);
67735 +       lowest_level = (carry_level *) (pool + 1);
67736 +       init_carry_level(lowest_level, pool);
67737 +
67738 +       op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
67739 +       assert("vs-1509", op != 0);
67740 +       if (IS_ERR(op)) {
67741 +               done_carry_pool(pool);
67742 +               return PTR_ERR(op);
67743 +       }
67744 +
67745 +       cut_data = (carry_cut_data *) (lowest_level + 3);
67746 +       cut_data->params.from = from;
67747 +       cut_data->params.to = to;
67748 +       cut_data->params.from_key = from_key;
67749 +       cut_data->params.to_key = to_key;
67750 +       cut_data->params.smallest_removed = smallest_removed;
67751 +
67752 +       op->u.cut_or_kill.is_cut = 1;
67753 +       op->u.cut_or_kill.u.cut = cut_data;
67754 +
67755 +       result = reiser4_carry(lowest_level, NULL);
67756 +       done_carry_pool(pool);
67757 +
67758 +       return result;
67759 +}
67760 +
67761 +/* cut part of the node
67762 +
67763 +   Cut part or whole content of node.
67764 +
67765 +   cut data between @from and @to of @from->node and call carry() to make
67766 +   corresponding changes in the tree. @from->node may become empty. If so -
67767 +   pointer to it will be removed. Neighboring nodes are not changed. Smallest
67768 +   removed key is stored in @smallest_removed
67769 +
67770 +*/
67771 +int kill_node_content(coord_t * from,  /* coord of the first unit/item that will be eliminated */
67772 +                     coord_t * to,     /* coord of the last unit/item that will be eliminated */
67773 +                     const reiser4_key * from_key,     /* first key to be removed */
67774 +                     const reiser4_key * to_key,       /* last key to be removed */
67775 +                     reiser4_key * smallest_removed,   /* smallest key actually removed */
67776 +                     znode * locked_left_neighbor,     /* this is set when kill_node_content is called with left neighbor
67777 +                                                        * locked (in squalloc_right_twig_cut, namely) */
67778 +                     struct inode *inode,      /* inode of file whose item (or its part) is to be killed. This is necessary to
67779 +                                                  invalidate pages together with item pointing to them */
67780 +                     int truncate)
67781 +{                              /* this call is made for file truncate)  */
67782 +       int result;
67783 +       carry_pool *pool;
67784 +       carry_level *lowest_level;
67785 +       carry_kill_data *kdata;
67786 +       lock_handle *left_child;
67787 +       lock_handle *right_child;
67788 +       carry_op *op;
67789 +
67790 +       assert("umka-328", from != NULL);
67791 +       assert("vs-316", !node_is_empty(from->node));
67792 +       assert("nikita-1812", coord_is_existing_unit(from)
67793 +              && coord_is_existing_unit(to));
67794 +
67795 +       /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
67796 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
67797 +                              sizeof(carry_kill_data) +
67798 +                              2 * sizeof(lock_handle) +
67799 +                              5 * sizeof(reiser4_key) + 2 * sizeof(coord_t));
67800 +       if (IS_ERR(pool))
67801 +               return PTR_ERR(pool);
67802 +
67803 +       lowest_level = (carry_level *) (pool + 1);
67804 +       init_carry_level(lowest_level, pool);
67805 +
67806 +       kdata = (carry_kill_data *) (lowest_level + 3);
67807 +       left_child = (lock_handle *) (kdata + 1);
67808 +       right_child = left_child + 1;
67809 +
67810 +       init_lh(left_child);
67811 +       init_lh(right_child);
67812 +
67813 +       kdata->params.from = from;
67814 +       kdata->params.to = to;
67815 +       kdata->params.from_key = from_key;
67816 +       kdata->params.to_key = to_key;
67817 +       kdata->params.smallest_removed = smallest_removed;
67818 +       kdata->params.truncate = truncate;
67819 +       kdata->flags = 0;
67820 +       kdata->inode = inode;
67821 +       kdata->left = left_child;
67822 +       kdata->right = right_child;
67823 +       /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
67824 +       kdata->buf = (char *)(right_child + 1);
67825 +
67826 +       if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
67827 +               /* left child of extent item may have to get updated right
67828 +                  delimiting key and to get linked with right child of extent
67829 +                  @from if it will be removed completely */
67830 +               result = prepare_twig_kill(kdata, locked_left_neighbor);
67831 +               if (result) {
67832 +                       done_children(kdata);
67833 +                       done_carry_pool(pool);
67834 +                       return result;
67835 +               }
67836 +       }
67837 +
67838 +       op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
67839 +       if (IS_ERR(op) || (op == NULL)) {
67840 +               done_children(kdata);
67841 +               done_carry_pool(pool);
67842 +               return RETERR(op ? PTR_ERR(op) : -EIO);
67843 +       }
67844 +
67845 +       op->u.cut_or_kill.is_cut = 0;
67846 +       op->u.cut_or_kill.u.kill = kdata;
67847 +
67848 +       result = reiser4_carry(lowest_level, NULL);
67849 +
67850 +       done_children(kdata);
67851 +       done_carry_pool(pool);
67852 +       return result;
67853 +}
67854 +
67855 +void
67856 +fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate)
67857 +{
67858 +       if (reiser4_inode_get_flag(inode, REISER4_HAS_MMAP)) {
67859 +               pgoff_t start_pg, end_pg;
67860 +
67861 +               start_pg = start >> PAGE_CACHE_SHIFT;
67862 +               end_pg = (end - 1) >> PAGE_CACHE_SHIFT;
67863 +
67864 +               if ((start & (PAGE_CACHE_SIZE - 1)) == 0) {
67865 +                       /*
67866 +                        * kill up to the page boundary.
67867 +                        */
67868 +                       assert("vs-123456", start_pg == end_pg);
67869 +                       reiser4_invalidate_pages(inode->i_mapping, start_pg, 1,
67870 +                                                truncate);
67871 +               } else if (start_pg != end_pg) {
67872 +                       /*
67873 +                        * page boundary is within killed portion of node.
67874 +                        */
67875 +                       assert("vs-654321", end_pg - start_pg == 1);
67876 +                       reiser4_invalidate_pages(inode->i_mapping, end_pg,
67877 +                                                end_pg - start_pg, 1);
67878 +               }
67879 +       }
67880 +       inode_sub_bytes(inode, end - start);
67881 +}
67882 +
67883 +/**
67884 + * Delete whole @node from the reiser4 tree without loading it.
67885 + *
67886 + * @left: locked left neighbor,
67887 + * @node: node to be deleted,
67888 + * @smallest_removed: leftmost key of deleted node,
67889 + * @object: inode pointer, if we truncate a file body.
67890 + * @truncate: true if called for file truncate.
67891 + *
67892 + * @return: 0 if success, error code otherwise.
67893 + *
67894 + * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
67895 + * contains the right value of the smallest removed key from the previous
67896 + * cut_worker() iteration.  This is needed for proper accounting of
67897 + * "i_blocks" and "i_bytes" fields of the @object.
67898 + */
67899 +int reiser4_delete_node(znode * node, reiser4_key * smallest_removed,
67900 +                       struct inode *object, int truncate)
67901 +{
67902 +       lock_handle parent_lock;
67903 +       coord_t cut_from;
67904 +       coord_t cut_to;
67905 +       reiser4_tree *tree;
67906 +       int ret;
67907 +
67908 +       assert("zam-937", node != NULL);
67909 +       assert("zam-933", znode_is_write_locked(node));
67910 +       assert("zam-999", smallest_removed != NULL);
67911 +
67912 +       init_lh(&parent_lock);
67913 +
67914 +       ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
67915 +       if (ret)
67916 +               return ret;
67917 +
67918 +       assert("zam-934", !znode_above_root(parent_lock.node));
67919 +
67920 +       ret = zload(parent_lock.node);
67921 +       if (ret)
67922 +               goto failed_nozrelse;
67923 +
67924 +       ret = find_child_ptr(parent_lock.node, node, &cut_from);
67925 +       if (ret)
67926 +               goto failed;
67927 +
67928 +       /* decrement child counter and set parent pointer to NULL before
67929 +          deleting the list from parent node because of checks in
67930 +          internal_kill_item_hook (we can delete the last item from the parent
67931 +          node, the parent node is going to be deleted and its c_count should
67932 +          be zero). */
67933 +
67934 +       tree = znode_get_tree(node);
67935 +       write_lock_tree(tree);
67936 +       init_parent_coord(&node->in_parent, NULL);
67937 +       --parent_lock.node->c_count;
67938 +       write_unlock_tree(tree);
67939 +
67940 +       assert("zam-989", item_is_internal(&cut_from));
67941 +
67942 +       /* @node should be deleted after unlocking. */
67943 +       ZF_SET(node, JNODE_HEARD_BANSHEE);
67944 +
67945 +       /* remove a pointer from the parent node to the node being deleted. */
67946 +       coord_dup(&cut_to, &cut_from);
67947 +       /* FIXME: shouldn't this be kill_node_content */
67948 +       ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
67949 +       if (ret)
67950 +               /* FIXME(Zam): Should we re-connect the node to its parent if
67951 +                * cut_node fails? */
67952 +               goto failed;
67953 +
67954 +       {
67955 +               reiser4_tree *tree = current_tree;
67956 +               __u64 start_offset = 0, end_offset = 0;
67957 +
67958 +               read_lock_tree(tree);
67959 +               write_lock_dk(tree);
67960 +               if (object) {
67961 +                       /* We use @smallest_removed and the left delimiting of
67962 +                        * the current node for @object->i_blocks, i_bytes
67963 +                        * calculation.  We assume that the items after the
67964 +                        * *@smallest_removed key have been deleted from the
67965 +                        * file body. */
67966 +                       start_offset = get_key_offset(znode_get_ld_key(node));
67967 +                       end_offset = get_key_offset(smallest_removed);
67968 +               }
67969 +
67970 +               assert("zam-1021", znode_is_connected(node));
67971 +               if (node->left)
67972 +                       znode_set_rd_key(node->left, znode_get_rd_key(node));
67973 +
67974 +               *smallest_removed = *znode_get_ld_key(node);
67975 +
67976 +               write_unlock_dk(tree);
67977 +               read_unlock_tree(tree);
67978 +
67979 +               if (object) {
67980 +                       /* we used to perform actions which are to be performed on items on their removal from tree in
67981 +                          special item method - kill_hook. Here for optimization reasons we avoid reading node
67982 +                          containing item we remove and can not call item's kill hook. Instead we call function which
67983 +                          does exactly the same things as tail kill hook in assumption that node we avoid reading
67984 +                          contains only one item and that item is a tail one. */
67985 +                       fake_kill_hook_tail(object, start_offset, end_offset,
67986 +                                           truncate);
67987 +               }
67988 +       }
67989 +      failed:
67990 +       zrelse(parent_lock.node);
67991 +      failed_nozrelse:
67992 +       done_lh(&parent_lock);
67993 +
67994 +       return ret;
67995 +}
67996 +
67997 +static int can_delete(const reiser4_key *key, znode *node)
67998 +{
67999 +       int result;
68000 +
68001 +       read_lock_dk(current_tree);
68002 +       result = keyle(key, znode_get_ld_key(node));
68003 +       read_unlock_dk(current_tree);
68004 +       return result;
68005 +}
68006 +
68007 +/**
68008 + * This subroutine is not optimal but implementation seems to
68009 + * be easier).
68010 + *
68011 + * @tap: the point deletion process begins from,
68012 + * @from_key: the beginning of the deleted key range,
68013 + * @to_key: the end of the deleted key range,
68014 + * @smallest_removed: the smallest removed key,
68015 + * @truncate: true if called for file truncate.
68016 + * @progress: return true if a progress in file items deletions was made,
68017 + *            @smallest_removed value is actual in that case.
68018 + *
68019 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long
68020 + * reiser4_cut_tree operation was interrupted for allowing atom commit.
68021 + */
68022 +int
68023 +cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key,
68024 +                      const reiser4_key * to_key,
68025 +                      reiser4_key * smallest_removed, struct inode *object,
68026 +                      int truncate, int *progress)
68027 +{
68028 +       lock_handle next_node_lock;
68029 +       coord_t left_coord;
68030 +       int result;
68031 +
68032 +       assert("zam-931", tap->coord->node != NULL);
68033 +       assert("zam-932", znode_is_write_locked(tap->coord->node));
68034 +
68035 +       *progress = 0;
68036 +       init_lh(&next_node_lock);
68037 +
68038 +       while (1) {
68039 +               znode *node;    /* node from which items are cut */
68040 +               node_plugin *nplug;     /* node plugin for @node */
68041 +
68042 +               node = tap->coord->node;
68043 +
68044 +               /* Move next_node_lock to the next node on the left. */
68045 +               result =
68046 +                   reiser4_get_left_neighbor(&next_node_lock, node,
68047 +                                             ZNODE_WRITE_LOCK,
68048 +                                             GN_CAN_USE_UPPER_LEVELS);
68049 +               if (result != 0 && result != -E_NO_NEIGHBOR)
68050 +                       break;
68051 +               /* Check can we delete the node as a whole. */
68052 +               if (*progress && znode_get_level(node) == LEAF_LEVEL &&
68053 +                   can_delete(from_key, node)) {
68054 +                       result = reiser4_delete_node(node, smallest_removed,
68055 +                                                    object, truncate);
68056 +               } else {
68057 +                       result = reiser4_tap_load(tap);
68058 +                       if (result)
68059 +                               return result;
68060 +
68061 +                       /* Prepare the second (right) point for cut_node() */
68062 +                       if (*progress)
68063 +                               coord_init_last_unit(tap->coord, node);
68064 +
68065 +                       else if (item_plugin_by_coord(tap->coord)->b.lookup ==
68066 +                                NULL)
68067 +                               /* set rightmost unit for the items without lookup method */
68068 +                               tap->coord->unit_pos =
68069 +                                   coord_last_unit_pos(tap->coord);
68070 +
68071 +                       nplug = node->nplug;
68072 +
68073 +                       assert("vs-686", nplug);
68074 +                       assert("vs-687", nplug->lookup);
68075 +
68076 +                       /* left_coord is leftmost unit cut from @node */
68077 +                       result = nplug->lookup(node, from_key,
68078 +                                              FIND_MAX_NOT_MORE_THAN,
68079 +                                              &left_coord);
68080 +
68081 +                       if (IS_CBKERR(result))
68082 +                               break;
68083 +
68084 +                       /* adjust coordinates so that they are set to existing units */
68085 +                       if (coord_set_to_right(&left_coord)
68086 +                           || coord_set_to_left(tap->coord)) {
68087 +                               result = 0;
68088 +                               break;
68089 +                       }
68090 +
68091 +                       if (coord_compare(&left_coord, tap->coord) ==
68092 +                           COORD_CMP_ON_RIGHT) {
68093 +                               /* keys from @from_key to @to_key are not in the tree */
68094 +                               result = 0;
68095 +                               break;
68096 +                       }
68097 +
68098 +                       if (left_coord.item_pos != tap->coord->item_pos) {
68099 +                               /* do not allow to cut more than one item. It is added to solve problem of truncating
68100 +                                  partially converted files. If file is partially converted there may exist a twig node
68101 +                                  containing both internal item or items pointing to leaf nodes with formatting items
68102 +                                  and extent item. We do not want to kill internal items being at twig node here
68103 +                                  because cut_tree_worker assumes killing them from level level */
68104 +                               coord_dup(&left_coord, tap->coord);
68105 +                               assert("vs-1652",
68106 +                                      coord_is_existing_unit(&left_coord));
68107 +                               left_coord.unit_pos = 0;
68108 +                       }
68109 +
68110 +                       /* cut data from one node */
68111 +                       /* *smallest_removed = *reiser4_min_key(); */
68112 +                       result =
68113 +                           kill_node_content(&left_coord, tap->coord, from_key,
68114 +                                             to_key, smallest_removed,
68115 +                                             next_node_lock.node, object,
68116 +                                             truncate);
68117 +                       reiser4_tap_relse(tap);
68118 +               }
68119 +               if (result)
68120 +                       break;
68121 +
68122 +               ++(*progress);
68123 +
68124 +               /* Check whether all items with keys >= from_key were removed
68125 +                * from the tree. */
68126 +               if (keyle(smallest_removed, from_key))
68127 +                       /* result = 0; */
68128 +                       break;
68129 +
68130 +               if (next_node_lock.node == NULL)
68131 +                       break;
68132 +
68133 +               result = reiser4_tap_move(tap, &next_node_lock);
68134 +               done_lh(&next_node_lock);
68135 +               if (result)
68136 +                       break;
68137 +
68138 +               /* Break long reiser4_cut_tree operation (deletion of a large
68139 +                  file) if atom requires commit. */
68140 +               if (*progress > CUT_TREE_MIN_ITERATIONS
68141 +                   && current_atom_should_commit()) {
68142 +                       result = -E_REPEAT;
68143 +                       break;
68144 +               }
68145 +       }
68146 +       done_lh(&next_node_lock);
68147 +       /* assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key())); */
68148 +       return result;
68149 +}
68150 +
68151 +/* there is a fundamental problem with optimizing deletes: VFS does it
68152 +   one file at a time.  Another problem is that if an item can be
68153 +   anything, then deleting items must be done one at a time.  It just
68154 +   seems clean to writes this to specify a from and a to key, and cut
68155 +   everything between them though.  */
68156 +
68157 +/* use this function with care if deleting more than what is part of a single file. */
68158 +/* do not use this when cutting a single item, it is suboptimal for that */
68159 +
68160 +/* You are encouraged to write plugin specific versions of this.  It
68161 +   cannot be optimal for all plugins because it works item at a time,
68162 +   and some plugins could sometimes work node at a time. Regular files
68163 +   however are not optimizable to work node at a time because of
68164 +   extents needing to free the blocks they point to.
68165 +
68166 +   Optimizations compared to v3 code:
68167 +
68168 +   It does not balance (that task is left to memory pressure code).
68169 +
68170 +   Nodes are deleted only if empty.
68171 +
68172 +   Uses extents.
68173 +
68174 +   Performs read-ahead of formatted nodes whose contents are part of
68175 +   the deletion.
68176 +*/
68177 +
68178 +/**
68179 + * Delete everything from the reiser4 tree between two keys: @from_key and
68180 + * @to_key.
68181 + *
68182 + * @from_key: the beginning of the deleted key range,
68183 + * @to_key: the end of the deleted key range,
68184 + * @smallest_removed: the smallest removed key,
68185 + * @object: owner of cutting items.
68186 + * @truncate: true if called for file truncate.
68187 + * @progress: return true if a progress in file items deletions was made,
68188 + *            @smallest_removed value is actual in that case.
68189 + *
68190 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
68191 + * operation was interrupted for allowing atom commit .
68192 + */
68193 +
68194 +int reiser4_cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
68195 +                           const reiser4_key * to_key,
68196 +                           reiser4_key * smallest_removed_p,
68197 +                           struct inode *object, int truncate, int *progress)
68198 +{
68199 +       lock_handle lock;
68200 +       int result;
68201 +       tap_t tap;
68202 +       coord_t right_coord;
68203 +       reiser4_key smallest_removed;
68204 +       int (*cut_tree_worker) (tap_t *, const reiser4_key *,
68205 +                               const reiser4_key *, reiser4_key *,
68206 +                               struct inode *, int, int *);
68207 +       STORE_COUNTERS;
68208 +
68209 +       assert("umka-329", tree != NULL);
68210 +       assert("umka-330", from_key != NULL);
68211 +       assert("umka-331", to_key != NULL);
68212 +       assert("zam-936", keyle(from_key, to_key));
68213 +
68214 +       if (smallest_removed_p == NULL)
68215 +               smallest_removed_p = &smallest_removed;
68216 +
68217 +       init_lh(&lock);
68218 +
68219 +       do {
68220 +               /* Find rightmost item to cut away from the tree. */
68221 +               result = reiser4_object_lookup(object, to_key, &right_coord,
68222 +                                              &lock, ZNODE_WRITE_LOCK,
68223 +                                              FIND_MAX_NOT_MORE_THAN,
68224 +                                              TWIG_LEVEL, LEAF_LEVEL,
68225 +                                              CBK_UNIQUE, NULL /*ra_info */);
68226 +               if (result != CBK_COORD_FOUND)
68227 +                       break;
68228 +               if (object == NULL
68229 +                   || inode_file_plugin(object)->cut_tree_worker == NULL)
68230 +                       cut_tree_worker = cut_tree_worker_common;
68231 +               else
68232 +                       cut_tree_worker =
68233 +                           inode_file_plugin(object)->cut_tree_worker;
68234 +               reiser4_tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
68235 +               result =
68236 +                   cut_tree_worker(&tap, from_key, to_key, smallest_removed_p,
68237 +                                   object, truncate, progress);
68238 +               reiser4_tap_done(&tap);
68239 +
68240 +               reiser4_preempt_point();
68241 +
68242 +       } while (0);
68243 +
68244 +       done_lh(&lock);
68245 +
68246 +       if (result) {
68247 +               switch (result) {
68248 +               case -E_NO_NEIGHBOR:
68249 +                       result = 0;
68250 +                       break;
68251 +               case -E_DEADLOCK:
68252 +                       result = -E_REPEAT;
68253 +               case -E_REPEAT:
68254 +               case -ENOMEM:
68255 +               case -ENOENT:
68256 +                       break;
68257 +               default:
68258 +                       warning("nikita-2861", "failure: %i", result);
68259 +               }
68260 +       }
68261 +
68262 +       CHECK_COUNTERS;
68263 +       return result;
68264 +}
68265 +
68266 +/* repeat reiser4_cut_tree_object until everything is deleted.
68267 + * unlike cut_file_items, it does not end current transaction if -E_REPEAT
68268 + * is returned by cut_tree_object. */
68269 +int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
68270 +                    const reiser4_key * to, struct inode *inode, int truncate)
68271 +{
68272 +       int result;
68273 +       int progress;
68274 +
68275 +       do {
68276 +               result = reiser4_cut_tree_object(tree, from, to, NULL,
68277 +                                                inode, truncate, &progress);
68278 +       } while (result == -E_REPEAT);
68279 +
68280 +       return result;
68281 +}
68282 +
68283 +/* finishing reiser4 initialization */
68284 +int reiser4_init_tree(reiser4_tree * tree      /* pointer to structure being
68285 +                                        * initialized */ ,
68286 +             const reiser4_block_nr * root_block       /* address of a root block
68287 +                                                        * on a disk */ ,
68288 +             tree_level height /* height of a tree */ ,
68289 +             node_plugin * nplug /* default node plugin */ )
68290 +{
68291 +       int result;
68292 +
68293 +       assert("nikita-306", tree != NULL);
68294 +       assert("nikita-307", root_block != NULL);
68295 +       assert("nikita-308", height > 0);
68296 +       assert("nikita-309", nplug != NULL);
68297 +       assert("zam-587", tree->super != NULL);
68298 +
68299 +       tree->root_block = *root_block;
68300 +       tree->height = height;
68301 +       tree->estimate_one_insert = calc_estimate_one_insert(height);
68302 +       tree->nplug = nplug;
68303 +
68304 +       tree->znode_epoch = 1ull;
68305 +
68306 +       cbk_cache_init(&tree->cbk_cache);
68307 +
68308 +       result = znodes_tree_init(tree);
68309 +       if (result == 0)
68310 +               result = jnodes_tree_init(tree);
68311 +       if (result == 0) {
68312 +               tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0,
68313 +                                 reiser4_ctx_gfp_mask_get());
68314 +               if (IS_ERR(tree->uber)) {
68315 +                       result = PTR_ERR(tree->uber);
68316 +                       tree->uber = NULL;
68317 +               }
68318 +       }
68319 +       return result;
68320 +}
68321 +
68322 +/* release resources associated with @tree */
68323 +void reiser4_done_tree(reiser4_tree * tree /* tree to release */ )
68324 +{
68325 +       if (tree == NULL)
68326 +               return;
68327 +
68328 +       if (tree->uber != NULL) {
68329 +               zput(tree->uber);
68330 +               tree->uber = NULL;
68331 +       }
68332 +       znodes_tree_done(tree);
68333 +       jnodes_tree_done(tree);
68334 +       cbk_cache_done(&tree->cbk_cache);
68335 +}
68336 +
68337 +/* Make Linus happy.
68338 +   Local variables:
68339 +   c-indentation-style: "K&R"
68340 +   mode-name: "LC"
68341 +   c-basic-offset: 8
68342 +   tab-width: 8
68343 +   fill-column: 120
68344 +   scroll-step: 1
68345 +   End:
68346 +*/
68347 diff -urN linux-2.6.35.orig/fs/reiser4/tree.h linux-2.6.35/fs/reiser4/tree.h
68348 --- linux-2.6.35.orig/fs/reiser4/tree.h 1970-01-01 01:00:00.000000000 +0100
68349 +++ linux-2.6.35/fs/reiser4/tree.h      2010-08-04 15:44:57.000000000 +0200
68350 @@ -0,0 +1,577 @@
68351 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68352 + * reiser4/README */
68353 +
68354 +/* Tree operations. See fs/reiser4/tree.c for comments */
68355 +
68356 +#if !defined( __REISER4_TREE_H__ )
68357 +#define __REISER4_TREE_H__
68358 +
68359 +#include "forward.h"
68360 +#include "debug.h"
68361 +#include "dformat.h"
68362 +#include "plugin/node/node.h"
68363 +#include "plugin/plugin.h"
68364 +#include "znode.h"
68365 +#include "tap.h"
68366 +
68367 +#include <linux/types.h>       /* for __u??  */
68368 +#include <linux/fs.h>          /* for struct super_block  */
68369 +#include <linux/spinlock.h>
68370 +#include <linux/sched.h>       /* for struct task_struct */
68371 +
68372 +/* fictive block number never actually used */
68373 +extern const reiser4_block_nr UBER_TREE_ADDR;
68374 +
68375 +/* &cbk_cache_slot - entry in a coord cache.
68376 +
68377 +   This is entry in a coord_by_key (cbk) cache, represented by
68378 +   &cbk_cache.
68379 +
68380 +*/
68381 +typedef struct cbk_cache_slot {
68382 +       /* cached node */
68383 +       znode *node;
68384 +       /* linkage to the next cbk cache slot in a LRU order */
68385 +       struct list_head lru;
68386 +} cbk_cache_slot;
68387 +
68388 +/* &cbk_cache - coord cache. This is part of reiser4_tree.
68389 +
68390 +   cbk_cache is supposed to speed up tree lookups by caching results of recent
68391 +   successful lookups (we don't cache negative results as dentry cache
68392 +   does). Cache consists of relatively small number of entries kept in a LRU
68393 +   order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
68394 +   which we can obtain a range of keys that covered by this znode. Before
68395 +   embarking into real tree traversal we scan cbk_cache slot by slot and for
68396 +   each slot check whether key we are looking for is between minimal and
68397 +   maximal keys for node pointed to by this slot. If no match is found, real
68398 +   tree traversal is performed and if result is successful, appropriate entry
68399 +   is inserted into cache, possibly pulling least recently used entry out of
68400 +   it.
68401 +
68402 +   Tree spin lock is used to protect coord cache. If contention for this
68403 +   lock proves to be too high, more finer grained locking can be added.
68404 +
68405 +   Invariants involving parts of this data-type:
68406 +
68407 +      [cbk-cache-invariant]
68408 +*/
68409 +typedef struct cbk_cache {
68410 +       /* serializator */
68411 +       rwlock_t guard;
68412 +       int nr_slots;
68413 +       /* head of LRU list of cache slots */
68414 +       struct list_head lru;
68415 +       /* actual array of slots */
68416 +       cbk_cache_slot *slot;
68417 +} cbk_cache;
68418 +
68419 +/* level_lookup_result - possible outcome of looking up key at some level.
68420 +   This is used by coord_by_key when traversing tree downward. */
68421 +typedef enum {
68422 +       /* continue to the next level */
68423 +       LOOKUP_CONT,
68424 +       /* done. Either required item was found, or we can prove it
68425 +          doesn't exist, or some error occurred. */
68426 +       LOOKUP_DONE,
68427 +       /* restart traversal from the root. Infamous "repetition". */
68428 +       LOOKUP_REST
68429 +} level_lookup_result;
68430 +
68431 +/*    This is representation of internal reiser4 tree where all file-system
68432 +   data and meta-data are stored. This structure is passed to all tree
68433 +   manipulation functions. It's different from the super block because:
68434 +   we don't want to limit ourselves to strictly one to one mapping
68435 +   between super blocks and trees, and, because they are logically
68436 +   different: there are things in a super block that have no relation to
68437 +   the tree (bitmaps, journalling area, mount options, etc.) and there
68438 +   are things in a tree that bear no relation to the super block, like
68439 +   tree of znodes.
68440 +
68441 +   At this time, there is only one tree
68442 +   per filesystem, and this struct is part of the super block.  We only
68443 +   call the super block the super block for historical reasons (most
68444 +   other filesystems call the per filesystem metadata the super block).
68445 +*/
68446 +
68447 +struct reiser4_tree {
68448 +       /* block_nr == 0 is fake znode. Write lock it, while changing
68449 +          tree height. */
68450 +       /* disk address of root node of a tree */
68451 +       reiser4_block_nr root_block;
68452 +
68453 +       /* level of the root node. If this is 1, tree consists of root
68454 +          node only */
68455 +       tree_level height;
68456 +
68457 +       /*
68458 +        * this is cached here avoid calling plugins through function
68459 +        * dereference all the time.
68460 +        */
68461 +       __u64 estimate_one_insert;
68462 +
68463 +       /* cache of recent tree lookup results */
68464 +       cbk_cache cbk_cache;
68465 +
68466 +       /* hash table to look up znodes by block number. */
68467 +       z_hash_table zhash_table;
68468 +       z_hash_table zfake_table;
68469 +       /* hash table to look up jnodes by inode and offset. */
68470 +       j_hash_table jhash_table;
68471 +
68472 +       /* lock protecting:
68473 +          - parent pointers,
68474 +          - sibling pointers,
68475 +          - znode hash table
68476 +          - coord cache
68477 +        */
68478 +       /* NOTE: The "giant" tree lock can be replaced by more spin locks,
68479 +          hoping they will be less contented. We can use one spin lock per one
68480 +          znode hash bucket.  With adding of some code complexity, sibling
68481 +          pointers can be protected by both znode spin locks.  However it looks
68482 +          more SMP scalable we should test this locking change on n-ways (n >
68483 +          4) SMP machines.  Current 4-ways machine test does not show that tree
68484 +          lock is contented and it is a bottleneck (2003.07.25). */
68485 +
68486 +       rwlock_t tree_lock;
68487 +
68488 +       /* lock protecting delimiting keys */
68489 +       rwlock_t dk_lock;
68490 +
68491 +       /* spin lock protecting znode_epoch */
68492 +       spinlock_t epoch_lock;
68493 +       /* version stamp used to mark znode updates. See seal.[ch] for more
68494 +        * information. */
68495 +       __u64 znode_epoch;
68496 +
68497 +       znode *uber;
68498 +       node_plugin *nplug;
68499 +       struct super_block *super;
68500 +       struct {
68501 +               /* carry flags used for insertion of new nodes */
68502 +               __u32 new_node_flags;
68503 +               /* carry flags used for insertion of new extents */
68504 +               __u32 new_extent_flags;
68505 +               /* carry flags used for paste operations */
68506 +               __u32 paste_flags;
68507 +               /* carry flags used for insert operations */
68508 +               __u32 insert_flags;
68509 +       } carry;
68510 +};
68511 +
68512 +extern int reiser4_init_tree(reiser4_tree * tree,
68513 +                            const reiser4_block_nr * root_block,
68514 +                            tree_level height, node_plugin * default_plugin);
68515 +extern void reiser4_done_tree(reiser4_tree * tree);
68516 +
68517 +/* cbk flags: options for coord_by_key() */
68518 +typedef enum {
68519 +       /* coord_by_key() is called for insertion. This is necessary because
68520 +          of extents being located at the twig level. For explanation, see
68521 +          comment just above is_next_item_internal().
68522 +        */
68523 +       CBK_FOR_INSERT = (1 << 0),
68524 +       /* coord_by_key() is called with key that is known to be unique */
68525 +       CBK_UNIQUE = (1 << 1),
68526 +       /* coord_by_key() can trust delimiting keys. This options is not user
68527 +          accessible. coord_by_key() will set it automatically. It will be
68528 +          only cleared by special-case in extents-on-the-twig-level handling
68529 +          where it is necessary to insert item with a key smaller than
68530 +          leftmost key in a node. This is necessary because of extents being
68531 +          located at the twig level. For explanation, see comment just above
68532 +          is_next_item_internal().
68533 +        */
68534 +       CBK_TRUST_DK = (1 << 2),
68535 +       CBK_READA = (1 << 3),   /* original: readahead leaves which contain items of certain file */
68536 +       CBK_READDIR_RA = (1 << 4),      /* readdir: readahead whole directory and all its stat datas */
68537 +       CBK_DKSET = (1 << 5),
68538 +       CBK_EXTENDED_COORD = (1 << 6),  /* coord_t is actually */
68539 +       CBK_IN_CACHE = (1 << 7),        /* node is already in cache */
68540 +       CBK_USE_CRABLOCK = (1 << 8)     /* use crab_lock in stead of long term
68541 +                                        * lock */
68542 +} cbk_flags;
68543 +
68544 +/* insertion outcome. IBK = insert by key */
68545 +typedef enum {
68546 +       IBK_INSERT_OK = 0,
68547 +       IBK_ALREADY_EXISTS = -EEXIST,
68548 +       IBK_IO_ERROR = -EIO,
68549 +       IBK_NO_SPACE = -E_NODE_FULL,
68550 +       IBK_OOM = -ENOMEM
68551 +} insert_result;
68552 +
68553 +#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
68554 +
68555 +typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord,
68556 +                                    lock_handle * lh, void *arg);
68557 +extern int reiser4_iterate_tree(reiser4_tree * tree, coord_t * coord,
68558 +                               lock_handle * lh,
68559 +                               tree_iterate_actor_t actor, void *arg,
68560 +                               znode_lock_mode mode, int through_units_p);
68561 +extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
68562 +                         znode_lock_request pri, lock_handle * lh);
68563 +
68564 +/* return node plugin of @node */
68565 +static inline node_plugin *node_plugin_by_node(const znode *
68566 +                                              node /* node to query */ )
68567 +{
68568 +       assert("vs-213", node != NULL);
68569 +       assert("vs-214", znode_is_loaded(node));
68570 +
68571 +       return node->nplug;
68572 +}
68573 +
68574 +/* number of items in @node */
68575 +static inline pos_in_node_t node_num_items(const znode * node)
68576 +{
68577 +       assert("nikita-2754", znode_is_loaded(node));
68578 +       assert("nikita-2468",
68579 +              node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
68580 +
68581 +       return node->nr_items;
68582 +}
68583 +
68584 +/* Return the number of items at the present node.  Asserts coord->node !=
68585 +   NULL. */
68586 +static inline unsigned coord_num_items(const coord_t * coord)
68587 +{
68588 +       assert("jmacd-9805", coord->node != NULL);
68589 +
68590 +       return node_num_items(coord->node);
68591 +}
68592 +
68593 +/* true if @node is empty */
68594 +static inline int node_is_empty(const znode * node)
68595 +{
68596 +       return node_num_items(node) == 0;
68597 +}
68598 +
68599 +typedef enum {
68600 +       SHIFTED_SOMETHING = 0,
68601 +       SHIFT_NO_SPACE = -E_NODE_FULL,
68602 +       SHIFT_IO_ERROR = -EIO,
68603 +       SHIFT_OOM = -ENOMEM,
68604 +} shift_result;
68605 +
68606 +extern node_plugin *node_plugin_by_coord(const coord_t * coord);
68607 +extern int is_coord_in_node(const coord_t * coord);
68608 +extern int key_in_node(const reiser4_key *, const coord_t *);
68609 +extern void coord_item_move_to(coord_t * coord, int items);
68610 +extern void coord_unit_move_to(coord_t * coord, int units);
68611 +
68612 +/* there are two types of repetitive accesses (ra): intra-syscall
68613 +   (local) and inter-syscall (global). Local ra is used when
68614 +   during single syscall we add/delete several items and units in the
68615 +   same place in a tree. Note that plan-A fragments local ra by
68616 +   separating stat-data and file body in key-space. Global ra is
68617 +   used when user does repetitive modifications in the same place in a
68618 +   tree.
68619 +
68620 +   Our ra implementation serves following purposes:
68621 +    1 it affects balancing decisions so that next operation in a row
68622 +      can be performed faster;
68623 +    2 it affects lower-level read-ahead in page-cache;
68624 +    3 it allows to avoid unnecessary lookups by maintaining some state
68625 +      across several operations (this is only for local ra);
68626 +    4 it leaves room for lazy-micro-balancing: when we start a sequence of
68627 +      operations they are performed without actually doing any intra-node
68628 +      shifts, until we finish sequence or scope of sequence leaves
68629 +      current node, only then we really pack node (local ra only).
68630 +*/
68631 +
68632 +/* another thing that can be useful is to keep per-tree and/or
68633 +   per-process cache of recent lookups. This cache can be organised as a
68634 +   list of block numbers of formatted nodes sorted by starting key in
68635 +   this node. Balancings should invalidate appropriate parts of this
68636 +   cache.
68637 +*/
68638 +
68639 +lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
68640 +                          coord_t * coord, lock_handle * handle,
68641 +                          znode_lock_mode lock, lookup_bias bias,
68642 +                          tree_level lock_level, tree_level stop_level,
68643 +                          __u32 flags, ra_info_t *);
68644 +
68645 +lookup_result reiser4_object_lookup(struct inode *object,
68646 +                                   const reiser4_key * key,
68647 +                                   coord_t * coord,
68648 +                                   lock_handle * lh,
68649 +                                   znode_lock_mode lock_mode,
68650 +                                   lookup_bias bias,
68651 +                                   tree_level lock_level,
68652 +                                   tree_level stop_level,
68653 +                                   __u32 flags, ra_info_t * info);
68654 +
68655 +insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
68656 +                           reiser4_item_data * data, coord_t * coord,
68657 +                           lock_handle * lh,
68658 +                           tree_level stop_level, __u32 flags);
68659 +insert_result insert_by_coord(coord_t * coord,
68660 +                             reiser4_item_data * data, const reiser4_key * key,
68661 +                             lock_handle * lh, __u32);
68662 +insert_result insert_extent_by_coord(coord_t * coord,
68663 +                                    reiser4_item_data * data,
68664 +                                    const reiser4_key * key, lock_handle * lh);
68665 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,
68666 +                    const reiser4_key * to_key,
68667 +                    reiser4_key * smallest_removed);
68668 +int kill_node_content(coord_t * from, coord_t * to,
68669 +                     const reiser4_key * from_key, const reiser4_key * to_key,
68670 +                     reiser4_key * smallest_removed,
68671 +                     znode * locked_left_neighbor, struct inode *inode,
68672 +                     int truncate);
68673 +
68674 +int reiser4_resize_item(coord_t * coord, reiser4_item_data * data,
68675 +                       reiser4_key * key, lock_handle * lh, cop_insert_flag);
68676 +int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key,
68677 +                    reiser4_item_data * data, unsigned);
68678 +int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
68679 +int find_new_child_ptr(znode * parent, znode * child, znode * left,
68680 +                      coord_t * result);
68681 +
68682 +int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
68683 +int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
68684 +
68685 +void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int);
68686 +
68687 +extern int cut_tree_worker_common(tap_t *, const reiser4_key *,
68688 +                                 const reiser4_key *, reiser4_key *,
68689 +                                 struct inode *, int, int *);
68690 +extern int reiser4_cut_tree_object(reiser4_tree *, const reiser4_key *,
68691 +                                  const reiser4_key *, reiser4_key *,
68692 +                                  struct inode *, int, int *);
68693 +extern int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
68694 +                           const reiser4_key * to, struct inode *, int);
68695 +
68696 +extern int reiser4_delete_node(znode *, reiser4_key *, struct inode *, int);
68697 +extern int check_tree_pointer(const coord_t * pointer, const znode * child);
68698 +extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG,
68699 +                             znode * left, coord_t * result);
68700 +extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
68701 +extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent,
68702 +                                    znode * child);
68703 +extern znode *child_znode(const coord_t * in_parent, znode * parent,
68704 +                         int incore_p, int setup_dkeys_p);
68705 +
68706 +extern int cbk_cache_init(cbk_cache * cache);
68707 +extern void cbk_cache_done(cbk_cache * cache);
68708 +extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
68709 +
68710 +extern char *sprint_address(const reiser4_block_nr * block);
68711 +
68712 +#if REISER4_DEBUG
68713 +extern void print_coord_content(const char *prefix, coord_t * p);
68714 +extern void reiser4_print_address(const char *prefix,
68715 +                       const reiser4_block_nr * block);
68716 +extern void print_tree_rec(const char *prefix, reiser4_tree * tree,
68717 +                          __u32 flags);
68718 +extern void check_dkeys(znode *node);
68719 +#else
68720 +#define print_coord_content(p, c) noop
68721 +#define reiser4_print_address(p, b) noop
68722 +#endif
68723 +
68724 +extern void forget_znode(lock_handle * handle);
68725 +extern int deallocate_znode(znode * node);
68726 +
68727 +extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
68728 +
68729 +/* struct used internally to pack all numerous arguments of tree lookup.
68730 +    Used to avoid passing a lot of arguments to helper functions. */
68731 +typedef struct cbk_handle {
68732 +       /* tree we are in */
68733 +       reiser4_tree *tree;
68734 +       /* key we are going after */
68735 +       const reiser4_key *key;
68736 +       /* coord we will store result in */
68737 +       coord_t *coord;
68738 +       /* type of lock to take on target node */
68739 +       znode_lock_mode lock_mode;
68740 +       /* lookup bias. See comments at the declaration of lookup_bias */
68741 +       lookup_bias bias;
68742 +       /* lock level: level starting from which tree traversal starts taking
68743 +        * write locks. */
68744 +       tree_level lock_level;
68745 +       /* level where search will stop. Either item will be found between
68746 +          lock_level and stop_level, or CBK_COORD_NOTFOUND will be
68747 +          returned.
68748 +        */
68749 +       tree_level stop_level;
68750 +       /* level we are currently at */
68751 +       tree_level level;
68752 +       /* block number of @active node. Tree traversal operates on two
68753 +          nodes: active and parent.  */
68754 +       reiser4_block_nr block;
68755 +       /* put here error message to be printed by caller */
68756 +       const char *error;
68757 +       /* result passed back to caller */
68758 +       lookup_result result;
68759 +       /* lock handles for active and parent */
68760 +       lock_handle *parent_lh;
68761 +       lock_handle *active_lh;
68762 +       reiser4_key ld_key;
68763 +       reiser4_key rd_key;
68764 +       /* flags, passed to the cbk routine. Bits of this bitmask are defined
68765 +          in tree.h:cbk_flags enum. */
68766 +       __u32 flags;
68767 +       ra_info_t *ra_info;
68768 +       struct inode *object;
68769 +} cbk_handle;
68770 +
68771 +extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
68772 +
68773 +/* eottl.c */
68774 +extern int handle_eottl(cbk_handle *h, int *outcome);
68775 +
68776 +int lookup_multikey(cbk_handle * handle, int nr_keys);
68777 +int lookup_couple(reiser4_tree * tree,
68778 +                 const reiser4_key * key1, const reiser4_key * key2,
68779 +                 coord_t * coord1, coord_t * coord2,
68780 +                 lock_handle * lh1, lock_handle * lh2,
68781 +                 znode_lock_mode lock_mode, lookup_bias bias,
68782 +                 tree_level lock_level, tree_level stop_level, __u32 flags,
68783 +                 int *result1, int *result2);
68784 +
68785 +static inline void read_lock_tree(reiser4_tree *tree)
68786 +{
68787 +       /* check that tree is not locked */
68788 +       assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
68789 +                   LOCK_CNT_NIL(read_locked_tree) &&
68790 +                   LOCK_CNT_NIL(write_locked_tree)));
68791 +       /* check that spinlocks of lower priorities are not held */
68792 +       assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
68793 +                   LOCK_CNT_NIL(rw_locked_dk) &&
68794 +                   LOCK_CNT_NIL(spin_locked_stack)));
68795 +
68796 +       read_lock(&(tree->tree_lock));
68797 +
68798 +       LOCK_CNT_INC(read_locked_tree);
68799 +       LOCK_CNT_INC(rw_locked_tree);
68800 +       LOCK_CNT_INC(spin_locked);
68801 +}
68802 +
68803 +static inline void read_unlock_tree(reiser4_tree *tree)
68804 +{
68805 +       assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree));
68806 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
68807 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68808 +
68809 +       LOCK_CNT_DEC(read_locked_tree);
68810 +       LOCK_CNT_DEC(rw_locked_tree);
68811 +       LOCK_CNT_DEC(spin_locked);
68812 +
68813 +       read_unlock(&(tree->tree_lock));
68814 +}
68815 +
68816 +static inline void write_lock_tree(reiser4_tree *tree)
68817 +{
68818 +       /* check that tree is not locked */
68819 +       assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
68820 +                   LOCK_CNT_NIL(read_locked_tree) &&
68821 +                   LOCK_CNT_NIL(write_locked_tree)));
68822 +       /* check that spinlocks of lower priorities are not held */
68823 +       assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
68824 +                   LOCK_CNT_NIL(rw_locked_dk) &&
68825 +                   LOCK_CNT_NIL(spin_locked_stack)));
68826 +
68827 +       write_lock(&(tree->tree_lock));
68828 +
68829 +       LOCK_CNT_INC(write_locked_tree);
68830 +       LOCK_CNT_INC(rw_locked_tree);
68831 +       LOCK_CNT_INC(spin_locked);
68832 +}
68833 +
68834 +static inline void write_unlock_tree(reiser4_tree *tree)
68835 +{
68836 +       assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree));
68837 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
68838 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68839 +
68840 +       LOCK_CNT_DEC(write_locked_tree);
68841 +       LOCK_CNT_DEC(rw_locked_tree);
68842 +       LOCK_CNT_DEC(spin_locked);
68843 +
68844 +       write_unlock(&(tree->tree_lock));
68845 +}
68846 +
68847 +static inline void read_lock_dk(reiser4_tree *tree)
68848 +{
68849 +       /* check that dk is not locked */
68850 +       assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
68851 +                   LOCK_CNT_NIL(read_locked_dk) &&
68852 +                   LOCK_CNT_NIL(write_locked_dk)));
68853 +       /* check that spinlocks of lower priorities are not held */
68854 +       assert("", LOCK_CNT_NIL(spin_locked_stack));
68855 +
68856 +       read_lock(&((tree)->dk_lock));
68857 +
68858 +       LOCK_CNT_INC(read_locked_dk);
68859 +       LOCK_CNT_INC(rw_locked_dk);
68860 +       LOCK_CNT_INC(spin_locked);
68861 +}
68862 +
68863 +static inline void read_unlock_dk(reiser4_tree *tree)
68864 +{
68865 +       assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk));
68866 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
68867 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68868 +
68869 +       LOCK_CNT_DEC(read_locked_dk);
68870 +       LOCK_CNT_DEC(rw_locked_dk);
68871 +       LOCK_CNT_DEC(spin_locked);
68872 +
68873 +       read_unlock(&(tree->dk_lock));
68874 +}
68875 +
68876 +static inline void write_lock_dk(reiser4_tree *tree)
68877 +{
68878 +       /* check that dk is not locked */
68879 +       assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
68880 +                   LOCK_CNT_NIL(read_locked_dk) &&
68881 +                   LOCK_CNT_NIL(write_locked_dk)));
68882 +       /* check that spinlocks of lower priorities are not held */
68883 +       assert("", LOCK_CNT_NIL(spin_locked_stack));
68884 +
68885 +       write_lock(&((tree)->dk_lock));
68886 +
68887 +       LOCK_CNT_INC(write_locked_dk);
68888 +       LOCK_CNT_INC(rw_locked_dk);
68889 +       LOCK_CNT_INC(spin_locked);
68890 +}
68891 +
68892 +static inline void write_unlock_dk(reiser4_tree *tree)
68893 +{
68894 +       assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk));
68895 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
68896 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68897 +
68898 +       LOCK_CNT_DEC(write_locked_dk);
68899 +       LOCK_CNT_DEC(rw_locked_dk);
68900 +       LOCK_CNT_DEC(spin_locked);
68901 +
68902 +       write_unlock(&(tree->dk_lock));
68903 +}
68904 +
68905 +/* estimate api. Implementation is in estimate.c */
68906 +reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
68907 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
68908 +reiser4_block_nr estimate_insert_flow(tree_level);
68909 +reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
68910 +reiser4_block_nr calc_estimate_one_insert(tree_level);
68911 +reiser4_block_nr estimate_dirty_cluster(struct inode *);
68912 +reiser4_block_nr estimate_insert_cluster(struct inode *);
68913 +reiser4_block_nr estimate_update_cluster(struct inode *);
68914 +
68915 +/* __REISER4_TREE_H__ */
68916 +#endif
68917 +
68918 +/* Make Linus happy.
68919 +   Local variables:
68920 +   c-indentation-style: "K&R"
68921 +   mode-name: "LC"
68922 +   c-basic-offset: 8
68923 +   tab-width: 8
68924 +   fill-column: 120
68925 +   scroll-step: 1
68926 +   End:
68927 +*/
68928 diff -urN linux-2.6.35.orig/fs/reiser4/tree_mod.c linux-2.6.35/fs/reiser4/tree_mod.c
68929 --- linux-2.6.35.orig/fs/reiser4/tree_mod.c     1970-01-01 01:00:00.000000000 +0100
68930 +++ linux-2.6.35/fs/reiser4/tree_mod.c  2010-08-04 15:44:57.000000000 +0200
68931 @@ -0,0 +1,386 @@
68932 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68933 + * reiser4/README */
68934 +
68935 +/*
68936 + * Functions to add/delete new nodes to/from the tree.
68937 + *
68938 + * Functions from this file are used by carry (see carry*) to handle:
68939 + *
68940 + *     . insertion of new formatted node into tree
68941 + *
68942 + *     . addition of new tree root, increasing tree height
68943 + *
68944 + *     . removing tree root, decreasing tree height
68945 + *
68946 + */
68947 +
68948 +#include "forward.h"
68949 +#include "debug.h"
68950 +#include "dformat.h"
68951 +#include "key.h"
68952 +#include "coord.h"
68953 +#include "plugin/plugin.h"
68954 +#include "jnode.h"
68955 +#include "znode.h"
68956 +#include "tree_mod.h"
68957 +#include "block_alloc.h"
68958 +#include "tree_walk.h"
68959 +#include "tree.h"
68960 +#include "super.h"
68961 +
68962 +#include <linux/err.h>
68963 +
68964 +static int add_child_ptr(znode * parent, znode * child);
68965 +/* warning only issued if error is not -E_REPEAT */
68966 +#define ewarning( error, ... )                 \
68967 +       if( ( error ) != -E_REPEAT )            \
68968 +               warning( __VA_ARGS__ )
68969 +
68970 +/* allocate new node on the @level and immediately on the right of @brother. */
68971 +znode * reiser4_new_node(znode * brother /* existing left neighbor
68972 +                                         *  of new node */,
68973 +                        tree_level level /* tree level at which new node is to
68974 +                                          * be allocated */)
68975 +{
68976 +       znode *result;
68977 +       int retcode;
68978 +       reiser4_block_nr blocknr;
68979 +
68980 +       assert("nikita-930", brother != NULL);
68981 +       assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT);
68982 +
68983 +       retcode = assign_fake_blocknr_formatted(&blocknr);
68984 +       if (retcode == 0) {
68985 +               result =
68986 +                   zget(znode_get_tree(brother), &blocknr, NULL, level,
68987 +                        reiser4_ctx_gfp_mask_get());
68988 +               if (IS_ERR(result)) {
68989 +                       ewarning(PTR_ERR(result), "nikita-929",
68990 +                                "Cannot allocate znode for carry: %li",
68991 +                                PTR_ERR(result));
68992 +                       return result;
68993 +               }
68994 +               /* cheap test, can be executed even when debugging is off */
68995 +               if (!znode_just_created(result)) {
68996 +                       warning("nikita-2213",
68997 +                               "Allocated already existing block: %llu",
68998 +                               (unsigned long long)blocknr);
68999 +                       zput(result);
69000 +                       return ERR_PTR(RETERR(-EIO));
69001 +               }
69002 +
69003 +               assert("nikita-931", result != NULL);
69004 +               result->nplug = znode_get_tree(brother)->nplug;
69005 +               assert("nikita-933", result->nplug != NULL);
69006 +
69007 +               retcode = zinit_new(result, reiser4_ctx_gfp_mask_get());
69008 +               if (retcode == 0) {
69009 +                       ZF_SET(result, JNODE_CREATED);
69010 +                       zrelse(result);
69011 +               } else {
69012 +                       zput(result);
69013 +                       result = ERR_PTR(retcode);
69014 +               }
69015 +       } else {
69016 +               /* failure to allocate new node during balancing.
69017 +                  This should never happen. Ever. Returning -E_REPEAT
69018 +                  is not viable solution, because "out of disk space"
69019 +                  is not transient error that will go away by itself.
69020 +                */
69021 +               ewarning(retcode, "nikita-928",
69022 +                        "Cannot allocate block for carry: %i", retcode);
69023 +               result = ERR_PTR(retcode);
69024 +       }
69025 +       assert("nikita-1071", result != NULL);
69026 +       return result;
69027 +}
69028 +
69029 +/* allocate new root and add it to the tree
69030 +
69031 +   This helper function is called by add_new_root().
69032 +
69033 +*/
69034 +znode *reiser4_add_tree_root(znode * old_root /* existing tree root */ ,
69035 +                    znode * fake /* "fake" znode */ )
69036 +{
69037 +       reiser4_tree *tree = znode_get_tree(old_root);
69038 +       znode *new_root = NULL; /* to shut gcc up */
69039 +       int result;
69040 +
69041 +       assert("nikita-1069", old_root != NULL);
69042 +       assert("umka-262", fake != NULL);
69043 +       assert("umka-263", tree != NULL);
69044 +
69045 +       /* "fake" znode---one always hanging just above current root. This
69046 +          node is locked when new root is created or existing root is
69047 +          deleted. Downward tree traversal takes lock on it before taking
69048 +          lock on a root node. This avoids race conditions with root
69049 +          manipulations.
69050 +
69051 +        */
69052 +       assert("nikita-1348", znode_above_root(fake));
69053 +       assert("nikita-1211", znode_is_root(old_root));
69054 +
69055 +       result = 0;
69056 +       if (tree->height >= REAL_MAX_ZTREE_HEIGHT) {
69057 +               warning("nikita-1344", "Tree is too tall: %i", tree->height);
69058 +               /* ext2 returns -ENOSPC when it runs out of free inodes with a
69059 +                  following comment (fs/ext2/ialloc.c:441): Is it really
69060 +                  ENOSPC?
69061 +
69062 +                  -EXFULL? -EINVAL?
69063 +                */
69064 +               result = RETERR(-ENOSPC);
69065 +       } else {
69066 +               /* Allocate block for new root. It's not that
69067 +                  important where it will be allocated, as root is
69068 +                  almost always in memory. Moreover, allocate on
69069 +                  flush can be going here.
69070 +                */
69071 +               assert("nikita-1448", znode_is_root(old_root));
69072 +               new_root = reiser4_new_node(fake, tree->height + 1);
69073 +               if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) {
69074 +                       lock_handle rlh;
69075 +
69076 +                       init_lh(&rlh);
69077 +                       result =
69078 +                           longterm_lock_znode(&rlh, new_root,
69079 +                                               ZNODE_WRITE_LOCK,
69080 +                                               ZNODE_LOCK_LOPRI);
69081 +                       if (result == 0) {
69082 +                               parent_coord_t *in_parent;
69083 +
69084 +                               znode_make_dirty(fake);
69085 +
69086 +                               /* new root is a child of "fake" node */
69087 +                               write_lock_tree(tree);
69088 +
69089 +                               ++tree->height;
69090 +
69091 +                               /* recalculate max balance overhead */
69092 +                               tree->estimate_one_insert =
69093 +                                   estimate_one_insert_item(tree);
69094 +
69095 +                               tree->root_block = *znode_get_block(new_root);
69096 +                               in_parent = &new_root->in_parent;
69097 +                               init_parent_coord(in_parent, fake);
69098 +                               /* manually insert new root into sibling
69099 +                                * list. With this all nodes involved into
69100 +                                * balancing are connected after balancing is
69101 +                                * done---useful invariant to check. */
69102 +                               sibling_list_insert_nolock(new_root, NULL);
69103 +                               write_unlock_tree(tree);
69104 +
69105 +                               /* insert into new root pointer to the
69106 +                                  @old_root. */
69107 +                               assert("nikita-1110",
69108 +                                      WITH_DATA(new_root,
69109 +                                                node_is_empty(new_root)));
69110 +                               write_lock_dk(tree);
69111 +                               znode_set_ld_key(new_root, reiser4_min_key());
69112 +                               znode_set_rd_key(new_root, reiser4_max_key());
69113 +                               write_unlock_dk(tree);
69114 +                               if (REISER4_DEBUG) {
69115 +                                       ZF_CLR(old_root, JNODE_LEFT_CONNECTED);
69116 +                                       ZF_CLR(old_root, JNODE_RIGHT_CONNECTED);
69117 +                                       ZF_SET(old_root, JNODE_ORPHAN);
69118 +                               }
69119 +                               result = add_child_ptr(new_root, old_root);
69120 +                               done_lh(&rlh);
69121 +                       }
69122 +                       zrelse(new_root);
69123 +               }
69124 +       }
69125 +       if (result != 0)
69126 +               new_root = ERR_PTR(result);
69127 +       return new_root;
69128 +}
69129 +
69130 +/* build &reiser4_item_data for inserting child pointer
69131 +
69132 +   Build &reiser4_item_data that can be later used to insert pointer to @child
69133 +   in its parent.
69134 +
69135 +*/
69136 +void build_child_ptr_data(znode * child        /* node pointer to which will be
69137 +                                        * inserted */ ,
69138 +                         reiser4_item_data * data /* where to store result */ )
69139 +{
69140 +       assert("nikita-1116", child != NULL);
69141 +       assert("nikita-1117", data != NULL);
69142 +
69143 +       /*
69144 +        * NOTE: use address of child's blocknr as address of data to be
69145 +        * inserted. As result of this data gets into on-disk structure in cpu
69146 +        * byte order. internal's create_hook converts it to little endian byte
69147 +        * order.
69148 +        */
69149 +       data->data = (char *)znode_get_block(child);
69150 +       /* data -> data is kernel space */
69151 +       data->user = 0;
69152 +       data->length = sizeof(reiser4_block_nr);
69153 +       /* FIXME-VS: hardcoded internal item? */
69154 +
69155 +       /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */
69156 +       data->iplug = item_plugin_by_id(NODE_POINTER_ID);
69157 +}
69158 +
69159 +/* add pointer to @child into empty @parent.
69160 +
69161 +   This is used when pointer to old root is inserted into new root which is
69162 +   empty.
69163 +*/
69164 +static int add_child_ptr(znode * parent, znode * child)
69165 +{
69166 +       coord_t coord;
69167 +       reiser4_item_data data;
69168 +       int result;
69169 +       reiser4_key key;
69170 +
69171 +       assert("nikita-1111", parent != NULL);
69172 +       assert("nikita-1112", child != NULL);
69173 +       assert("nikita-1115",
69174 +              znode_get_level(parent) == znode_get_level(child) + 1);
69175 +
69176 +       result = zload(parent);
69177 +       if (result != 0)
69178 +               return result;
69179 +       assert("nikita-1113", node_is_empty(parent));
69180 +       coord_init_first_unit(&coord, parent);
69181 +
69182 +       build_child_ptr_data(child, &data);
69183 +       data.arg = NULL;
69184 +
69185 +       read_lock_dk(znode_get_tree(parent));
69186 +       key = *znode_get_ld_key(child);
69187 +       read_unlock_dk(znode_get_tree(parent));
69188 +
69189 +       result = node_plugin_by_node(parent)->create_item(&coord, &key, &data,
69190 +                                                         NULL);
69191 +       znode_make_dirty(parent);
69192 +       zrelse(parent);
69193 +       return result;
69194 +}
69195 +
69196 +/* actually remove tree root */
69197 +static int reiser4_kill_root(reiser4_tree * tree /* tree from which root is
69198 +                                                 * being removed */,
69199 +                            znode * old_root /* root node that is being
69200 +                                              * removed */ ,
69201 +                            znode * new_root   /* new root---sole child of
69202 +                                                * @old_root */,
69203 +                    const reiser4_block_nr * new_root_blk /* disk address of
69204 +                                                           * @new_root */)
69205 +{
69206 +       znode *uber;
69207 +       int result;
69208 +       lock_handle handle_for_uber;
69209 +
69210 +       assert("umka-265", tree != NULL);
69211 +       assert("nikita-1198", new_root != NULL);
69212 +       assert("nikita-1199",
69213 +              znode_get_level(new_root) + 1 == znode_get_level(old_root));
69214 +
69215 +       assert("nikita-1201", znode_is_write_locked(old_root));
69216 +
69217 +       assert("nikita-1203",
69218 +              disk_addr_eq(new_root_blk, znode_get_block(new_root)));
69219 +
69220 +       init_lh(&handle_for_uber);
69221 +       /* obtain and lock "fake" znode protecting changes in tree height. */
69222 +       result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
69223 +                               &handle_for_uber);
69224 +       if (result == 0) {
69225 +               uber = handle_for_uber.node;
69226 +
69227 +               znode_make_dirty(uber);
69228 +
69229 +               /* don't take long term lock a @new_root. Take spinlock. */
69230 +
69231 +               write_lock_tree(tree);
69232 +
69233 +               tree->root_block = *new_root_blk;
69234 +               --tree->height;
69235 +
69236 +               /* recalculate max balance overhead */
69237 +               tree->estimate_one_insert = estimate_one_insert_item(tree);
69238 +
69239 +               assert("nikita-1202",
69240 +                      tree->height == znode_get_level(new_root));
69241 +
69242 +               /* new root is child on "fake" node */
69243 +               init_parent_coord(&new_root->in_parent, uber);
69244 +               ++uber->c_count;
69245 +
69246 +               /* sibling_list_insert_nolock(new_root, NULL); */
69247 +               write_unlock_tree(tree);
69248 +
69249 +               /* reinitialise old root. */
69250 +               result = node_plugin_by_node(old_root)->init(old_root);
69251 +               znode_make_dirty(old_root);
69252 +               if (result == 0) {
69253 +                       assert("nikita-1279", node_is_empty(old_root));
69254 +                       ZF_SET(old_root, JNODE_HEARD_BANSHEE);
69255 +                       old_root->c_count = 0;
69256 +               }
69257 +       }
69258 +       done_lh(&handle_for_uber);
69259 +
69260 +       return result;
69261 +}
69262 +
69263 +/* remove tree root
69264 +
69265 +   This function removes tree root, decreasing tree height by one.  Tree root
69266 +   and its only child (that is going to become new tree root) are write locked
69267 +   at the entry.
69268 +
69269 +   To remove tree root we need to take lock on special "fake" znode that
69270 +   protects changes of tree height. See comments in reiser4_add_tree_root() for
69271 +   more on this.
69272 +
69273 +   Also parent pointers have to be updated in
69274 +   old and new root. To simplify code, function is split into two parts: outer
69275 +   reiser4_kill_tree_root() collects all necessary arguments and calls
69276 +   reiser4_kill_root() to do the actual job.
69277 +
69278 +*/
69279 +int reiser4_kill_tree_root(znode * old_root /* tree root that we are
69280 +                                              removing*/)
69281 +{
69282 +       int result;
69283 +       coord_t down_link;
69284 +       znode *new_root;
69285 +       reiser4_tree *tree;
69286 +
69287 +       assert("umka-266", current_tree != NULL);
69288 +       assert("nikita-1194", old_root != NULL);
69289 +       assert("nikita-1196", znode_is_root(old_root));
69290 +       assert("nikita-1200", node_num_items(old_root) == 1);
69291 +       assert("nikita-1401", znode_is_write_locked(old_root));
69292 +
69293 +       coord_init_first_unit(&down_link, old_root);
69294 +
69295 +       tree = znode_get_tree(old_root);
69296 +       new_root = child_znode(&down_link, old_root, 0, 1);
69297 +       if (!IS_ERR(new_root)) {
69298 +               result =
69299 +                       reiser4_kill_root(tree, old_root, new_root,
69300 +                                         znode_get_block(new_root));
69301 +               zput(new_root);
69302 +       } else
69303 +               result = PTR_ERR(new_root);
69304 +
69305 +       return result;
69306 +}
69307 +
69308 +/* Make Linus happy.
69309 +   Local variables:
69310 +   c-indentation-style: "K&R"
69311 +   mode-name: "LC"
69312 +   c-basic-offset: 8
69313 +   tab-width: 8
69314 +   fill-column: 120
69315 +   scroll-step: 1
69316 +   End:
69317 +*/
69318 diff -urN linux-2.6.35.orig/fs/reiser4/tree_mod.h linux-2.6.35/fs/reiser4/tree_mod.h
69319 --- linux-2.6.35.orig/fs/reiser4/tree_mod.h     1970-01-01 01:00:00.000000000 +0100
69320 +++ linux-2.6.35/fs/reiser4/tree_mod.h  2010-08-04 15:44:57.000000000 +0200
69321 @@ -0,0 +1,29 @@
69322 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69323 + * reiser4/README */
69324 +
69325 +/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for
69326 + * comments. */
69327 +
69328 +#if !defined( __REISER4_TREE_MOD_H__ )
69329 +#define __REISER4_TREE_MOD_H__
69330 +
69331 +#include "forward.h"
69332 +
69333 +znode *reiser4_new_node(znode * brother, tree_level level);
69334 +znode *reiser4_add_tree_root(znode * old_root, znode * fake);
69335 +int reiser4_kill_tree_root(znode * old_root);
69336 +void build_child_ptr_data(znode * child, reiser4_item_data * data);
69337 +
69338 +/* __REISER4_TREE_MOD_H__ */
69339 +#endif
69340 +
69341 +/* Make Linus happy.
69342 +   Local variables:
69343 +   c-indentation-style: "K&R"
69344 +   mode-name: "LC"
69345 +   c-basic-offset: 8
69346 +   tab-width: 8
69347 +   fill-column: 120
69348 +   scroll-step: 1
69349 +   End:
69350 +*/
69351 diff -urN linux-2.6.35.orig/fs/reiser4/tree_walk.c linux-2.6.35/fs/reiser4/tree_walk.c
69352 --- linux-2.6.35.orig/fs/reiser4/tree_walk.c    1970-01-01 01:00:00.000000000 +0100
69353 +++ linux-2.6.35/fs/reiser4/tree_walk.c 2010-08-04 15:44:57.000000000 +0200
69354 @@ -0,0 +1,927 @@
69355 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69356 + * reiser4/README */
69357 +
69358 +/* Routines and macros to:
69359 +
69360 +   get_left_neighbor()
69361 +
69362 +   get_right_neighbor()
69363 +
69364 +   get_parent()
69365 +
69366 +   get_first_child()
69367 +
69368 +   get_last_child()
69369 +
69370 +   various routines to walk the whole tree and do things to it like
69371 +   repack it, or move it to tertiary storage.  Please make them as
69372 +   generic as is reasonable.
69373 +
69374 +*/
69375 +
69376 +#include "forward.h"
69377 +#include "debug.h"
69378 +#include "dformat.h"
69379 +#include "coord.h"
69380 +#include "plugin/item/item.h"
69381 +#include "jnode.h"
69382 +#include "znode.h"
69383 +#include "tree_walk.h"
69384 +#include "tree.h"
69385 +#include "super.h"
69386 +
69387 +/* These macros are used internally in tree_walk.c in attempt to make
69388 +   lock_neighbor() code usable to build lock_parent(), lock_right_neighbor,
69389 +   lock_left_neighbor */
69390 +#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off)))
69391 +#define FIELD_OFFSET(name)  offsetof(znode, name)
69392 +#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node)
69393 +#define LEFT_PTR_OFFSET   FIELD_OFFSET(left)
69394 +#define RIGHT_PTR_OFFSET  FIELD_OFFSET(right)
69395 +
69396 +/* This is the generic procedure to get and lock `generic' neighbor (left or
69397 +    right neighbor or parent). It implements common algorithm for all cases of
69398 +    getting lock on neighbor node, only znode structure field is different in
69399 +    each case. This is parameterized by ptr_offset argument, which is byte
69400 +    offset for the pointer to the desired neighbor within the current node's
69401 +    znode structure. This function should be called with the tree lock held */
69402 +static int lock_neighbor(
69403 +                               /* resulting lock handle */
69404 +                               lock_handle * result,
69405 +                               /* znode to lock */
69406 +                               znode * node,
69407 +                               /* pointer to neighbor (or parent) znode field offset, in bytes from
69408 +                                  the base address of znode structure  */
69409 +                               int ptr_offset,
69410 +                               /* lock mode for longterm_lock_znode call */
69411 +                               znode_lock_mode mode,
69412 +                               /* lock request for longterm_lock_znode call */
69413 +                               znode_lock_request req,
69414 +                               /* GN_* flags */
69415 +                               int flags, int rlocked)
69416 +{
69417 +       reiser4_tree *tree = znode_get_tree(node);
69418 +       znode *neighbor;
69419 +       int ret;
69420 +
69421 +       assert("umka-236", node != NULL);
69422 +       assert("umka-237", tree != NULL);
69423 +       assert_rw_locked(&(tree->tree_lock));
69424 +
69425 +       if (flags & GN_TRY_LOCK)
69426 +               req |= ZNODE_LOCK_NONBLOCK;
69427 +       if (flags & GN_SAME_ATOM)
69428 +               req |= ZNODE_LOCK_DONT_FUSE;
69429 +
69430 +       /* get neighbor's address by using of sibling link, quit while loop
69431 +          (and return) if link is not available. */
69432 +       while (1) {
69433 +               neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset);
69434 +
69435 +               /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if
69436 +                * node pointed by it is not connected.
69437 +                *
69438 +                * However, GN_ALLOW_NOT_CONNECTED option masks "connected"
69439 +                * check and allows passing reference to not connected znode to
69440 +                * subsequent longterm_lock_znode() call.  This kills possible
69441 +                * busy loop if we are trying to get longterm lock on locked but
69442 +                * not yet connected parent node. */
69443 +               if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED)
69444 +                                         || znode_is_connected(neighbor))) {
69445 +                       return RETERR(-E_NO_NEIGHBOR);
69446 +               }
69447 +
69448 +               /* protect it from deletion. */
69449 +               zref(neighbor);
69450 +
69451 +               rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
69452 +
69453 +               ret = longterm_lock_znode(result, neighbor, mode, req);
69454 +
69455 +               /* The lock handle obtains its own reference, release the one from above. */
69456 +               zput(neighbor);
69457 +
69458 +               rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
69459 +
69460 +               /* restart if node we got reference to is being
69461 +                  invalidated. we should not get reference to this node
69462 +                  again. */
69463 +               if (ret == -EINVAL)
69464 +                       continue;
69465 +               if (ret)
69466 +                       return ret;
69467 +
69468 +               /* check if neighbor link still points to just locked znode;
69469 +                  the link could have been changed while the process slept. */
69470 +               if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset))
69471 +                       return 0;
69472 +
69473 +               /* znode was locked by mistake; unlock it and restart locking
69474 +                  process from beginning. */
69475 +               rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
69476 +               longterm_unlock_znode(result);
69477 +               rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
69478 +       }
69479 +}
69480 +
69481 +/* get parent node with longterm lock, accepts GN* flags. */
69482 +int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ ,
69483 +                            znode * node /* child node */ ,
69484 +                            znode_lock_mode mode
69485 +                            /* type of lock: read or write */ ,
69486 +                            int flags /* GN_* flags */ )
69487 +{
69488 +       int result;
69489 +
69490 +       read_lock_tree(znode_get_tree(node));
69491 +       result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode,
69492 +                              ZNODE_LOCK_HIPRI, flags, 1);
69493 +       read_unlock_tree(znode_get_tree(node));
69494 +       return result;
69495 +}
69496 +
69497 +/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT
69498 +   bit in @flags parameter  */
69499 +/* Audited by: umka (2002.06.14) */
69500 +static inline int
69501 +lock_side_neighbor(lock_handle * result,
69502 +                  znode * node, znode_lock_mode mode, int flags, int rlocked)
69503 +{
69504 +       int ret;
69505 +       int ptr_offset;
69506 +       znode_lock_request req;
69507 +
69508 +       if (flags & GN_GO_LEFT) {
69509 +               ptr_offset = LEFT_PTR_OFFSET;
69510 +               req = ZNODE_LOCK_LOPRI;
69511 +       } else {
69512 +               ptr_offset = RIGHT_PTR_OFFSET;
69513 +               req = ZNODE_LOCK_HIPRI;
69514 +       }
69515 +
69516 +       ret =
69517 +           lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked);
69518 +
69519 +       if (ret == -E_NO_NEIGHBOR)      /* if we walk left or right -E_NO_NEIGHBOR does not
69520 +                                        * guarantee that neighbor is absent in the
69521 +                                        * tree; in this case we return -ENOENT --
69522 +                                        * means neighbor at least not found in
69523 +                                        * cache */
69524 +               return RETERR(-ENOENT);
69525 +
69526 +       return ret;
69527 +}
69528 +
69529 +#if REISER4_DEBUG
69530 +
69531 +int check_sibling_list(znode * node)
69532 +{
69533 +       znode *scan;
69534 +       znode *next;
69535 +
69536 +       assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree));
69537 +
69538 +       if (node == NULL)
69539 +               return 1;
69540 +
69541 +       if (ZF_ISSET(node, JNODE_RIP))
69542 +               return 1;
69543 +
69544 +       assert("nikita-3270", node != NULL);
69545 +       assert_rw_write_locked(&(znode_get_tree(node)->tree_lock));
69546 +
69547 +       for (scan = node; znode_is_left_connected(scan); scan = next) {
69548 +               next = scan->left;
69549 +               if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
69550 +                       assert("nikita-3271", znode_is_right_connected(next));
69551 +                       assert("nikita-3272", next->right == scan);
69552 +               } else
69553 +                       break;
69554 +       }
69555 +       for (scan = node; znode_is_right_connected(scan); scan = next) {
69556 +               next = scan->right;
69557 +               if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
69558 +                       assert("nikita-3273", znode_is_left_connected(next));
69559 +                       assert("nikita-3274", next->left == scan);
69560 +               } else
69561 +                       break;
69562 +       }
69563 +       return 1;
69564 +}
69565 +
69566 +#endif
69567 +
69568 +/* Znode sibling pointers maintenence. */
69569 +
69570 +/* Znode sibling pointers are established between any neighbored nodes which are
69571 +   in cache.  There are two znode state bits (JNODE_LEFT_CONNECTED,
69572 +   JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual
69573 +   value (even NULL), corresponded JNODE_*_CONNECTED bit is set.
69574 +
69575 +   Reiser4 tree operations which may allocate new znodes (CBK, tree balancing)
69576 +   take care about searching (hash table lookup may be required) of znode
69577 +   neighbors, establishing sibling pointers between them and setting
69578 +   JNODE_*_CONNECTED state bits. */
69579 +
69580 +/* adjusting of sibling pointers and `connected' states for two
69581 +   neighbors; works if one neighbor is NULL (was not found). */
69582 +
69583 +/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */
69584 +void link_left_and_right(znode * left, znode * right)
69585 +{
69586 +       assert("nikita-3275", check_sibling_list(left));
69587 +       assert("nikita-3275", check_sibling_list(right));
69588 +
69589 +       if (left != NULL) {
69590 +               if (left->right == NULL) {
69591 +                       left->right = right;
69592 +                       ZF_SET(left, JNODE_RIGHT_CONNECTED);
69593 +
69594 +                       ON_DEBUG(left->right_version =
69595 +                                atomic_inc_return(&delim_key_version);
69596 +                           );
69597 +
69598 +               } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE)
69599 +                          && left->right != right) {
69600 +
69601 +                       ON_DEBUG(left->right->left_version =
69602 +                                atomic_inc_return(&delim_key_version);
69603 +                                left->right_version =
69604 +                                atomic_inc_return(&delim_key_version););
69605 +
69606 +                       left->right->left = NULL;
69607 +                       left->right = right;
69608 +                       ZF_SET(left, JNODE_RIGHT_CONNECTED);
69609 +               } else
69610 +                       /*
69611 +                        * there is a race condition in renew_sibling_link()
69612 +                        * and assertions below check that it is only one
69613 +                        * there. Thread T1 calls renew_sibling_link() without
69614 +                        * GN_NO_ALLOC flag. zlook() doesn't find neighbor
69615 +                        * node, but before T1 gets to the
69616 +                        * link_left_and_right(), another thread T2 creates
69617 +                        * neighbor node and connects it. check for
69618 +                        * left->right == NULL above protects T1 from
69619 +                        * overwriting correct left->right pointer installed
69620 +                        * by T2.
69621 +                        */
69622 +                       assert("nikita-3302",
69623 +                              right == NULL || left->right == right);
69624 +       }
69625 +       if (right != NULL) {
69626 +               if (right->left == NULL) {
69627 +                       right->left = left;
69628 +                       ZF_SET(right, JNODE_LEFT_CONNECTED);
69629 +
69630 +                       ON_DEBUG(right->left_version =
69631 +                                atomic_inc_return(&delim_key_version);
69632 +                           );
69633 +
69634 +               } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE)
69635 +                          && right->left != left) {
69636 +
69637 +                       ON_DEBUG(right->left->right_version =
69638 +                                atomic_inc_return(&delim_key_version);
69639 +                                right->left_version =
69640 +                                atomic_inc_return(&delim_key_version););
69641 +
69642 +                       right->left->right = NULL;
69643 +                       right->left = left;
69644 +                       ZF_SET(right, JNODE_LEFT_CONNECTED);
69645 +
69646 +               } else
69647 +                       assert("nikita-3303",
69648 +                              left == NULL || right->left == left);
69649 +       }
69650 +       assert("nikita-3275", check_sibling_list(left));
69651 +       assert("nikita-3275", check_sibling_list(right));
69652 +}
69653 +
69654 +/* Audited by: umka (2002.06.14) */
69655 +static void link_znodes(znode * first, znode * second, int to_left)
69656 +{
69657 +       if (to_left)
69658 +               link_left_and_right(second, first);
69659 +       else
69660 +               link_left_and_right(first, second);
69661 +}
69662 +
69663 +/* getting of next (to left or to right, depend on gn_to_left bit in flags)
69664 +   coord's unit position in horizontal direction, even across node
69665 +   boundary. Should be called under tree lock, it protects nonexistence of
69666 +   sibling link on parent level, if lock_side_neighbor() fails with
69667 +   -ENOENT. */
69668 +static int far_next_coord(coord_t * coord, lock_handle * handle, int flags)
69669 +{
69670 +       int ret;
69671 +       znode *node;
69672 +       reiser4_tree *tree;
69673 +
69674 +       assert("umka-243", coord != NULL);
69675 +       assert("umka-244", handle != NULL);
69676 +       assert("zam-1069", handle->node == NULL);
69677 +
69678 +       ret =
69679 +           (flags & GN_GO_LEFT) ? coord_prev_unit(coord) :
69680 +           coord_next_unit(coord);
69681 +       if (!ret)
69682 +               return 0;
69683 +
69684 +       ret =
69685 +           lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0);
69686 +       if (ret)
69687 +               return ret;
69688 +
69689 +       node = handle->node;
69690 +       tree = znode_get_tree(node);
69691 +       write_unlock_tree(tree);
69692 +
69693 +       coord_init_zero(coord);
69694 +
69695 +       /* We avoid synchronous read here if it is specified by flag. */
69696 +       if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) {
69697 +               ret = jstartio(ZJNODE(handle->node));
69698 +               if (!ret)
69699 +                       ret = -E_REPEAT;
69700 +               goto error_locked;
69701 +       }
69702 +
69703 +       /* corresponded zrelse() should be called by the clients of
69704 +          far_next_coord(), in place when this node gets unlocked. */
69705 +       ret = zload(handle->node);
69706 +       if (ret)
69707 +               goto error_locked;
69708 +
69709 +       if (flags & GN_GO_LEFT)
69710 +               coord_init_last_unit(coord, node);
69711 +       else
69712 +               coord_init_first_unit(coord, node);
69713 +
69714 +       if (0) {
69715 +             error_locked:
69716 +               longterm_unlock_znode(handle);
69717 +       }
69718 +       write_lock_tree(tree);
69719 +       return ret;
69720 +}
69721 +
69722 +/* Very significant function which performs a step in horizontal direction
69723 +   when sibling pointer is not available.  Actually, it is only function which
69724 +   does it.
69725 +   Note: this function does not restore locking status at exit,
69726 +   caller should does care about proper unlocking and zrelsing */
69727 +static int
69728 +renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child,
69729 +                  tree_level level, int flags, int *nr_locked)
69730 +{
69731 +       int ret;
69732 +       int to_left = flags & GN_GO_LEFT;
69733 +       reiser4_block_nr da;
69734 +       /* parent of the neighbor node; we set it to parent until not sharing
69735 +          of one parent between child and neighbor node is detected */
69736 +       znode *side_parent = coord->node;
69737 +       reiser4_tree *tree = znode_get_tree(child);
69738 +       znode *neighbor = NULL;
69739 +
69740 +       assert("umka-245", coord != NULL);
69741 +       assert("umka-246", handle != NULL);
69742 +       assert("umka-247", child != NULL);
69743 +       assert("umka-303", tree != NULL);
69744 +
69745 +       init_lh(handle);
69746 +       write_lock_tree(tree);
69747 +       ret = far_next_coord(coord, handle, flags);
69748 +
69749 +       if (ret) {
69750 +               if (ret != -ENOENT) {
69751 +                       write_unlock_tree(tree);
69752 +                       return ret;
69753 +               }
69754 +       } else {
69755 +               item_plugin *iplug;
69756 +
69757 +               if (handle->node != NULL) {
69758 +                       (*nr_locked)++;
69759 +                       side_parent = handle->node;
69760 +               }
69761 +
69762 +               /* does coord object points to internal item? We do not
69763 +                  support sibling pointers between znode for formatted and
69764 +                  unformatted nodes and return -E_NO_NEIGHBOR in that case. */
69765 +               iplug = item_plugin_by_coord(coord);
69766 +               if (!item_is_internal(coord)) {
69767 +                       link_znodes(child, NULL, to_left);
69768 +                       write_unlock_tree(tree);
69769 +                       /* we know there can't be formatted neighbor */
69770 +                       return RETERR(-E_NO_NEIGHBOR);
69771 +               }
69772 +               write_unlock_tree(tree);
69773 +
69774 +               iplug->s.internal.down_link(coord, NULL, &da);
69775 +
69776 +               if (flags & GN_NO_ALLOC) {
69777 +                       neighbor = zlook(tree, &da);
69778 +               } else {
69779 +                       neighbor =
69780 +                           zget(tree, &da, side_parent, level,
69781 +                                reiser4_ctx_gfp_mask_get());
69782 +               }
69783 +
69784 +               if (IS_ERR(neighbor)) {
69785 +                       ret = PTR_ERR(neighbor);
69786 +                       return ret;
69787 +               }
69788 +
69789 +               if (neighbor)
69790 +                       /* update delimiting keys */
69791 +                       set_child_delimiting_keys(coord->node, coord, neighbor);
69792 +
69793 +               write_lock_tree(tree);
69794 +       }
69795 +
69796 +       if (likely(neighbor == NULL ||
69797 +                  (znode_get_level(child) == znode_get_level(neighbor)
69798 +                   && child != neighbor)))
69799 +               link_znodes(child, neighbor, to_left);
69800 +       else {
69801 +               warning("nikita-3532",
69802 +                       "Sibling nodes on the different levels: %i != %i\n",
69803 +                       znode_get_level(child), znode_get_level(neighbor));
69804 +               ret = RETERR(-EIO);
69805 +       }
69806 +
69807 +       write_unlock_tree(tree);
69808 +
69809 +       /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */
69810 +       if (neighbor != NULL && (flags & GN_NO_ALLOC))
69811 +               /* atomic_dec(&ZJNODE(neighbor)->x_count); */
69812 +               zput(neighbor);
69813 +
69814 +       return ret;
69815 +}
69816 +
69817 +/* This function is for establishing of one side relation. */
69818 +/* Audited by: umka (2002.06.14) */
69819 +static int connect_one_side(coord_t * coord, znode * node, int flags)
69820 +{
69821 +       coord_t local;
69822 +       lock_handle handle;
69823 +       int nr_locked;
69824 +       int ret;
69825 +
69826 +       assert("umka-248", coord != NULL);
69827 +       assert("umka-249", node != NULL);
69828 +
69829 +       coord_dup_nocheck(&local, coord);
69830 +
69831 +       init_lh(&handle);
69832 +
69833 +       ret =
69834 +           renew_sibling_link(&local, &handle, node, znode_get_level(node),
69835 +                              flags | GN_NO_ALLOC, &nr_locked);
69836 +
69837 +       if (handle.node != NULL) {
69838 +               /* complementary operations for zload() and lock() in far_next_coord() */
69839 +               zrelse(handle.node);
69840 +               longterm_unlock_znode(&handle);
69841 +       }
69842 +
69843 +       /* we catch error codes which are not interesting for us because we
69844 +          run renew_sibling_link() only for znode connection. */
69845 +       if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
69846 +               return 0;
69847 +
69848 +       return ret;
69849 +}
69850 +
69851 +/* if @child is not in `connected' state, performs hash searches for left and
69852 +   right neighbor nodes and establishes horizontal sibling links */
69853 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
69854 +int connect_znode(coord_t * parent_coord, znode * child)
69855 +{
69856 +       reiser4_tree *tree = znode_get_tree(child);
69857 +       int ret = 0;
69858 +
69859 +       assert("zam-330", parent_coord != NULL);
69860 +       assert("zam-331", child != NULL);
69861 +       assert("zam-332", parent_coord->node != NULL);
69862 +       assert("umka-305", tree != NULL);
69863 +
69864 +       /* it is trivial to `connect' root znode because it can't have
69865 +          neighbors */
69866 +       if (znode_above_root(parent_coord->node)) {
69867 +               child->left = NULL;
69868 +               child->right = NULL;
69869 +               ZF_SET(child, JNODE_LEFT_CONNECTED);
69870 +               ZF_SET(child, JNODE_RIGHT_CONNECTED);
69871 +
69872 +               ON_DEBUG(child->left_version =
69873 +                        atomic_inc_return(&delim_key_version);
69874 +                        child->right_version =
69875 +                        atomic_inc_return(&delim_key_version););
69876 +
69877 +               return 0;
69878 +       }
69879 +
69880 +       /* load parent node */
69881 +       coord_clear_iplug(parent_coord);
69882 +       ret = zload(parent_coord->node);
69883 +
69884 +       if (ret != 0)
69885 +               return ret;
69886 +
69887 +       /* protect `connected' state check by tree_lock */
69888 +       read_lock_tree(tree);
69889 +
69890 +       if (!znode_is_right_connected(child)) {
69891 +               read_unlock_tree(tree);
69892 +               /* connect right (default is right) */
69893 +               ret = connect_one_side(parent_coord, child, GN_NO_ALLOC);
69894 +               if (ret)
69895 +                       goto zrelse_and_ret;
69896 +
69897 +               read_lock_tree(tree);
69898 +       }
69899 +
69900 +       ret = znode_is_left_connected(child);
69901 +
69902 +       read_unlock_tree(tree);
69903 +
69904 +       if (!ret) {
69905 +               ret =
69906 +                   connect_one_side(parent_coord, child,
69907 +                                    GN_NO_ALLOC | GN_GO_LEFT);
69908 +       } else
69909 +               ret = 0;
69910 +
69911 +      zrelse_and_ret:
69912 +       zrelse(parent_coord->node);
69913 +
69914 +       return ret;
69915 +}
69916 +
69917 +/* this function is like renew_sibling_link() but allocates neighbor node if
69918 +   it doesn't exist and `connects' it. It may require making two steps in
69919 +   horizontal direction, first one for neighbor node finding/allocation,
69920 +   second one is for finding neighbor of neighbor to connect freshly allocated
69921 +   znode. */
69922 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
69923 +static int
69924 +renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags)
69925 +{
69926 +       coord_t local;
69927 +       lock_handle empty[2];
69928 +       reiser4_tree *tree = znode_get_tree(node);
69929 +       znode *neighbor = NULL;
69930 +       int nr_locked = 0;
69931 +       int ret;
69932 +
69933 +       assert("umka-250", coord != NULL);
69934 +       assert("umka-251", node != NULL);
69935 +       assert("umka-307", tree != NULL);
69936 +       assert("umka-308", level <= tree->height);
69937 +
69938 +       /* umka (2002.06.14)
69939 +          Here probably should be a check for given "level" validness.
69940 +          Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT);
69941 +        */
69942 +
69943 +       coord_dup(&local, coord);
69944 +
69945 +       ret =
69946 +           renew_sibling_link(&local, &empty[0], node, level,
69947 +                              flags & ~GN_NO_ALLOC, &nr_locked);
69948 +       if (ret)
69949 +               goto out;
69950 +
69951 +       /* tree lock is not needed here because we keep parent node(s) locked
69952 +          and reference to neighbor znode incremented */
69953 +       neighbor = (flags & GN_GO_LEFT) ? node->left : node->right;
69954 +
69955 +       read_lock_tree(tree);
69956 +       ret = znode_is_connected(neighbor);
69957 +       read_unlock_tree(tree);
69958 +       if (ret) {
69959 +               ret = 0;
69960 +               goto out;
69961 +       }
69962 +
69963 +       ret =
69964 +           renew_sibling_link(&local, &empty[nr_locked], neighbor, level,
69965 +                              flags | GN_NO_ALLOC, &nr_locked);
69966 +       /* second renew_sibling_link() call is used for znode connection only,
69967 +          so we can live with these errors */
69968 +       if (-ENOENT == ret || -E_NO_NEIGHBOR == ret)
69969 +               ret = 0;
69970 +
69971 +      out:
69972 +
69973 +       for (--nr_locked; nr_locked >= 0; --nr_locked) {
69974 +               zrelse(empty[nr_locked].node);
69975 +               longterm_unlock_znode(&empty[nr_locked]);
69976 +       }
69977 +
69978 +       if (neighbor != NULL)
69979 +               /* decrement znode reference counter without actually
69980 +                  releasing it. */
69981 +               atomic_dec(&ZJNODE(neighbor)->x_count);
69982 +
69983 +       return ret;
69984 +}
69985 +
69986 +/*
69987 +   reiser4_get_neighbor() -- lock node's neighbor.
69988 +
69989 +   reiser4_get_neighbor() locks node's neighbor (left or right one, depends on
69990 +   given parameter) using sibling link to it. If sibling link is not available
69991 +   (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one
69992 +   level up for information about neighbor's disk address. We lock node's
69993 +   parent, if it is common parent for both 'node' and its neighbor, neighbor's
69994 +   disk address is in next (to left or to right) down link from link that points
69995 +   to original node. If not, we need to lock parent's neighbor, read its content
69996 +   and take first(last) downlink with neighbor's disk address.  That locking
69997 +   could be done by using sibling link and lock_neighbor() function, if sibling
69998 +   link exists. In another case we have to go level up again until we find
69999 +   common parent or valid sibling link. Then go down
70000 +   allocating/connecting/locking/reading nodes until neighbor of first one is
70001 +   locked.
70002 +
70003 +   @neighbor:  result lock handle,
70004 +   @node: a node which we lock neighbor of,
70005 +   @lock_mode: lock mode {LM_READ, LM_WRITE},
70006 +   @flags: logical OR of {GN_*} (see description above) subset.
70007 +
70008 +   @return: 0 if success, negative value if lock was impossible due to an error
70009 +   or lack of neighbor node.
70010 +*/
70011 +
70012 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
70013 +int
70014 +reiser4_get_neighbor(lock_handle * neighbor, znode * node,
70015 +                    znode_lock_mode lock_mode, int flags)
70016 +{
70017 +       reiser4_tree *tree = znode_get_tree(node);
70018 +       lock_handle path[REAL_MAX_ZTREE_HEIGHT];
70019 +
70020 +       coord_t coord;
70021 +
70022 +       tree_level base_level;
70023 +       tree_level h = 0;
70024 +       int ret;
70025 +
70026 +       assert("umka-252", tree != NULL);
70027 +       assert("umka-253", neighbor != NULL);
70028 +       assert("umka-254", node != NULL);
70029 +
70030 +       base_level = znode_get_level(node);
70031 +
70032 +       assert("umka-310", base_level <= tree->height);
70033 +
70034 +       coord_init_zero(&coord);
70035 +
70036 +      again:
70037 +       /* first, we try to use simple lock_neighbor() which requires sibling
70038 +          link existence */
70039 +       read_lock_tree(tree);
70040 +       ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1);
70041 +       read_unlock_tree(tree);
70042 +       if (!ret) {
70043 +               /* load znode content if it was specified */
70044 +               if (flags & GN_LOAD_NEIGHBOR) {
70045 +                       ret = zload(node);
70046 +                       if (ret)
70047 +                               longterm_unlock_znode(neighbor);
70048 +               }
70049 +               return ret;
70050 +       }
70051 +
70052 +       /* only -ENOENT means we may look upward and try to connect
70053 +          @node with its neighbor (if @flags allow us to do it) */
70054 +       if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS))
70055 +               return ret;
70056 +
70057 +       /* before establishing of sibling link we lock parent node; it is
70058 +          required by renew_neighbor() to work.  */
70059 +       init_lh(&path[0]);
70060 +       ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK);
70061 +       if (ret)
70062 +               return ret;
70063 +       if (znode_above_root(path[0].node)) {
70064 +               longterm_unlock_znode(&path[0]);
70065 +               return RETERR(-E_NO_NEIGHBOR);
70066 +       }
70067 +
70068 +       while (1) {
70069 +               znode *child = (h == 0) ? node : path[h - 1].node;
70070 +               znode *parent = path[h].node;
70071 +
70072 +               ret = zload(parent);
70073 +               if (ret)
70074 +                       break;
70075 +
70076 +               ret = find_child_ptr(parent, child, &coord);
70077 +
70078 +               if (ret) {
70079 +                       zrelse(parent);
70080 +                       break;
70081 +               }
70082 +
70083 +               /* try to establish missing sibling link */
70084 +               ret = renew_neighbor(&coord, child, h + base_level, flags);
70085 +
70086 +               zrelse(parent);
70087 +
70088 +               switch (ret) {
70089 +               case 0:
70090 +                       /* unlocking of parent znode prevents simple
70091 +                          deadlock situation */
70092 +                       done_lh(&path[h]);
70093 +
70094 +                       /* depend on tree level we stay on we repeat first
70095 +                          locking attempt ...  */
70096 +                       if (h == 0)
70097 +                               goto again;
70098 +
70099 +                       /* ... or repeat establishing of sibling link at
70100 +                          one level below. */
70101 +                       --h;
70102 +                       break;
70103 +
70104 +               case -ENOENT:
70105 +                       /* sibling link is not available -- we go
70106 +                          upward. */
70107 +                       init_lh(&path[h + 1]);
70108 +                       ret =
70109 +                           reiser4_get_parent(&path[h + 1], parent,
70110 +                                              ZNODE_READ_LOCK);
70111 +                       if (ret)
70112 +                               goto fail;
70113 +                       ++h;
70114 +                       if (znode_above_root(path[h].node)) {
70115 +                               ret = RETERR(-E_NO_NEIGHBOR);
70116 +                               goto fail;
70117 +                       }
70118 +                       break;
70119 +
70120 +               case -E_DEADLOCK:
70121 +                       /* there was lock request from hi-pri locker. if
70122 +                          it is possible we unlock last parent node and
70123 +                          re-lock it again. */
70124 +                       for (; reiser4_check_deadlock(); h--) {
70125 +                               done_lh(&path[h]);
70126 +                               if (h == 0)
70127 +                                       goto fail;
70128 +                       }
70129 +
70130 +                       break;
70131 +
70132 +               default:        /* other errors. */
70133 +                       goto fail;
70134 +               }
70135 +       }
70136 +      fail:
70137 +       ON_DEBUG(check_lock_node_data(node));
70138 +       ON_DEBUG(check_lock_data());
70139 +
70140 +       /* unlock path */
70141 +       do {
70142 +               /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto
70143 +                  fail; path[0] is already done_lh-ed, therefore
70144 +                  longterm_unlock_znode(&path[h]); is not applicable */
70145 +               done_lh(&path[h]);
70146 +               --h;
70147 +       } while (h + 1 != 0);
70148 +
70149 +       return ret;
70150 +}
70151 +
70152 +/* remove node from sibling list */
70153 +/* Audited by: umka (2002.06.14) */
70154 +void sibling_list_remove(znode * node)
70155 +{
70156 +       reiser4_tree *tree;
70157 +
70158 +       tree = znode_get_tree(node);
70159 +       assert("umka-255", node != NULL);
70160 +       assert_rw_write_locked(&(tree->tree_lock));
70161 +       assert("nikita-3275", check_sibling_list(node));
70162 +
70163 +       write_lock_dk(tree);
70164 +       if (znode_is_right_connected(node) && node->right != NULL &&
70165 +           znode_is_left_connected(node) && node->left != NULL) {
70166 +               assert("zam-32245",
70167 +                      keyeq(znode_get_rd_key(node),
70168 +                            znode_get_ld_key(node->right)));
70169 +               znode_set_rd_key(node->left, znode_get_ld_key(node->right));
70170 +       }
70171 +       write_unlock_dk(tree);
70172 +
70173 +       if (znode_is_right_connected(node) && node->right != NULL) {
70174 +               assert("zam-322", znode_is_left_connected(node->right));
70175 +               node->right->left = node->left;
70176 +               ON_DEBUG(node->right->left_version =
70177 +                        atomic_inc_return(&delim_key_version);
70178 +                   );
70179 +       }
70180 +       if (znode_is_left_connected(node) && node->left != NULL) {
70181 +               assert("zam-323", znode_is_right_connected(node->left));
70182 +               node->left->right = node->right;
70183 +               ON_DEBUG(node->left->right_version =
70184 +                        atomic_inc_return(&delim_key_version);
70185 +                   );
70186 +       }
70187 +
70188 +       ZF_CLR(node, JNODE_LEFT_CONNECTED);
70189 +       ZF_CLR(node, JNODE_RIGHT_CONNECTED);
70190 +       ON_DEBUG(node->left = node->right = NULL;
70191 +                node->left_version = atomic_inc_return(&delim_key_version);
70192 +                node->right_version = atomic_inc_return(&delim_key_version););
70193 +       assert("nikita-3276", check_sibling_list(node));
70194 +}
70195 +
70196 +/* disconnect node from sibling list */
70197 +void sibling_list_drop(znode * node)
70198 +{
70199 +       znode *right;
70200 +       znode *left;
70201 +
70202 +       assert("nikita-2464", node != NULL);
70203 +       assert("nikita-3277", check_sibling_list(node));
70204 +
70205 +       right = node->right;
70206 +       if (right != NULL) {
70207 +               assert("nikita-2465", znode_is_left_connected(right));
70208 +               right->left = NULL;
70209 +               ON_DEBUG(right->left_version =
70210 +                        atomic_inc_return(&delim_key_version);
70211 +                   );
70212 +       }
70213 +       left = node->left;
70214 +       if (left != NULL) {
70215 +               assert("zam-323", znode_is_right_connected(left));
70216 +               left->right = NULL;
70217 +               ON_DEBUG(left->right_version =
70218 +                        atomic_inc_return(&delim_key_version);
70219 +                   );
70220 +       }
70221 +       ZF_CLR(node, JNODE_LEFT_CONNECTED);
70222 +       ZF_CLR(node, JNODE_RIGHT_CONNECTED);
70223 +       ON_DEBUG(node->left = node->right = NULL;
70224 +                node->left_version = atomic_inc_return(&delim_key_version);
70225 +                node->right_version = atomic_inc_return(&delim_key_version););
70226 +}
70227 +
70228 +/* Insert new node into sibling list. Regular balancing inserts new node
70229 +   after (at right side) existing and locked node (@before), except one case
70230 +   of adding new tree root node. @before should be NULL in that case. */
70231 +void sibling_list_insert_nolock(znode * new, znode * before)
70232 +{
70233 +       assert("zam-334", new != NULL);
70234 +       assert("nikita-3298", !znode_is_left_connected(new));
70235 +       assert("nikita-3299", !znode_is_right_connected(new));
70236 +       assert("nikita-3300", new->left == NULL);
70237 +       assert("nikita-3301", new->right == NULL);
70238 +       assert("nikita-3278", check_sibling_list(new));
70239 +       assert("nikita-3279", check_sibling_list(before));
70240 +
70241 +       if (before != NULL) {
70242 +               assert("zam-333", znode_is_connected(before));
70243 +               new->right = before->right;
70244 +               new->left = before;
70245 +               ON_DEBUG(new->right_version =
70246 +                        atomic_inc_return(&delim_key_version);
70247 +                        new->left_version =
70248 +                        atomic_inc_return(&delim_key_version););
70249 +               if (before->right != NULL) {
70250 +                       before->right->left = new;
70251 +                       ON_DEBUG(before->right->left_version =
70252 +                                atomic_inc_return(&delim_key_version);
70253 +                           );
70254 +               }
70255 +               before->right = new;
70256 +               ON_DEBUG(before->right_version =
70257 +                        atomic_inc_return(&delim_key_version);
70258 +                   );
70259 +       } else {
70260 +               new->right = NULL;
70261 +               new->left = NULL;
70262 +               ON_DEBUG(new->right_version =
70263 +                        atomic_inc_return(&delim_key_version);
70264 +                        new->left_version =
70265 +                        atomic_inc_return(&delim_key_version););
70266 +       }
70267 +       ZF_SET(new, JNODE_LEFT_CONNECTED);
70268 +       ZF_SET(new, JNODE_RIGHT_CONNECTED);
70269 +       assert("nikita-3280", check_sibling_list(new));
70270 +       assert("nikita-3281", check_sibling_list(before));
70271 +}
70272 +
70273 +/*
70274 +   Local variables:
70275 +   c-indentation-style: "K&R"
70276 +   mode-name: "LC"
70277 +   c-basic-offset: 8
70278 +   tab-width: 8
70279 +   fill-column: 80
70280 +   End:
70281 +*/
70282 diff -urN linux-2.6.35.orig/fs/reiser4/tree_walk.h linux-2.6.35/fs/reiser4/tree_walk.h
70283 --- linux-2.6.35.orig/fs/reiser4/tree_walk.h    1970-01-01 01:00:00.000000000 +0100
70284 +++ linux-2.6.35/fs/reiser4/tree_walk.h 2010-08-04 15:44:57.000000000 +0200
70285 @@ -0,0 +1,125 @@
70286 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
70287 +
70288 +/* definitions of reiser4 tree walk functions */
70289 +
70290 +#ifndef __FS_REISER4_TREE_WALK_H__
70291 +#define __FS_REISER4_TREE_WALK_H__
70292 +
70293 +#include "debug.h"
70294 +#include "forward.h"
70295 +
70296 +/* establishes horizontal links between cached znodes */
70297 +int connect_znode(coord_t * coord, znode * node);
70298 +
70299 +/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor())
70300 +  have the following common arguments:
70301 +
70302 +  return codes:
70303 +
70304 +  @return : 0        - OK,
70305 +
70306 +ZAM-FIXME-HANS: wrong return code name.  Change them all.
70307 +           -ENOENT  - neighbor is not in cache, what is detected by sibling
70308 +                      link absence.
70309 +
70310 +            -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be
70311 +                       found (because we are left-/right- most node of the
70312 +                      tree, for example). Also, this return code is for
70313 +                      reiser4_get_parent() when we see no parent link -- it
70314 +                      means that our node is root node.
70315 +
70316 +            -E_DEADLOCK - deadlock detected (request from high-priority process
70317 +                      received), other error codes are conformed to
70318 +                      /usr/include/asm/errno.h .
70319 +*/
70320 +
70321 +int
70322 +reiser4_get_parent_flags(lock_handle * result, znode * node,
70323 +                        znode_lock_mode mode, int flags);
70324 +
70325 +/* bits definition for reiser4_get_neighbor function `flags' arg. */
70326 +typedef enum {
70327 +       /* If sibling pointer is NULL, this flag allows get_neighbor() to try to
70328 +        * find not allocated not connected neigbor by going though upper
70329 +        * levels */
70330 +       GN_CAN_USE_UPPER_LEVELS = 0x1,
70331 +       /* locking left neighbor instead of right one */
70332 +       GN_GO_LEFT = 0x2,
70333 +       /* automatically load neighbor node content */
70334 +       GN_LOAD_NEIGHBOR = 0x4,
70335 +       /* return -E_REPEAT if can't lock  */
70336 +       GN_TRY_LOCK = 0x8,
70337 +       /* used internally in tree_walk.c, causes renew_sibling to not
70338 +          allocate neighbor znode, but only search for it in znode cache */
70339 +       GN_NO_ALLOC = 0x10,
70340 +       /* do not go across atom boundaries */
70341 +       GN_SAME_ATOM = 0x20,
70342 +       /* allow to lock not connected nodes */
70343 +       GN_ALLOW_NOT_CONNECTED = 0x40,
70344 +       /*  Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */
70345 +       GN_ASYNC = 0x80
70346 +} znode_get_neigbor_flags;
70347 +
70348 +/* A commonly used wrapper for reiser4_get_parent_flags(). */
70349 +static inline int reiser4_get_parent(lock_handle * result, znode * node,
70350 +                                    znode_lock_mode mode)
70351 +{
70352 +       return reiser4_get_parent_flags(result, node, mode,
70353 +                                       GN_ALLOW_NOT_CONNECTED);
70354 +}
70355 +
70356 +int reiser4_get_neighbor(lock_handle * neighbor, znode * node,
70357 +                        znode_lock_mode lock_mode, int flags);
70358 +
70359 +/* there are wrappers for most common usages of reiser4_get_neighbor() */
70360 +static inline int
70361 +reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode,
70362 +                         int flags)
70363 +{
70364 +       return reiser4_get_neighbor(result, node, lock_mode,
70365 +                                   flags | GN_GO_LEFT);
70366 +}
70367 +
70368 +static inline int
70369 +reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode,
70370 +                          int flags)
70371 +{
70372 +       ON_DEBUG(check_lock_node_data(node));
70373 +       ON_DEBUG(check_lock_data());
70374 +       return reiser4_get_neighbor(result, node, lock_mode,
70375 +                                   flags & (~GN_GO_LEFT));
70376 +}
70377 +
70378 +extern void sibling_list_remove(znode * node);
70379 +extern void sibling_list_drop(znode * node);
70380 +extern void sibling_list_insert_nolock(znode * new, znode * before);
70381 +extern void link_left_and_right(znode * left, znode * right);
70382 +
70383 +/* Functions called by tree_walk() when tree_walk() ...  */
70384 +struct tree_walk_actor {
70385 +       /* ... meets a formatted node, */
70386 +       int (*process_znode) (tap_t *, void *);
70387 +       /* ... meets an extent, */
70388 +       int (*process_extent) (tap_t *, void *);
70389 +       /* ... begins tree traversal or repeats it after -E_REPEAT was returned by
70390 +        * node or extent processing functions. */
70391 +       int (*before) (void *);
70392 +};
70393 +
70394 +#if REISER4_DEBUG
70395 +int check_sibling_list(znode * node);
70396 +#else
70397 +#define check_sibling_list(n) (1)
70398 +#endif
70399 +
70400 +#endif                         /* __FS_REISER4_TREE_WALK_H__ */
70401 +
70402 +/*
70403 +   Local variables:
70404 +   c-indentation-style: "K&R"
70405 +   mode-name: "LC"
70406 +   c-basic-offset: 8
70407 +   tab-width: 8
70408 +   fill-column: 120
70409 +   End:
70410 +*/
70411 diff -urN linux-2.6.35.orig/fs/reiser4/txnmgr.c linux-2.6.35/fs/reiser4/txnmgr.c
70412 --- linux-2.6.35.orig/fs/reiser4/txnmgr.c       1970-01-01 01:00:00.000000000 +0100
70413 +++ linux-2.6.35/fs/reiser4/txnmgr.c    2010-08-04 15:44:57.000000000 +0200
70414 @@ -0,0 +1,3165 @@
70415 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
70416 + * reiser4/README */
70417 +
70418 +/* Joshua MacDonald wrote the first draft of this code. */
70419 +
70420 +/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a
70421 +filesystem scales only as well as its worst locking design.  You need to
70422 +substantially restructure this code. Josh was not as experienced a programmer
70423 +as you.  Particularly review how the locking style differs from what you did
70424 +for znodes usingt hi-lo priority locking, and present to me an opinion on
70425 +whether the differences are well founded.  */
70426 +
70427 +/* I cannot help but to disagree with the sentiment above. Locking of
70428 + * transaction manager is _not_ badly designed, and, at the very least, is not
70429 + * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority
70430 + * locking on znodes, especially on the root node of the tree. --nikita,
70431 + * 2003.10.13 */
70432 +
70433 +/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles.  The
70434 +   txnmgr processes capture_block requests and manages the relationship between jnodes and
70435 +   atoms through the various stages of a transcrash, and it also oversees the fusion and
70436 +   capture-on-copy processes.  The main difficulty with this task is maintaining a
70437 +   deadlock-free lock ordering between atoms and jnodes/handles.  The reason for the
70438 +   difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle
70439 +   must be broken.  The main requirement is that atom-fusion be deadlock free, so once you
70440 +   hold the atom_lock you may then wait to acquire any jnode or handle lock.  This implies
70441 +   that any time you check the atom-pointer of a jnode or handle and then try to lock that
70442 +   atom, you must use trylock() and possibly reverse the order.
70443 +
70444 +   This code implements the design documented at:
70445 +
70446 +     http://namesys.com/txn-doc.html
70447 +
70448 +ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the
70449 +above document and reference the new.  Be sure to provide some credit to Josh.  I already have some writings on this
70450 +topic in v4.html, but they are lacking in details present in the above.  Cure that.  Remember to write for the bright 12
70451 +year old --- define all technical terms used.
70452 +
70453 +*/
70454 +
70455 +/* Thoughts on the external transaction interface:
70456 +
70457 +   In the current code, a TRANSCRASH handle is created implicitly by reiser4_init_context() (which
70458 +   creates state that lasts for the duration of a system call and is called at the start
70459 +   of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(),
70460 +   occupying the scope of a single system call.  We wish to give certain applications an
70461 +   interface to begin and close (commit) transactions.  Since our implementation of
70462 +   transactions does not yet support isolation, allowing an application to open a
70463 +   transaction implies trusting it to later close the transaction.  Part of the
70464 +   transaction interface will be aimed at enabling that trust, but the interface for
70465 +   actually using transactions is fairly narrow.
70466 +
70467 +   BEGIN_TRANSCRASH: Returns a transcrash identifier.  It should be possible to translate
70468 +   this identifier into a string that a shell-script could use, allowing you to start a
70469 +   transaction by issuing a command.  Once open, the transcrash should be set in the task
70470 +   structure, and there should be options (I suppose) to allow it to be carried across
70471 +   fork/exec.  A transcrash has several options:
70472 +
70473 +     - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only
70474 +     on writes (WRITE_FUSING) and allow "dirty reads".  If the application wishes to
70475 +     capture on reads as well, it should set READ_FUSING.
70476 +
70477 +     - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must
70478 +     eventually close (or else the machine must crash).  If the application dies an
70479 +     unexpected death with an open transcrash, for example, or if it hangs for a long
70480 +     duration, one solution (to avoid crashing the machine) is to simply close it anyway.
70481 +     This is a dangerous option, but it is one way to solve the problem until isolated
70482 +     transcrashes are available for untrusted applications.
70483 +
70484 +     It seems to be what databases do, though it is unclear how one avoids a DoS attack
70485 +     creating a vulnerability based on resource starvation.  Guaranteeing that some
70486 +     minimum amount of computational resources are made available would seem more correct
70487 +     than guaranteeing some amount of time.  When we again have someone to code the work,
70488 +     this issue should be considered carefully.  -Hans
70489 +
70490 +   RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how
70491 +   many dirty blocks it expects.  The reserve_blocks interface should be called at a point
70492 +   where it is safe for the application to fail, because the system may not be able to
70493 +   grant the allocation and the application must be able to back-out.  For this reason,
70494 +   the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but
70495 +   the application may also wish to extend the allocation after beginning its transcrash.
70496 +
70497 +   CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making
70498 +   modifications that require transaction protection.  When isolated transactions are
70499 +   supported the CLOSE operation is replaced by either COMMIT or ABORT.  For example, if a
70500 +   RESERVE_BLOCKS call fails for the application, it should "abort" by calling
70501 +   CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is
70502 +   why, for safety, the application should call RESERVE_BLOCKS before making any changes).
70503 +
70504 +   For actually implementing these out-of-system-call-scopped transcrashes, the
70505 +   reiser4_context has a "txn_handle *trans" pointer that may be set to an open
70506 +   transcrash.  Currently there are no dynamically-allocated transcrashes, but there is a
70507 +   "struct kmem_cache *_txnh_slab" created for that purpose in this file.
70508 +*/
70509 +
70510 +/* Extending the other system call interfaces for future transaction features:
70511 +
70512 +   Specialized applications may benefit from passing flags to the ordinary system call
70513 +   interface such as read(), write(), or stat().  For example, the application specifies
70514 +   WRITE_FUSING by default but wishes to add that a certain read() command should be
70515 +   treated as READ_FUSING.  But which read?  Is it the directory-entry read, the stat-data
70516 +   read, or the file-data read?  These issues are straight-forward, but there are a lot of
70517 +   them and adding the necessary flags-passing code will be tedious.
70518 +
70519 +   When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW)
70520 +   flag, which specifies that although it is a read operation being requested, a
70521 +   write-lock should be taken.  The reason is that read-locks are shared while write-locks
70522 +   are exclusive, so taking a read-lock when a later-write is known in advance will often
70523 +   leads to deadlock.  If a reader knows it will write later, it should issue read
70524 +   requests with the RMW flag set.
70525 +*/
70526 +
70527 +/*
70528 +   The znode/atom deadlock avoidance.
70529 +
70530 +   FIXME(Zam): writing of this comment is in progress.
70531 +
70532 +   The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's
70533 +   long-term locking, which makes reiser4 locking scheme more complex.  It had
70534 +   deadlocks until we implement deadlock avoidance algorithms.  That deadlocks
70535 +   looked as the following: one stopped thread waits for a long-term lock on
70536 +   znode, the thread who owns that lock waits when fusion with another atom will
70537 +   be allowed.
70538 +
70539 +   The source of the deadlocks is an optimization of not capturing index nodes
70540 +   for read.  Let's prove it.  Suppose we have dumb node capturing scheme which
70541 +   unconditionally captures each block before locking it.
70542 +
70543 +   That scheme has no deadlocks.  Let's begin with the thread which stage is
70544 +   ASTAGE_CAPTURE_WAIT and it waits for a znode lock.  The thread can't wait for
70545 +   a capture because it's stage allows fusion with any atom except which are
70546 +   being committed currently. A process of atom commit can't deadlock because
70547 +   atom commit procedure does not acquire locks and does not fuse with other
70548 +   atoms.  Reiser4 does capturing right before going to sleep inside the
70549 +   longtertm_lock_znode() function, it means the znode which we want to lock is
70550 +   already captured and its atom is in ASTAGE_CAPTURE_WAIT stage.  If we
70551 +   continue the analysis we understand that no one process in the sequence may
70552 +   waits atom fusion.  Thereby there are no deadlocks of described kind.
70553 +
70554 +   The capturing optimization makes the deadlocks possible.  A thread can wait a
70555 +   lock which owner did not captured that node.  The lock owner's current atom
70556 +   is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT
70557 +   state. A deadlock is possible when that atom meets another one which is in
70558 +   ASTAGE_CAPTURE_WAIT already.
70559 +
70560 +   The deadlock avoidance scheme includes two algorithms:
70561 +
70562 +   First algorithm is used when a thread captures a node which is locked but not
70563 +   captured by another thread.  Those nodes are marked MISSED_IN_CAPTURE at the
70564 +   moment we skip their capturing.  If such a node (marked MISSED_IN_CAPTURE) is
70565 +   being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the
70566 +   routine which forces all lock owners to join with current atom is executed.
70567 +
70568 +   Second algorithm does not allow to skip capturing of already captured nodes.
70569 +
70570 +   Both algorithms together prevent waiting a longterm lock without atom fusion
70571 +   with atoms of all lock owners, which is a key thing for getting atom/znode
70572 +   locking deadlocks.
70573 +*/
70574 +
70575 +/*
70576 + * Transactions and mmap(2).
70577 + *
70578 + *     1. Transactions are not supported for accesses through mmap(2), because
70579 + *     this would effectively amount to user-level transactions whose duration
70580 + *     is beyond control of the kernel.
70581 + *
70582 + *     2. That said, we still want to preserve some decency with regard to
70583 + *     mmap(2). During normal write(2) call, following sequence of events
70584 + *     happens:
70585 + *
70586 + *         1. page is created;
70587 + *
70588 + *         2. jnode is created, dirtied and captured into current atom.
70589 + *
70590 + *         3. extent is inserted and modified.
70591 + *
70592 + *     Steps (2) and (3) take place under long term lock on the twig node.
70593 + *
70594 + *     When file is accessed through mmap(2) page is always created during
70595 + *     page fault.
70596 + *     After this (in reiser4_readpage()->reiser4_readpage_extent()):
70597 + *
70598 + *         1. if access is made to non-hole page new jnode is created, (if
70599 + *         necessary)
70600 + *
70601 + *         2. if access is made to the hole page, jnode is not created (XXX
70602 + *         not clear why).
70603 + *
70604 + *     Also, even if page is created by write page fault it is not marked
70605 + *     dirty immediately by handle_mm_fault(). Probably this is to avoid races
70606 + *     with page write-out.
70607 + *
70608 + *     Dirty bit installed by hardware is only transferred to the struct page
70609 + *     later, when page is unmapped (in zap_pte_range(), or
70610 + *     try_to_unmap_one()).
70611 + *
70612 + *     So, with mmap(2) we have to handle following irksome situations:
70613 + *
70614 + *         1. there exists modified page (clean or dirty) without jnode
70615 + *
70616 + *         2. there exists modified page (clean or dirty) with clean jnode
70617 + *
70618 + *         3. clean page which is a part of atom can be transparently modified
70619 + *         at any moment through mapping without becoming dirty.
70620 + *
70621 + *     (1) and (2) can lead to the out-of-memory situation: ->writepage()
70622 + *     doesn't know what to do with such pages and ->sync_sb()/->writepages()
70623 + *     don't see them, because these methods operate on atoms.
70624 + *
70625 + *     (3) can lead to the loss of data: suppose we have dirty page with dirty
70626 + *     captured jnode captured by some atom. As part of early flush (for
70627 + *     example) page was written out. Dirty bit was cleared on both page and
70628 + *     jnode. After this page is modified through mapping, but kernel doesn't
70629 + *     notice and just discards page and jnode as part of commit. (XXX
70630 + *     actually it doesn't, because to reclaim page ->releasepage() has to be
70631 + *     called and before this dirty bit will be transferred to the struct
70632 + *     page).
70633 + *
70634 + */
70635 +
70636 +#include "debug.h"
70637 +#include "txnmgr.h"
70638 +#include "jnode.h"
70639 +#include "znode.h"
70640 +#include "block_alloc.h"
70641 +#include "tree.h"
70642 +#include "wander.h"
70643 +#include "ktxnmgrd.h"
70644 +#include "super.h"
70645 +#include "page_cache.h"
70646 +#include "reiser4.h"
70647 +#include "vfs_ops.h"
70648 +#include "inode.h"
70649 +#include "flush.h"
70650 +
70651 +#include <asm/atomic.h>
70652 +#include <linux/types.h>
70653 +#include <linux/fs.h>
70654 +#include <linux/mm.h>
70655 +#include <linux/slab.h>
70656 +#include <linux/pagemap.h>
70657 +#include <linux/writeback.h>
70658 +#include <linux/swap.h>                /* for totalram_pages */
70659 +
70660 +static void atom_free(txn_atom * atom);
70661 +
70662 +static int commit_txnh(txn_handle * txnh);
70663 +
70664 +static void wakeup_atom_waitfor_list(txn_atom * atom);
70665 +static void wakeup_atom_waiting_list(txn_atom * atom);
70666 +
70667 +static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh);
70668 +
70669 +static void capture_assign_block_nolock(txn_atom * atom, jnode * node);
70670 +
70671 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node);
70672 +
70673 +static int capture_init_fusion(jnode * node, txn_handle * txnh,
70674 +                              txn_capture mode);
70675 +
70676 +static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture);
70677 +
70678 +static void capture_fuse_into(txn_atom * small, txn_atom * large);
70679 +
70680 +void reiser4_invalidate_list(struct list_head *);
70681 +
70682 +/* GENERIC STRUCTURES */
70683 +
70684 +typedef struct _txn_wait_links txn_wait_links;
70685 +
70686 +struct _txn_wait_links {
70687 +       lock_stack *_lock_stack;
70688 +       struct list_head _fwaitfor_link;
70689 +       struct list_head _fwaiting_link;
70690 +       int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
70691 +       int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
70692 +};
70693 +
70694 +/* FIXME: In theory, we should be using the slab cache init & destructor
70695 +   methods instead of, e.g., jnode_init, etc. */
70696 +static struct kmem_cache *_atom_slab = NULL;
70697 +/* this is for user-visible, cross system-call transactions. */
70698 +static struct kmem_cache *_txnh_slab = NULL;
70699 +
70700 +/**
70701 + * init_txnmgr_static - create transaction manager slab caches
70702 + *
70703 + * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module
70704 + * initialization.
70705 + */
70706 +int init_txnmgr_static(void)
70707 +{
70708 +       assert("jmacd-600", _atom_slab == NULL);
70709 +       assert("jmacd-601", _txnh_slab == NULL);
70710 +
70711 +       ON_DEBUG(atomic_set(&flush_cnt, 0));
70712 +
70713 +       _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0,
70714 +                                      SLAB_HWCACHE_ALIGN |
70715 +                                      SLAB_RECLAIM_ACCOUNT, NULL);
70716 +       if (_atom_slab == NULL)
70717 +               return RETERR(-ENOMEM);
70718 +
70719 +       _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0,
70720 +                             SLAB_HWCACHE_ALIGN, NULL);
70721 +       if (_txnh_slab == NULL) {
70722 +               kmem_cache_destroy(_atom_slab);
70723 +               _atom_slab = NULL;
70724 +               return RETERR(-ENOMEM);
70725 +       }
70726 +
70727 +       return 0;
70728 +}
70729 +
70730 +/**
70731 + * done_txnmgr_static - delete txn_atom and txn_handle caches
70732 + *
70733 + * This is called on reiser4 module unloading or system shutdown.
70734 + */
70735 +void done_txnmgr_static(void)
70736 +{
70737 +       destroy_reiser4_cache(&_atom_slab);
70738 +       destroy_reiser4_cache(&_txnh_slab);
70739 +}
70740 +
70741 +/**
70742 + * init_txnmgr - initialize a new transaction manager
70743 + * @mgr: pointer to transaction manager embedded in reiser4 super block
70744 + *
70745 + * This is called on mount. Makes necessary initializations.
70746 + */
70747 +void reiser4_init_txnmgr(txn_mgr *mgr)
70748 +{
70749 +       assert("umka-169", mgr != NULL);
70750 +
70751 +       mgr->atom_count = 0;
70752 +       mgr->id_count = 1;
70753 +       INIT_LIST_HEAD(&mgr->atoms_list);
70754 +       spin_lock_init(&mgr->tmgr_lock);
70755 +       mutex_init(&mgr->commit_mutex);
70756 +}
70757 +
70758 +/**
70759 + * reiser4_done_txnmgr - stop transaction manager
70760 + * @mgr: pointer to transaction manager embedded in reiser4 super block
70761 + *
70762 + * This is called on umount. Does sanity checks.
70763 + */
70764 +void reiser4_done_txnmgr(txn_mgr *mgr)
70765 +{
70766 +       assert("umka-170", mgr != NULL);
70767 +       assert("umka-1701", list_empty_careful(&mgr->atoms_list));
70768 +       assert("umka-1702", mgr->atom_count == 0);
70769 +}
70770 +
70771 +/* Initialize a transaction handle. */
70772 +/* Audited by: umka (2002.06.13) */
70773 +static void txnh_init(txn_handle * txnh, txn_mode mode)
70774 +{
70775 +       assert("umka-171", txnh != NULL);
70776 +
70777 +       txnh->mode = mode;
70778 +       txnh->atom = NULL;
70779 +       reiser4_ctx_gfp_mask_set();
70780 +       txnh->flags = 0;
70781 +       spin_lock_init(&txnh->hlock);
70782 +       INIT_LIST_HEAD(&txnh->txnh_link);
70783 +}
70784 +
70785 +#if REISER4_DEBUG
70786 +/* Check if a transaction handle is clean. */
70787 +static int txnh_isclean(txn_handle * txnh)
70788 +{
70789 +       assert("umka-172", txnh != NULL);
70790 +       return txnh->atom == NULL &&
70791 +               LOCK_CNT_NIL(spin_locked_txnh);
70792 +}
70793 +#endif
70794 +
70795 +/* Initialize an atom. */
70796 +static void atom_init(txn_atom * atom)
70797 +{
70798 +       int level;
70799 +
70800 +       assert("umka-173", atom != NULL);
70801 +
70802 +       memset(atom, 0, sizeof(txn_atom));
70803 +
70804 +       atom->stage = ASTAGE_FREE;
70805 +       atom->start_time = jiffies;
70806 +
70807 +       for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1)
70808 +               INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level));
70809 +
70810 +       INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom));
70811 +       INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom));
70812 +       INIT_LIST_HEAD(ATOM_WB_LIST(atom));
70813 +       INIT_LIST_HEAD(&atom->inodes);
70814 +       spin_lock_init(&(atom->alock));
70815 +       /* list of transaction handles */
70816 +       INIT_LIST_HEAD(&atom->txnh_list);
70817 +       /* link to transaction manager's list of atoms */
70818 +       INIT_LIST_HEAD(&atom->atom_link);
70819 +       INIT_LIST_HEAD(&atom->fwaitfor_list);
70820 +       INIT_LIST_HEAD(&atom->fwaiting_list);
70821 +       blocknr_set_init(&atom->delete_set);
70822 +       blocknr_set_init(&atom->wandered_map);
70823 +
70824 +       init_atom_fq_parts(atom);
70825 +}
70826 +
70827 +#if REISER4_DEBUG
70828 +/* Check if an atom is clean. */
70829 +static int atom_isclean(txn_atom * atom)
70830 +{
70831 +       int level;
70832 +
70833 +       assert("umka-174", atom != NULL);
70834 +
70835 +       for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
70836 +               if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) {
70837 +                       return 0;
70838 +               }
70839 +       }
70840 +
70841 +       return  atom->stage == ASTAGE_FREE &&
70842 +               atom->txnh_count == 0 &&
70843 +               atom->capture_count == 0 &&
70844 +               atomic_read(&atom->refcount) == 0 &&
70845 +               (&atom->atom_link == atom->atom_link.next &&
70846 +                &atom->atom_link == atom->atom_link.prev) &&
70847 +               list_empty_careful(&atom->txnh_list) &&
70848 +               list_empty_careful(ATOM_CLEAN_LIST(atom)) &&
70849 +               list_empty_careful(ATOM_OVRWR_LIST(atom)) &&
70850 +               list_empty_careful(ATOM_WB_LIST(atom)) &&
70851 +               list_empty_careful(&atom->fwaitfor_list) &&
70852 +               list_empty_careful(&atom->fwaiting_list) &&
70853 +               atom_fq_parts_are_clean(atom);
70854 +}
70855 +#endif
70856 +
70857 +/* Begin a transaction in this context.  Currently this uses the reiser4_context's
70858 +   trans_in_ctx, which means that transaction handles are stack-allocated.  Eventually
70859 +   this will be extended to allow transaction handles to span several contexts. */
70860 +/* Audited by: umka (2002.06.13) */
70861 +void reiser4_txn_begin(reiser4_context * context)
70862 +{
70863 +       assert("jmacd-544", context->trans == NULL);
70864 +
70865 +       context->trans = &context->trans_in_ctx;
70866 +
70867 +       /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING
70868 +          transcrash.  Default should be TXN_WRITE_FUSING.  Also, the _trans variable is
70869 +          stack allocated right now, but we would like to allow for dynamically allocated
70870 +          transcrashes that span multiple system calls.
70871 +        */
70872 +       txnh_init(context->trans, TXN_WRITE_FUSING);
70873 +}
70874 +
70875 +/* Finish a transaction handle context. */
70876 +int reiser4_txn_end(reiser4_context * context)
70877 +{
70878 +       long ret = 0;
70879 +       txn_handle *txnh;
70880 +
70881 +       assert("umka-283", context != NULL);
70882 +       assert("nikita-3012", reiser4_schedulable());
70883 +       assert("vs-24", context == get_current_context());
70884 +       assert("nikita-2967", lock_stack_isclean(get_current_lock_stack()));
70885 +
70886 +       txnh = context->trans;
70887 +       if (txnh != NULL) {
70888 +               if (txnh->atom != NULL)
70889 +                       ret = commit_txnh(txnh);
70890 +               assert("jmacd-633", txnh_isclean(txnh));
70891 +               context->trans = NULL;
70892 +       }
70893 +       return ret;
70894 +}
70895 +
70896 +void reiser4_txn_restart(reiser4_context * context)
70897 +{
70898 +       reiser4_txn_end(context);
70899 +       reiser4_preempt_point();
70900 +       reiser4_txn_begin(context);
70901 +}
70902 +
70903 +void reiser4_txn_restart_current(void)
70904 +{
70905 +       reiser4_txn_restart(get_current_context());
70906 +}
70907 +
70908 +/* TXN_ATOM */
70909 +
70910 +/* Get the atom belonging to a txnh, which is not locked.  Return txnh locked. Locks atom, if atom
70911 +   is not NULL.  This performs the necessary spin_trylock to break the lock-ordering cycle.  May
70912 +   return NULL. */
70913 +static txn_atom *txnh_get_atom(txn_handle * txnh)
70914 +{
70915 +       txn_atom *atom;
70916 +
70917 +       assert("umka-180", txnh != NULL);
70918 +       assert_spin_not_locked(&(txnh->hlock));
70919 +
70920 +       while (1) {
70921 +               spin_lock_txnh(txnh);
70922 +               atom = txnh->atom;
70923 +
70924 +               if (atom == NULL)
70925 +                       break;
70926 +
70927 +               if (spin_trylock_atom(atom))
70928 +                       break;
70929 +
70930 +               atomic_inc(&atom->refcount);
70931 +
70932 +               spin_unlock_txnh(txnh);
70933 +               spin_lock_atom(atom);
70934 +               spin_lock_txnh(txnh);
70935 +
70936 +               if (txnh->atom == atom) {
70937 +                       atomic_dec(&atom->refcount);
70938 +                       break;
70939 +               }
70940 +
70941 +               spin_unlock_txnh(txnh);
70942 +               atom_dec_and_unlock(atom);
70943 +       }
70944 +
70945 +       return atom;
70946 +}
70947 +
70948 +/* Get the current atom and spinlock it if current atom present. May return NULL  */
70949 +txn_atom *get_current_atom_locked_nocheck(void)
70950 +{
70951 +       reiser4_context *cx;
70952 +       txn_atom *atom;
70953 +       txn_handle *txnh;
70954 +
70955 +       cx = get_current_context();
70956 +       assert("zam-437", cx != NULL);
70957 +
70958 +       txnh = cx->trans;
70959 +       assert("zam-435", txnh != NULL);
70960 +
70961 +       atom = txnh_get_atom(txnh);
70962 +
70963 +       spin_unlock_txnh(txnh);
70964 +       return atom;
70965 +}
70966 +
70967 +/* Get the atom belonging to a jnode, which is initially locked.  Return with
70968 +   both jnode and atom locked.  This performs the necessary spin_trylock to
70969 +   break the lock-ordering cycle.  Assumes the jnode is already locked, and
70970 +   returns NULL if atom is not set. */
70971 +txn_atom *jnode_get_atom(jnode * node)
70972 +{
70973 +       txn_atom *atom;
70974 +
70975 +       assert("umka-181", node != NULL);
70976 +
70977 +       while (1) {
70978 +               assert_spin_locked(&(node->guard));
70979 +
70980 +               atom = node->atom;
70981 +               /* node is not in any atom */
70982 +               if (atom == NULL)
70983 +                       break;
70984 +
70985 +               /* If atom is not locked, grab the lock and return */
70986 +               if (spin_trylock_atom(atom))
70987 +                       break;
70988 +
70989 +               /* At least one jnode belongs to this atom it guarantees that
70990 +                * atom->refcount > 0, we can safely increment refcount. */
70991 +               atomic_inc(&atom->refcount);
70992 +               spin_unlock_jnode(node);
70993 +
70994 +               /* re-acquire spin locks in the right order */
70995 +               spin_lock_atom(atom);
70996 +               spin_lock_jnode(node);
70997 +
70998 +               /* check if node still points to the same atom. */
70999 +               if (node->atom == atom) {
71000 +                       atomic_dec(&atom->refcount);
71001 +                       break;
71002 +               }
71003 +
71004 +               /* releasing of atom lock and reference requires not holding
71005 +                * locks on jnodes.  */
71006 +               spin_unlock_jnode(node);
71007 +
71008 +               /* We do not sure that this atom has extra references except our
71009 +                * one, so we should call proper function which may free atom if
71010 +                * last reference is released. */
71011 +               atom_dec_and_unlock(atom);
71012 +
71013 +               /* lock jnode again for getting valid node->atom pointer
71014 +                * value. */
71015 +               spin_lock_jnode(node);
71016 +       }
71017 +
71018 +       return atom;
71019 +}
71020 +
71021 +/* Returns true if @node is dirty and part of the same atom as one of its neighbors.  Used
71022 +   by flush code to indicate whether the next node (in some direction) is suitable for
71023 +   flushing. */
71024 +int
71025 +same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value)
71026 +{
71027 +       int compat;
71028 +       txn_atom *atom;
71029 +
71030 +       assert("umka-182", node != NULL);
71031 +       assert("umka-183", check != NULL);
71032 +
71033 +       /* Not sure what this function is supposed to do if supplied with @check that is
71034 +          neither formatted nor unformatted (bitmap or so). */
71035 +       assert("nikita-2373", jnode_is_znode(check)
71036 +              || jnode_is_unformatted(check));
71037 +
71038 +       /* Need a lock on CHECK to get its atom and to check various state bits.
71039 +          Don't need a lock on NODE once we get the atom lock. */
71040 +       /* It is not enough to lock two nodes and check (node->atom ==
71041 +          check->atom) because atom could be locked and being fused at that
71042 +          moment, jnodes of the atom of that state (being fused) can point to
71043 +          different objects, but the atom is the same. */
71044 +       spin_lock_jnode(check);
71045 +
71046 +       atom = jnode_get_atom(check);
71047 +
71048 +       if (atom == NULL) {
71049 +               compat = 0;
71050 +       } else {
71051 +               compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY));
71052 +
71053 +               if (compat && jnode_is_znode(check)) {
71054 +                       compat &= znode_is_connected(JZNODE(check));
71055 +               }
71056 +
71057 +               if (compat && alloc_check) {
71058 +                       compat &= (alloc_value == jnode_is_flushprepped(check));
71059 +               }
71060 +
71061 +               spin_unlock_atom(atom);
71062 +       }
71063 +
71064 +       spin_unlock_jnode(check);
71065 +
71066 +       return compat;
71067 +}
71068 +
71069 +/* Decrement the atom's reference count and if it falls to zero, free it. */
71070 +void atom_dec_and_unlock(txn_atom * atom)
71071 +{
71072 +       txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
71073 +
71074 +       assert("umka-186", atom != NULL);
71075 +       assert_spin_locked(&(atom->alock));
71076 +       assert("zam-1039", atomic_read(&atom->refcount) > 0);
71077 +
71078 +       if (atomic_dec_and_test(&atom->refcount)) {
71079 +               /* take txnmgr lock and atom lock in proper order. */
71080 +               if (!spin_trylock_txnmgr(mgr)) {
71081 +                       /* This atom should exist after we re-acquire its
71082 +                        * spinlock, so we increment its reference counter. */
71083 +                       atomic_inc(&atom->refcount);
71084 +                       spin_unlock_atom(atom);
71085 +                       spin_lock_txnmgr(mgr);
71086 +                       spin_lock_atom(atom);
71087 +
71088 +                       if (!atomic_dec_and_test(&atom->refcount)) {
71089 +                               spin_unlock_atom(atom);
71090 +                               spin_unlock_txnmgr(mgr);
71091 +                               return;
71092 +                       }
71093 +               }
71094 +               assert_spin_locked(&(mgr->tmgr_lock));
71095 +               atom_free(atom);
71096 +               spin_unlock_txnmgr(mgr);
71097 +       } else
71098 +               spin_unlock_atom(atom);
71099 +}
71100 +
71101 +/* Create new atom and connect it to given transaction handle.  This adds the
71102 +   atom to the transaction manager's list and sets its reference count to 1, an
71103 +   artificial reference which is kept until it commits.  We play strange games
71104 +   to avoid allocation under jnode & txnh spinlocks.*/
71105 +
71106 +static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh)
71107 +{
71108 +       txn_atom *atom;
71109 +       txn_mgr *mgr;
71110 +
71111 +       if (REISER4_DEBUG && rofs_tree(current_tree)) {
71112 +               warning("nikita-3366", "Creating atom on rofs");
71113 +               dump_stack();
71114 +       }
71115 +
71116 +       if (*atom_alloc == NULL) {
71117 +               (*atom_alloc) = kmem_cache_alloc(_atom_slab,
71118 +                                                reiser4_ctx_gfp_mask_get());
71119 +
71120 +               if (*atom_alloc == NULL)
71121 +                       return RETERR(-ENOMEM);
71122 +       }
71123 +
71124 +       /* and, also, txnmgr spin lock should be taken before jnode and txnh
71125 +          locks. */
71126 +       mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
71127 +       spin_lock_txnmgr(mgr);
71128 +       spin_lock_txnh(txnh);
71129 +
71130 +       /* Check whether new atom still needed */
71131 +       if (txnh->atom != NULL) {
71132 +               /* NOTE-NIKITA probably it is rather better to free
71133 +                * atom_alloc here than thread it up to reiser4_try_capture() */
71134 +
71135 +               spin_unlock_txnh(txnh);
71136 +               spin_unlock_txnmgr(mgr);
71137 +
71138 +               return -E_REPEAT;
71139 +       }
71140 +
71141 +       atom = *atom_alloc;
71142 +       *atom_alloc = NULL;
71143 +
71144 +       atom_init(atom);
71145 +
71146 +       assert("jmacd-17", atom_isclean(atom));
71147 +
71148 +        /*
71149 +        * lock ordering is broken here. It is ok, as long as @atom is new
71150 +        * and inaccessible for others. We can't use spin_lock_atom or
71151 +        * spin_lock(&atom->alock) because they care about locking
71152 +        * dependencies. spin_trylock_lock doesn't.
71153 +        */
71154 +       check_me("", spin_trylock_atom(atom));
71155 +
71156 +       /* add atom to the end of transaction manager's list of atoms */
71157 +       list_add_tail(&atom->atom_link, &mgr->atoms_list);
71158 +       atom->atom_id = mgr->id_count++;
71159 +       mgr->atom_count += 1;
71160 +
71161 +       /* Release txnmgr lock */
71162 +       spin_unlock_txnmgr(mgr);
71163 +
71164 +       /* One reference until it commits. */
71165 +       atomic_inc(&atom->refcount);
71166 +       atom->stage = ASTAGE_CAPTURE_FUSE;
71167 +       atom->super = reiser4_get_current_sb();
71168 +       capture_assign_txnh_nolock(atom, txnh);
71169 +
71170 +       spin_unlock_atom(atom);
71171 +       spin_unlock_txnh(txnh);
71172 +
71173 +       return -E_REPEAT;
71174 +}
71175 +
71176 +/* Return true if an atom is currently "open". */
71177 +static int atom_isopen(const txn_atom * atom)
71178 +{
71179 +       assert("umka-185", atom != NULL);
71180 +
71181 +       return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT;
71182 +}
71183 +
71184 +/* Return the number of pointers to this atom that must be updated during fusion.  This
71185 +   approximates the amount of work to be done.  Fusion chooses the atom with fewer
71186 +   pointers to fuse into the atom with more pointers. */
71187 +static int atom_pointer_count(const txn_atom * atom)
71188 +{
71189 +       assert("umka-187", atom != NULL);
71190 +
71191 +       /* This is a measure of the amount of work needed to fuse this atom
71192 +        * into another. */
71193 +       return atom->txnh_count + atom->capture_count;
71194 +}
71195 +
71196 +/* Called holding the atom lock, this removes the atom from the transaction manager list
71197 +   and frees it. */
71198 +static void atom_free(txn_atom * atom)
71199 +{
71200 +       txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
71201 +
71202 +       assert("umka-188", atom != NULL);
71203 +       assert_spin_locked(&(atom->alock));
71204 +
71205 +       /* Remove from the txn_mgr's atom list */
71206 +       assert_spin_locked(&(mgr->tmgr_lock));
71207 +       mgr->atom_count -= 1;
71208 +       list_del_init(&atom->atom_link);
71209 +
71210 +       /* Clean the atom */
71211 +       assert("jmacd-16",
71212 +              (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE));
71213 +       atom->stage = ASTAGE_FREE;
71214 +
71215 +       blocknr_set_destroy(&atom->delete_set);
71216 +       blocknr_set_destroy(&atom->wandered_map);
71217 +
71218 +       assert("jmacd-16", atom_isclean(atom));
71219 +
71220 +       spin_unlock_atom(atom);
71221 +
71222 +       kmem_cache_free(_atom_slab, atom);
71223 +}
71224 +
71225 +static int atom_is_dotard(const txn_atom * atom)
71226 +{
71227 +       return time_after(jiffies, atom->start_time +
71228 +                         get_current_super_private()->tmgr.atom_max_age);
71229 +}
71230 +
71231 +static int atom_can_be_committed(txn_atom * atom)
71232 +{
71233 +       assert_spin_locked(&(atom->alock));
71234 +       assert("zam-885", atom->txnh_count > atom->nr_waiters);
71235 +       return atom->txnh_count == atom->nr_waiters + 1;
71236 +}
71237 +
71238 +/* Return true if an atom should commit now.  This is determined by aging, atom
71239 +   size or atom flags. */
71240 +static int atom_should_commit(const txn_atom * atom)
71241 +{
71242 +       assert("umka-189", atom != NULL);
71243 +       return
71244 +           (atom->flags & ATOM_FORCE_COMMIT) ||
71245 +           ((unsigned)atom_pointer_count(atom) >
71246 +            get_current_super_private()->tmgr.atom_max_size)
71247 +           || atom_is_dotard(atom);
71248 +}
71249 +
71250 +/* return 1 if current atom exists and requires commit. */
71251 +int current_atom_should_commit(void)
71252 +{
71253 +       txn_atom *atom;
71254 +       int result = 0;
71255 +
71256 +       atom = get_current_atom_locked_nocheck();
71257 +       if (atom) {
71258 +               result = atom_should_commit(atom);
71259 +               spin_unlock_atom(atom);
71260 +       }
71261 +       return result;
71262 +}
71263 +
71264 +static int atom_should_commit_asap(const txn_atom * atom)
71265 +{
71266 +       unsigned int captured;
71267 +       unsigned int pinnedpages;
71268 +
71269 +       assert("nikita-3309", atom != NULL);
71270 +
71271 +       captured = (unsigned)atom->capture_count;
71272 +       pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode);
71273 +
71274 +       return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100);
71275 +}
71276 +
71277 +static jnode *find_first_dirty_in_list(struct list_head *head, int flags)
71278 +{
71279 +       jnode *first_dirty;
71280 +
71281 +       list_for_each_entry(first_dirty, head, capture_link) {
71282 +               if (!(flags & JNODE_FLUSH_COMMIT)) {
71283 +                       /*
71284 +                        * skip jnodes which "heard banshee" or having active
71285 +                        * I/O
71286 +                        */
71287 +                       if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) ||
71288 +                           JF_ISSET(first_dirty, JNODE_WRITEBACK))
71289 +                               continue;
71290 +               }
71291 +               return first_dirty;
71292 +       }
71293 +       return NULL;
71294 +}
71295 +
71296 +/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty
71297 +   nodes on atom's lists */
71298 +jnode *find_first_dirty_jnode(txn_atom * atom, int flags)
71299 +{
71300 +       jnode *first_dirty;
71301 +       tree_level level;
71302 +
71303 +       assert_spin_locked(&(atom->alock));
71304 +
71305 +       /* The flush starts from LEAF_LEVEL (=1). */
71306 +       for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
71307 +               if (list_empty_careful(ATOM_DIRTY_LIST(atom, level)))
71308 +                       continue;
71309 +
71310 +               first_dirty =
71311 +                   find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level),
71312 +                                            flags);
71313 +               if (first_dirty)
71314 +                       return first_dirty;
71315 +       }
71316 +
71317 +       /* znode-above-root is on the list #0. */
71318 +       return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags);
71319 +}
71320 +
71321 +static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq)
71322 +{
71323 +       jnode *cur;
71324 +
71325 +       assert("zam-905", atom_is_protected(atom));
71326 +
71327 +       cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link);
71328 +       while (ATOM_WB_LIST(atom) != &cur->capture_link) {
71329 +               jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
71330 +
71331 +               spin_lock_jnode(cur);
71332 +               if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
71333 +                       if (JF_ISSET(cur, JNODE_DIRTY)) {
71334 +                               queue_jnode(fq, cur);
71335 +                       } else {
71336 +                               /* move jnode to atom's clean list */
71337 +                               list_move_tail(&cur->capture_link,
71338 +                                             ATOM_CLEAN_LIST(atom));
71339 +                       }
71340 +               }
71341 +               spin_unlock_jnode(cur);
71342 +
71343 +               cur = next;
71344 +       }
71345 +}
71346 +
71347 +/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback
71348 + * jnodes to disk. */
71349 +static int submit_wb_list(void)
71350 +{
71351 +       int ret;
71352 +       flush_queue_t *fq;
71353 +
71354 +       fq = get_fq_for_current_atom();
71355 +       if (IS_ERR(fq))
71356 +               return PTR_ERR(fq);
71357 +
71358 +       dispatch_wb_list(fq->atom, fq);
71359 +       spin_unlock_atom(fq->atom);
71360 +
71361 +       ret = reiser4_write_fq(fq, NULL, 1);
71362 +       reiser4_fq_put(fq);
71363 +
71364 +       return ret;
71365 +}
71366 +
71367 +/* Wait completion of all writes, re-submit atom writeback list if needed. */
71368 +static int current_atom_complete_writes(void)
71369 +{
71370 +       int ret;
71371 +
71372 +       /* Each jnode from that list was modified and dirtied when it had i/o
71373 +        * request running already. After i/o completion we have to resubmit
71374 +        * them to disk again.*/
71375 +       ret = submit_wb_list();
71376 +       if (ret < 0)
71377 +               return ret;
71378 +
71379 +       /* Wait all i/o completion */
71380 +       ret = current_atom_finish_all_fq();
71381 +       if (ret)
71382 +               return ret;
71383 +
71384 +       /* Scan wb list again; all i/o should be completed, we re-submit dirty
71385 +        * nodes to disk */
71386 +       ret = submit_wb_list();
71387 +       if (ret < 0)
71388 +               return ret;
71389 +
71390 +       /* Wait all nodes we just submitted */
71391 +       return current_atom_finish_all_fq();
71392 +}
71393 +
71394 +#if REISER4_DEBUG
71395 +
71396 +static void reiser4_info_atom(const char *prefix, const txn_atom * atom)
71397 +{
71398 +       if (atom == NULL) {
71399 +               printk("%s: no atom\n", prefix);
71400 +               return;
71401 +       }
71402 +
71403 +       printk("%s: refcount: %i id: %i flags: %x txnh_count: %i"
71404 +              " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix,
71405 +              atomic_read(&atom->refcount), atom->atom_id, atom->flags,
71406 +              atom->txnh_count, atom->capture_count, atom->stage,
71407 +              atom->start_time, atom->flushed);
71408 +}
71409 +
71410 +#else  /*  REISER4_DEBUG  */
71411 +
71412 +static inline void reiser4_info_atom(const char *prefix, const txn_atom * atom) {}
71413 +
71414 +#endif  /*  REISER4_DEBUG  */
71415 +
71416 +#define TOOMANYFLUSHES (1 << 13)
71417 +
71418 +/* Called with the atom locked and no open "active" transaction handlers except
71419 +   ours, this function calls flush_current_atom() until all dirty nodes are
71420 +   processed.  Then it initiates commit processing.
71421 +
71422 +   Called by the single remaining open "active" txnh, which is closing. Other
71423 +   open txnhs belong to processes which wait atom commit in commit_txnh()
71424 +   routine. They are counted as "waiters" in atom->nr_waiters.  Therefore as
71425 +   long as we hold the atom lock none of the jnodes can be captured and/or
71426 +   locked.
71427 +
71428 +   Return value is an error code if commit fails.
71429 +*/
71430 +static int commit_current_atom(long *nr_submitted, txn_atom ** atom)
71431 +{
71432 +       reiser4_super_info_data *sbinfo = get_current_super_private();
71433 +       long ret = 0;
71434 +       /* how many times jnode_flush() was called as a part of attempt to
71435 +        * commit this atom. */
71436 +       int flushiters;
71437 +
71438 +       assert("zam-888", atom != NULL && *atom != NULL);
71439 +       assert_spin_locked(&((*atom)->alock));
71440 +       assert("zam-887", get_current_context()->trans->atom == *atom);
71441 +       assert("jmacd-151", atom_isopen(*atom));
71442 +
71443 +       assert("nikita-3184",
71444 +              get_current_super_private()->delete_mutex_owner != current);
71445 +
71446 +       for (flushiters = 0;; ++flushiters) {
71447 +               ret =
71448 +                   flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS |
71449 +                                      JNODE_FLUSH_COMMIT,
71450 +                                      LONG_MAX /* nr_to_write */ ,
71451 +                                      nr_submitted, atom, NULL);
71452 +               if (ret != -E_REPEAT)
71453 +                       break;
71454 +
71455 +               /* if atom's dirty list contains one znode which is
71456 +                  HEARD_BANSHEE and is locked we have to allow lock owner to
71457 +                  continue and uncapture that znode */
71458 +               reiser4_preempt_point();
71459 +
71460 +               *atom = get_current_atom_locked();
71461 +               if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) {
71462 +                       warning("nikita-3176",
71463 +                               "Flushing like mad: %i", flushiters);
71464 +                       reiser4_info_atom("atom", *atom);
71465 +                       DEBUGON(flushiters > (1 << 20));
71466 +               }
71467 +       }
71468 +
71469 +       if (ret)
71470 +               return ret;
71471 +
71472 +       assert_spin_locked(&((*atom)->alock));
71473 +
71474 +       if (!atom_can_be_committed(*atom)) {
71475 +               spin_unlock_atom(*atom);
71476 +               return RETERR(-E_REPEAT);
71477 +       }
71478 +
71479 +       if ((*atom)->capture_count == 0)
71480 +               goto done;
71481 +
71482 +       /* Up to this point we have been flushing and after flush is called we
71483 +          return -E_REPEAT.  Now we can commit.  We cannot return -E_REPEAT
71484 +          at this point, commit should be successful. */
71485 +       reiser4_atom_set_stage(*atom, ASTAGE_PRE_COMMIT);
71486 +       ON_DEBUG(((*atom)->committer = current));
71487 +       spin_unlock_atom(*atom);
71488 +
71489 +       ret = current_atom_complete_writes();
71490 +       if (ret)
71491 +               return ret;
71492 +
71493 +       assert("zam-906", list_empty(ATOM_WB_LIST(*atom)));
71494 +
71495 +       /* isolate critical code path which should be executed by only one
71496 +        * thread using tmgr mutex */
71497 +       mutex_lock(&sbinfo->tmgr.commit_mutex);
71498 +
71499 +       ret = reiser4_write_logs(nr_submitted);
71500 +       if (ret < 0)
71501 +               reiser4_panic("zam-597", "write log failed (%ld)\n", ret);
71502 +
71503 +       /* The atom->ovrwr_nodes list is processed under commit mutex held
71504 +          because of bitmap nodes which are captured by special way in
71505 +          reiser4_pre_commit_hook_bitmap(), that way does not include
71506 +          capture_fuse_wait() as a capturing of other nodes does -- the commit
71507 +          mutex is used for transaction isolation instead. */
71508 +       reiser4_invalidate_list(ATOM_OVRWR_LIST(*atom));
71509 +       mutex_unlock(&sbinfo->tmgr.commit_mutex);
71510 +
71511 +       reiser4_invalidate_list(ATOM_CLEAN_LIST(*atom));
71512 +       reiser4_invalidate_list(ATOM_WB_LIST(*atom));
71513 +       assert("zam-927", list_empty(&(*atom)->inodes));
71514 +
71515 +       spin_lock_atom(*atom);
71516 + done:
71517 +       reiser4_atom_set_stage(*atom, ASTAGE_DONE);
71518 +       ON_DEBUG((*atom)->committer = NULL);
71519 +
71520 +       /* Atom's state changes, so wake up everybody waiting for this
71521 +          event. */
71522 +       wakeup_atom_waiting_list(*atom);
71523 +
71524 +       /* Decrement the "until commit" reference, at least one txnh (the caller) is
71525 +          still open. */
71526 +       atomic_dec(&(*atom)->refcount);
71527 +
71528 +       assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0);
71529 +       assert("jmacd-1062", (*atom)->capture_count == 0);
71530 +       BUG_ON((*atom)->capture_count != 0);
71531 +       assert_spin_locked(&((*atom)->alock));
71532 +
71533 +       return ret;
71534 +}
71535 +
71536 +/* TXN_TXNH */
71537 +
71538 +/**
71539 + * force_commit_atom - commit current atom and wait commit completion
71540 + * @txnh:
71541 + *
71542 + * Commits current atom and wait commit completion; current atom and @txnh have
71543 + * to be spinlocked before call, this function unlocks them on exit.
71544 + */
71545 +int force_commit_atom(txn_handle *txnh)
71546 +{
71547 +       txn_atom *atom;
71548 +
71549 +       assert("zam-837", txnh != NULL);
71550 +       assert_spin_locked(&(txnh->hlock));
71551 +       assert("nikita-2966", lock_stack_isclean(get_current_lock_stack()));
71552 +
71553 +       atom = txnh->atom;
71554 +
71555 +       assert("zam-834", atom != NULL);
71556 +       assert_spin_locked(&(atom->alock));
71557 +
71558 +       /*
71559 +        * Set flags for atom and txnh: forcing atom commit and waiting for
71560 +        * commit completion
71561 +        */
71562 +       txnh->flags |= TXNH_WAIT_COMMIT;
71563 +       atom->flags |= ATOM_FORCE_COMMIT;
71564 +
71565 +       spin_unlock_txnh(txnh);
71566 +       spin_unlock_atom(atom);
71567 +
71568 +       /* commit is here */
71569 +       reiser4_txn_restart_current();
71570 +       return 0;
71571 +}
71572 +
71573 +/* Called to force commit of any outstanding atoms.  @commit_all_atoms controls
71574 + * should we commit all atoms including new ones which are created after this
71575 + * functions is called. */
71576 +int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms)
71577 +{
71578 +       int ret;
71579 +       txn_atom *atom;
71580 +       txn_mgr *mgr;
71581 +       txn_handle *txnh;
71582 +       unsigned long start_time = jiffies;
71583 +       reiser4_context *ctx = get_current_context();
71584 +
71585 +       assert("nikita-2965", lock_stack_isclean(get_current_lock_stack()));
71586 +       assert("nikita-3058", reiser4_commit_check_locks());
71587 +
71588 +       reiser4_txn_restart_current();
71589 +
71590 +       mgr = &get_super_private(super)->tmgr;
71591 +
71592 +       txnh = ctx->trans;
71593 +
71594 +      again:
71595 +
71596 +       spin_lock_txnmgr(mgr);
71597 +
71598 +       list_for_each_entry(atom, &mgr->atoms_list, atom_link) {
71599 +               spin_lock_atom(atom);
71600 +
71601 +               /* Commit any atom which can be committed.  If @commit_new_atoms
71602 +                * is not set we commit only atoms which were created before
71603 +                * this call is started. */
71604 +               if (commit_all_atoms
71605 +                   || time_before_eq(atom->start_time, start_time)) {
71606 +                       if (atom->stage <= ASTAGE_POST_COMMIT) {
71607 +                               spin_unlock_txnmgr(mgr);
71608 +
71609 +                               if (atom->stage < ASTAGE_PRE_COMMIT) {
71610 +                                       spin_lock_txnh(txnh);
71611 +                                       /* Add force-context txnh */
71612 +                                       capture_assign_txnh_nolock(atom, txnh);
71613 +                                       ret = force_commit_atom(txnh);
71614 +                                       if (ret)
71615 +                                               return ret;
71616 +                               } else
71617 +                                       /* wait atom commit */
71618 +                                       reiser4_atom_wait_event(atom);
71619 +
71620 +                               goto again;
71621 +                       }
71622 +               }
71623 +
71624 +               spin_unlock_atom(atom);
71625 +       }
71626 +
71627 +#if REISER4_DEBUG
71628 +       if (commit_all_atoms) {
71629 +               reiser4_super_info_data *sbinfo = get_super_private(super);
71630 +               spin_lock_reiser4_super(sbinfo);
71631 +               assert("zam-813",
71632 +                      sbinfo->blocks_fake_allocated_unformatted == 0);
71633 +               assert("zam-812", sbinfo->blocks_fake_allocated == 0);
71634 +               spin_unlock_reiser4_super(sbinfo);
71635 +       }
71636 +#endif
71637 +
71638 +       spin_unlock_txnmgr(mgr);
71639 +
71640 +       return 0;
71641 +}
71642 +
71643 +/* check whether commit_some_atoms() can commit @atom. Locking is up to the
71644 + * caller */
71645 +static int atom_is_committable(txn_atom * atom)
71646 +{
71647 +       return
71648 +           atom->stage < ASTAGE_PRE_COMMIT &&
71649 +           atom->txnh_count == atom->nr_waiters && atom_should_commit(atom);
71650 +}
71651 +
71652 +/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin
71653 + * lock at exit */
71654 +int commit_some_atoms(txn_mgr * mgr)
71655 +{
71656 +       int ret = 0;
71657 +       txn_atom *atom;
71658 +       txn_handle *txnh;
71659 +       reiser4_context *ctx;
71660 +       struct list_head *pos, *tmp;
71661 +
71662 +       ctx = get_current_context();
71663 +       assert("nikita-2444", ctx != NULL);
71664 +
71665 +       txnh = ctx->trans;
71666 +       spin_lock_txnmgr(mgr);
71667 +
71668 +       /*
71669 +        * this is to avoid gcc complain that atom might be used
71670 +        * uninitialized
71671 +        */
71672 +       atom = NULL;
71673 +
71674 +       /* look for atom to commit */
71675 +       list_for_each_safe(pos, tmp, &mgr->atoms_list) {
71676 +               atom = list_entry(pos, txn_atom, atom_link);
71677 +               /*
71678 +                * first test without taking atom spin lock, whether it is
71679 +                * eligible for committing at all
71680 +                */
71681 +               if (atom_is_committable(atom)) {
71682 +                       /* now, take spin lock and re-check */
71683 +                       spin_lock_atom(atom);
71684 +                       if (atom_is_committable(atom))
71685 +                               break;
71686 +                       spin_unlock_atom(atom);
71687 +               }
71688 +       }
71689 +
71690 +       ret = (&mgr->atoms_list == pos);
71691 +       spin_unlock_txnmgr(mgr);
71692 +
71693 +       if (ret) {
71694 +               /* nothing found */
71695 +               spin_unlock(&mgr->daemon->guard);
71696 +               return 0;
71697 +       }
71698 +
71699 +       spin_lock_txnh(txnh);
71700 +
71701 +       BUG_ON(atom == NULL);
71702 +       /* Set the atom to force committing */
71703 +       atom->flags |= ATOM_FORCE_COMMIT;
71704 +
71705 +       /* Add force-context txnh */
71706 +       capture_assign_txnh_nolock(atom, txnh);
71707 +
71708 +       spin_unlock_txnh(txnh);
71709 +       spin_unlock_atom(atom);
71710 +
71711 +       /* we are about to release daemon spin lock, notify daemon it
71712 +          has to rescan atoms */
71713 +       mgr->daemon->rescan = 1;
71714 +       spin_unlock(&mgr->daemon->guard);
71715 +       reiser4_txn_restart_current();
71716 +       return 0;
71717 +}
71718 +
71719 +static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom)
71720 +{
71721 +       int atom_stage;
71722 +       txn_atom *atom_2;
71723 +       int repeat;
71724 +
71725 +       assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT);
71726 +
71727 +       atom_stage = atom->stage;
71728 +       repeat = 0;
71729 +
71730 +       if (!spin_trylock_txnmgr(tmgr)) {
71731 +               atomic_inc(&atom->refcount);
71732 +               spin_unlock_atom(atom);
71733 +               spin_lock_txnmgr(tmgr);
71734 +               spin_lock_atom(atom);
71735 +               repeat = 1;
71736 +               if (atom->stage != atom_stage) {
71737 +                       spin_unlock_txnmgr(tmgr);
71738 +                       atom_dec_and_unlock(atom);
71739 +                       return -E_REPEAT;
71740 +               }
71741 +               atomic_dec(&atom->refcount);
71742 +       }
71743 +
71744 +       list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) {
71745 +               if (atom == atom_2)
71746 +                       continue;
71747 +               /*
71748 +                * if trylock does not succeed we just do not fuse with that
71749 +                * atom.
71750 +                */
71751 +               if (spin_trylock_atom(atom_2)) {
71752 +                       if (atom_2->stage < ASTAGE_PRE_COMMIT) {
71753 +                               spin_unlock_txnmgr(tmgr);
71754 +                               capture_fuse_into(atom_2, atom);
71755 +                               /* all locks are lost we can only repeat here */
71756 +                               return -E_REPEAT;
71757 +                       }
71758 +                       spin_unlock_atom(atom_2);
71759 +               }
71760 +       }
71761 +       atom->flags |= ATOM_CANCEL_FUSION;
71762 +       spin_unlock_txnmgr(tmgr);
71763 +       if (repeat) {
71764 +               spin_unlock_atom(atom);
71765 +               return -E_REPEAT;
71766 +       }
71767 +       return 0;
71768 +}
71769 +
71770 +/* Calls jnode_flush for current atom if it exists; if not, just take another
71771 +   atom and call jnode_flush() for him.  If current transaction handle has
71772 +   already assigned atom (current atom) we have to close current transaction
71773 +   prior to switch to another atom or do something with current atom. This
71774 +   code tries to flush current atom.
71775 +
71776 +   flush_some_atom() is called as part of memory clearing process. It is
71777 +   invoked from balance_dirty_pages(), pdflushd, and entd.
71778 +
71779 +   If we can flush no nodes, atom is committed, because this frees memory.
71780 +
71781 +   If atom is too large or too old it is committed also.
71782 +*/
71783 +int
71784 +flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc,
71785 +               int flags)
71786 +{
71787 +       reiser4_context *ctx = get_current_context();
71788 +       txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr;
71789 +       txn_handle *txnh = ctx->trans;
71790 +       txn_atom *atom;
71791 +       int ret;
71792 +
71793 +       BUG_ON(wbc->nr_to_write == 0);
71794 +       BUG_ON(*nr_submitted != 0);
71795 +       assert("zam-1042", txnh != NULL);
71796 +      repeat:
71797 +       if (txnh->atom == NULL) {
71798 +               /* current atom is not available, take first from txnmgr */
71799 +               spin_lock_txnmgr(tmgr);
71800 +
71801 +               /* traverse the list of all atoms */
71802 +               list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
71803 +                       /* lock atom before checking its state */
71804 +                       spin_lock_atom(atom);
71805 +
71806 +                       /*
71807 +                        * we need an atom which is not being committed and
71808 +                        * which has no flushers (jnode_flush() add one flusher
71809 +                        * at the beginning and subtract one at the end).
71810 +                        */
71811 +                       if (atom->stage < ASTAGE_PRE_COMMIT &&
71812 +                           atom->nr_flushers == 0) {
71813 +                               spin_lock_txnh(txnh);
71814 +                               capture_assign_txnh_nolock(atom, txnh);
71815 +                               spin_unlock_txnh(txnh);
71816 +
71817 +                               goto found;
71818 +                       }
71819 +
71820 +                       spin_unlock_atom(atom);
71821 +               }
71822 +
71823 +               /*
71824 +                * Write throttling is case of no one atom can be
71825 +                * flushed/committed.
71826 +                */
71827 +               if (!current_is_flush_bd_task() && !wbc->nonblocking) {
71828 +                       list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
71829 +                               spin_lock_atom(atom);
71830 +                               /* Repeat the check from the above. */
71831 +                               if (atom->stage < ASTAGE_PRE_COMMIT
71832 +                                   && atom->nr_flushers == 0) {
71833 +                                       spin_lock_txnh(txnh);
71834 +                                       capture_assign_txnh_nolock(atom, txnh);
71835 +                                       spin_unlock_txnh(txnh);
71836 +
71837 +                                       goto found;
71838 +                               }
71839 +                               if (atom->stage <= ASTAGE_POST_COMMIT) {
71840 +                                       spin_unlock_txnmgr(tmgr);
71841 +                                       /*
71842 +                                        * we just wait until atom's flusher
71843 +                                        * makes a progress in flushing or
71844 +                                        * committing the atom
71845 +                                        */
71846 +                                       reiser4_atom_wait_event(atom);
71847 +                                       goto repeat;
71848 +                               }
71849 +                               spin_unlock_atom(atom);
71850 +                       }
71851 +               }
71852 +               spin_unlock_txnmgr(tmgr);
71853 +               return 0;
71854 +             found:
71855 +               spin_unlock_txnmgr(tmgr);
71856 +       } else
71857 +               atom = get_current_atom_locked();
71858 +
71859 +       BUG_ON(atom->super != ctx->super);
71860 +       assert("vs-35", atom->super == ctx->super);
71861 +       if (start) {
71862 +               spin_lock_jnode(start);
71863 +               ret = (atom == start->atom) ? 1 : 0;
71864 +               spin_unlock_jnode(start);
71865 +               if (ret == 0)
71866 +                       start = NULL;
71867 +       }
71868 +       ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start);
71869 +       if (ret == 0) {
71870 +               /* flush_current_atom returns 0 only if it submitted for write
71871 +                  nothing */
71872 +               BUG_ON(*nr_submitted != 0);
71873 +               if (*nr_submitted == 0 || atom_should_commit_asap(atom)) {
71874 +                       if (atom->capture_count < tmgr->atom_min_size &&
71875 +                           !(atom->flags & ATOM_CANCEL_FUSION)) {
71876 +                               ret = txn_try_to_fuse_small_atom(tmgr, atom);
71877 +                               if (ret == -E_REPEAT) {
71878 +                                       reiser4_preempt_point();
71879 +                                       goto repeat;
71880 +                               }
71881 +                       }
71882 +                       /* if early flushing could not make more nodes clean,
71883 +                        * or atom is too old/large,
71884 +                        * we force current atom to commit */
71885 +                       /* wait for commit completion but only if this
71886 +                        * wouldn't stall pdflushd and ent thread. */
71887 +                       if (!wbc->nonblocking && !ctx->entd)
71888 +                               txnh->flags |= TXNH_WAIT_COMMIT;
71889 +                       atom->flags |= ATOM_FORCE_COMMIT;
71890 +               }
71891 +               spin_unlock_atom(atom);
71892 +       } else if (ret == -E_REPEAT) {
71893 +               if (*nr_submitted == 0) {
71894 +                       /* let others who hampers flushing (hold longterm locks,
71895 +                          for instance) to free the way for flush */
71896 +                       reiser4_preempt_point();
71897 +                       goto repeat;
71898 +               }
71899 +               ret = 0;
71900 +       }
71901 +/*
71902 +       if (*nr_submitted > wbc->nr_to_write)
71903 +               warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted);
71904 +*/
71905 +       reiser4_txn_restart(ctx);
71906 +
71907 +       return ret;
71908 +}
71909 +
71910 +/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
71911 +void reiser4_invalidate_list(struct list_head *head)
71912 +{
71913 +       while (!list_empty(head)) {
71914 +               jnode *node;
71915 +
71916 +               node = list_entry(head->next, jnode, capture_link);
71917 +               spin_lock_jnode(node);
71918 +               reiser4_uncapture_block(node);
71919 +               jput(node);
71920 +       }
71921 +}
71922 +
71923 +static void init_wlinks(txn_wait_links * wlinks)
71924 +{
71925 +       wlinks->_lock_stack = get_current_lock_stack();
71926 +       INIT_LIST_HEAD(&wlinks->_fwaitfor_link);
71927 +       INIT_LIST_HEAD(&wlinks->_fwaiting_link);
71928 +       wlinks->waitfor_cb = NULL;
71929 +       wlinks->waiting_cb = NULL;
71930 +}
71931 +
71932 +/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */
71933 +void reiser4_atom_wait_event(txn_atom * atom)
71934 +{
71935 +       txn_wait_links _wlinks;
71936 +
71937 +       assert_spin_locked(&(atom->alock));
71938 +       assert("nikita-3156",
71939 +              lock_stack_isclean(get_current_lock_stack()) ||
71940 +              atom->nr_running_queues > 0);
71941 +
71942 +       init_wlinks(&_wlinks);
71943 +       list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list);
71944 +       atomic_inc(&atom->refcount);
71945 +       spin_unlock_atom(atom);
71946 +
71947 +       reiser4_prepare_to_sleep(_wlinks._lock_stack);
71948 +       reiser4_go_to_sleep(_wlinks._lock_stack);
71949 +
71950 +       spin_lock_atom(atom);
71951 +       list_del(&_wlinks._fwaitfor_link);
71952 +       atom_dec_and_unlock(atom);
71953 +}
71954 +
71955 +void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage)
71956 +{
71957 +       assert("nikita-3535", atom != NULL);
71958 +       assert_spin_locked(&(atom->alock));
71959 +       assert("nikita-3536", stage <= ASTAGE_INVALID);
71960 +       /* Excelsior! */
71961 +       assert("nikita-3537", stage >= atom->stage);
71962 +       if (atom->stage != stage) {
71963 +               atom->stage = stage;
71964 +               reiser4_atom_send_event(atom);
71965 +       }
71966 +}
71967 +
71968 +/* wake all threads which wait for an event */
71969 +void reiser4_atom_send_event(txn_atom * atom)
71970 +{
71971 +       assert_spin_locked(&(atom->alock));
71972 +       wakeup_atom_waitfor_list(atom);
71973 +}
71974 +
71975 +/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for
71976 +   example, because it does fsync(2)) */
71977 +static int should_wait_commit(txn_handle * h)
71978 +{
71979 +       return h->flags & TXNH_WAIT_COMMIT;
71980 +}
71981 +
71982 +typedef struct commit_data {
71983 +       txn_atom *atom;
71984 +       txn_handle *txnh;
71985 +       long nr_written;
71986 +       /* as an optimization we start committing atom by first trying to
71987 +        * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This
71988 +        * allows to reduce stalls due to other threads waiting for atom in
71989 +        * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these
71990 +        * preliminary flushes. */
71991 +       int preflush;
71992 +       /* have we waited on atom. */
71993 +       int wait;
71994 +       int failed;
71995 +       int wake_ktxnmgrd_up;
71996 +} commit_data;
71997 +
71998 +/*
71999 + * Called from commit_txnh() repeatedly, until either error happens, or atom
72000 + * commits successfully.
72001 + */
72002 +static int try_commit_txnh(commit_data * cd)
72003 +{
72004 +       int result;
72005 +
72006 +       assert("nikita-2968", lock_stack_isclean(get_current_lock_stack()));
72007 +
72008 +       /* Get the atom and txnh locked. */
72009 +       cd->atom = txnh_get_atom(cd->txnh);
72010 +       assert("jmacd-309", cd->atom != NULL);
72011 +       spin_unlock_txnh(cd->txnh);
72012 +
72013 +       if (cd->wait) {
72014 +               cd->atom->nr_waiters--;
72015 +               cd->wait = 0;
72016 +       }
72017 +
72018 +       if (cd->atom->stage == ASTAGE_DONE)
72019 +               return 0;
72020 +
72021 +       if (cd->failed)
72022 +               return 0;
72023 +
72024 +       if (atom_should_commit(cd->atom)) {
72025 +               /* if atom is _very_ large schedule it for commit as soon as
72026 +                * possible. */
72027 +               if (atom_should_commit_asap(cd->atom)) {
72028 +                       /*
72029 +                        * When atom is in PRE_COMMIT or later stage following
72030 +                        * invariant (encoded   in    atom_can_be_committed())
72031 +                        * holds:  there is exactly one non-waiter transaction
72032 +                        * handle opened  on this atom.  When  thread wants to
72033 +                        * wait  until atom  commits (for  example  sync()) it
72034 +                        * waits    on    atom  event     after     increasing
72035 +                        * atom->nr_waiters (see blow  in  this  function). It
72036 +                        * cannot be guaranteed that atom is already committed
72037 +                        * after    receiving event,  so     loop has   to  be
72038 +                        * re-started. But  if  atom switched into  PRE_COMMIT
72039 +                        * stage and became  too  large, we cannot  change its
72040 +                        * state back   to CAPTURE_WAIT (atom  stage can  only
72041 +                        * increase monotonically), hence this check.
72042 +                        */
72043 +                       if (cd->atom->stage < ASTAGE_CAPTURE_WAIT)
72044 +                               reiser4_atom_set_stage(cd->atom,
72045 +                                                      ASTAGE_CAPTURE_WAIT);
72046 +                       cd->atom->flags |= ATOM_FORCE_COMMIT;
72047 +               }
72048 +               if (cd->txnh->flags & TXNH_DONT_COMMIT) {
72049 +                       /*
72050 +                        * this  thread (transaction  handle  that is) doesn't
72051 +                        * want to commit  atom. Notify waiters that handle is
72052 +                        * closed. This can happen, for  example, when we  are
72053 +                        * under  VFS directory lock  and don't want to commit
72054 +                        * atom  right   now to  avoid  stalling other threads
72055 +                        * working in the same directory.
72056 +                        */
72057 +
72058 +                       /* Wake  the ktxnmgrd up if  the ktxnmgrd is needed to
72059 +                        * commit this  atom: no  atom  waiters  and only  one
72060 +                        * (our) open transaction handle. */
72061 +                       cd->wake_ktxnmgrd_up =
72062 +                           cd->atom->txnh_count == 1 &&
72063 +                           cd->atom->nr_waiters == 0;
72064 +                       reiser4_atom_send_event(cd->atom);
72065 +                       result = 0;
72066 +               } else if (!atom_can_be_committed(cd->atom)) {
72067 +                       if (should_wait_commit(cd->txnh)) {
72068 +                               /* sync(): wait for commit */
72069 +                               cd->atom->nr_waiters++;
72070 +                               cd->wait = 1;
72071 +                               reiser4_atom_wait_event(cd->atom);
72072 +                               result = RETERR(-E_REPEAT);
72073 +                       } else {
72074 +                               result = 0;
72075 +                       }
72076 +               } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) {
72077 +                       /*
72078 +                        * optimization: flush  atom without switching it into
72079 +                        * ASTAGE_CAPTURE_WAIT.
72080 +                        *
72081 +                        * But don't  do this for  ktxnmgrd, because  ktxnmgrd
72082 +                        * should never block on atom fusion.
72083 +                        */
72084 +                       result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS,
72085 +                                                   LONG_MAX, &cd->nr_written,
72086 +                                                   &cd->atom, NULL);
72087 +                       if (result == 0) {
72088 +                               spin_unlock_atom(cd->atom);
72089 +                               cd->preflush = 0;
72090 +                               result = RETERR(-E_REPEAT);
72091 +                       } else  /* Atoms wasn't flushed
72092 +                                * completely. Rinse. Repeat. */
72093 +                               --cd->preflush;
72094 +               } else {
72095 +                       /* We change   atom state  to   ASTAGE_CAPTURE_WAIT to
72096 +                          prevent atom fusion and count  ourself as an active
72097 +                          flusher */
72098 +                       reiser4_atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
72099 +                       cd->atom->flags |= ATOM_FORCE_COMMIT;
72100 +
72101 +                       result =
72102 +                           commit_current_atom(&cd->nr_written, &cd->atom);
72103 +                       if (result != 0 && result != -E_REPEAT)
72104 +                               cd->failed = 1;
72105 +               }
72106 +       } else
72107 +               result = 0;
72108 +
72109 +#if REISER4_DEBUG
72110 +       if (result == 0)
72111 +               assert_spin_locked(&(cd->atom->alock));
72112 +#endif
72113 +
72114 +       /* perfectly valid assertion, except that when atom/txnh is not locked
72115 +        * fusion can take place, and cd->atom points nowhere. */
72116 +       /*
72117 +          assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom)));
72118 +        */
72119 +       return result;
72120 +}
72121 +
72122 +/* Called to commit a transaction handle.  This decrements the atom's number of open
72123 +   handles and if it is the last handle to commit and the atom should commit, initiates
72124 +   atom commit. if commit does not fail, return number of written blocks */
72125 +static int commit_txnh(txn_handle * txnh)
72126 +{
72127 +       commit_data cd;
72128 +       assert("umka-192", txnh != NULL);
72129 +
72130 +       memset(&cd, 0, sizeof cd);
72131 +       cd.txnh = txnh;
72132 +       cd.preflush = 10;
72133 +
72134 +       /* calls try_commit_txnh() until either atom commits, or error
72135 +        * happens */
72136 +       while (try_commit_txnh(&cd) != 0)
72137 +               reiser4_preempt_point();
72138 +
72139 +       spin_lock_txnh(txnh);
72140 +
72141 +       cd.atom->txnh_count -= 1;
72142 +       txnh->atom = NULL;
72143 +       /* remove transaction handle from atom's list of transaction handles */
72144 +       list_del_init(&txnh->txnh_link);
72145 +
72146 +       spin_unlock_txnh(txnh);
72147 +       atom_dec_and_unlock(cd.atom);
72148 +       /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably
72149 +        * because it takes time) by current thread, we do that work
72150 +        * asynchronously by ktxnmgrd daemon. */
72151 +       if (cd.wake_ktxnmgrd_up)
72152 +               ktxnmgrd_kick(&get_current_super_private()->tmgr);
72153 +
72154 +       return 0;
72155 +}
72156 +
72157 +/* TRY_CAPTURE */
72158 +
72159 +/* This routine attempts a single block-capture request.  It may return -E_REPEAT if some
72160 +   condition indicates that the request should be retried, and it may block if the
72161 +   txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag.
72162 +
72163 +   This routine encodes the basic logic of block capturing described by:
72164 +
72165 +     http://namesys.com/v4/v4.html
72166 +
72167 +   Our goal here is to ensure that any two blocks that contain dependent modifications
72168 +   should commit at the same time.  This function enforces this discipline by initiating
72169 +   fusion whenever a transaction handle belonging to one atom requests to read or write a
72170 +   block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC).
72171 +
72172 +   In addition, this routine handles the initial assignment of atoms to blocks and
72173 +   transaction handles.  These are possible outcomes of this function:
72174 +
72175 +   1. The block and handle are already part of the same atom: return immediate success
72176 +
72177 +   2. The block is assigned but the handle is not: call capture_assign_txnh to assign
72178 +      the handle to the block's atom.
72179 +
72180 +   3. The handle is assigned but the block is not: call capture_assign_block to assign
72181 +      the block to the handle's atom.
72182 +
72183 +   4. Both handle and block are assigned, but to different atoms: call capture_init_fusion
72184 +      to fuse atoms.
72185 +
72186 +   5. Neither block nor handle are assigned: create a new atom and assign them both.
72187 +
72188 +   6. A read request for a non-captured block: return immediate success.
72189 +
72190 +   This function acquires and releases the handle's spinlock.  This function is called
72191 +   under the jnode lock and if the return value is 0, it returns with the jnode lock still
72192 +   held.  If the return is -E_REPEAT or some other error condition, the jnode lock is
72193 +   released.  The external interface (reiser4_try_capture) manages re-aquiring the jnode
72194 +   lock in the failure case.
72195 +*/
72196 +static int try_capture_block(
72197 +       txn_handle * txnh, jnode * node, txn_capture mode,
72198 +       txn_atom ** atom_alloc)
72199 +{
72200 +       txn_atom *block_atom;
72201 +       txn_atom *txnh_atom;
72202 +
72203 +       /* Should not call capture for READ_NONCOM requests, handled in reiser4_try_capture. */
72204 +       assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM);
72205 +
72206 +       /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree ==
72207 +        * node->tree somewhere. */
72208 +       assert("umka-194", txnh != NULL);
72209 +       assert("umka-195", node != NULL);
72210 +
72211 +       /* The jnode is already locked!  Being called from reiser4_try_capture(). */
72212 +       assert_spin_locked(&(node->guard));
72213 +       block_atom = node->atom;
72214 +
72215 +       /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't
72216 +          let us touch the atoms themselves. */
72217 +       spin_lock_txnh(txnh);
72218 +       txnh_atom = txnh->atom;
72219 +       /* Process of capturing continues into one of four branches depends on
72220 +          which atoms from (block atom (node->atom), current atom (txnh->atom))
72221 +          exist. */
72222 +       if (txnh_atom == NULL) {
72223 +               if (block_atom == NULL) {
72224 +                       spin_unlock_txnh(txnh);
72225 +                       spin_unlock_jnode(node);
72226 +                       /* assign empty atom to the txnh and repeat */
72227 +                       return atom_begin_and_assign_to_txnh(atom_alloc, txnh);
72228 +               } else {
72229 +                       atomic_inc(&block_atom->refcount);
72230 +                       /* node spin-lock isn't needed anymore */
72231 +                       spin_unlock_jnode(node);
72232 +                       if (!spin_trylock_atom(block_atom)) {
72233 +                               spin_unlock_txnh(txnh);
72234 +                               spin_lock_atom(block_atom);
72235 +                               spin_lock_txnh(txnh);
72236 +                       }
72237 +                       /* re-check state after getting txnh and the node
72238 +                        * atom spin-locked */
72239 +                       if (node->atom != block_atom || txnh->atom != NULL) {
72240 +                               spin_unlock_txnh(txnh);
72241 +                               atom_dec_and_unlock(block_atom);
72242 +                               return RETERR(-E_REPEAT);
72243 +                       }
72244 +                       atomic_dec(&block_atom->refcount);
72245 +                       if (block_atom->stage > ASTAGE_CAPTURE_WAIT ||
72246 +                           (block_atom->stage == ASTAGE_CAPTURE_WAIT &&
72247 +                            block_atom->txnh_count != 0))
72248 +                               return capture_fuse_wait(txnh, block_atom, NULL, mode);
72249 +                       capture_assign_txnh_nolock(block_atom, txnh);
72250 +                       spin_unlock_txnh(txnh);
72251 +                       spin_unlock_atom(block_atom);
72252 +                       return RETERR(-E_REPEAT);
72253 +               }
72254 +       } else {
72255 +               /* It is time to perform deadlock prevention check over the
72256 +                  node we want to capture.  It is possible this node was locked
72257 +                  for read without capturing it. The optimization which allows
72258 +                  to do it helps us in keeping atoms independent as long as
72259 +                  possible but it may cause lock/fuse deadlock problems.
72260 +
72261 +                  A number of similar deadlock situations with locked but not
72262 +                  captured nodes were found.  In each situation there are two
72263 +                  or more threads: one of them does flushing while another one
72264 +                  does routine balancing or tree lookup.  The flushing thread
72265 +                  (F) sleeps in long term locking request for node (N), another
72266 +                  thread (A) sleeps in trying to capture some node already
72267 +                  belonging the atom F, F has a state which prevents
72268 +                  immediately fusion .
72269 +
72270 +                  Deadlocks of this kind cannot happen if node N was properly
72271 +                  captured by thread A. The F thread fuse atoms before locking
72272 +                  therefore current atom of thread F and current atom of thread
72273 +                  A became the same atom and thread A may proceed.  This does
72274 +                  not work if node N was not captured because the fusion of
72275 +                  atom does not happens.
72276 +
72277 +                  The following scheme solves the deadlock: If
72278 +                  longterm_lock_znode locks and does not capture a znode, that
72279 +                  znode is marked as MISSED_IN_CAPTURE.  A node marked this way
72280 +                  is processed by the code below which restores the missed
72281 +                  capture and fuses current atoms of all the node lock owners
72282 +                  by calling the fuse_not_fused_lock_owners() function. */
72283 +               if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) {
72284 +                       JF_CLR(node, JNODE_MISSED_IN_CAPTURE);
72285 +                       if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) {
72286 +                               spin_unlock_txnh(txnh);
72287 +                               spin_unlock_jnode(node);
72288 +                               fuse_not_fused_lock_owners(txnh, JZNODE(node));
72289 +                               return RETERR(-E_REPEAT);
72290 +                       }
72291 +               }
72292 +               if (block_atom == NULL) {
72293 +                       atomic_inc(&txnh_atom->refcount);
72294 +                       spin_unlock_txnh(txnh);
72295 +                       if (!spin_trylock_atom(txnh_atom)) {
72296 +                               spin_unlock_jnode(node);
72297 +                               spin_lock_atom(txnh_atom);
72298 +                               spin_lock_jnode(node);
72299 +                       }
72300 +                       if (txnh->atom != txnh_atom || node->atom != NULL
72301 +                               || JF_ISSET(node, JNODE_IS_DYING)) {
72302 +                               spin_unlock_jnode(node);
72303 +                               atom_dec_and_unlock(txnh_atom);
72304 +                               return RETERR(-E_REPEAT);
72305 +                       }
72306 +                       atomic_dec(&txnh_atom->refcount);
72307 +                       capture_assign_block_nolock(txnh_atom, node);
72308 +                       spin_unlock_atom(txnh_atom);
72309 +               } else {
72310 +                       if (txnh_atom != block_atom) {
72311 +                               if (mode & TXN_CAPTURE_DONT_FUSE) {
72312 +                                       spin_unlock_txnh(txnh);
72313 +                                       spin_unlock_jnode(node);
72314 +                                       /* we are in a "no-fusion" mode and @node is
72315 +                                        * already part of transaction. */
72316 +                                       return RETERR(-E_NO_NEIGHBOR);
72317 +                               }
72318 +                               return capture_init_fusion(node, txnh, mode);
72319 +                       }
72320 +                       spin_unlock_txnh(txnh);
72321 +               }
72322 +       }
72323 +       return 0;
72324 +}
72325 +
72326 +static txn_capture
72327 +build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags)
72328 +{
72329 +       txn_capture cap_mode;
72330 +
72331 +       assert_spin_locked(&(node->guard));
72332 +
72333 +       /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */
72334 +
72335 +       if (lock_mode == ZNODE_WRITE_LOCK) {
72336 +               cap_mode = TXN_CAPTURE_WRITE;
72337 +       } else if (node->atom != NULL) {
72338 +               cap_mode = TXN_CAPTURE_WRITE;
72339 +       } else if (0 &&         /* txnh->mode == TXN_READ_FUSING && */
72340 +                  jnode_get_level(node) == LEAF_LEVEL) {
72341 +               /* NOTE-NIKITA TXN_READ_FUSING is not currently used */
72342 +               /* We only need a READ_FUSING capture at the leaf level.  This
72343 +                  is because the internal levels of the tree (twigs included)
72344 +                  are redundant from the point of the user that asked for a
72345 +                  read-fusing transcrash.  The user only wants to read-fuse
72346 +                  atoms due to reading uncommitted data that another user has
72347 +                  written.  It is the file system that reads/writes the
72348 +                  internal tree levels, the user only reads/writes leaves. */
72349 +               cap_mode = TXN_CAPTURE_READ_ATOMIC;
72350 +       } else {
72351 +               /* In this case (read lock at a non-leaf) there's no reason to
72352 +                * capture. */
72353 +               /* cap_mode = TXN_CAPTURE_READ_NONCOM; */
72354 +               return 0;
72355 +       }
72356 +
72357 +       cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE));
72358 +       assert("nikita-3186", cap_mode != 0);
72359 +       return cap_mode;
72360 +}
72361 +
72362 +/* This is an external interface to try_capture_block(), it calls
72363 +   try_capture_block() repeatedly as long as -E_REPEAT is returned.
72364 +
72365 +   @node:         node to capture,
72366 +   @lock_mode:    read or write lock is used in capture mode calculation,
72367 +   @flags:        see txn_capture flags enumeration,
72368 +   @can_coc     : can copy-on-capture
72369 +
72370 +   @return: 0 - node was successfully captured, -E_REPEAT - capture request
72371 +            cannot be processed immediately as it was requested in flags,
72372 +           < 0 - other errors.
72373 +*/
72374 +int reiser4_try_capture(jnode *node, znode_lock_mode lock_mode,
72375 +                       txn_capture flags)
72376 +{
72377 +       txn_atom *atom_alloc = NULL;
72378 +       txn_capture cap_mode;
72379 +       txn_handle *txnh = get_current_context()->trans;
72380 +       int ret;
72381 +
72382 +       assert_spin_locked(&(node->guard));
72383 +
72384 +      repeat:
72385 +       if (JF_ISSET(node, JNODE_IS_DYING))
72386 +               return RETERR(-EINVAL);
72387 +       if (node->atom != NULL && txnh->atom == node->atom)
72388 +               return 0;
72389 +       cap_mode = build_capture_mode(node, lock_mode, flags);
72390 +       if (cap_mode == 0 ||
72391 +           (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) {
72392 +               /* Mark this node as "MISSED".  It helps in further deadlock
72393 +                * analysis */
72394 +               if (jnode_is_znode(node))
72395 +                       JF_SET(node, JNODE_MISSED_IN_CAPTURE);
72396 +               return 0;
72397 +       }
72398 +       /* Repeat try_capture as long as -E_REPEAT is returned. */
72399 +       ret = try_capture_block(txnh, node, cap_mode, &atom_alloc);
72400 +       /* Regardless of non_blocking:
72401 +
72402 +          If ret == 0 then jnode is still locked.
72403 +          If ret != 0 then jnode is unlocked.
72404 +        */
72405 +#if REISER4_DEBUG
72406 +       if (ret == 0)
72407 +               assert_spin_locked(&(node->guard));
72408 +       else
72409 +               assert_spin_not_locked(&(node->guard));
72410 +#endif
72411 +       assert_spin_not_locked(&(txnh->guard));
72412 +
72413 +       if (ret == -E_REPEAT) {
72414 +               /* E_REPEAT implies all locks were released, therefore we need
72415 +                  to take the jnode's lock again. */
72416 +               spin_lock_jnode(node);
72417 +
72418 +               /* Although this may appear to be a busy loop, it is not.
72419 +                  There are several conditions that cause E_REPEAT to be
72420 +                  returned by the call to try_capture_block, all cases
72421 +                  indicating some kind of state change that means you should
72422 +                  retry the request and will get a different result.  In some
72423 +                  cases this could be avoided with some extra code, but
72424 +                  generally it is done because the necessary locks were
72425 +                  released as a result of the operation and repeating is the
72426 +                  simplest thing to do (less bug potential).  The cases are:
72427 +                  atom fusion returns E_REPEAT after it completes (jnode and
72428 +                  txnh were unlocked); race conditions in assign_block,
72429 +                  assign_txnh, and init_fusion return E_REPEAT (trylock
72430 +                  failure); after going to sleep in capture_fuse_wait
72431 +                  (request was blocked but may now succeed).  I'm not quite
72432 +                  sure how capture_copy works yet, but it may also return
72433 +                  E_REPEAT.  When the request is legitimately blocked, the
72434 +                  requestor goes to sleep in fuse_wait, so this is not a busy
72435 +                  loop. */
72436 +               /* NOTE-NIKITA: still don't understand:
72437 +
72438 +                  try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT
72439 +
72440 +                  looks like busy loop?
72441 +                */
72442 +               goto repeat;
72443 +       }
72444 +
72445 +       /* free extra atom object that was possibly allocated by
72446 +          try_capture_block().
72447 +
72448 +          Do this before acquiring jnode spin lock to
72449 +          minimize time spent under lock. --nikita */
72450 +       if (atom_alloc != NULL) {
72451 +               kmem_cache_free(_atom_slab, atom_alloc);
72452 +       }
72453 +
72454 +       if (ret != 0) {
72455 +               if (ret == -E_BLOCK) {
72456 +                       assert("nikita-3360",
72457 +                              cap_mode & TXN_CAPTURE_NONBLOCKING);
72458 +                       ret = -E_REPEAT;
72459 +               }
72460 +
72461 +               /* Failure means jnode is not locked.  FIXME_LATER_JMACD May
72462 +                  want to fix the above code to avoid releasing the lock and
72463 +                  re-acquiring it, but there are cases were failure occurs
72464 +                  when the lock is not held, and those cases would need to be
72465 +                  modified to re-take the lock. */
72466 +               spin_lock_jnode(node);
72467 +       }
72468 +
72469 +       /* Jnode is still locked. */
72470 +       assert_spin_locked(&(node->guard));
72471 +       return ret;
72472 +}
72473 +
72474 +static void release_two_atoms(txn_atom *one, txn_atom *two)
72475 +{
72476 +       spin_unlock_atom(one);
72477 +       atom_dec_and_unlock(two);
72478 +       spin_lock_atom(one);
72479 +       atom_dec_and_unlock(one);
72480 +}
72481 +
72482 +/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is
72483 +   returned by that routine.  The txn_capture request mode is computed here depending on
72484 +   the transaction handle's type and the lock request.  This is called from the depths of
72485 +   the lock manager with the jnode lock held and it always returns with the jnode lock
72486 +   held.
72487 +*/
72488 +
72489 +/* fuse all 'active' atoms of lock owners of given node. */
72490 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node)
72491 +{
72492 +       lock_handle *lh;
72493 +       int repeat;
72494 +       txn_atom *atomh, *atomf;
72495 +       reiser4_context *me = get_current_context();
72496 +       reiser4_context *ctx = NULL;
72497 +
72498 +       assert_spin_not_locked(&(ZJNODE(node)->guard));
72499 +       assert_spin_not_locked(&(txnh->hlock));
72500 +
72501 + repeat:
72502 +       repeat = 0;
72503 +       atomh = txnh_get_atom(txnh);
72504 +       spin_unlock_txnh(txnh);
72505 +       assert("zam-692", atomh != NULL);
72506 +
72507 +       spin_lock_zlock(&node->lock);
72508 +       /* inspect list of lock owners */
72509 +       list_for_each_entry(lh, &node->lock.owners, owners_link) {
72510 +               ctx = get_context_by_lock_stack(lh->owner);
72511 +               if (ctx == me)
72512 +                       continue;
72513 +               /* below we use two assumptions to avoid addition spin-locks
72514 +                  for checking the condition :
72515 +
72516 +                  1) if the lock stack has lock, the transaction should be
72517 +                  opened, i.e. ctx->trans != NULL;
72518 +
72519 +                  2) reading of well-aligned ctx->trans->atom is atomic, if it
72520 +                  equals to the address of spin-locked atomh, we take that
72521 +                  the atoms are the same, nothing has to be captured. */
72522 +               if (atomh != ctx->trans->atom) {
72523 +                       reiser4_wake_up(lh->owner);
72524 +                       repeat = 1;
72525 +                       break;
72526 +               }
72527 +       }
72528 +       if (repeat) {
72529 +               if (!spin_trylock_txnh(ctx->trans)) {
72530 +                       spin_unlock_zlock(&node->lock);
72531 +                       spin_unlock_atom(atomh);
72532 +                       goto repeat;
72533 +               }
72534 +               atomf = ctx->trans->atom;
72535 +               if (atomf == NULL) {
72536 +                       capture_assign_txnh_nolock(atomh, ctx->trans);
72537 +                       /* release zlock lock _after_ assigning the atom to the
72538 +                        * transaction handle, otherwise the lock owner thread
72539 +                        * may unlock all znodes, exit kernel context and here
72540 +                        * we would access an invalid transaction handle. */
72541 +                       spin_unlock_zlock(&node->lock);
72542 +                       spin_unlock_atom(atomh);
72543 +                       spin_unlock_txnh(ctx->trans);
72544 +                       goto repeat;
72545 +               }
72546 +               assert("zam-1059", atomf != atomh);
72547 +               spin_unlock_zlock(&node->lock);
72548 +               atomic_inc(&atomh->refcount);
72549 +               atomic_inc(&atomf->refcount);
72550 +               spin_unlock_txnh(ctx->trans);
72551 +               if (atomf > atomh) {
72552 +                       spin_lock_atom_nested(atomf);
72553 +               } else {
72554 +                       spin_unlock_atom(atomh);
72555 +                       spin_lock_atom(atomf);
72556 +                       spin_lock_atom_nested(atomh);
72557 +               }
72558 +               if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) {
72559 +                       release_two_atoms(atomf, atomh);
72560 +                       goto repeat;
72561 +               }
72562 +               atomic_dec(&atomh->refcount);
72563 +               atomic_dec(&atomf->refcount);
72564 +               capture_fuse_into(atomf, atomh);
72565 +               goto repeat;
72566 +       }
72567 +       spin_unlock_zlock(&node->lock);
72568 +       spin_unlock_atom(atomh);
72569 +}
72570 +
72571 +/* This is the interface to capture unformatted nodes via their struct page
72572 +   reference. Currently it is only used in reiser4_invalidatepage */
72573 +int try_capture_page_to_invalidate(struct page *pg)
72574 +{
72575 +       int ret;
72576 +       jnode *node;
72577 +
72578 +       assert("umka-292", pg != NULL);
72579 +       assert("nikita-2597", PageLocked(pg));
72580 +
72581 +       if (IS_ERR(node = jnode_of_page(pg))) {
72582 +               return PTR_ERR(node);
72583 +       }
72584 +
72585 +       spin_lock_jnode(node);
72586 +       unlock_page(pg);
72587 +
72588 +       ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
72589 +       spin_unlock_jnode(node);
72590 +       jput(node);
72591 +       lock_page(pg);
72592 +       return ret;
72593 +}
72594 +
72595 +/* This informs the transaction manager when a node is deleted.  Add the block to the
72596 +   atom's delete set and uncapture the block.
72597 +
72598 +VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for
72599 +explanations.  find all the functions that use it, and unless there is some very
72600 +good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....),
72601 +move the loop to inside the function.
72602 +
72603 +VS-FIXME-HANS: can this code be at all streamlined?  In particular, can you lock and unlock the jnode fewer times?
72604 +  */
72605 +void reiser4_uncapture_page(struct page *pg)
72606 +{
72607 +       jnode *node;
72608 +       txn_atom *atom;
72609 +
72610 +       assert("umka-199", pg != NULL);
72611 +       assert("nikita-3155", PageLocked(pg));
72612 +
72613 +       clear_page_dirty_for_io(pg);
72614 +
72615 +       reiser4_wait_page_writeback(pg);
72616 +
72617 +       node = jprivate(pg);
72618 +       BUG_ON(node == NULL);
72619 +
72620 +       spin_lock_jnode(node);
72621 +
72622 +       atom = jnode_get_atom(node);
72623 +       if (atom == NULL) {
72624 +               assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
72625 +               spin_unlock_jnode(node);
72626 +               return;
72627 +       }
72628 +
72629 +       /* We can remove jnode from transaction even if it is on flush queue
72630 +        * prepped list, we only need to be sure that flush queue is not being
72631 +        * written by reiser4_write_fq().  reiser4_write_fq() does not use atom
72632 +        * spin lock for protection of the prepped nodes list, instead
72633 +        * write_fq() increments atom's nr_running_queues counters for the time
72634 +        * when prepped list is not protected by spin lock.  Here we check this
72635 +        * counter if we want to remove jnode from flush queue and, if the
72636 +        * counter is not zero, wait all reiser4_write_fq() for this atom to
72637 +        * complete. This is not significant overhead. */
72638 +       while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) {
72639 +               spin_unlock_jnode(node);
72640 +               /*
72641 +                * at this moment we want to wait for "atom event", viz. wait
72642 +                * until @node can be removed from flush queue. But
72643 +                * reiser4_atom_wait_event() cannot be called with page locked,
72644 +                * because it deadlocks with jnode_extent_write(). Unlock page,
72645 +                * after making sure (through page_cache_get()) that it cannot
72646 +                * be released from memory.
72647 +                */
72648 +               page_cache_get(pg);
72649 +               unlock_page(pg);
72650 +               reiser4_atom_wait_event(atom);
72651 +               lock_page(pg);
72652 +               /*
72653 +                * page may has been detached by ->writepage()->releasepage().
72654 +                */
72655 +               reiser4_wait_page_writeback(pg);
72656 +               spin_lock_jnode(node);
72657 +               page_cache_release(pg);
72658 +               atom = jnode_get_atom(node);
72659 +/* VS-FIXME-HANS: improve the commenting in this function */
72660 +               if (atom == NULL) {
72661 +                       spin_unlock_jnode(node);
72662 +                       return;
72663 +               }
72664 +       }
72665 +       reiser4_uncapture_block(node);
72666 +       spin_unlock_atom(atom);
72667 +       jput(node);
72668 +}
72669 +
72670 +/* this is used in extent's kill hook to uncapture and unhash jnodes attached to
72671 + * inode's tree of jnodes */
72672 +void reiser4_uncapture_jnode(jnode * node)
72673 +{
72674 +       txn_atom *atom;
72675 +
72676 +       assert_spin_locked(&(node->guard));
72677 +       assert("", node->pg == 0);
72678 +
72679 +       atom = jnode_get_atom(node);
72680 +       if (atom == NULL) {
72681 +               assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
72682 +               spin_unlock_jnode(node);
72683 +               return;
72684 +       }
72685 +
72686 +       reiser4_uncapture_block(node);
72687 +       spin_unlock_atom(atom);
72688 +       jput(node);
72689 +}
72690 +
72691 +/* No-locking version of assign_txnh.  Sets the transaction handle's atom pointer,
72692 +   increases atom refcount and txnh_count, adds to txnh_list. */
72693 +static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh)
72694 +{
72695 +       assert("umka-200", atom != NULL);
72696 +       assert("umka-201", txnh != NULL);
72697 +
72698 +       assert_spin_locked(&(txnh->hlock));
72699 +       assert_spin_locked(&(atom->alock));
72700 +       assert("jmacd-824", txnh->atom == NULL);
72701 +       assert("nikita-3540", atom_isopen(atom));
72702 +       BUG_ON(txnh->atom != NULL);
72703 +
72704 +       atomic_inc(&atom->refcount);
72705 +       txnh->atom = atom;
72706 +       reiser4_ctx_gfp_mask_set();
72707 +       list_add_tail(&txnh->txnh_link, &atom->txnh_list);
72708 +       atom->txnh_count += 1;
72709 +}
72710 +
72711 +/* No-locking version of assign_block.  Sets the block's atom pointer, references the
72712 +   block, adds it to the clean or dirty capture_jnode list, increments capture_count. */
72713 +static void capture_assign_block_nolock(txn_atom *atom, jnode *node)
72714 +{
72715 +       assert("umka-202", atom != NULL);
72716 +       assert("umka-203", node != NULL);
72717 +       assert_spin_locked(&(node->guard));
72718 +       assert_spin_locked(&(atom->alock));
72719 +       assert("jmacd-323", node->atom == NULL);
72720 +       BUG_ON(!list_empty_careful(&node->capture_link));
72721 +       assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY));
72722 +
72723 +       /* Pointer from jnode to atom is not counted in atom->refcount. */
72724 +       node->atom = atom;
72725 +
72726 +       list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom));
72727 +       atom->capture_count += 1;
72728 +       /* reference to jnode is acquired by atom. */
72729 +       jref(node);
72730 +
72731 +       ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1));
72732 +
72733 +       LOCK_CNT_INC(t_refs);
72734 +}
72735 +
72736 +/* common code for dirtying both unformatted jnodes and formatted znodes. */
72737 +static void do_jnode_make_dirty(jnode * node, txn_atom * atom)
72738 +{
72739 +       assert_spin_locked(&(node->guard));
72740 +       assert_spin_locked(&(atom->alock));
72741 +       assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY));
72742 +
72743 +       JF_SET(node, JNODE_DIRTY);
72744 +
72745 +       if (!JF_ISSET(node, JNODE_CLUSTER_PAGE))
72746 +               get_current_context()->nr_marked_dirty++;
72747 +
72748 +       /* We grab2flush_reserve one additional block only if node was
72749 +          not CREATED and jnode_flush did not sort it into neither
72750 +          relocate set nor overwrite one. If node is in overwrite or
72751 +          relocate set we assume that atom's flush reserved counter was
72752 +          already adjusted. */
72753 +       if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC)
72754 +           && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node)
72755 +           && !jnode_is_cluster_page(node)) {
72756 +               assert("vs-1093", !reiser4_blocknr_is_fake(&node->blocknr));
72757 +               assert("vs-1506", *jnode_get_block(node) != 0);
72758 +               grabbed2flush_reserved_nolock(atom, (__u64) 1);
72759 +               JF_SET(node, JNODE_FLUSH_RESERVED);
72760 +       }
72761 +
72762 +       if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
72763 +               /* If the atom is not set yet, it will be added to the appropriate list in
72764 +                  capture_assign_block_nolock. */
72765 +               /* Sometimes a node is set dirty before being captured -- the case for new
72766 +                  jnodes.  In that case the jnode will be added to the appropriate list
72767 +                  in capture_assign_block_nolock. Another reason not to re-link jnode is
72768 +                  that jnode is on a flush queue (see flush.c for details) */
72769 +
72770 +               int level = jnode_get_level(node);
72771 +
72772 +               assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR));
72773 +               assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT);
72774 +               assert("nikita-2607", 0 <= level);
72775 +               assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT);
72776 +
72777 +               /* move node to atom's dirty list */
72778 +               list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level));
72779 +               ON_DEBUG(count_jnode
72780 +                        (atom, node, NODE_LIST(node), DIRTY_LIST, 1));
72781 +       }
72782 +}
72783 +
72784 +/* Set the dirty status for this (spin locked) jnode. */
72785 +void jnode_make_dirty_locked(jnode * node)
72786 +{
72787 +       assert("umka-204", node != NULL);
72788 +       assert_spin_locked(&(node->guard));
72789 +
72790 +       if (REISER4_DEBUG && rofs_jnode(node)) {
72791 +               warning("nikita-3365", "Dirtying jnode on rofs");
72792 +               dump_stack();
72793 +       }
72794 +
72795 +       /* Fast check for already dirty node */
72796 +       if (!JF_ISSET(node, JNODE_DIRTY)) {
72797 +               txn_atom *atom;
72798 +
72799 +               atom = jnode_get_atom(node);
72800 +               assert("vs-1094", atom);
72801 +               /* Check jnode dirty status again because node spin lock might
72802 +                * be released inside jnode_get_atom(). */
72803 +               if (likely(!JF_ISSET(node, JNODE_DIRTY)))
72804 +                       do_jnode_make_dirty(node, atom);
72805 +               spin_unlock_atom(atom);
72806 +       }
72807 +}
72808 +
72809 +/* Set the dirty status for this znode. */
72810 +void znode_make_dirty(znode * z)
72811 +{
72812 +       jnode *node;
72813 +       struct page *page;
72814 +
72815 +       assert("umka-204", z != NULL);
72816 +       assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z));
72817 +       assert("nikita-3560", znode_is_write_locked(z));
72818 +
72819 +       node = ZJNODE(z);
72820 +       /* znode is longterm locked, we can check dirty bit without spinlock */
72821 +       if (JF_ISSET(node, JNODE_DIRTY)) {
72822 +               /* znode is dirty already. All we have to do is to change znode version */
72823 +               z->version = znode_build_version(jnode_get_tree(node));
72824 +               return;
72825 +       }
72826 +
72827 +       spin_lock_jnode(node);
72828 +       jnode_make_dirty_locked(node);
72829 +       page = jnode_page(node);
72830 +       if (page != NULL) {
72831 +               /* this is useful assertion (allows one to check that no
72832 +                * modifications are lost due to update of in-flight page),
72833 +                * but it requires locking on page to check PG_writeback
72834 +                * bit. */
72835 +               /* assert("nikita-3292",
72836 +                  !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */
72837 +               page_cache_get(page);
72838 +
72839 +               /* jnode lock is not needed for the rest of
72840 +                * znode_set_dirty(). */
72841 +               spin_unlock_jnode(node);
72842 +               /* reiser4 file write code calls set_page_dirty for
72843 +                * unformatted nodes, for formatted nodes we do it here. */
72844 +               set_page_dirty_notag(page);
72845 +               page_cache_release(page);
72846 +               /* bump version counter in znode */
72847 +               z->version = znode_build_version(jnode_get_tree(node));
72848 +       } else {
72849 +               assert("zam-596", znode_above_root(JZNODE(node)));
72850 +               spin_unlock_jnode(node);
72851 +       }
72852 +
72853 +       assert("nikita-1900", znode_is_write_locked(z));
72854 +       assert("jmacd-9777", node->atom != NULL);
72855 +}
72856 +
72857 +int reiser4_sync_atom(txn_atom * atom)
72858 +{
72859 +       int result;
72860 +       txn_handle *txnh;
72861 +
72862 +       txnh = get_current_context()->trans;
72863 +
72864 +       result = 0;
72865 +       if (atom != NULL) {
72866 +               if (atom->stage < ASTAGE_PRE_COMMIT) {
72867 +                       spin_lock_txnh(txnh);
72868 +                       capture_assign_txnh_nolock(atom, txnh);
72869 +                       result = force_commit_atom(txnh);
72870 +               } else if (atom->stage < ASTAGE_POST_COMMIT) {
72871 +                       /* wait atom commit */
72872 +                       reiser4_atom_wait_event(atom);
72873 +                       /* try once more */
72874 +                       result = RETERR(-E_REPEAT);
72875 +               } else
72876 +                       spin_unlock_atom(atom);
72877 +       }
72878 +       return result;
72879 +}
72880 +
72881 +#if REISER4_DEBUG
72882 +
72883 +/* move jnode form one list to another
72884 +   call this after atom->capture_count is updated */
72885 +void
72886 +count_jnode(txn_atom * atom, jnode * node, atom_list old_list,
72887 +           atom_list new_list, int check_lists)
72888 +{
72889 +       struct list_head *pos;
72890 +
72891 +       assert("zam-1018", atom_is_protected(atom));
72892 +       assert_spin_locked(&(node->guard));
72893 +       assert("", NODE_LIST(node) == old_list);
72894 +
72895 +       switch (NODE_LIST(node)) {
72896 +       case NOT_CAPTURED:
72897 +               break;
72898 +       case DIRTY_LIST:
72899 +               assert("", atom->dirty > 0);
72900 +               atom->dirty--;
72901 +               break;
72902 +       case CLEAN_LIST:
72903 +               assert("", atom->clean > 0);
72904 +               atom->clean--;
72905 +               break;
72906 +       case FQ_LIST:
72907 +               assert("", atom->fq > 0);
72908 +               atom->fq--;
72909 +               break;
72910 +       case WB_LIST:
72911 +               assert("", atom->wb > 0);
72912 +               atom->wb--;
72913 +               break;
72914 +       case OVRWR_LIST:
72915 +               assert("", atom->ovrwr > 0);
72916 +               atom->ovrwr--;
72917 +               break;
72918 +       default:
72919 +               impossible("", "");
72920 +       }
72921 +
72922 +       switch (new_list) {
72923 +       case NOT_CAPTURED:
72924 +               break;
72925 +       case DIRTY_LIST:
72926 +               atom->dirty++;
72927 +               break;
72928 +       case CLEAN_LIST:
72929 +               atom->clean++;
72930 +               break;
72931 +       case FQ_LIST:
72932 +               atom->fq++;
72933 +               break;
72934 +       case WB_LIST:
72935 +               atom->wb++;
72936 +               break;
72937 +       case OVRWR_LIST:
72938 +               atom->ovrwr++;
72939 +               break;
72940 +       default:
72941 +               impossible("", "");
72942 +       }
72943 +       ASSIGN_NODE_LIST(node, new_list);
72944 +       if (0 && check_lists) {
72945 +               int count;
72946 +               tree_level level;
72947 +
72948 +               count = 0;
72949 +
72950 +               /* flush queue list */
72951 +               /* reiser4_check_fq(atom); */
72952 +
72953 +               /* dirty list */
72954 +               count = 0;
72955 +               for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
72956 +                       list_for_each(pos, ATOM_DIRTY_LIST(atom, level))
72957 +                               count++;
72958 +               }
72959 +               if (count != atom->dirty)
72960 +                       warning("", "dirty counter %d, real %d\n", atom->dirty,
72961 +                               count);
72962 +
72963 +               /* clean list */
72964 +               count = 0;
72965 +               list_for_each(pos, ATOM_CLEAN_LIST(atom))
72966 +                       count++;
72967 +               if (count != atom->clean)
72968 +                       warning("", "clean counter %d, real %d\n", atom->clean,
72969 +                               count);
72970 +
72971 +               /* wb list */
72972 +               count = 0;
72973 +               list_for_each(pos, ATOM_WB_LIST(atom))
72974 +                       count++;
72975 +               if (count != atom->wb)
72976 +                       warning("", "wb counter %d, real %d\n", atom->wb,
72977 +                               count);
72978 +
72979 +               /* overwrite list */
72980 +               count = 0;
72981 +               list_for_each(pos, ATOM_OVRWR_LIST(atom))
72982 +                       count++;
72983 +
72984 +               if (count != atom->ovrwr)
72985 +                       warning("", "ovrwr counter %d, real %d\n", atom->ovrwr,
72986 +                               count);
72987 +       }
72988 +       assert("vs-1624", atom->num_queued == atom->fq);
72989 +       if (atom->capture_count !=
72990 +           atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) {
72991 +               printk
72992 +                   ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n",
72993 +                    atom->capture_count, atom->dirty, atom->clean, atom->ovrwr,
72994 +                    atom->wb, atom->fq);
72995 +               assert("vs-1622",
72996 +                      atom->capture_count ==
72997 +                      atom->dirty + atom->clean + atom->ovrwr + atom->wb +
72998 +                      atom->fq);
72999 +       }
73000 +}
73001 +
73002 +#endif
73003 +
73004 +/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode
73005 + * lock should be taken before calling this function. */
73006 +void jnode_make_wander_nolock(jnode * node)
73007 +{
73008 +       txn_atom *atom;
73009 +
73010 +       assert("nikita-2431", node != NULL);
73011 +       assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC));
73012 +       assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY));
73013 +       assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
73014 +       assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
73015 +
73016 +       atom = node->atom;
73017 +
73018 +       assert("zam-895", atom != NULL);
73019 +       assert("zam-894", atom_is_protected(atom));
73020 +
73021 +       JF_SET(node, JNODE_OVRWR);
73022 +       /* move node to atom's overwrite list */
73023 +       list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom));
73024 +       ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1));
73025 +}
73026 +
73027 +/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside
73028 + * this function. */
73029 +void jnode_make_wander(jnode * node)
73030 +{
73031 +       txn_atom *atom;
73032 +
73033 +       spin_lock_jnode(node);
73034 +       atom = jnode_get_atom(node);
73035 +       assert("zam-913", atom != NULL);
73036 +       assert("zam-914", !JF_ISSET(node, JNODE_RELOC));
73037 +
73038 +       jnode_make_wander_nolock(node);
73039 +       spin_unlock_atom(atom);
73040 +       spin_unlock_jnode(node);
73041 +}
73042 +
73043 +/* this just sets RELOC bit  */
73044 +static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node)
73045 +{
73046 +       assert_spin_locked(&(node->guard));
73047 +       assert("zam-916", JF_ISSET(node, JNODE_DIRTY));
73048 +       assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
73049 +       assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
73050 +       assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
73051 +       assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
73052 +       jnode_set_reloc(node);
73053 +}
73054 +
73055 +/* Make znode RELOC and put it on flush queue */
73056 +void znode_make_reloc(znode * z, flush_queue_t * fq)
73057 +{
73058 +       jnode *node;
73059 +       txn_atom *atom;
73060 +
73061 +       node = ZJNODE(z);
73062 +       spin_lock_jnode(node);
73063 +
73064 +       atom = jnode_get_atom(node);
73065 +       assert("zam-919", atom != NULL);
73066 +
73067 +       jnode_make_reloc_nolock(fq, node);
73068 +       queue_jnode(fq, node);
73069 +
73070 +       spin_unlock_atom(atom);
73071 +       spin_unlock_jnode(node);
73072 +
73073 +}
73074 +
73075 +/* Make unformatted node RELOC and put it on flush queue */
73076 +void unformatted_make_reloc(jnode *node, flush_queue_t *fq)
73077 +{
73078 +       assert("vs-1479", jnode_is_unformatted(node));
73079 +
73080 +       jnode_make_reloc_nolock(fq, node);
73081 +       queue_jnode(fq, node);
73082 +}
73083 +
73084 +int reiser4_capture_super_block(struct super_block *s)
73085 +{
73086 +       int result;
73087 +       znode *uber;
73088 +       lock_handle lh;
73089 +
73090 +       init_lh(&lh);
73091 +       result = get_uber_znode(reiser4_get_tree(s),
73092 +                               ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh);
73093 +       if (result)
73094 +               return result;
73095 +
73096 +       uber = lh.node;
73097 +       /* Grabbing one block for superblock */
73098 +       result = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
73099 +       if (result != 0)
73100 +               return result;
73101 +
73102 +       znode_make_dirty(uber);
73103 +
73104 +       done_lh(&lh);
73105 +       return 0;
73106 +}
73107 +
73108 +/* Wakeup every handle on the atom's WAITFOR list */
73109 +static void wakeup_atom_waitfor_list(txn_atom * atom)
73110 +{
73111 +       txn_wait_links *wlinks;
73112 +
73113 +       assert("umka-210", atom != NULL);
73114 +
73115 +       /* atom is locked */
73116 +       list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) {
73117 +               if (wlinks->waitfor_cb == NULL ||
73118 +                   wlinks->waitfor_cb(atom, wlinks))
73119 +                       /* Wake up. */
73120 +                       reiser4_wake_up(wlinks->_lock_stack);
73121 +       }
73122 +}
73123 +
73124 +/* Wakeup every handle on the atom's WAITING list */
73125 +static void wakeup_atom_waiting_list(txn_atom * atom)
73126 +{
73127 +       txn_wait_links *wlinks;
73128 +
73129 +       assert("umka-211", atom != NULL);
73130 +
73131 +       /* atom is locked */
73132 +       list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) {
73133 +               if (wlinks->waiting_cb == NULL ||
73134 +                   wlinks->waiting_cb(atom, wlinks))
73135 +                       /* Wake up. */
73136 +                       reiser4_wake_up(wlinks->_lock_stack);
73137 +       }
73138 +}
73139 +
73140 +/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */
73141 +static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks)
73142 +{
73143 +       assert("nikita-3330", atom != NULL);
73144 +       assert_spin_locked(&(atom->alock));
73145 +
73146 +       /* atom->txnh_count == 1 is for waking waiters up if we are releasing
73147 +        * last transaction handle. */
73148 +       return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1;
73149 +}
73150 +
73151 +/* The general purpose of this function is to wait on the first of two possible events.
73152 +   The situation is that a handle (and its atom atomh) is blocked trying to capture a
73153 +   block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state.  The
73154 +   handle's atom (atomh) is not in the CAPTURE_WAIT state.  However, atomh could fuse with
73155 +   another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it
73156 +   needs to unblock the handle to avoid deadlock.  When the txnh is unblocked it will
73157 +   proceed and fuse the two atoms in the CAPTURE_WAIT state.
73158 +
73159 +   In other words, if either atomh or atomf change state, the handle will be awakened,
73160 +   thus there are two lists per atom: WAITING and WAITFOR.
73161 +
73162 +   This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to
73163 +   close but it is not assigned to an atom of its own.
73164 +
73165 +   Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK,
73166 +   BOTH_ATOM_LOCKS.  Result: all four locks are released.
73167 +*/
73168 +static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf,
73169 +                   txn_atom * atomh, txn_capture mode)
73170 +{
73171 +       int ret;
73172 +       txn_wait_links wlinks;
73173 +
73174 +       assert("umka-213", txnh != NULL);
73175 +       assert("umka-214", atomf != NULL);
73176 +
73177 +       if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) {
73178 +               spin_unlock_txnh(txnh);
73179 +               spin_unlock_atom(atomf);
73180 +
73181 +               if (atomh) {
73182 +                       spin_unlock_atom(atomh);
73183 +               }
73184 +
73185 +               return RETERR(-E_BLOCK);
73186 +       }
73187 +
73188 +       /* Initialize the waiting list links. */
73189 +       init_wlinks(&wlinks);
73190 +
73191 +       /* Add txnh to atomf's waitfor list, unlock atomf. */
73192 +       list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list);
73193 +       wlinks.waitfor_cb = wait_for_fusion;
73194 +       atomic_inc(&atomf->refcount);
73195 +       spin_unlock_atom(atomf);
73196 +
73197 +       if (atomh) {
73198 +               /* Add txnh to atomh's waiting list, unlock atomh. */
73199 +               list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list);
73200 +               atomic_inc(&atomh->refcount);
73201 +               spin_unlock_atom(atomh);
73202 +       }
73203 +
73204 +       /* Go to sleep. */
73205 +       spin_unlock_txnh(txnh);
73206 +
73207 +       ret = reiser4_prepare_to_sleep(wlinks._lock_stack);
73208 +       if (ret == 0) {
73209 +               reiser4_go_to_sleep(wlinks._lock_stack);
73210 +               ret = RETERR(-E_REPEAT);
73211 +       }
73212 +
73213 +       /* Remove from the waitfor list. */
73214 +       spin_lock_atom(atomf);
73215 +
73216 +       list_del(&wlinks._fwaitfor_link);
73217 +       atom_dec_and_unlock(atomf);
73218 +
73219 +       if (atomh) {
73220 +               /* Remove from the waiting list. */
73221 +               spin_lock_atom(atomh);
73222 +               list_del(&wlinks._fwaiting_link);
73223 +               atom_dec_and_unlock(atomh);
73224 +       }
73225 +       return ret;
73226 +}
73227 +
73228 +static void lock_two_atoms(txn_atom * one, txn_atom * two)
73229 +{
73230 +       assert("zam-1067", one != two);
73231 +
73232 +       /* lock the atom with lesser address first */
73233 +       if (one < two) {
73234 +               spin_lock_atom(one);
73235 +               spin_lock_atom_nested(two);
73236 +       } else {
73237 +               spin_lock_atom(two);
73238 +               spin_lock_atom_nested(one);
73239 +       }
73240 +}
73241 +
73242 +/* Perform the necessary work to prepare for fusing two atoms, which involves
73243 + * acquiring two atom locks in the proper order.  If one of the node's atom is
73244 + * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's
73245 + * atom is not then the handle's request is put to sleep.  If the node's atom
73246 + * is committing, then the node can be copy-on-captured.  Otherwise, pick the
73247 + * atom with fewer pointers to be fused into the atom with more pointer and
73248 + * call capture_fuse_into.
73249 + */
73250 +static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode)
73251 +{
73252 +       txn_atom * txnh_atom = txnh->atom;
73253 +       txn_atom * block_atom = node->atom;
73254 +
73255 +       atomic_inc(&txnh_atom->refcount);
73256 +       atomic_inc(&block_atom->refcount);
73257 +
73258 +       spin_unlock_txnh(txnh);
73259 +       spin_unlock_jnode(node);
73260 +
73261 +       lock_two_atoms(txnh_atom, block_atom);
73262 +
73263 +       if (txnh->atom != txnh_atom || node->atom != block_atom ) {
73264 +               release_two_atoms(txnh_atom, block_atom);
73265 +               return RETERR(-E_REPEAT);
73266 +       }
73267 +
73268 +       atomic_dec(&txnh_atom->refcount);
73269 +       atomic_dec(&block_atom->refcount);
73270 +
73271 +       assert ("zam-1066", atom_isopen(txnh_atom));
73272 +
73273 +       if (txnh_atom->stage >= block_atom->stage ||
73274 +           (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) {
73275 +               capture_fuse_into(txnh_atom, block_atom);
73276 +               return RETERR(-E_REPEAT);
73277 +       }
73278 +       spin_lock_txnh(txnh);
73279 +       return capture_fuse_wait(txnh, block_atom, txnh_atom, mode);
73280 +}
73281 +
73282 +/* This function splices together two jnode lists (small and large) and sets all jnodes in
73283 +   the small list to point to the large atom.  Returns the length of the list. */
73284 +static int
73285 +capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head,
73286 +                        struct list_head *small_head)
73287 +{
73288 +       int count = 0;
73289 +       jnode *node;
73290 +
73291 +       assert("umka-218", large != NULL);
73292 +       assert("umka-219", large_head != NULL);
73293 +       assert("umka-220", small_head != NULL);
73294 +       /* small atom should be locked also. */
73295 +       assert_spin_locked(&(large->alock));
73296 +
73297 +       /* For every jnode on small's capture list... */
73298 +       list_for_each_entry(node, small_head, capture_link) {
73299 +               count += 1;
73300 +
73301 +               /* With the jnode lock held, update atom pointer. */
73302 +               spin_lock_jnode(node);
73303 +               node->atom = large;
73304 +               spin_unlock_jnode(node);
73305 +       }
73306 +
73307 +       /* Splice the lists. */
73308 +       list_splice_init(small_head, large_head->prev);
73309 +
73310 +       return count;
73311 +}
73312 +
73313 +/* This function splices together two txnh lists (small and large) and sets all txn handles in
73314 +   the small list to point to the large atom.  Returns the length of the list. */
73315 +static int
73316 +capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head,
73317 +                       struct list_head *small_head)
73318 +{
73319 +       int count = 0;
73320 +       txn_handle *txnh;
73321 +
73322 +       assert("umka-221", large != NULL);
73323 +       assert("umka-222", large_head != NULL);
73324 +       assert("umka-223", small_head != NULL);
73325 +
73326 +       /* Adjust every txnh to the new atom. */
73327 +       list_for_each_entry(txnh, small_head, txnh_link) {
73328 +               count += 1;
73329 +
73330 +               /* With the txnh lock held, update atom pointer. */
73331 +               spin_lock_txnh(txnh);
73332 +               txnh->atom = large;
73333 +               spin_unlock_txnh(txnh);
73334 +       }
73335 +
73336 +       /* Splice the txn_handle list. */
73337 +       list_splice_init(small_head, large_head->prev);
73338 +
73339 +       return count;
73340 +}
73341 +
73342 +/* This function fuses two atoms.  The captured nodes and handles belonging to SMALL are
73343 +   added to LARGE and their ->atom pointers are all updated.  The associated counts are
73344 +   updated as well, and any waiting handles belonging to either are awakened.  Finally the
73345 +   smaller atom's refcount is decremented.
73346 +*/
73347 +static void capture_fuse_into(txn_atom * small, txn_atom * large)
73348 +{
73349 +       int level;
73350 +       unsigned zcount = 0;
73351 +       unsigned tcount = 0;
73352 +
73353 +       assert("umka-224", small != NULL);
73354 +       assert("umka-225", small != NULL);
73355 +
73356 +       assert_spin_locked(&(large->alock));
73357 +       assert_spin_locked(&(small->alock));
73358 +
73359 +       assert("jmacd-201", atom_isopen(small));
73360 +       assert("jmacd-202", atom_isopen(large));
73361 +
73362 +       /* Splice and update the per-level dirty jnode lists */
73363 +       for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73364 +               zcount +=
73365 +                   capture_fuse_jnode_lists(large,
73366 +                                            ATOM_DIRTY_LIST(large, level),
73367 +                                            ATOM_DIRTY_LIST(small, level));
73368 +       }
73369 +
73370 +       /* Splice and update the [clean,dirty] jnode and txnh lists */
73371 +       zcount +=
73372 +           capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large),
73373 +                                    ATOM_CLEAN_LIST(small));
73374 +       zcount +=
73375 +           capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large),
73376 +                                    ATOM_OVRWR_LIST(small));
73377 +       zcount +=
73378 +           capture_fuse_jnode_lists(large, ATOM_WB_LIST(large),
73379 +                                    ATOM_WB_LIST(small));
73380 +       zcount +=
73381 +           capture_fuse_jnode_lists(large, &large->inodes, &small->inodes);
73382 +       tcount +=
73383 +           capture_fuse_txnh_lists(large, &large->txnh_list,
73384 +                                   &small->txnh_list);
73385 +
73386 +       /* Check our accounting. */
73387 +       assert("jmacd-1063",
73388 +              zcount + small->num_queued == small->capture_count);
73389 +       assert("jmacd-1065", tcount == small->txnh_count);
73390 +
73391 +       /* sum numbers of waiters threads */
73392 +       large->nr_waiters += small->nr_waiters;
73393 +       small->nr_waiters = 0;
73394 +
73395 +       /* splice flush queues */
73396 +       reiser4_fuse_fq(large, small);
73397 +
73398 +       /* update counter of jnode on every atom' list */
73399 +       ON_DEBUG(large->dirty += small->dirty;
73400 +                small->dirty = 0;
73401 +                large->clean += small->clean;
73402 +                small->clean = 0;
73403 +                large->ovrwr += small->ovrwr;
73404 +                small->ovrwr = 0;
73405 +                large->wb += small->wb;
73406 +                small->wb = 0;
73407 +                large->fq += small->fq;
73408 +                small->fq = 0;);
73409 +
73410 +       /* count flushers in result atom */
73411 +       large->nr_flushers += small->nr_flushers;
73412 +       small->nr_flushers = 0;
73413 +
73414 +       /* update counts of flushed nodes */
73415 +       large->flushed += small->flushed;
73416 +       small->flushed = 0;
73417 +
73418 +       /* Transfer list counts to large. */
73419 +       large->txnh_count += small->txnh_count;
73420 +       large->capture_count += small->capture_count;
73421 +
73422 +       /* Add all txnh references to large. */
73423 +       atomic_add(small->txnh_count, &large->refcount);
73424 +       atomic_sub(small->txnh_count, &small->refcount);
73425 +
73426 +       /* Reset small counts */
73427 +       small->txnh_count = 0;
73428 +       small->capture_count = 0;
73429 +
73430 +       /* Assign the oldest start_time, merge flags. */
73431 +       large->start_time = min(large->start_time, small->start_time);
73432 +       large->flags |= small->flags;
73433 +
73434 +       /* Merge blocknr sets. */
73435 +       blocknr_set_merge(&small->delete_set, &large->delete_set);
73436 +       blocknr_set_merge(&small->wandered_map, &large->wandered_map);
73437 +
73438 +       /* Merge allocated/deleted file counts */
73439 +       large->nr_objects_deleted += small->nr_objects_deleted;
73440 +       large->nr_objects_created += small->nr_objects_created;
73441 +
73442 +       small->nr_objects_deleted = 0;
73443 +       small->nr_objects_created = 0;
73444 +
73445 +       /* Merge allocated blocks counts */
73446 +       large->nr_blocks_allocated += small->nr_blocks_allocated;
73447 +
73448 +       large->nr_running_queues += small->nr_running_queues;
73449 +       small->nr_running_queues = 0;
73450 +
73451 +       /* Merge blocks reserved for overwrite set. */
73452 +       large->flush_reserved += small->flush_reserved;
73453 +       small->flush_reserved = 0;
73454 +
73455 +       if (large->stage < small->stage) {
73456 +               /* Large only needs to notify if it has changed state. */
73457 +               reiser4_atom_set_stage(large, small->stage);
73458 +               wakeup_atom_waiting_list(large);
73459 +       }
73460 +
73461 +       reiser4_atom_set_stage(small, ASTAGE_INVALID);
73462 +
73463 +       /* Notify any waiters--small needs to unload its wait lists.  Waiters
73464 +          actually remove themselves from the list before returning from the
73465 +          fuse_wait function. */
73466 +       wakeup_atom_waiting_list(small);
73467 +
73468 +       /* Unlock atoms */
73469 +       spin_unlock_atom(large);
73470 +       atom_dec_and_unlock(small);
73471 +}
73472 +
73473 +/* TXNMGR STUFF */
73474 +
73475 +/* Release a block from the atom, reversing the effects of being captured,
73476 +   do not release atom's reference to jnode due to holding spin-locks.
73477 +   Currently this is only called when the atom commits.
73478 +
73479 +   NOTE: this function does not release a (journal) reference to jnode
73480 +   due to locking optimizations, you should call jput() somewhere after
73481 +   calling reiser4_uncapture_block(). */
73482 +void reiser4_uncapture_block(jnode * node)
73483 +{
73484 +       txn_atom *atom;
73485 +
73486 +       assert("umka-226", node != NULL);
73487 +       atom = node->atom;
73488 +       assert("umka-228", atom != NULL);
73489 +
73490 +       assert("jmacd-1021", node->atom == atom);
73491 +       assert_spin_locked(&(node->guard));
73492 +       assert("jmacd-1023", atom_is_protected(atom));
73493 +
73494 +       JF_CLR(node, JNODE_DIRTY);
73495 +       JF_CLR(node, JNODE_RELOC);
73496 +       JF_CLR(node, JNODE_OVRWR);
73497 +       JF_CLR(node, JNODE_CREATED);
73498 +       JF_CLR(node, JNODE_WRITEBACK);
73499 +       JF_CLR(node, JNODE_REPACK);
73500 +
73501 +       list_del_init(&node->capture_link);
73502 +       if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
73503 +               assert("zam-925", atom_isopen(atom));
73504 +               assert("vs-1623", NODE_LIST(node) == FQ_LIST);
73505 +               ON_DEBUG(atom->num_queued--);
73506 +               JF_CLR(node, JNODE_FLUSH_QUEUED);
73507 +       }
73508 +       atom->capture_count -= 1;
73509 +       ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
73510 +       node->atom = NULL;
73511 +
73512 +       spin_unlock_jnode(node);
73513 +       LOCK_CNT_DEC(t_refs);
73514 +}
73515 +
73516 +/* Unconditional insert of jnode into atom's overwrite list. Currently used in
73517 +   bitmap-based allocator code for adding modified bitmap blocks the
73518 +   transaction. @atom and @node are spin locked */
73519 +void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node)
73520 +{
73521 +       assert("zam-538", atom_is_protected(atom));
73522 +       assert_spin_locked(&(node->guard));
73523 +       assert("zam-899", JF_ISSET(node, JNODE_OVRWR));
73524 +       assert("zam-543", node->atom == NULL);
73525 +       assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node));
73526 +
73527 +       list_add(&node->capture_link, ATOM_OVRWR_LIST(atom));
73528 +       jref(node);
73529 +       node->atom = atom;
73530 +       atom->capture_count++;
73531 +       ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
73532 +}
73533 +
73534 +static int count_deleted_blocks_actor(txn_atom * atom,
73535 +                                     const reiser4_block_nr * a,
73536 +                                     const reiser4_block_nr * b, void *data)
73537 +{
73538 +       reiser4_block_nr *counter = data;
73539 +
73540 +       assert("zam-995", data != NULL);
73541 +       assert("zam-996", a != NULL);
73542 +       if (b == NULL)
73543 +               *counter += 1;
73544 +       else
73545 +               *counter += *b;
73546 +       return 0;
73547 +}
73548 +
73549 +reiser4_block_nr txnmgr_count_deleted_blocks(void)
73550 +{
73551 +       reiser4_block_nr result;
73552 +       txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73553 +       txn_atom *atom;
73554 +
73555 +       result = 0;
73556 +
73557 +       spin_lock_txnmgr(tmgr);
73558 +       list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
73559 +               spin_lock_atom(atom);
73560 +               if (atom_isopen(atom))
73561 +                       blocknr_set_iterator(
73562 +                               atom, &atom->delete_set,
73563 +                               count_deleted_blocks_actor, &result, 0);
73564 +               spin_unlock_atom(atom);
73565 +       }
73566 +       spin_unlock_txnmgr(tmgr);
73567 +
73568 +       return result;
73569 +}
73570 +
73571 +/*
73572 + * Local variables:
73573 + * c-indentation-style: "K&R"
73574 + * mode-name: "LC"
73575 + * c-basic-offset: 8
73576 + * tab-width: 8
73577 + * fill-column: 79
73578 + * End:
73579 + */
73580 diff -urN linux-2.6.35.orig/fs/reiser4/txnmgr.h linux-2.6.35/fs/reiser4/txnmgr.h
73581 --- linux-2.6.35.orig/fs/reiser4/txnmgr.h       1970-01-01 01:00:00.000000000 +0100
73582 +++ linux-2.6.35/fs/reiser4/txnmgr.h    2010-08-04 15:44:57.000000000 +0200
73583 @@ -0,0 +1,701 @@
73584 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
73585 + * reiser4/README */
73586 +
73587 +/* data-types and function declarations for transaction manager. See txnmgr.c
73588 + * for details. */
73589 +
73590 +#ifndef __REISER4_TXNMGR_H__
73591 +#define __REISER4_TXNMGR_H__
73592 +
73593 +#include "forward.h"
73594 +#include "dformat.h"
73595 +
73596 +#include <linux/fs.h>
73597 +#include <linux/mm.h>
73598 +#include <linux/types.h>
73599 +#include <linux/spinlock.h>
73600 +#include <asm/atomic.h>
73601 +#include <linux/wait.h>
73602 +
73603 +/* TYPE DECLARATIONS */
73604 +
73605 +/* This enumeration describes the possible types of a capture request (reiser4_try_capture).
73606 +   A capture request dynamically assigns a block to the calling thread's transaction
73607 +   handle. */
73608 +typedef enum {
73609 +       /* A READ_ATOMIC request indicates that a block will be read and that the caller's
73610 +          atom should fuse in order to ensure that the block commits atomically with the
73611 +          caller. */
73612 +       TXN_CAPTURE_READ_ATOMIC = (1 << 0),
73613 +
73614 +       /* A READ_NONCOM request indicates that a block will be read and that the caller is
73615 +          willing to read a non-committed block without causing atoms to fuse. */
73616 +       TXN_CAPTURE_READ_NONCOM = (1 << 1),
73617 +
73618 +       /* A READ_MODIFY request indicates that a block will be read but that the caller
73619 +          wishes for the block to be captured as it will be written.  This capture request
73620 +          mode is not currently used, but eventually it will be useful for preventing
73621 +          deadlock in read-modify-write cycles. */
73622 +       TXN_CAPTURE_READ_MODIFY = (1 << 2),
73623 +
73624 +       /* A WRITE capture request indicates that a block will be modified and that atoms
73625 +          should fuse to make the commit atomic. */
73626 +       TXN_CAPTURE_WRITE = (1 << 3),
73627 +
73628 +       /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
73629 +          exclusive type designation from extra bits that may be supplied -- see
73630 +          below. */
73631 +       TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC |
73632 +                            TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY |
73633 +                            TXN_CAPTURE_WRITE),
73634 +
73635 +       /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
73636 +          indicate modification will occur. */
73637 +       TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
73638 +
73639 +       /* An option to reiser4_try_capture, NONBLOCKING indicates that the caller would
73640 +          prefer not to sleep waiting for an aging atom to commit. */
73641 +       TXN_CAPTURE_NONBLOCKING = (1 << 4),
73642 +
73643 +       /* An option to reiser4_try_capture to prevent atom fusion, just simple
73644 +          capturing is allowed */
73645 +       TXN_CAPTURE_DONT_FUSE = (1 << 5)
73646 +
73647 +       /* This macro selects only the exclusive capture request types, stripping out any
73648 +          options that were supplied (i.e., NONBLOCKING). */
73649 +#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
73650 +} txn_capture;
73651 +
73652 +/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
73653 +   difference is in the handling of read requests.  A WRITE_FUSING transaction handle
73654 +   defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
73655 +   transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
73656 +typedef enum {
73657 +       TXN_WRITE_FUSING = (1 << 0),
73658 +       TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING,  /* READ implies WRITE */
73659 +} txn_mode;
73660 +
73661 +/* Every atom has a stage, which is one of these exclusive values: */
73662 +typedef enum {
73663 +       /* Initially an atom is free. */
73664 +       ASTAGE_FREE = 0,
73665 +
73666 +       /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
73667 +          blocks and fuse with other atoms. */
73668 +       ASTAGE_CAPTURE_FUSE = 1,
73669 +
73670 +       /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
73671 +
73672 +       /* When an atom reaches a certain age it must do all it can to commit.  An atom in
73673 +          the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
73674 +          atoms in the CAPTURE_FUSE stage. */
73675 +       ASTAGE_CAPTURE_WAIT = 2,
73676 +
73677 +       /* Waiting for I/O before commit.  Copy-on-capture (see
73678 +          http://namesys.com/v4/v4.html). */
73679 +       ASTAGE_PRE_COMMIT = 3,
73680 +
73681 +       /* Post-commit overwrite I/O.  Steal-on-capture. */
73682 +       ASTAGE_POST_COMMIT = 4,
73683 +
73684 +       /* Atom which waits for the removal of the last reference to (it? ) to
73685 +        * be deleted from memory  */
73686 +       ASTAGE_DONE = 5,
73687 +
73688 +       /* invalid atom. */
73689 +       ASTAGE_INVALID = 6,
73690 +
73691 +} txn_stage;
73692 +
73693 +/* Certain flags may be set in the txn_atom->flags field. */
73694 +typedef enum {
73695 +       /* Indicates that the atom should commit as soon as possible. */
73696 +       ATOM_FORCE_COMMIT = (1 << 0),
73697 +       /* to avoid endless loop, mark the atom (which was considered as too
73698 +        * small) after failed attempt to fuse it. */
73699 +       ATOM_CANCEL_FUSION = (1 << 1)
73700 +} txn_flags;
73701 +
73702 +/* Flags for controlling commit_txnh */
73703 +typedef enum {
73704 +       /* Wait commit atom completion in commit_txnh */
73705 +       TXNH_WAIT_COMMIT = 0x2,
73706 +       /* Don't commit atom when this handle is closed */
73707 +       TXNH_DONT_COMMIT = 0x4
73708 +} txn_handle_flags_t;
73709 +
73710 +/* TYPE DEFINITIONS */
73711 +
73712 +/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
73713 +   fields, so typically an operation on the atom through either of these objects must (1)
73714 +   lock the object, (2) read the atom pointer, (3) lock the atom.
73715 +
73716 +   During atom fusion, the process holds locks on both atoms at once.  Then, it iterates
73717 +   through the list of handles and pages held by the smaller of the two atoms.  For each
73718 +   handle and page referencing the smaller atom, the fusing process must: (1) lock the
73719 +   object, and (2) update the atom pointer.
73720 +
73721 +   You can see that there is a conflict of lock ordering here, so the more-complex
73722 +   procedure should have priority, i.e., the fusing process has priority so that it is
73723 +   guaranteed to make progress and to avoid restarts.
73724 +
73725 +   This decision, however, means additional complexity for aquiring the atom lock in the
73726 +   first place.
73727 +
73728 +   The general original procedure followed in the code was:
73729 +
73730 +       TXN_OBJECT *obj = ...;
73731 +       TXN_ATOM   *atom;
73732 +
73733 +       spin_lock (& obj->_lock);
73734 +
73735 +       atom = obj->_atom;
73736 +
73737 +       if (! spin_trylock_atom (atom))
73738 +         {
73739 +           spin_unlock (& obj->_lock);
73740 +           RESTART OPERATION, THERE WAS A RACE;
73741 +         }
73742 +
73743 +       ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
73744 +
73745 +   It has however been found that this wastes CPU a lot in a manner that is
73746 +   hard to profile. So, proper refcounting was added to atoms, and new
73747 +   standard locking sequence is like following:
73748 +
73749 +       TXN_OBJECT *obj = ...;
73750 +       TXN_ATOM   *atom;
73751 +
73752 +       spin_lock (& obj->_lock);
73753 +
73754 +       atom = obj->_atom;
73755 +
73756 +       if (! spin_trylock_atom (atom))
73757 +         {
73758 +           atomic_inc (& atom->refcount);
73759 +           spin_unlock (& obj->_lock);
73760 +           spin_lock (&atom->_lock);
73761 +           atomic_dec (& atom->refcount);
73762 +           // HERE atom is locked
73763 +           spin_unlock (&atom->_lock);
73764 +           RESTART OPERATION, THERE WAS A RACE;
73765 +         }
73766 +
73767 +       ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
73768 +
73769 +   (core of this is implemented in trylock_throttle() function)
73770 +
73771 +   See the jnode_get_atom() function for a common case.
73772 +
73773 +   As an additional (and important) optimization allowing to avoid restarts,
73774 +   it is possible to re-check required pre-conditions at the HERE point in
73775 +   code above and proceed without restarting if they are still satisfied.
73776 +*/
73777 +
73778 +/* An atomic transaction: this is the underlying system representation
73779 +   of a transaction, not the one seen by clients.
73780 +
73781 +   Invariants involving this data-type:
73782 +
73783 +      [sb-fake-allocated]
73784 +*/
73785 +struct txn_atom {
73786 +       /* The spinlock protecting the atom, held during fusion and various other state
73787 +          changes. */
73788 +       spinlock_t alock;
73789 +
73790 +       /* The atom's reference counter, increasing (in case of a duplication
73791 +          of an existing reference or when we are sure that some other
73792 +          reference exists) may be done without taking spinlock, decrementing
73793 +          of the ref. counter requires a spinlock to be held.
73794 +
73795 +          Each transaction handle counts in ->refcount. All jnodes count as
73796 +          one reference acquired in atom_begin_andlock(), released in
73797 +          commit_current_atom().
73798 +        */
73799 +       atomic_t refcount;
73800 +
73801 +       /* The atom_id identifies the atom in persistent records such as the log. */
73802 +       __u32 atom_id;
73803 +
73804 +       /* Flags holding any of the txn_flags enumerated values (e.g.,
73805 +          ATOM_FORCE_COMMIT). */
73806 +       __u32 flags;
73807 +
73808 +       /* Number of open handles. */
73809 +       __u32 txnh_count;
73810 +
73811 +       /* The number of znodes captured by this atom.  Equal to the sum of lengths of the
73812 +          dirty_nodes[level] and clean_nodes lists. */
73813 +       __u32 capture_count;
73814 +
73815 +#if REISER4_DEBUG
73816 +       int clean;
73817 +       int dirty;
73818 +       int ovrwr;
73819 +       int wb;
73820 +       int fq;
73821 +#endif
73822 +
73823 +       __u32 flushed;
73824 +
73825 +       /* Current transaction stage. */
73826 +       txn_stage stage;
73827 +
73828 +       /* Start time. */
73829 +       unsigned long start_time;
73830 +
73831 +       /* The atom's delete set. It collects block numbers of the nodes
73832 +          which were deleted during the transaction. */
73833 +       struct list_head delete_set;
73834 +
73835 +       /* The atom's wandered_block mapping. */
73836 +       struct list_head wandered_map;
73837 +
73838 +       /* The transaction's list of dirty captured nodes--per level.  Index
73839 +          by (level). dirty_nodes[0] is for znode-above-root */
73840 +       struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1];
73841 +
73842 +       /* The transaction's list of clean captured nodes. */
73843 +       struct list_head clean_nodes;
73844 +
73845 +       /* The atom's overwrite set */
73846 +       struct list_head ovrwr_nodes;
73847 +
73848 +       /* nodes which are being written to disk */
73849 +       struct list_head writeback_nodes;
73850 +
73851 +       /* list of inodes */
73852 +       struct list_head inodes;
73853 +
73854 +       /* List of handles associated with this atom. */
73855 +       struct list_head txnh_list;
73856 +
73857 +       /* Transaction list link: list of atoms in the transaction manager. */
73858 +       struct list_head atom_link;
73859 +
73860 +       /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
73861 +       struct list_head fwaitfor_list;
73862 +
73863 +       /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
73864 +       struct list_head fwaiting_list;
73865 +
73866 +       /* Numbers of objects which were deleted/created in this transaction
73867 +          thereby numbers of objects IDs which were released/deallocated. */
73868 +       int nr_objects_deleted;
73869 +       int nr_objects_created;
73870 +       /* number of blocks allocated during the transaction */
73871 +       __u64 nr_blocks_allocated;
73872 +       /* All atom's flush queue objects are on this list  */
73873 +       struct list_head flush_queues;
73874 +#if REISER4_DEBUG
73875 +       /* number of flush queues for this atom. */
73876 +       int nr_flush_queues;
73877 +       /* Number of jnodes which were removed from atom's lists and put
73878 +          on flush_queue */
73879 +       int num_queued;
73880 +#endif
73881 +       /* number of threads who wait for this atom to complete commit */
73882 +       int nr_waiters;
73883 +       /* number of threads which do jnode_flush() over this atom */
73884 +       int nr_flushers;
73885 +       /* number of flush queues which are IN_USE and jnodes from fq->prepped
73886 +          are submitted to disk by the reiser4_write_fq() routine. */
73887 +       int nr_running_queues;
73888 +       /* A counter of grabbed unformatted nodes, see a description of the
73889 +        * reiser4 space reservation scheme at block_alloc.c */
73890 +       reiser4_block_nr flush_reserved;
73891 +#if REISER4_DEBUG
73892 +       void *committer;
73893 +#endif
73894 +       struct super_block *super;
73895 +};
73896 +
73897 +#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level])
73898 +#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes)
73899 +#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes)
73900 +#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes)
73901 +#define ATOM_FQ_LIST(fq) (&(fq)->prepped)
73902 +
73903 +#define NODE_LIST(node) (node)->list
73904 +#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
73905 +ON_DEBUG(void
73906 +        count_jnode(txn_atom *, jnode *, atom_list old_list,
73907 +                    atom_list new_list, int check_lists));
73908 +
73909 +/* A transaction handle: the client obtains and commits this handle which is assigned by
73910 +   the system to a txn_atom. */
73911 +struct txn_handle {
73912 +       /* Spinlock protecting ->atom pointer */
73913 +       spinlock_t hlock;
73914 +
73915 +       /* Flags for controlling commit_txnh() behavior */
73916 +       /* from txn_handle_flags_t */
73917 +       txn_handle_flags_t flags;
73918 +
73919 +       /* Whether it is READ_FUSING or WRITE_FUSING. */
73920 +       txn_mode mode;
73921 +
73922 +       /* If assigned, the atom it is part of. */
73923 +       txn_atom *atom;
73924 +
73925 +       /* Transaction list link. Head is in txn_atom. */
73926 +       struct list_head txnh_link;
73927 +};
73928 +
73929 +/* The transaction manager: one is contained in the reiser4_super_info_data */
73930 +struct txn_mgr {
73931 +       /* A spinlock protecting the atom list, id_count, flush_control */
73932 +       spinlock_t tmgr_lock;
73933 +
73934 +       /* List of atoms. */
73935 +       struct list_head atoms_list;
73936 +
73937 +       /* Number of atoms. */
73938 +       int atom_count;
73939 +
73940 +       /* A counter used to assign atom->atom_id values. */
73941 +       __u32 id_count;
73942 +
73943 +       /* a mutex object for commit serialization */
73944 +       struct mutex commit_mutex;
73945 +
73946 +       /* a list of all txnmrgs served by particular daemon. */
73947 +       struct list_head linkage;
73948 +
73949 +       /* description of daemon for this txnmgr */
73950 +       ktxnmgrd_context *daemon;
73951 +
73952 +       /* parameters. Adjustable through mount options. */
73953 +       unsigned int atom_max_size;
73954 +       unsigned int atom_max_age;
73955 +       unsigned int atom_min_size;
73956 +       /* max number of concurrent flushers for one atom, 0 - unlimited.  */
73957 +       unsigned int atom_max_flushers;
73958 +       struct dentry *debugfs_atom_count;
73959 +       struct dentry *debugfs_id_count;
73960 +};
73961 +
73962 +/* FUNCTION DECLARATIONS */
73963 +
73964 +/* These are the externally (within Reiser4) visible transaction functions, therefore they
73965 +   are prefixed with "txn_".  For comments, see txnmgr.c. */
73966 +
73967 +extern int init_txnmgr_static(void);
73968 +extern void done_txnmgr_static(void);
73969 +
73970 +extern void reiser4_init_txnmgr(txn_mgr *);
73971 +extern void reiser4_done_txnmgr(txn_mgr *);
73972 +
73973 +extern int reiser4_txn_reserve(int reserved);
73974 +
73975 +extern void reiser4_txn_begin(reiser4_context * context);
73976 +extern int reiser4_txn_end(reiser4_context * context);
73977 +
73978 +extern void reiser4_txn_restart(reiser4_context * context);
73979 +extern void reiser4_txn_restart_current(void);
73980 +
73981 +extern int txnmgr_force_commit_all(struct super_block *, int);
73982 +extern int current_atom_should_commit(void);
73983 +
73984 +extern jnode *find_first_dirty_jnode(txn_atom *, int);
73985 +
73986 +extern int commit_some_atoms(txn_mgr *);
73987 +extern int force_commit_atom(txn_handle *);
73988 +extern int flush_current_atom(int, long, long *, txn_atom **, jnode *);
73989 +
73990 +extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int);
73991 +
73992 +extern void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage);
73993 +
73994 +extern int same_slum_check(jnode * base, jnode * check, int alloc_check,
73995 +                          int alloc_value);
73996 +extern void atom_dec_and_unlock(txn_atom * atom);
73997 +
73998 +extern int reiser4_try_capture(jnode * node, znode_lock_mode mode, txn_capture flags);
73999 +extern int try_capture_page_to_invalidate(struct page *pg);
74000 +
74001 +extern void reiser4_uncapture_page(struct page *pg);
74002 +extern void reiser4_uncapture_block(jnode *);
74003 +extern void reiser4_uncapture_jnode(jnode *);
74004 +
74005 +extern int reiser4_capture_inode(struct inode *);
74006 +extern int reiser4_uncapture_inode(struct inode *);
74007 +
74008 +extern txn_atom *get_current_atom_locked_nocheck(void);
74009 +
74010 +#if REISER4_DEBUG
74011 +
74012 +/**
74013 + * atom_is_protected - make sure that nobody but us can do anything with atom
74014 + * @atom: atom to be checked
74015 + *
74016 + * This is used to assert that atom either entered commit stages or is spin
74017 + * locked.
74018 + */
74019 +static inline int atom_is_protected(txn_atom *atom)
74020 +{
74021 +       if (atom->stage >= ASTAGE_PRE_COMMIT)
74022 +               return 1;
74023 +       assert_spin_locked(&(atom->alock));
74024 +       return 1;
74025 +}
74026 +
74027 +#endif
74028 +
74029 +/* Get the current atom and spinlock it if current atom present. May not return NULL */
74030 +static inline txn_atom *get_current_atom_locked(void)
74031 +{
74032 +       txn_atom *atom;
74033 +
74034 +       atom = get_current_atom_locked_nocheck();
74035 +       assert("zam-761", atom != NULL);
74036 +
74037 +       return atom;
74038 +}
74039 +
74040 +extern txn_atom *jnode_get_atom(jnode *);
74041 +
74042 +extern void reiser4_atom_wait_event(txn_atom *);
74043 +extern void reiser4_atom_send_event(txn_atom *);
74044 +
74045 +extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node);
74046 +extern int reiser4_capture_super_block(struct super_block *s);
74047 +int capture_bulk(jnode **, int count);
74048 +
74049 +/* See the comment on the function blocknrset.c:blocknr_set_add for the
74050 +   calling convention of these three routines. */
74051 +extern void blocknr_set_init(struct list_head * bset);
74052 +extern void blocknr_set_destroy(struct list_head * bset);
74053 +extern void blocknr_set_merge(struct list_head * from, struct list_head * into);
74054 +extern int blocknr_set_add_extent(txn_atom * atom,
74055 +                                 struct list_head * bset,
74056 +                                 blocknr_set_entry ** new_bsep,
74057 +                                 const reiser4_block_nr * start,
74058 +                                 const reiser4_block_nr * len);
74059 +extern int blocknr_set_add_pair(txn_atom * atom, struct list_head * bset,
74060 +                               blocknr_set_entry ** new_bsep,
74061 +                               const reiser4_block_nr * a,
74062 +                               const reiser4_block_nr * b);
74063 +
74064 +typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *,
74065 +                                   const reiser4_block_nr *, void *);
74066 +
74067 +extern int blocknr_set_iterator(txn_atom * atom, struct list_head * bset,
74068 +                               blocknr_set_actor_f actor, void *data,
74069 +                               int delete);
74070 +
74071 +/* flush code takes care about how to fuse flush queues */
74072 +extern void flush_init_atom(txn_atom * atom);
74073 +extern void flush_fuse_queues(txn_atom * large, txn_atom * small);
74074 +
74075 +static inline void spin_lock_atom(txn_atom *atom)
74076 +{
74077 +       /* check that spinlocks of lower priorities are not held */
74078 +       assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
74079 +                   LOCK_CNT_NIL(spin_locked_atom) &&
74080 +                   LOCK_CNT_NIL(spin_locked_jnode) &&
74081 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
74082 +                   LOCK_CNT_NIL(rw_locked_dk) &&
74083 +                   LOCK_CNT_NIL(rw_locked_tree)));
74084 +
74085 +       spin_lock(&(atom->alock));
74086 +
74087 +       LOCK_CNT_INC(spin_locked_atom);
74088 +       LOCK_CNT_INC(spin_locked);
74089 +}
74090 +
74091 +static inline void spin_lock_atom_nested(txn_atom *atom)
74092 +{
74093 +       assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
74094 +                   LOCK_CNT_NIL(spin_locked_jnode) &&
74095 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
74096 +                   LOCK_CNT_NIL(rw_locked_dk) &&
74097 +                   LOCK_CNT_NIL(rw_locked_tree)));
74098 +
74099 +       spin_lock_nested(&(atom->alock), SINGLE_DEPTH_NESTING);
74100 +
74101 +       LOCK_CNT_INC(spin_locked_atom);
74102 +       LOCK_CNT_INC(spin_locked);
74103 +}
74104 +
74105 +static inline int spin_trylock_atom(txn_atom *atom)
74106 +{
74107 +       if (spin_trylock(&(atom->alock))) {
74108 +               LOCK_CNT_INC(spin_locked_atom);
74109 +               LOCK_CNT_INC(spin_locked);
74110 +               return 1;
74111 +       }
74112 +       return 0;
74113 +}
74114 +
74115 +static inline void spin_unlock_atom(txn_atom *atom)
74116 +{
74117 +       assert_spin_locked(&(atom->alock));
74118 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom));
74119 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
74120 +
74121 +       LOCK_CNT_DEC(spin_locked_atom);
74122 +       LOCK_CNT_DEC(spin_locked);
74123 +
74124 +       spin_unlock(&(atom->alock));
74125 +}
74126 +
74127 +static inline void spin_lock_txnh(txn_handle *txnh)
74128 +{
74129 +       /* check that spinlocks of lower priorities are not held */
74130 +       assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
74131 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
74132 +                   LOCK_CNT_NIL(rw_locked_tree)));
74133 +
74134 +       spin_lock(&(txnh->hlock));
74135 +
74136 +       LOCK_CNT_INC(spin_locked_txnh);
74137 +       LOCK_CNT_INC(spin_locked);
74138 +}
74139 +
74140 +static inline int spin_trylock_txnh(txn_handle *txnh)
74141 +{
74142 +       if (spin_trylock(&(txnh->hlock))) {
74143 +               LOCK_CNT_INC(spin_locked_txnh);
74144 +               LOCK_CNT_INC(spin_locked);
74145 +               return 1;
74146 +       }
74147 +       return 0;
74148 +}
74149 +
74150 +static inline void spin_unlock_txnh(txn_handle *txnh)
74151 +{
74152 +       assert_spin_locked(&(txnh->hlock));
74153 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh));
74154 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
74155 +
74156 +       LOCK_CNT_DEC(spin_locked_txnh);
74157 +       LOCK_CNT_DEC(spin_locked);
74158 +
74159 +       spin_unlock(&(txnh->hlock));
74160 +}
74161 +
74162 +#define spin_ordering_pred_txnmgr(tmgr)                \
74163 +       ( LOCK_CNT_NIL(spin_locked_atom) &&     \
74164 +         LOCK_CNT_NIL(spin_locked_txnh) &&     \
74165 +         LOCK_CNT_NIL(spin_locked_jnode) &&    \
74166 +         LOCK_CNT_NIL(rw_locked_zlock) &&      \
74167 +         LOCK_CNT_NIL(rw_locked_dk) &&         \
74168 +         LOCK_CNT_NIL(rw_locked_tree) )
74169 +
74170 +static inline void spin_lock_txnmgr(txn_mgr *mgr)
74171 +{
74172 +       /* check that spinlocks of lower priorities are not held */
74173 +       assert("", (LOCK_CNT_NIL(spin_locked_atom) &&
74174 +                   LOCK_CNT_NIL(spin_locked_txnh) &&
74175 +                   LOCK_CNT_NIL(spin_locked_jnode) &&
74176 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
74177 +                   LOCK_CNT_NIL(rw_locked_dk) &&
74178 +                   LOCK_CNT_NIL(rw_locked_tree)));
74179 +
74180 +       spin_lock(&(mgr->tmgr_lock));
74181 +
74182 +       LOCK_CNT_INC(spin_locked_txnmgr);
74183 +       LOCK_CNT_INC(spin_locked);
74184 +}
74185 +
74186 +static inline int spin_trylock_txnmgr(txn_mgr *mgr)
74187 +{
74188 +       if (spin_trylock(&(mgr->tmgr_lock))) {
74189 +               LOCK_CNT_INC(spin_locked_txnmgr);
74190 +               LOCK_CNT_INC(spin_locked);
74191 +               return 1;
74192 +       }
74193 +       return 0;
74194 +}
74195 +
74196 +static inline void spin_unlock_txnmgr(txn_mgr *mgr)
74197 +{
74198 +       assert_spin_locked(&(mgr->tmgr_lock));
74199 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr));
74200 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
74201 +
74202 +       LOCK_CNT_DEC(spin_locked_txnmgr);
74203 +       LOCK_CNT_DEC(spin_locked);
74204 +
74205 +       spin_unlock(&(mgr->tmgr_lock));
74206 +}
74207 +
74208 +typedef enum {
74209 +       FQ_IN_USE = 0x1
74210 +} flush_queue_state_t;
74211 +
74212 +typedef struct flush_queue flush_queue_t;
74213 +
74214 +/* This is an accumulator for jnodes prepared for writing to disk. A flush queue
74215 +   is filled by the jnode_flush() routine, and written to disk under memory
74216 +   pressure or at atom commit time. */
74217 +/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
74218 +   field and fq->prepped list can be modified if atom is spin-locked and fq
74219 +   object is "in-use" state.  For read-only traversal of the fq->prepped list
74220 +   and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
74221 +   only have atom spin-locked. */
74222 +struct flush_queue {
74223 +       /* linkage element is the first in this structure to make debugging
74224 +          easier.  See field in atom struct for description of list. */
74225 +       struct list_head alink;
74226 +       /* A spinlock to protect changes of fq state and fq->atom pointer */
74227 +       spinlock_t guard;
74228 +       /* flush_queue state: [in_use | ready] */
74229 +       flush_queue_state_t state;
74230 +       /* A list which contains queued nodes, queued nodes are removed from any
74231 +        * atom's list and put on this ->prepped one. */
74232 +       struct list_head prepped;
74233 +       /* number of submitted i/o requests */
74234 +       atomic_t nr_submitted;
74235 +       /* number of i/o errors */
74236 +       atomic_t nr_errors;
74237 +       /* An atom this flush queue is attached to */
74238 +       txn_atom *atom;
74239 +       /* A wait queue head to wait on i/o completion */
74240 +       wait_queue_head_t wait;
74241 +#if REISER4_DEBUG
74242 +       /* A thread which took this fq in exclusive use, NULL if fq is free,
74243 +        * used for debugging. */
74244 +       struct task_struct *owner;
74245 +#endif
74246 +};
74247 +
74248 +extern int reiser4_fq_by_atom(txn_atom *, flush_queue_t **);
74249 +extern void reiser4_fq_put_nolock(flush_queue_t *);
74250 +extern void reiser4_fq_put(flush_queue_t *);
74251 +extern void reiser4_fuse_fq(txn_atom * to, txn_atom * from);
74252 +extern void queue_jnode(flush_queue_t *, jnode *);
74253 +
74254 +extern int reiser4_write_fq(flush_queue_t *, long *, int);
74255 +extern int current_atom_finish_all_fq(void);
74256 +extern void init_atom_fq_parts(txn_atom *);
74257 +
74258 +extern reiser4_block_nr txnmgr_count_deleted_blocks(void);
74259 +
74260 +extern void znode_make_dirty(znode * node);
74261 +extern void jnode_make_dirty_locked(jnode * node);
74262 +
74263 +extern int reiser4_sync_atom(txn_atom * atom);
74264 +
74265 +#if REISER4_DEBUG
74266 +extern int atom_fq_parts_are_clean(txn_atom *);
74267 +#endif
74268 +
74269 +extern void add_fq_to_bio(flush_queue_t *, struct bio *);
74270 +extern flush_queue_t *get_fq_for_current_atom(void);
74271 +
74272 +void reiser4_invalidate_list(struct list_head * head);
74273 +
74274 +# endif                                /* __REISER4_TXNMGR_H__ */
74275 +
74276 +/* Make Linus happy.
74277 +   Local variables:
74278 +   c-indentation-style: "K&R"
74279 +   mode-name: "LC"
74280 +   c-basic-offset: 8
74281 +   tab-width: 8
74282 +   fill-column: 120
74283 +   End:
74284 +*/
74285 diff -urN linux-2.6.35.orig/fs/reiser4/type_safe_hash.h linux-2.6.35/fs/reiser4/type_safe_hash.h
74286 --- linux-2.6.35.orig/fs/reiser4/type_safe_hash.h       1970-01-01 01:00:00.000000000 +0100
74287 +++ linux-2.6.35/fs/reiser4/type_safe_hash.h    2010-08-04 15:44:57.000000000 +0200
74288 @@ -0,0 +1,320 @@
74289 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74290 + * reiser4/README */
74291 +
74292 +/* A hash table class that uses hash chains (singly-linked) and is
74293 +   parametrized to provide type safety.  */
74294 +
74295 +#ifndef __REISER4_TYPE_SAFE_HASH_H__
74296 +#define __REISER4_TYPE_SAFE_HASH_H__
74297 +
74298 +#include "debug.h"
74299 +
74300 +#include <asm/errno.h>
74301 +/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects
74302 +   based on the object type.  You need to declare the item type before
74303 +   this definition, define it after this definition. */
74304 +#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE)                                                     \
74305 +                                                                                              \
74306 +typedef struct PREFIX##_hash_table_  PREFIX##_hash_table;                                     \
74307 +typedef struct PREFIX##_hash_link_   PREFIX##_hash_link;                                      \
74308 +                                                                                              \
74309 +struct PREFIX##_hash_table_                                                                   \
74310 +{                                                                                             \
74311 +  ITEM_TYPE  **_table;                                                                        \
74312 +  __u32        _buckets;                                                                      \
74313 +};                                                                                            \
74314 +                                                                                              \
74315 +struct PREFIX##_hash_link_                                                                    \
74316 +{                                                                                             \
74317 +  ITEM_TYPE *_next;                                                                           \
74318 +}
74319 +
74320 +/* Step 2: Define the object type of the hash: give it field of type
74321 +   PREFIX_hash_link. */
74322 +
74323 +/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using
74324 +   the type and field name used in step 3.  The arguments are:
74325 +
74326 +   ITEM_TYPE    The item type being hashed
74327 +   KEY_TYPE     The type of key being hashed
74328 +   KEY_NAME     The name of the key field within the item
74329 +   LINK_NAME    The name of the link field within the item, which you must make type PREFIX_hash_link)
74330 +   HASH_FUNC    The name of the hash function (or macro, takes const pointer to key)
74331 +   EQ_FUNC      The name of the equality function (or macro, takes const pointer to two keys)
74332 +
74333 +   It implements these functions:
74334 +
74335 +   prefix_hash_init           Initialize the table given its size.
74336 +   prefix_hash_insert         Insert an item
74337 +   prefix_hash_insert_index   Insert an item w/ precomputed hash_index
74338 +   prefix_hash_find           Find an item by key
74339 +   prefix_hash_find_index     Find an item w/ precomputed hash_index
74340 +   prefix_hash_remove         Remove an item, returns 1 if found, 0 if not found
74341 +   prefix_hash_remove_index   Remove an item w/ precomputed hash_index
74342 +
74343 +   If you'd like something to be done differently, feel free to ask me
74344 +   for modifications.  Additional features that could be added but
74345 +   have not been:
74346 +
74347 +   prefix_hash_remove_key           Find and remove an item by key
74348 +   prefix_hash_remove_key_index     Find and remove an item by key w/ precomputed hash_index
74349 +
74350 +   The hash_function currently receives only the key as an argument,
74351 +   meaning it must somehow know the number of buckets.  If this is a
74352 +   problem let me know.
74353 +
74354 +   This hash table uses a single-linked hash chain.  This means
74355 +   insertion is fast but deletion requires searching the chain.
74356 +
74357 +   There is also the doubly-linked hash chain approach, under which
74358 +   deletion requires no search but the code is longer and it takes two
74359 +   pointers per item.
74360 +
74361 +   The circularly-linked approach has the shortest code but requires
74362 +   two pointers per bucket, doubling the size of the bucket array (in
74363 +   addition to two pointers per item).
74364 +*/
74365 +#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC)  \
74366 +                                                                                       \
74367 +static __inline__ void                                                                 \
74368 +PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG,                            \
74369 +                    __u32                hash UNUSED_ARG)                              \
74370 +{                                                                                      \
74371 +       assert("nikita-2780", hash < table->_buckets);                                  \
74372 +}                                                                                      \
74373 +                                                                                       \
74374 +static __inline__ int                                                                  \
74375 +PREFIX##_hash_init (PREFIX##_hash_table *hash,                                         \
74376 +                   __u32                buckets)                                       \
74377 +{                                                                                      \
74378 +  hash->_table   = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets);              \
74379 +  hash->_buckets = buckets;                                                            \
74380 +  if (hash->_table == NULL)                                                            \
74381 +    {                                                                                  \
74382 +      return RETERR(-ENOMEM);                                                          \
74383 +    }                                                                                  \
74384 +  memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets);                             \
74385 +  ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets));                      \
74386 +  return 0;                                                                            \
74387 +}                                                                                      \
74388 +                                                                                       \
74389 +static __inline__ void                                                                 \
74390 +PREFIX##_hash_done (PREFIX##_hash_table *hash)                                         \
74391 +{                                                                                      \
74392 +  if (REISER4_DEBUG && hash->_table != NULL) {                                          \
74393 +           __u32 i;                                                                    \
74394 +           for (i = 0 ; i < hash->_buckets ; ++ i)                                     \
74395 +                   assert("nikita-2905", hash->_table[i] == NULL);                     \
74396 +  }                                                                                     \
74397 +  if (hash->_table != NULL)                                                            \
74398 +    KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets);                                \
74399 +  hash->_table = NULL;                                                                 \
74400 +}                                                                                      \
74401 +                                                                                       \
74402 +static __inline__ void                                                                 \
74403 +PREFIX##_hash_prefetch_next (ITEM_TYPE *item)                                          \
74404 +{                                                                                      \
74405 +       prefetch(item->LINK_NAME._next);                                                \
74406 +}                                                                                      \
74407 +                                                                                       \
74408 +static __inline__ void                                                                 \
74409 +PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash,                              \
74410 +                              __u32                index)                              \
74411 +{                                                                                      \
74412 +       prefetch(hash->_table[index]);                                                  \
74413 +}                                                                                      \
74414 +                                                                                       \
74415 +static __inline__ ITEM_TYPE*                                                           \
74416 +PREFIX##_hash_find_index (PREFIX##_hash_table *hash,                                   \
74417 +                         __u32                hash_index,                              \
74418 +                         KEY_TYPE const      *find_key)                                \
74419 +{                                                                                      \
74420 +  ITEM_TYPE *item;                                                                     \
74421 +                                                                                       \
74422 +  PREFIX##_check_hash(hash, hash_index);                                               \
74423 +                                                                                       \
74424 +  for (item  = hash->_table[hash_index];                                               \
74425 +       item != NULL;                                                                   \
74426 +       item  = item->LINK_NAME._next)                                                  \
74427 +    {                                                                                  \
74428 +      prefetch(item->LINK_NAME._next);                                                 \
74429 +      prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME));                 \
74430 +      if (EQ_FUNC (& item->KEY_NAME, find_key))                                                \
74431 +        {                                                                              \
74432 +          return item;                                                                 \
74433 +        }                                                                              \
74434 +    }                                                                                  \
74435 +                                                                                       \
74436 +  return NULL;                                                                         \
74437 +}                                                                                      \
74438 +                                                                                       \
74439 +static __inline__ ITEM_TYPE*                                                           \
74440 +PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash,                               \
74441 +                             __u32                hash_index,                          \
74442 +                             KEY_TYPE const      *find_key)                            \
74443 +{                                                                                      \
74444 +  ITEM_TYPE ** item = &hash->_table[hash_index];                                        \
74445 +                                                                                       \
74446 +  PREFIX##_check_hash(hash, hash_index);                                               \
74447 +                                                                                        \
74448 +  while (*item != NULL) {                                                               \
74449 +    prefetch(&(*item)->LINK_NAME._next);                                               \
74450 +    if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) {                                       \
74451 +      ITEM_TYPE *found;                                                                \
74452 +                                                                                       \
74453 +      found = *item;                                                                   \
74454 +      *item = found->LINK_NAME._next;                                                   \
74455 +      found->LINK_NAME._next = hash->_table[hash_index];                               \
74456 +      hash->_table[hash_index] = found;                                                        \
74457 +      return found;                                                                     \
74458 +    }                                                                                   \
74459 +    item = &(*item)->LINK_NAME._next;                                                   \
74460 +  }                                                                                    \
74461 +  return NULL;                                                                         \
74462 +}                                                                                      \
74463 +                                                                                       \
74464 +static __inline__ int                                                                  \
74465 +PREFIX##_hash_remove_index (PREFIX##_hash_table *hash,                                 \
74466 +                           __u32                hash_index,                            \
74467 +                           ITEM_TYPE           *del_item)                              \
74468 +{                                                                                      \
74469 +  ITEM_TYPE ** hash_item_p = &hash->_table[hash_index];                                 \
74470 +                                                                                       \
74471 +  PREFIX##_check_hash(hash, hash_index);                                               \
74472 +                                                                                        \
74473 +  while (*hash_item_p != NULL) {                                                        \
74474 +    prefetch(&(*hash_item_p)->LINK_NAME._next);                                                \
74475 +    if (*hash_item_p == del_item) {                                                     \
74476 +      *hash_item_p = (*hash_item_p)->LINK_NAME._next;                                   \
74477 +      return 1;                                                                         \
74478 +    }                                                                                   \
74479 +    hash_item_p = &(*hash_item_p)->LINK_NAME._next;                                     \
74480 +  }                                                                                    \
74481 +  return 0;                                                                            \
74482 +}                                                                                      \
74483 +                                                                                       \
74484 +static __inline__ void                                                                 \
74485 +PREFIX##_hash_insert_index (PREFIX##_hash_table *hash,                                 \
74486 +                           __u32                hash_index,                            \
74487 +                           ITEM_TYPE           *ins_item)                              \
74488 +{                                                                                      \
74489 +  PREFIX##_check_hash(hash, hash_index);                                               \
74490 +                                                                                       \
74491 +  ins_item->LINK_NAME._next = hash->_table[hash_index];                                        \
74492 +  hash->_table[hash_index]  = ins_item;                                                        \
74493 +}                                                                                      \
74494 +                                                                                       \
74495 +static __inline__ void                                                                 \
74496 +PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash,                             \
74497 +                               __u32                hash_index,                        \
74498 +                               ITEM_TYPE           *ins_item)                          \
74499 +{                                                                                      \
74500 +  PREFIX##_check_hash(hash, hash_index);                                               \
74501 +                                                                                       \
74502 +  ins_item->LINK_NAME._next = hash->_table[hash_index];                                        \
74503 +  smp_wmb();                                                                           \
74504 +  hash->_table[hash_index]  = ins_item;                                                        \
74505 +}                                                                                      \
74506 +                                                                                       \
74507 +static __inline__ ITEM_TYPE*                                                           \
74508 +PREFIX##_hash_find (PREFIX##_hash_table *hash,                                         \
74509 +                   KEY_TYPE const      *find_key)                                      \
74510 +{                                                                                      \
74511 +  return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key);         \
74512 +}                                                                                      \
74513 +                                                                                       \
74514 +static __inline__ ITEM_TYPE*                                                           \
74515 +PREFIX##_hash_find_lru (PREFIX##_hash_table *hash,                                     \
74516 +                       KEY_TYPE const      *find_key)                                  \
74517 +{                                                                                      \
74518 +  return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key);     \
74519 +}                                                                                      \
74520 +                                                                                       \
74521 +static __inline__ int                                                                  \
74522 +PREFIX##_hash_remove (PREFIX##_hash_table *hash,                                       \
74523 +                     ITEM_TYPE           *del_item)                                    \
74524 +{                                                                                      \
74525 +  return PREFIX##_hash_remove_index (hash,                                             \
74526 +                                     HASH_FUNC(hash, &del_item->KEY_NAME), del_item);  \
74527 +}                                                                                      \
74528 +                                                                                       \
74529 +static __inline__ int                                                                  \
74530 +PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash,                                   \
74531 +                     ITEM_TYPE           *del_item)                                    \
74532 +{                                                                                      \
74533 +  return PREFIX##_hash_remove (hash, del_item);                                                \
74534 +}                                                                                      \
74535 +                                                                                       \
74536 +static __inline__ void                                                                 \
74537 +PREFIX##_hash_insert (PREFIX##_hash_table *hash,                                       \
74538 +                     ITEM_TYPE           *ins_item)                                    \
74539 +{                                                                                      \
74540 +  return PREFIX##_hash_insert_index (hash,                                             \
74541 +                                     HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item);  \
74542 +}                                                                                      \
74543 +                                                                                       \
74544 +static __inline__ void                                                                 \
74545 +PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash,                                   \
74546 +                         ITEM_TYPE           *ins_item)                                \
74547 +{                                                                                      \
74548 +  return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME),           \
74549 +                                         ins_item);                                    \
74550 +}                                                                                      \
74551 +                                                                                       \
74552 +static __inline__ ITEM_TYPE *                                                          \
74553 +PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind)                             \
74554 +{                                                                                      \
74555 +  ITEM_TYPE *first;                                                                    \
74556 +                                                                                       \
74557 +  for (first = NULL; ind < hash->_buckets; ++ ind) {                                   \
74558 +    first = hash->_table[ind];                                                         \
74559 +    if (first != NULL)                                                                 \
74560 +      break;                                                                           \
74561 +  }                                                                                    \
74562 +  return first;                                                                                \
74563 +}                                                                                      \
74564 +                                                                                       \
74565 +static __inline__ ITEM_TYPE *                                                          \
74566 +PREFIX##_hash_next (PREFIX##_hash_table *hash,                                         \
74567 +                   ITEM_TYPE           *item)                                          \
74568 +{                                                                                      \
74569 +  ITEM_TYPE  *next;                                                                    \
74570 +                                                                                       \
74571 +  if (item == NULL)                                                                    \
74572 +    return NULL;                                                                       \
74573 +  next = item->LINK_NAME._next;                                                                \
74574 +  if (next == NULL)                                                                    \
74575 +    next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1);           \
74576 +  return next;                                                                         \
74577 +}                                                                                      \
74578 +                                                                                       \
74579 +typedef struct {} PREFIX##_hash_dummy
74580 +
74581 +#define for_all_ht_buckets(table, head)                                        \
74582 +for ((head) = &(table) -> _table[ 0 ] ;                                        \
74583 +     (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head))
74584 +
74585 +#define for_all_in_bucket(bucket, item, next, field)                           \
74586 +for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ;      \
74587 +     (item) != NULL ;                                                          \
74588 +     (item) = (next), (next) = (item) ? (item) -> field._next : NULL )
74589 +
74590 +#define for_all_in_htable(table, prefix, item, next)   \
74591 +for ((item) = prefix ## _hash_first ((table), 0),      \
74592 +     (next) = prefix ## _hash_next ((table), (item)) ; \
74593 +     (item) != NULL ;                                  \
74594 +     (item) = (next),                                  \
74595 +     (next) = prefix ## _hash_next ((table), (item)))
74596 +
74597 +/* __REISER4_TYPE_SAFE_HASH_H__ */
74598 +#endif
74599 +
74600 +/* Make Linus happy.
74601 +   Local variables:
74602 +   c-indentation-style: "K&R"
74603 +   mode-name: "LC"
74604 +   c-basic-offset: 8
74605 +   tab-width: 8
74606 +   fill-column: 120
74607 +   End:
74608 +*/
74609 diff -urN linux-2.6.35.orig/fs/reiser4/vfs_ops.c linux-2.6.35/fs/reiser4/vfs_ops.c
74610 --- linux-2.6.35.orig/fs/reiser4/vfs_ops.c      1970-01-01 01:00:00.000000000 +0100
74611 +++ linux-2.6.35/fs/reiser4/vfs_ops.c   2010-08-04 15:44:57.000000000 +0200
74612 @@ -0,0 +1,267 @@
74613 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74614 + * reiser4/README */
74615 +
74616 +/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined
74617 +   here. */
74618 +
74619 +#include "forward.h"
74620 +#include "debug.h"
74621 +#include "dformat.h"
74622 +#include "coord.h"
74623 +#include "plugin/item/item.h"
74624 +#include "plugin/file/file.h"
74625 +#include "plugin/security/perm.h"
74626 +#include "plugin/disk_format/disk_format.h"
74627 +#include "plugin/plugin.h"
74628 +#include "plugin/plugin_set.h"
74629 +#include "plugin/object.h"
74630 +#include "txnmgr.h"
74631 +#include "jnode.h"
74632 +#include "znode.h"
74633 +#include "block_alloc.h"
74634 +#include "tree.h"
74635 +#include "vfs_ops.h"
74636 +#include "inode.h"
74637 +#include "page_cache.h"
74638 +#include "ktxnmgrd.h"
74639 +#include "super.h"
74640 +#include "reiser4.h"
74641 +#include "entd.h"
74642 +#include "status_flags.h"
74643 +#include "flush.h"
74644 +#include "dscale.h"
74645 +
74646 +#include <linux/profile.h>
74647 +#include <linux/types.h>
74648 +#include <linux/mount.h>
74649 +#include <linux/vfs.h>
74650 +#include <linux/mm.h>
74651 +#include <linux/buffer_head.h>
74652 +#include <linux/dcache.h>
74653 +#include <linux/list.h>
74654 +#include <linux/pagemap.h>
74655 +#include <linux/slab.h>
74656 +#include <linux/seq_file.h>
74657 +#include <linux/init.h>
74658 +#include <linux/module.h>
74659 +#include <linux/writeback.h>
74660 +#include <linux/blkdev.h>
74661 +#include <linux/quotaops.h>
74662 +#include <linux/security.h>
74663 +#include <linux/reboot.h>
74664 +#include <linux/rcupdate.h>
74665 +
74666 +/* update inode stat-data by calling plugin */
74667 +int reiser4_update_sd(struct inode *object)
74668 +{
74669 +       file_plugin *fplug;
74670 +
74671 +       assert("nikita-2338", object != NULL);
74672 +       /* check for read-only file system. */
74673 +       if (IS_RDONLY(object))
74674 +               return 0;
74675 +
74676 +       fplug = inode_file_plugin(object);
74677 +       assert("nikita-2339", fplug != NULL);
74678 +       return fplug->write_sd_by_inode(object);
74679 +}
74680 +
74681 +/* helper function: increase inode nlink count and call plugin method to save
74682 +   updated stat-data.
74683 +
74684 +   Used by link/create and during creation of dot and dotdot in mkdir
74685 +*/
74686 +int reiser4_add_nlink(struct inode *object /* object to which link is added */ ,
74687 +                     struct inode *parent /* parent where new entry will be */
74688 +                     ,
74689 +                     int write_sd_p    /* true if stat-data has to be
74690 +                                        * updated */ )
74691 +{
74692 +       file_plugin *fplug;
74693 +       int result;
74694 +
74695 +       assert("nikita-1351", object != NULL);
74696 +
74697 +       fplug = inode_file_plugin(object);
74698 +       assert("nikita-1445", fplug != NULL);
74699 +
74700 +       /* ask plugin whether it can add yet another link to this
74701 +          object */
74702 +       if (!fplug->can_add_link(object))
74703 +               return RETERR(-EMLINK);
74704 +
74705 +       assert("nikita-2211", fplug->add_link != NULL);
74706 +       /* call plugin to do actual addition of link */
74707 +       result = fplug->add_link(object, parent);
74708 +
74709 +       /* optionally update stat data */
74710 +       if (result == 0 && write_sd_p)
74711 +               result = fplug->write_sd_by_inode(object);
74712 +       return result;
74713 +}
74714 +
74715 +/* helper function: decrease inode nlink count and call plugin method to save
74716 +   updated stat-data.
74717 +
74718 +   Used by unlink/create
74719 +*/
74720 +int reiser4_del_nlink(struct inode *object     /* object from which link is
74721 +                                                * removed */ ,
74722 +                     struct inode *parent /* parent where entry was */ ,
74723 +                     int write_sd_p    /* true is stat-data has to be
74724 +                                        * updated */ )
74725 +{
74726 +       file_plugin *fplug;
74727 +       int result;
74728 +
74729 +       assert("nikita-1349", object != NULL);
74730 +
74731 +       fplug = inode_file_plugin(object);
74732 +       assert("nikita-1350", fplug != NULL);
74733 +       assert("nikita-1446", object->i_nlink > 0);
74734 +       assert("nikita-2210", fplug->rem_link != NULL);
74735 +
74736 +       /* call plugin to do actual deletion of link */
74737 +       result = fplug->rem_link(object, parent);
74738 +
74739 +       /* optionally update stat data */
74740 +       if (result == 0 && write_sd_p)
74741 +               result = fplug->write_sd_by_inode(object);
74742 +       return result;
74743 +}
74744 +
74745 +/* Release reiser4 dentry. This is d_op->d_release() method. */
74746 +static void reiser4_d_release(struct dentry *dentry /* dentry released */ )
74747 +{
74748 +       reiser4_free_dentry_fsdata(dentry);
74749 +}
74750 +
74751 +/*
74752 + * Called by reiser4_sync_inodes(), during speculative write-back (through
74753 + * pdflush, or balance_dirty_pages()).
74754 + */
74755 +void reiser4_writeout(struct super_block *sb, struct writeback_control *wbc)
74756 +{
74757 +       long written = 0;
74758 +       int repeats = 0;
74759 +       int result;
74760 +       struct address_space *mapping;
74761 +
74762 +       /*
74763 +        * Performs early flushing, trying to free some memory. If there is
74764 +        * nothing to flush, commits some atoms.
74765 +        */
74766 +
74767 +       /* Commit all atoms if reiser4_writepages() is called from sys_sync() or
74768 +          sys_fsync(). */
74769 +       if (wbc->sync_mode != WB_SYNC_NONE) {
74770 +               txnmgr_force_commit_all(sb, 0);
74771 +               return;
74772 +       }
74773 +
74774 +       BUG_ON(reiser4_get_super_fake(sb) == NULL);
74775 +       mapping = reiser4_get_super_fake(sb)->i_mapping;
74776 +       do {
74777 +               long nr_submitted = 0;
74778 +               jnode *node = NULL;
74779 +
74780 +               /* do not put more requests to overload write queue */
74781 +               if (wbc->nonblocking &&
74782 +                   bdi_write_congested(mapping->backing_dev_info)) {
74783 +                       blk_run_address_space(mapping);
74784 +                       wbc->encountered_congestion = 1;
74785 +                       break;
74786 +               }
74787 +               repeats++;
74788 +               BUG_ON(wbc->nr_to_write <= 0);
74789 +
74790 +               if (get_current_context()->entd) {
74791 +                       entd_context *ent = get_entd_context(sb);
74792 +
74793 +                       if (ent->cur_request->node)
74794 +                               /*
74795 +                                * this is ent thread and it managed to capture
74796 +                                * requested page itself - start flush from
74797 +                                * that page
74798 +                                */
74799 +                               node = ent->cur_request->node;
74800 +               }
74801 +
74802 +               result = flush_some_atom(node, &nr_submitted, wbc,
74803 +                                        JNODE_FLUSH_WRITE_BLOCKS);
74804 +               if (result != 0)
74805 +                       warning("nikita-31001", "Flush failed: %i", result);
74806 +               if (node)
74807 +                       /* drop the reference aquired
74808 +                          in find_or_create_extent() */
74809 +                       jput(node);
74810 +               if (!nr_submitted)
74811 +                       break;
74812 +
74813 +               wbc->nr_to_write -= nr_submitted;
74814 +               written += nr_submitted;
74815 +       } while (wbc->nr_to_write > 0);
74816 +}
74817 +
74818 +/* tell VM how many pages were dirtied */
74819 +void reiser4_throttle_write(struct inode *inode, int nrpages)
74820 +{
74821 +       reiser4_context *ctx;
74822 +
74823 +       ctx = get_current_context();
74824 +       reiser4_txn_restart(ctx);
74825 +       current->journal_info = NULL;
74826 +       balance_dirty_pages_ratelimited_nr(inode->i_mapping, nrpages);
74827 +       current->journal_info = ctx;
74828 +}
74829 +
74830 +const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4";
74831 +const int REISER4_MAGIC_OFFSET = 16 * 4096;    /* offset to magic string from the
74832 +                                                * beginning of device */
74833 +
74834 +/*
74835 + * Reiser4 initialization/shutdown.
74836 + *
74837 + * Code below performs global reiser4 initialization that is done either as
74838 + * part of kernel initialization (when reiser4 is statically built-in), or
74839 + * during reiser4 module load (when compiled as module).
74840 + */
74841 +
74842 +void reiser4_handle_error(void)
74843 +{
74844 +       struct super_block *sb = reiser4_get_current_sb();
74845 +
74846 +       if (!sb)
74847 +               return;
74848 +       reiser4_status_write(REISER4_STATUS_DAMAGED, 0,
74849 +                            "Filesystem error occured");
74850 +       switch (get_super_private(sb)->onerror) {
74851 +       case 0:
74852 +               reiser4_panic("foobar-42", "Filesystem error occured\n");
74853 +       case 1:
74854 +       default:
74855 +               if (sb->s_flags & MS_RDONLY)
74856 +                       return;
74857 +               sb->s_flags |= MS_RDONLY;
74858 +               break;
74859 +       }
74860 +}
74861 +
74862 +struct dentry_operations reiser4_dentry_operations = {
74863 +       .d_revalidate = NULL,
74864 +       .d_hash = NULL,
74865 +       .d_compare = NULL,
74866 +       .d_delete = NULL,
74867 +       .d_release = reiser4_d_release,
74868 +       .d_iput = NULL,
74869 +};
74870 +
74871 +/* Make Linus happy.
74872 +   Local variables:
74873 +   c-indentation-style: "K&R"
74874 +   mode-name: "LC"
74875 +   c-basic-offset: 8
74876 +   tab-width: 8
74877 +   fill-column: 120
74878 +   End:
74879 +*/
74880 diff -urN linux-2.6.35.orig/fs/reiser4/vfs_ops.h linux-2.6.35/fs/reiser4/vfs_ops.h
74881 --- linux-2.6.35.orig/fs/reiser4/vfs_ops.h      1970-01-01 01:00:00.000000000 +0100
74882 +++ linux-2.6.35/fs/reiser4/vfs_ops.h   2010-08-04 15:44:57.000000000 +0200
74883 @@ -0,0 +1,53 @@
74884 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74885 + * reiser4/README */
74886 +
74887 +/* vfs_ops.c's exported symbols */
74888 +
74889 +#if !defined( __FS_REISER4_VFS_OPS_H__ )
74890 +#define __FS_REISER4_VFS_OPS_H__
74891 +
74892 +#include "forward.h"
74893 +#include "coord.h"
74894 +#include "seal.h"
74895 +#include "plugin/file/file.h"
74896 +#include "super.h"
74897 +#include "readahead.h"
74898 +
74899 +#include <linux/types.h>       /* for loff_t */
74900 +#include <linux/fs.h>          /* for struct address_space */
74901 +#include <linux/dcache.h>      /* for struct dentry */
74902 +#include <linux/mm.h>
74903 +#include <linux/backing-dev.h>
74904 +
74905 +/* address space operations */
74906 +int reiser4_writepage(struct page *, struct writeback_control *);
74907 +int reiser4_set_page_dirty(struct page *);
74908 +void reiser4_invalidatepage(struct page *, unsigned long offset);
74909 +int reiser4_releasepage(struct page *, gfp_t);
74910 +
74911 +extern int reiser4_update_sd(struct inode *);
74912 +extern int reiser4_add_nlink(struct inode *, struct inode *, int);
74913 +extern int reiser4_del_nlink(struct inode *, struct inode *, int);
74914 +
74915 +extern int reiser4_start_up_io(struct page *page);
74916 +extern void reiser4_throttle_write(struct inode *, int nrpages);
74917 +extern int jnode_is_releasable(jnode *);
74918 +
74919 +#define CAPTURE_APAGE_BURST (1024l)
74920 +void reiser4_writeout(struct super_block *, struct writeback_control *);
74921 +
74922 +extern void reiser4_handle_error(void);
74923 +
74924 +/* __FS_REISER4_VFS_OPS_H__ */
74925 +#endif
74926 +
74927 +/* Make Linus happy.
74928 +   Local variables:
74929 +   c-indentation-style: "K&R"
74930 +   mode-name: "LC"
74931 +   c-basic-offset: 8
74932 +   tab-width: 8
74933 +   fill-column: 120
74934 +   scroll-step: 1
74935 +   End:
74936 +*/
74937 diff -urN linux-2.6.35.orig/fs/reiser4/wander.c linux-2.6.35/fs/reiser4/wander.c
74938 --- linux-2.6.35.orig/fs/reiser4/wander.c       1970-01-01 01:00:00.000000000 +0100
74939 +++ linux-2.6.35/fs/reiser4/wander.c    2010-08-04 15:44:57.000000000 +0200
74940 @@ -0,0 +1,1798 @@
74941 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74942 + * reiser4/README */
74943 +
74944 +/* Reiser4 Wandering Log */
74945 +
74946 +/* You should read http://www.namesys.com/txn-doc.html
74947 +
74948 +   That describes how filesystem operations are performed as atomic
74949 +   transactions, and how we try to arrange it so that we can write most of the
74950 +   data only once while performing the operation atomically.
74951 +
74952 +   For the purposes of this code, it is enough for it to understand that it
74953 +   has been told a given block should be written either once, or twice (if
74954 +   twice then once to the wandered location and once to the real location).
74955 +
74956 +   This code guarantees that those blocks that are defined to be part of an
74957 +   atom either all take effect or none of them take effect.
74958 +
74959 +   The "relocate set" of nodes are submitted to write by the jnode_flush()
74960 +   routine, and the "overwrite set" is submitted by reiser4_write_log().
74961 +   This is because with the overwrite set we seek to optimize writes, and
74962 +   with the relocate set we seek to cause disk order to correlate with the
74963 +   "parent first order" (preorder).
74964 +
74965 +   reiser4_write_log() allocates and writes wandered blocks and maintains
74966 +   additional on-disk structures of the atom as wander records (each wander
74967 +   record occupies one block) for storing of the "wandered map" (a table which
74968 +   contains a relation between wandered and real block numbers) and other
74969 +   information which might be needed at transaction recovery time.
74970 +
74971 +   The wander records are unidirectionally linked into a circle: each wander
74972 +   record contains a block number of the next wander record, the last wander
74973 +   record points to the first one.
74974 +
74975 +   One wander record (named "tx head" in this file) has a format which is
74976 +   different from the other wander records. The "tx head" has a reference to the
74977 +   "tx head" block of the previously committed atom.  Also, "tx head" contains
74978 +   fs information (the free blocks counter, and the oid allocator state) which
74979 +   is logged in a special way .
74980 +
74981 +   There are two journal control blocks, named journal header and journal
74982 +   footer which have fixed on-disk locations.  The journal header has a
74983 +   reference to the "tx head" block of the last committed atom.  The journal
74984 +   footer points to the "tx head" of the last flushed atom.  The atom is
74985 +   "played" when all blocks from its overwrite set are written to disk the
74986 +   second time (i.e. written to their real locations).
74987 +
74988 +   NOTE: People who know reiserfs internals and its journal structure might be
74989 +   confused with these terms journal footer and journal header. There is a table
74990 +   with terms of similar semantics in reiserfs (reiser3) and reiser4:
74991 +
74992 +   REISER3 TERM        |  REISER4 TERM         | DESCRIPTION
74993 +   --------------------+-----------------------+----------------------------
74994 +   commit record       |  journal header       | atomic write of this record
74995 +                       |                       | ends transaction commit
74996 +   --------------------+-----------------------+----------------------------
74997 +   journal header      |  journal footer       | atomic write of this record
74998 +                       |                       | ends post-commit writes.
74999 +                       |                       | After successful
75000 +                       |                       | writing of this journal
75001 +                       |                       | blocks (in reiser3) or
75002 +                       |                       | wandered blocks/records are
75003 +                       |                       | free for re-use.
75004 +   --------------------+-----------------------+----------------------------
75005 +
75006 +   The atom commit process is the following:
75007 +
75008 +   1. The overwrite set is taken from atom's clean list, and its size is
75009 +      counted.
75010 +
75011 +   2. The number of necessary wander records (including tx head) is calculated,
75012 +      and the wander record blocks are allocated.
75013 +
75014 +   3. Allocate wandered blocks and populate wander records by wandered map.
75015 +
75016 +   4. submit write requests for wander records and wandered blocks.
75017 +
75018 +   5. wait until submitted write requests complete.
75019 +
75020 +   6. update journal header: change the pointer to the block number of just
75021 +   written tx head, submit an i/o for modified journal header block and wait
75022 +   for i/o completion.
75023 +
75024 +   NOTE: The special logging for bitmap blocks and some reiser4 super block
75025 +   fields makes processes of atom commit, flush and recovering a bit more
75026 +   complex (see comments in the source code for details).
75027 +
75028 +   The atom playing process is the following:
75029 +
75030 +   1. Write atom's overwrite set in-place.
75031 +
75032 +   2. Wait on i/o.
75033 +
75034 +   3. Update journal footer: change the pointer to block number of tx head
75035 +   block of the atom we currently flushing, submit an i/o, wait on i/o
75036 +   completion.
75037 +
75038 +   4. Free disk space which was used for wandered blocks and wander records.
75039 +
75040 +   After the freeing of wandered blocks and wander records we have that journal
75041 +   footer points to the on-disk structure which might be overwritten soon.
75042 +   Neither the log writer nor the journal recovery procedure use that pointer
75043 +   for accessing the data.  When the journal recovery procedure finds the oldest
75044 +   transaction it compares the journal footer pointer value with the "prev_tx"
75045 +   pointer value in tx head, if values are equal the oldest not flushed
75046 +   transaction is found.
75047 +
75048 +   NOTE on disk space leakage: the information about of what blocks and how many
75049 +   blocks are allocated for wandered blocks, wandered records is not written to
75050 +   the disk because of special logging for bitmaps and some super blocks
75051 +   counters.  After a system crash we the reiser4 does not remember those
75052 +   objects allocation, thus we have no such a kind of disk space leakage.
75053 +*/
75054 +
75055 +/* Special logging of reiser4 super block fields. */
75056 +
75057 +/* There are some reiser4 super block fields (free block count and OID allocator
75058 +   state (number of files and next free OID) which are logged separately from
75059 +   super block to avoid unnecessary atom fusion.
75060 +
75061 +   So, the reiser4 super block can be not captured by a transaction with
75062 +   allocates/deallocates disk blocks or create/delete file objects.  Moreover,
75063 +   the reiser4 on-disk super block is not touched when such a transaction is
75064 +   committed and flushed.  Those "counters logged specially" are logged in "tx
75065 +   head" blocks and in the journal footer block.
75066 +
75067 +   A step-by-step description of special logging:
75068 +
75069 +   0. The per-atom information about deleted or created files and allocated or
75070 +   freed blocks is collected during the transaction.  The atom's
75071 +   ->nr_objects_created and ->nr_objects_deleted are for object
75072 +   deletion/creation tracking, the numbers of allocated and freed blocks are
75073 +   calculated using atom's delete set and atom's capture list -- all new and
75074 +   relocated nodes should be on atom's clean list and should have JNODE_RELOC
75075 +   bit set.
75076 +
75077 +   1. The "logged specially" reiser4 super block fields have their "committed"
75078 +   versions in the reiser4 in-memory super block.  They get modified only at
75079 +   atom commit time.  The atom's commit thread has an exclusive access to those
75080 +   "committed" fields because the log writer implementation supports only one
75081 +   atom commit a time (there is a per-fs "commit" mutex).  At
75082 +   that time "committed" counters are modified using per-atom information
75083 +   collected during the transaction. These counters are stored on disk as a
75084 +   part of tx head block when atom is committed.
75085 +
75086 +   2. When the atom is flushed the value of the free block counter and the OID
75087 +   allocator state get written to the journal footer block.  A special journal
75088 +   procedure (journal_recover_sb_data()) takes those values from the journal
75089 +   footer and updates the reiser4 in-memory super block.
75090 +
75091 +   NOTE: That means free block count and OID allocator state are logged
75092 +   separately from the reiser4 super block regardless of the fact that the
75093 +   reiser4 super block has fields to store both the free block counter and the
75094 +   OID allocator.
75095 +
75096 +   Writing the whole super block at commit time requires knowing true values of
75097 +   all its fields without changes made by not yet committed transactions. It is
75098 +   possible by having their "committed" version of the super block like the
75099 +   reiser4 bitmap blocks have "committed" and "working" versions.  However,
75100 +   another scheme was implemented which stores special logged values in the
75101 +   unused free space inside transaction head block.  In my opinion it has an
75102 +   advantage of not writing whole super block when only part of it was
75103 +   modified. */
75104 +
75105 +#include "debug.h"
75106 +#include "dformat.h"
75107 +#include "txnmgr.h"
75108 +#include "jnode.h"
75109 +#include "znode.h"
75110 +#include "block_alloc.h"
75111 +#include "page_cache.h"
75112 +#include "wander.h"
75113 +#include "reiser4.h"
75114 +#include "super.h"
75115 +#include "vfs_ops.h"
75116 +#include "writeout.h"
75117 +#include "inode.h"
75118 +#include "entd.h"
75119 +
75120 +#include <linux/types.h>
75121 +#include <linux/fs.h>          /* for struct super_block  */
75122 +#include <linux/mm.h>          /* for struct page */
75123 +#include <linux/pagemap.h>
75124 +#include <linux/bio.h>         /* for struct bio */
75125 +#include <linux/blkdev.h>
75126 +
75127 +static int write_jnodes_to_disk_extent(
75128 +       jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
75129 +
75130 +/* The commit_handle is a container for objects needed at atom commit time  */
75131 +struct commit_handle {
75132 +       /* A pointer to atom's list of OVRWR nodes */
75133 +       struct list_head *overwrite_set;
75134 +       /* atom's overwrite set size */
75135 +       int overwrite_set_size;
75136 +       /* jnodes for wander record blocks */
75137 +       struct list_head tx_list;
75138 +       /* number of wander records */
75139 +       __u32 tx_size;
75140 +       /* 'committed' sb counters are saved here until atom is completely
75141 +          flushed  */
75142 +       __u64 free_blocks;
75143 +       __u64 nr_files;
75144 +       __u64 next_oid;
75145 +       /* A pointer to the atom which is being committed */
75146 +       txn_atom *atom;
75147 +       /* A pointer to current super block */
75148 +       struct super_block *super;
75149 +       /* The counter of modified bitmaps */
75150 +       reiser4_block_nr nr_bitmap;
75151 +};
75152 +
75153 +static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
75154 +{
75155 +       memset(ch, 0, sizeof(struct commit_handle));
75156 +       INIT_LIST_HEAD(&ch->tx_list);
75157 +
75158 +       ch->atom = atom;
75159 +       ch->super = reiser4_get_current_sb();
75160 +}
75161 +
75162 +static void done_commit_handle(struct commit_handle *ch)
75163 +{
75164 +       assert("zam-690", list_empty(&ch->tx_list));
75165 +}
75166 +
75167 +static inline int reiser4_use_write_barrier(struct super_block * s)
75168 +{
75169 +       return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
75170 +}
75171 +
75172 +static void disable_write_barrier(struct super_block * s)
75173 +{
75174 +       notice("zam-1055", "%s does not support write barriers,"
75175 +              " using synchronous write instead.", s->s_id);
75176 +       set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
75177 +}
75178 +
75179 +/* fill journal header block data  */
75180 +static void format_journal_header(struct commit_handle *ch)
75181 +{
75182 +       struct reiser4_super_info_data *sbinfo;
75183 +       struct journal_header *header;
75184 +       jnode *txhead;
75185 +
75186 +       sbinfo = get_super_private(ch->super);
75187 +       assert("zam-479", sbinfo != NULL);
75188 +       assert("zam-480", sbinfo->journal_header != NULL);
75189 +
75190 +       txhead = list_entry(ch->tx_list.next, jnode, capture_link);
75191 +
75192 +       jload(sbinfo->journal_header);
75193 +
75194 +       header = (struct journal_header *)jdata(sbinfo->journal_header);
75195 +       assert("zam-484", header != NULL);
75196 +
75197 +       put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
75198 +                     &header->last_committed_tx);
75199 +
75200 +       jrelse(sbinfo->journal_header);
75201 +}
75202 +
75203 +/* fill journal footer block data */
75204 +static void format_journal_footer(struct commit_handle *ch)
75205 +{
75206 +       struct reiser4_super_info_data *sbinfo;
75207 +       struct journal_footer *footer;
75208 +       jnode *tx_head;
75209 +
75210 +       sbinfo = get_super_private(ch->super);
75211 +
75212 +       tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
75213 +
75214 +       assert("zam-493", sbinfo != NULL);
75215 +       assert("zam-494", sbinfo->journal_header != NULL);
75216 +
75217 +       check_me("zam-691", jload(sbinfo->journal_footer) == 0);
75218 +
75219 +       footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
75220 +       assert("zam-495", footer != NULL);
75221 +
75222 +       put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
75223 +                     &footer->last_flushed_tx);
75224 +       put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
75225 +
75226 +       put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
75227 +       put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
75228 +
75229 +       jrelse(sbinfo->journal_footer);
75230 +}
75231 +
75232 +/* wander record capacity depends on current block size */
75233 +static int wander_record_capacity(const struct super_block *super)
75234 +{
75235 +       return (super->s_blocksize -
75236 +               sizeof(struct wander_record_header)) /
75237 +           sizeof(struct wander_entry);
75238 +}
75239 +
75240 +/* Fill first wander record (tx head) in accordance with supplied given data */
75241 +static void format_tx_head(struct commit_handle *ch)
75242 +{
75243 +       jnode *tx_head;
75244 +       jnode *next;
75245 +       struct tx_header *header;
75246 +
75247 +       tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
75248 +       assert("zam-692", &ch->tx_list != &tx_head->capture_link);
75249 +
75250 +       next = list_entry(tx_head->capture_link.next, jnode, capture_link);
75251 +       if (&ch->tx_list == &next->capture_link)
75252 +               next = tx_head;
75253 +
75254 +       header = (struct tx_header *)jdata(tx_head);
75255 +
75256 +       assert("zam-460", header != NULL);
75257 +       assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
75258 +
75259 +       memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
75260 +       memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
75261 +
75262 +       put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
75263 +       put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
75264 +                     &header->prev_tx);
75265 +       put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
75266 +       put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
75267 +       put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
75268 +       put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
75269 +}
75270 +
75271 +/* prepare ordinary wander record block (fill all service fields) */
75272 +static void
75273 +format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
75274 +{
75275 +       struct wander_record_header *LRH;
75276 +       jnode *next;
75277 +
75278 +       assert("zam-464", node != NULL);
75279 +
75280 +       LRH = (struct wander_record_header *)jdata(node);
75281 +       next = list_entry(node->capture_link.next, jnode, capture_link);
75282 +
75283 +       if (&ch->tx_list == &next->capture_link)
75284 +               next = list_entry(ch->tx_list.next, jnode, capture_link);
75285 +
75286 +       assert("zam-465", LRH != NULL);
75287 +       assert("zam-463",
75288 +              ch->super->s_blocksize > sizeof(struct wander_record_header));
75289 +
75290 +       memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
75291 +       memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
75292 +
75293 +       put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
75294 +       put_unaligned(cpu_to_le32(serial), &LRH->serial);
75295 +       put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
75296 +}
75297 +
75298 +/* add one wandered map entry to formatted wander record */
75299 +static void
75300 +store_entry(jnode * node, int index, const reiser4_block_nr * a,
75301 +           const reiser4_block_nr * b)
75302 +{
75303 +       char *data;
75304 +       struct wander_entry *pairs;
75305 +
75306 +       data = jdata(node);
75307 +       assert("zam-451", data != NULL);
75308 +
75309 +       pairs =
75310 +           (struct wander_entry *)(data + sizeof(struct wander_record_header));
75311 +
75312 +       put_unaligned(cpu_to_le64(*a), &pairs[index].original);
75313 +       put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
75314 +}
75315 +
75316 +/* currently, wander records contains contain only wandered map, which depend on
75317 +   overwrite set size */
75318 +static void get_tx_size(struct commit_handle *ch)
75319 +{
75320 +       assert("zam-440", ch->overwrite_set_size != 0);
75321 +       assert("zam-695", ch->tx_size == 0);
75322 +
75323 +       /* count all ordinary wander records
75324 +          (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
75325 +          for tx head block */
75326 +       ch->tx_size =
75327 +           (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
75328 +           2;
75329 +}
75330 +
75331 +/* A special structure for using in store_wmap_actor() for saving its state
75332 +   between calls */
75333 +struct store_wmap_params {
75334 +       jnode *cur;             /* jnode of current wander record to fill */
75335 +       int idx;                /* free element index in wander record  */
75336 +       int capacity;           /* capacity  */
75337 +
75338 +#if REISER4_DEBUG
75339 +       struct list_head *tx_list;
75340 +#endif
75341 +};
75342 +
75343 +/* an actor for use in blocknr_set_iterator routine which populates the list
75344 +   of pre-formatted wander records by wandered map info */
75345 +static int
75346 +store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
75347 +                const reiser4_block_nr * b, void *data)
75348 +{
75349 +       struct store_wmap_params *params = data;
75350 +
75351 +       if (params->idx >= params->capacity) {
75352 +               /* a new wander record should be taken from the tx_list */
75353 +               params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
75354 +               assert("zam-454",
75355 +                      params->tx_list != &params->cur->capture_link);
75356 +
75357 +               params->idx = 0;
75358 +       }
75359 +
75360 +       store_entry(params->cur, params->idx, a, b);
75361 +       params->idx++;
75362 +
75363 +       return 0;
75364 +}
75365 +
75366 +/* This function is called after Relocate set gets written to disk, Overwrite
75367 +   set is written to wandered locations and all wander records are written
75368 +   also. Updated journal header blocks contains a pointer (block number) to
75369 +   first wander record of the just written transaction */
75370 +static int update_journal_header(struct commit_handle *ch, int use_barrier)
75371 +{
75372 +       struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
75373 +       jnode *jh = sbinfo->journal_header;
75374 +       jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
75375 +       int ret;
75376 +
75377 +       format_journal_header(ch);
75378 +
75379 +       ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
75380 +                                         use_barrier ? WRITEOUT_BARRIER : 0);
75381 +       if (ret)
75382 +               return ret;
75383 +
75384 +       /* blk_run_address_space(sbinfo->fake->i_mapping);
75385 +        * blk_run_queues(); */
75386 +
75387 +       ret = jwait_io(jh, WRITE);
75388 +
75389 +       if (ret)
75390 +               return ret;
75391 +
75392 +       sbinfo->last_committed_tx = *jnode_get_block(head);
75393 +
75394 +       return 0;
75395 +}
75396 +
75397 +/* This function is called after write-back is finished. We update journal
75398 +   footer block and free blocks which were occupied by wandered blocks and
75399 +   transaction wander records */
75400 +static int update_journal_footer(struct commit_handle *ch, int use_barrier)
75401 +{
75402 +       reiser4_super_info_data *sbinfo = get_super_private(ch->super);
75403 +
75404 +       jnode *jf = sbinfo->journal_footer;
75405 +
75406 +       int ret;
75407 +
75408 +       format_journal_footer(ch);
75409 +
75410 +       ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
75411 +                                         use_barrier ? WRITEOUT_BARRIER : 0);
75412 +       if (ret)
75413 +               return ret;
75414 +
75415 +       /* blk_run_address_space(sbinfo->fake->i_mapping);
75416 +        * blk_run_queue(); */
75417 +
75418 +       ret = jwait_io(jf, WRITE);
75419 +       if (ret)
75420 +               return ret;
75421 +
75422 +       return 0;
75423 +}
75424 +
75425 +/* free block numbers of wander records of already written in place transaction */
75426 +static void dealloc_tx_list(struct commit_handle *ch)
75427 +{
75428 +       while (!list_empty(&ch->tx_list)) {
75429 +               jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
75430 +               list_del(&cur->capture_link);
75431 +               ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
75432 +               reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
75433 +                                     BA_FORMATTED);
75434 +
75435 +               unpin_jnode_data(cur);
75436 +               reiser4_drop_io_head(cur);
75437 +       }
75438 +}
75439 +
75440 +/* An actor for use in block_nr_iterator() routine which frees wandered blocks
75441 +   from atom's overwrite set. */
75442 +static int
75443 +dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
75444 +                  const reiser4_block_nr * a UNUSED_ARG,
75445 +                  const reiser4_block_nr * b, void *data UNUSED_ARG)
75446 +{
75447 +
75448 +       assert("zam-499", b != NULL);
75449 +       assert("zam-500", *b != 0);
75450 +       assert("zam-501", !reiser4_blocknr_is_fake(b));
75451 +
75452 +       reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
75453 +       return 0;
75454 +}
75455 +
75456 +/* free wandered block locations of already written in place transaction */
75457 +static void dealloc_wmap(struct commit_handle *ch)
75458 +{
75459 +       assert("zam-696", ch->atom != NULL);
75460 +
75461 +       blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
75462 +                            dealloc_wmap_actor, NULL, 1);
75463 +}
75464 +
75465 +/* helper function for alloc wandered blocks, which refill set of block
75466 +   numbers needed for wandered blocks  */
75467 +static int
75468 +get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
75469 +{
75470 +       reiser4_blocknr_hint hint;
75471 +       int ret;
75472 +
75473 +       reiser4_block_nr wide_len = count;
75474 +
75475 +       /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
75476 +          ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
75477 +          reserved allocation area so as to get the best qualities of fixed
75478 +          journals? */
75479 +       reiser4_blocknr_hint_init(&hint);
75480 +       hint.block_stage = BLOCK_GRABBED;
75481 +
75482 +       ret = reiser4_alloc_blocks(&hint, start, &wide_len,
75483 +                                  BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
75484 +       *len = (int)wide_len;
75485 +
75486 +       return ret;
75487 +}
75488 +
75489 +/*
75490 + * roll back changes made before issuing BIO in the case of IO error.
75491 + */
75492 +static void undo_bio(struct bio *bio)
75493 +{
75494 +       int i;
75495 +
75496 +       for (i = 0; i < bio->bi_vcnt; ++i) {
75497 +               struct page *pg;
75498 +               jnode *node;
75499 +
75500 +               pg = bio->bi_io_vec[i].bv_page;
75501 +               end_page_writeback(pg);
75502 +               node = jprivate(pg);
75503 +               spin_lock_jnode(node);
75504 +               JF_CLR(node, JNODE_WRITEBACK);
75505 +               JF_SET(node, JNODE_DIRTY);
75506 +               spin_unlock_jnode(node);
75507 +       }
75508 +       bio_put(bio);
75509 +}
75510 +
75511 +/* put overwrite set back to atom's clean list */
75512 +static void put_overwrite_set(struct commit_handle *ch)
75513 +{
75514 +       jnode *cur;
75515 +
75516 +       list_for_each_entry(cur, ch->overwrite_set, capture_link)
75517 +               jrelse_tail(cur);
75518 +}
75519 +
75520 +/* Count overwrite set size, grab disk space for wandered blocks allocation.
75521 +   Since we have a separate list for atom's overwrite set we just scan the list,
75522 +   count bitmap and other not leaf nodes which wandered blocks allocation we
75523 +   have to grab space for. */
75524 +static int get_overwrite_set(struct commit_handle *ch)
75525 +{
75526 +       int ret;
75527 +       jnode *cur;
75528 +       __u64 nr_not_leaves = 0;
75529 +#if REISER4_DEBUG
75530 +       __u64 nr_formatted_leaves = 0;
75531 +       __u64 nr_unformatted_leaves = 0;
75532 +#endif
75533 +
75534 +       assert("zam-697", ch->overwrite_set_size == 0);
75535 +
75536 +       ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
75537 +       cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
75538 +
75539 +       while (ch->overwrite_set != &cur->capture_link) {
75540 +               jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
75541 +
75542 +               /* Count bitmap locks for getting correct statistics what number
75543 +                * of blocks were cleared by the transaction commit. */
75544 +               if (jnode_get_type(cur) == JNODE_BITMAP)
75545 +                       ch->nr_bitmap++;
75546 +
75547 +               assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
75548 +                      || jnode_get_type(cur) == JNODE_BITMAP);
75549 +
75550 +               if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
75551 +                       /* we replace fake znode by another (real)
75552 +                          znode which is suggested by disk_layout
75553 +                          plugin */
75554 +
75555 +                       /* FIXME: it looks like fake znode should be
75556 +                          replaced by jnode supplied by
75557 +                          disk_layout. */
75558 +
75559 +                       struct super_block *s = reiser4_get_current_sb();
75560 +                       reiser4_super_info_data *sbinfo =
75561 +                           get_current_super_private();
75562 +
75563 +                       if (sbinfo->df_plug->log_super) {
75564 +                               jnode *sj = sbinfo->df_plug->log_super(s);
75565 +
75566 +                               assert("zam-593", sj != NULL);
75567 +
75568 +                               if (IS_ERR(sj))
75569 +                                       return PTR_ERR(sj);
75570 +
75571 +                               spin_lock_jnode(sj);
75572 +                               JF_SET(sj, JNODE_OVRWR);
75573 +                               insert_into_atom_ovrwr_list(ch->atom, sj);
75574 +                               spin_unlock_jnode(sj);
75575 +
75576 +                               /* jload it as the rest of overwrite set */
75577 +                               jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0);
75578 +
75579 +                               ch->overwrite_set_size++;
75580 +                       }
75581 +                       spin_lock_jnode(cur);
75582 +                       reiser4_uncapture_block(cur);
75583 +                       jput(cur);
75584 +
75585 +               } else {
75586 +                       int ret;
75587 +                       ch->overwrite_set_size++;
75588 +                       ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0);
75589 +                       if (ret)
75590 +                               reiser4_panic("zam-783",
75591 +                                             "cannot load e-flushed jnode back (ret = %d)\n",
75592 +                                             ret);
75593 +               }
75594 +
75595 +               /* Count not leaves here because we have to grab disk space
75596 +                * for wandered blocks. They were not counted as "flush
75597 +                * reserved". Counting should be done _after_ nodes are pinned
75598 +                * into memory by jload(). */
75599 +               if (!jnode_is_leaf(cur))
75600 +                       nr_not_leaves++;
75601 +               else {
75602 +#if REISER4_DEBUG
75603 +                       /* at this point @cur either has JNODE_FLUSH_RESERVED
75604 +                        * or is eflushed. Locking is not strong enough to
75605 +                        * write an assertion checking for this. */
75606 +                       if (jnode_is_znode(cur))
75607 +                               nr_formatted_leaves++;
75608 +                       else
75609 +                               nr_unformatted_leaves++;
75610 +#endif
75611 +                       JF_CLR(cur, JNODE_FLUSH_RESERVED);
75612 +               }
75613 +
75614 +               cur = next;
75615 +       }
75616 +
75617 +       /* Grab space for writing (wandered blocks) of not leaves found in
75618 +        * overwrite set. */
75619 +       ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
75620 +       if (ret)
75621 +               return ret;
75622 +
75623 +       /* Disk space for allocation of wandered blocks of leaf nodes already
75624 +        * reserved as "flush reserved", move it to grabbed space counter. */
75625 +       spin_lock_atom(ch->atom);
75626 +       assert("zam-940",
75627 +              nr_formatted_leaves + nr_unformatted_leaves <=
75628 +              ch->atom->flush_reserved);
75629 +       flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
75630 +       spin_unlock_atom(ch->atom);
75631 +
75632 +       return ch->overwrite_set_size;
75633 +}
75634 +
75635 +/**
75636 + * write_jnodes_to_disk_extent - submit write request
75637 + * @head:
75638 + * @first: first jnode of the list
75639 + * @nr: number of jnodes on the list
75640 + * @block_p:
75641 + * @fq:
75642 + * @flags: used to decide whether page is to get PG_reclaim flag
75643 + *
75644 + * Submits a write request for @nr jnodes beginning from the @first, other
75645 + * jnodes are after the @first on the double-linked "capture" list.  All jnodes
75646 + * will be written to the disk region of @nr blocks starting with @block_p block
75647 + * number.  If @fq is not NULL it means that waiting for i/o completion will be
75648 + * done more efficiently by using flush_queue_t objects.
75649 + * This function is the one which writes list of jnodes in batch mode. It does
75650 + * all low-level things as bio construction and page states manipulation.
75651 + *
75652 + * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
75653 + * aggregated in this function instead of being left to the layers below
75654 + *
75655 + * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
75656 + * Why that layer needed? Why BIOs cannot be constructed here?
75657 + */
75658 +static int write_jnodes_to_disk_extent(
75659 +       jnode *first, int nr, const reiser4_block_nr *block_p,
75660 +       flush_queue_t *fq, int flags)
75661 +{
75662 +       struct super_block *super = reiser4_get_current_sb();
75663 +       int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
75664 +       int max_blocks;
75665 +       jnode *cur = first;
75666 +       reiser4_block_nr block;
75667 +
75668 +       assert("zam-571", first != NULL);
75669 +       assert("zam-572", block_p != NULL);
75670 +       assert("zam-570", nr > 0);
75671 +
75672 +       block = *block_p;
75673 +       max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
75674 +
75675 +       while (nr > 0) {
75676 +               struct bio *bio;
75677 +               int nr_blocks = min(nr, max_blocks);
75678 +               int i;
75679 +               int nr_used;
75680 +
75681 +               bio = bio_alloc(GFP_NOIO, nr_blocks);
75682 +               if (!bio)
75683 +                       return RETERR(-ENOMEM);
75684 +
75685 +               bio->bi_bdev = super->s_bdev;
75686 +               bio->bi_sector = block * (super->s_blocksize >> 9);
75687 +               for (nr_used = 0, i = 0; i < nr_blocks; i++) {
75688 +                       struct page *pg;
75689 +
75690 +                       pg = jnode_page(cur);
75691 +                       assert("zam-573", pg != NULL);
75692 +
75693 +                       page_cache_get(pg);
75694 +
75695 +                       lock_and_wait_page_writeback(pg);
75696 +
75697 +                       if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
75698 +                               /*
75699 +                                * underlying device is satiated. Stop adding
75700 +                                * pages to the bio.
75701 +                                */
75702 +                               unlock_page(pg);
75703 +                               page_cache_release(pg);
75704 +                               break;
75705 +                       }
75706 +
75707 +                       spin_lock_jnode(cur);
75708 +                       assert("nikita-3166",
75709 +                              pg->mapping == jnode_get_mapping(cur));
75710 +                       assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
75711 +#if REISER4_DEBUG
75712 +                       spin_lock(&cur->load);
75713 +                       assert("nikita-3165", !jnode_is_releasable(cur));
75714 +                       spin_unlock(&cur->load);
75715 +#endif
75716 +                       JF_SET(cur, JNODE_WRITEBACK);
75717 +                       JF_CLR(cur, JNODE_DIRTY);
75718 +                       ON_DEBUG(cur->written++);
75719 +                       spin_unlock_jnode(cur);
75720 +
75721 +                       ClearPageError(pg);
75722 +                       set_page_writeback(pg);
75723 +
75724 +                       if (get_current_context()->entd) {
75725 +                               /* this is ent thread */
75726 +                               entd_context *ent = get_entd_context(super);
75727 +                               struct wbq *rq, *next;
75728 +
75729 +                               spin_lock(&ent->guard);
75730 +
75731 +                               if (pg == ent->cur_request->page) {
75732 +                                       /*
75733 +                                        * entd is called for this page. This
75734 +                                        * request is not in th etodo list
75735 +                                        */
75736 +                                       ent->cur_request->written = 1;
75737 +                               } else {
75738 +                                       /*
75739 +                                        * if we have written a page for which writepage
75740 +                                        * is called for - move request to another list.
75741 +                                        */
75742 +                                       list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
75743 +                                               assert("", rq->magic == WBQ_MAGIC);
75744 +                                               if (pg == rq->page) {
75745 +                                                       /*
75746 +                                                        * remove request from
75747 +                                                        * entd's queue, but do
75748 +                                                        * not wake up a thread
75749 +                                                        * which put this
75750 +                                                        * request
75751 +                                                        */
75752 +                                                       list_del_init(&rq->link);
75753 +                                                       ent->nr_todo_reqs --;
75754 +                                                       list_add_tail(&rq->link, &ent->done_list);
75755 +                                                       ent->nr_done_reqs ++;
75756 +                                                       rq->written = 1;
75757 +                                                       break;
75758 +                                               }
75759 +                                       }
75760 +                               }
75761 +                               spin_unlock(&ent->guard);
75762 +                       }
75763 +
75764 +                       clear_page_dirty_for_io(pg);
75765 +
75766 +                       unlock_page(pg);
75767 +
75768 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
75769 +                       nr_used++;
75770 +               }
75771 +               if (nr_used > 0) {
75772 +                       assert("nikita-3453",
75773 +                              bio->bi_size == super->s_blocksize * nr_used);
75774 +                       assert("nikita-3454", bio->bi_vcnt == nr_used);
75775 +
75776 +                       /* Check if we are allowed to write at all */
75777 +                       if (super->s_flags & MS_RDONLY)
75778 +                               undo_bio(bio);
75779 +                       else {
75780 +                               int not_supported;
75781 +
75782 +                               add_fq_to_bio(fq, bio);
75783 +                               bio_get(bio);
75784 +                               reiser4_submit_bio(write_op, bio);
75785 +                               not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
75786 +                               bio_put(bio);
75787 +                               if (not_supported)
75788 +                                       return -EOPNOTSUPP;
75789 +                       }
75790 +
75791 +                       block += nr_used - 1;
75792 +                       update_blocknr_hint_default(super, &block);
75793 +                       block += 1;
75794 +               } else {
75795 +                       bio_put(bio);
75796 +               }
75797 +               nr -= nr_used;
75798 +       }
75799 +
75800 +       return 0;
75801 +}
75802 +
75803 +/* This is a procedure which recovers a contiguous sequences of disk block
75804 +   numbers in the given list of j-nodes and submits write requests on this
75805 +   per-sequence basis */
75806 +int
75807 +write_jnode_list(struct list_head *head, flush_queue_t *fq,
75808 +                long *nr_submitted, int flags)
75809 +{
75810 +       int ret;
75811 +       jnode *beg = list_entry(head->next, jnode, capture_link);
75812 +
75813 +       while (head != &beg->capture_link) {
75814 +               int nr = 1;
75815 +               jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
75816 +
75817 +               while (head != &cur->capture_link) {
75818 +                       if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
75819 +                               break;
75820 +                       ++nr;
75821 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
75822 +               }
75823 +
75824 +               ret = write_jnodes_to_disk_extent(
75825 +                       beg, nr, jnode_get_block(beg), fq, flags);
75826 +               if (ret)
75827 +                       return ret;
75828 +
75829 +               if (nr_submitted)
75830 +                       *nr_submitted += nr;
75831 +
75832 +               beg = cur;
75833 +       }
75834 +
75835 +       return 0;
75836 +}
75837 +
75838 +/* add given wandered mapping to atom's wandered map */
75839 +static int
75840 +add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
75841 +{
75842 +       int ret;
75843 +       blocknr_set_entry *new_bsep = NULL;
75844 +       reiser4_block_nr block;
75845 +
75846 +       txn_atom *atom;
75847 +
75848 +       assert("zam-568", block_p != NULL);
75849 +       block = *block_p;
75850 +       assert("zam-569", len > 0);
75851 +
75852 +       while ((len--) > 0) {
75853 +               do {
75854 +                       atom = get_current_atom_locked();
75855 +                       assert("zam-536",
75856 +                              !reiser4_blocknr_is_fake(jnode_get_block(cur)));
75857 +                       ret =
75858 +                           blocknr_set_add_pair(atom, &atom->wandered_map,
75859 +                                                &new_bsep,
75860 +                                                jnode_get_block(cur), &block);
75861 +               } while (ret == -E_REPEAT);
75862 +
75863 +               if (ret) {
75864 +                       /* deallocate blocks which were not added to wandered
75865 +                          map */
75866 +                       reiser4_block_nr wide_len = len;
75867 +
75868 +                       reiser4_dealloc_blocks(&block, &wide_len,
75869 +                                              BLOCK_NOT_COUNTED,
75870 +                                              BA_FORMATTED
75871 +                                              /* formatted, without defer */ );
75872 +
75873 +                       return ret;
75874 +               }
75875 +
75876 +               spin_unlock_atom(atom);
75877 +
75878 +               cur = list_entry(cur->capture_link.next, jnode, capture_link);
75879 +               ++block;
75880 +       }
75881 +
75882 +       return 0;
75883 +}
75884 +
75885 +/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
75886 +   submit IO for allocated blocks.  We assume that current atom is in a stage
75887 +   when any atom fusion is impossible and atom is unlocked and it is safe. */
75888 +static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
75889 +{
75890 +       reiser4_block_nr block;
75891 +
75892 +       int rest;
75893 +       int len;
75894 +       int ret;
75895 +
75896 +       jnode *cur;
75897 +
75898 +       assert("zam-534", ch->overwrite_set_size > 0);
75899 +
75900 +       rest = ch->overwrite_set_size;
75901 +
75902 +       cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
75903 +       while (ch->overwrite_set != &cur->capture_link) {
75904 +               assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
75905 +
75906 +               ret = get_more_wandered_blocks(rest, &block, &len);
75907 +               if (ret)
75908 +                       return ret;
75909 +
75910 +               rest -= len;
75911 +
75912 +               ret = add_region_to_wmap(cur, len, &block);
75913 +               if (ret)
75914 +                       return ret;
75915 +
75916 +               ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
75917 +               if (ret)
75918 +                       return ret;
75919 +
75920 +               while ((len--) > 0) {
75921 +                       assert("zam-604",
75922 +                              ch->overwrite_set != &cur->capture_link);
75923 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
75924 +               }
75925 +       }
75926 +
75927 +       return 0;
75928 +}
75929 +
75930 +/* allocate given number of nodes over the journal area and link them into a
75931 +   list, return pointer to the first jnode in the list */
75932 +static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
75933 +{
75934 +       reiser4_blocknr_hint hint;
75935 +       reiser4_block_nr allocated = 0;
75936 +       reiser4_block_nr first, len;
75937 +       jnode *cur;
75938 +       jnode *txhead;
75939 +       int ret;
75940 +       reiser4_context *ctx;
75941 +       reiser4_super_info_data *sbinfo;
75942 +
75943 +       assert("zam-698", ch->tx_size > 0);
75944 +       assert("zam-699", list_empty_careful(&ch->tx_list));
75945 +
75946 +       ctx = get_current_context();
75947 +       sbinfo = get_super_private(ctx->super);
75948 +
75949 +       while (allocated < (unsigned)ch->tx_size) {
75950 +               len = (ch->tx_size - allocated);
75951 +
75952 +               reiser4_blocknr_hint_init(&hint);
75953 +
75954 +               hint.block_stage = BLOCK_GRABBED;
75955 +
75956 +               /* FIXME: there should be some block allocation policy for
75957 +                  nodes which contain wander records */
75958 +
75959 +               /* We assume that disk space for wandered record blocks can be
75960 +                * taken from reserved area. */
75961 +               ret = reiser4_alloc_blocks(&hint, &first, &len,
75962 +                                          BA_FORMATTED | BA_RESERVED |
75963 +                                          BA_USE_DEFAULT_SEARCH_START);
75964 +               reiser4_blocknr_hint_done(&hint);
75965 +
75966 +               if (ret)
75967 +                       return ret;
75968 +
75969 +               allocated += len;
75970 +
75971 +               /* create jnodes for all wander records */
75972 +               while (len--) {
75973 +                       cur = reiser4_alloc_io_head(&first);
75974 +
75975 +                       if (cur == NULL) {
75976 +                               ret = RETERR(-ENOMEM);
75977 +                               goto free_not_assigned;
75978 +                       }
75979 +
75980 +                       ret = jinit_new(cur, reiser4_ctx_gfp_mask_get());
75981 +
75982 +                       if (ret != 0) {
75983 +                               jfree(cur);
75984 +                               goto free_not_assigned;
75985 +                       }
75986 +
75987 +                       pin_jnode_data(cur);
75988 +
75989 +                       list_add_tail(&cur->capture_link, &ch->tx_list);
75990 +
75991 +                       first++;
75992 +               }
75993 +       }
75994 +
75995 +       { /* format a on-disk linked list of wander records */
75996 +               int serial = 1;
75997 +
75998 +               txhead = list_entry(ch->tx_list.next, jnode, capture_link);
75999 +               format_tx_head(ch);
76000 +
76001 +               cur = list_entry(txhead->capture_link.next, jnode, capture_link);
76002 +               while (&ch->tx_list != &cur->capture_link) {
76003 +                       format_wander_record(ch, cur, serial++);
76004 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
76005 +               }
76006 +       }
76007 +
76008 +       { /* Fill wander records with Wandered Set */
76009 +               struct store_wmap_params params;
76010 +               txn_atom *atom;
76011 +
76012 +               params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
76013 +
76014 +               params.idx = 0;
76015 +               params.capacity =
76016 +                   wander_record_capacity(reiser4_get_current_sb());
76017 +
76018 +               atom = get_current_atom_locked();
76019 +               blocknr_set_iterator(atom, &atom->wandered_map,
76020 +                                    &store_wmap_actor, &params, 0);
76021 +               spin_unlock_atom(atom);
76022 +       }
76023 +
76024 +       { /* relse all jnodes from tx_list */
76025 +               cur = list_entry(ch->tx_list.next, jnode, capture_link);
76026 +               while (&ch->tx_list != &cur->capture_link) {
76027 +                       jrelse(cur);
76028 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
76029 +               }
76030 +       }
76031 +
76032 +       ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
76033 +
76034 +       return ret;
76035 +
76036 +      free_not_assigned:
76037 +       /* We deallocate blocks not yet assigned to jnodes on tx_list. The
76038 +          caller takes care about invalidating of tx list  */
76039 +       reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
76040 +
76041 +       return ret;
76042 +}
76043 +
76044 +static int commit_tx(struct commit_handle *ch)
76045 +{
76046 +       flush_queue_t *fq;
76047 +       int barrier;
76048 +       int ret;
76049 +
76050 +       /* Grab more space for wandered records. */
76051 +       ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
76052 +       if (ret)
76053 +               return ret;
76054 +
76055 +       fq = get_fq_for_current_atom();
76056 +       if (IS_ERR(fq))
76057 +               return PTR_ERR(fq);
76058 +
76059 +       spin_unlock_atom(fq->atom);
76060 +       do {
76061 +               ret = alloc_wandered_blocks(ch, fq);
76062 +               if (ret)
76063 +                       break;
76064 +               ret = alloc_tx(ch, fq);
76065 +               if (ret)
76066 +                       break;
76067 +       } while (0);
76068 +
76069 +       reiser4_fq_put(fq);
76070 +       if (ret)
76071 +               return ret;
76072 + repeat_wo_barrier:
76073 +       barrier = reiser4_use_write_barrier(ch->super);
76074 +       if (!barrier) {
76075 +               ret = current_atom_finish_all_fq();
76076 +               if (ret)
76077 +                       return ret;
76078 +       }
76079 +       ret = update_journal_header(ch, barrier);
76080 +       if (barrier) {
76081 +               if (ret) {
76082 +                       if (ret == -EOPNOTSUPP) {
76083 +                               disable_write_barrier(ch->super);
76084 +                               goto repeat_wo_barrier;
76085 +                       }
76086 +                       return ret;
76087 +               }
76088 +               ret = current_atom_finish_all_fq();
76089 +       }
76090 +       return ret;
76091 +}
76092 +
76093 +static int write_tx_back(struct commit_handle * ch)
76094 +{
76095 +       flush_queue_t *fq;
76096 +       int ret;
76097 +       int barrier;
76098 +
76099 +       reiser4_post_commit_hook();
76100 +       fq = get_fq_for_current_atom();
76101 +       if (IS_ERR(fq))
76102 +               return  PTR_ERR(fq);
76103 +       spin_unlock_atom(fq->atom);
76104 +       ret = write_jnode_list(
76105 +               ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
76106 +       reiser4_fq_put(fq);
76107 +       if (ret)
76108 +               return ret;
76109 + repeat_wo_barrier:
76110 +       barrier = reiser4_use_write_barrier(ch->super);
76111 +       if (!barrier) {
76112 +               ret = current_atom_finish_all_fq();
76113 +               if (ret)
76114 +                       return ret;
76115 +       }
76116 +       ret = update_journal_footer(ch, barrier);
76117 +       if (barrier) {
76118 +               if (ret) {
76119 +                       if (ret == -EOPNOTSUPP) {
76120 +                               disable_write_barrier(ch->super);
76121 +                               goto repeat_wo_barrier;
76122 +                       }
76123 +                       return ret;
76124 +               }
76125 +               ret = current_atom_finish_all_fq();
76126 +       }
76127 +       if (ret)
76128 +               return ret;
76129 +       reiser4_post_write_back_hook();
76130 +       return 0;
76131 +}
76132 +
76133 +/* We assume that at this moment all captured blocks are marked as RELOC or
76134 +   WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
76135 +   are submitted to write.
76136 +*/
76137 +
76138 +int reiser4_write_logs(long *nr_submitted)
76139 +{
76140 +       txn_atom *atom;
76141 +       struct super_block *super = reiser4_get_current_sb();
76142 +       reiser4_super_info_data *sbinfo = get_super_private(super);
76143 +       struct commit_handle ch;
76144 +       int ret;
76145 +
76146 +       writeout_mode_enable();
76147 +
76148 +       /* block allocator may add j-nodes to the clean_list */
76149 +       ret = reiser4_pre_commit_hook();
76150 +       if (ret)
76151 +               return ret;
76152 +
76153 +       /* No locks are required if we take atom which stage >=
76154 +        * ASTAGE_PRE_COMMIT */
76155 +       atom = get_current_context()->trans->atom;
76156 +       assert("zam-965", atom != NULL);
76157 +
76158 +       /* relocate set is on the atom->clean_nodes list after
76159 +        * current_atom_complete_writes() finishes. It can be safely
76160 +        * uncaptured after commit_mutex is locked, because any atom that
76161 +        * captures these nodes is guaranteed to commit after current one.
76162 +        *
76163 +        * This can only be done after reiser4_pre_commit_hook(), because it is where
76164 +        * early flushed jnodes with CREATED bit are transferred to the
76165 +        * overwrite list. */
76166 +       reiser4_invalidate_list(ATOM_CLEAN_LIST(atom));
76167 +       spin_lock_atom(atom);
76168 +       /* There might be waiters for the relocate nodes which we have
76169 +        * released, wake them up. */
76170 +       reiser4_atom_send_event(atom);
76171 +       spin_unlock_atom(atom);
76172 +
76173 +       if (REISER4_DEBUG) {
76174 +               int level;
76175 +
76176 +               for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
76177 +                       assert("nikita-3352",
76178 +                              list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
76179 +       }
76180 +
76181 +       sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
76182 +       sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
76183 +
76184 +       init_commit_handle(&ch, atom);
76185 +
76186 +       ch.free_blocks = sbinfo->blocks_free_committed;
76187 +       ch.nr_files = sbinfo->nr_files_committed;
76188 +       /* ZAM-FIXME-HANS: email me what the contention level is for the super
76189 +        * lock. */
76190 +       ch.next_oid = oid_next(super);
76191 +
76192 +       /* count overwrite set and place it in a separate list */
76193 +       ret = get_overwrite_set(&ch);
76194 +
76195 +       if (ret <= 0) {
76196 +               /* It is possible that overwrite set is empty here, it means
76197 +                  all captured nodes are clean */
76198 +               goto up_and_ret;
76199 +       }
76200 +
76201 +       /* Inform the caller about what number of dirty pages will be
76202 +        * submitted to disk. */
76203 +       *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
76204 +
76205 +       /* count all records needed for storing of the wandered set */
76206 +       get_tx_size(&ch);
76207 +
76208 +       ret = commit_tx(&ch);
76209 +       if (ret)
76210 +               goto up_and_ret;
76211 +
76212 +       spin_lock_atom(atom);
76213 +       reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT);
76214 +       spin_unlock_atom(atom);
76215 +
76216 +       ret = write_tx_back(&ch);
76217 +       reiser4_post_write_back_hook();
76218 +
76219 +      up_and_ret:
76220 +       if (ret) {
76221 +               /* there could be fq attached to current atom; the only way to
76222 +                  remove them is: */
76223 +               current_atom_finish_all_fq();
76224 +       }
76225 +
76226 +       /* free blocks of flushed transaction */
76227 +       dealloc_tx_list(&ch);
76228 +       dealloc_wmap(&ch);
76229 +
76230 +       put_overwrite_set(&ch);
76231 +
76232 +       done_commit_handle(&ch);
76233 +
76234 +       writeout_mode_disable();
76235 +
76236 +       return ret;
76237 +}
76238 +
76239 +/* consistency checks for journal data/control blocks: header, footer, log
76240 +   records, transactions head blocks. All functions return zero on success. */
76241 +
76242 +static int check_journal_header(const jnode * node UNUSED_ARG)
76243 +{
76244 +       /* FIXME: journal header has no magic field yet. */
76245 +       return 0;
76246 +}
76247 +
76248 +/* wait for write completion for all jnodes from given list */
76249 +static int wait_on_jnode_list(struct list_head *head)
76250 +{
76251 +       jnode *scan;
76252 +       int ret = 0;
76253 +
76254 +       list_for_each_entry(scan, head, capture_link) {
76255 +               struct page *pg = jnode_page(scan);
76256 +
76257 +               if (pg) {
76258 +                       if (PageWriteback(pg))
76259 +                               wait_on_page_writeback(pg);
76260 +
76261 +                       if (PageError(pg))
76262 +                               ret++;
76263 +               }
76264 +       }
76265 +
76266 +       return ret;
76267 +}
76268 +
76269 +static int check_journal_footer(const jnode * node UNUSED_ARG)
76270 +{
76271 +       /* FIXME: journal footer has no magic field yet. */
76272 +       return 0;
76273 +}
76274 +
76275 +static int check_tx_head(const jnode * node)
76276 +{
76277 +       struct tx_header *header = (struct tx_header *)jdata(node);
76278 +
76279 +       if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
76280 +               warning("zam-627", "tx head at block %s corrupted\n",
76281 +                       sprint_address(jnode_get_block(node)));
76282 +               return RETERR(-EIO);
76283 +       }
76284 +
76285 +       return 0;
76286 +}
76287 +
76288 +static int check_wander_record(const jnode * node)
76289 +{
76290 +       struct wander_record_header *RH =
76291 +           (struct wander_record_header *)jdata(node);
76292 +
76293 +       if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
76294 +           0) {
76295 +               warning("zam-628", "wander record at block %s corrupted\n",
76296 +                       sprint_address(jnode_get_block(node)));
76297 +               return RETERR(-EIO);
76298 +       }
76299 +
76300 +       return 0;
76301 +}
76302 +
76303 +/* fill commit_handler structure by everything what is needed for update_journal_footer */
76304 +static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
76305 +{
76306 +       struct tx_header *TXH;
76307 +       int ret;
76308 +
76309 +       ret = jload(tx_head);
76310 +       if (ret)
76311 +               return ret;
76312 +
76313 +       TXH = (struct tx_header *)jdata(tx_head);
76314 +
76315 +       ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
76316 +       ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
76317 +       ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
76318 +
76319 +       jrelse(tx_head);
76320 +
76321 +       list_add(&tx_head->capture_link, &ch->tx_list);
76322 +
76323 +       return 0;
76324 +}
76325 +
76326 +/* replay one transaction: restore and write overwrite set in place */
76327 +static int replay_transaction(const struct super_block *s,
76328 +                             jnode * tx_head,
76329 +                             const reiser4_block_nr * log_rec_block_p,
76330 +                             const reiser4_block_nr * end_block,
76331 +                             unsigned int nr_wander_records)
76332 +{
76333 +       reiser4_block_nr log_rec_block = *log_rec_block_p;
76334 +       struct commit_handle ch;
76335 +       LIST_HEAD(overwrite_set);
76336 +       jnode *log;
76337 +       int ret;
76338 +
76339 +       init_commit_handle(&ch, NULL);
76340 +       ch.overwrite_set = &overwrite_set;
76341 +
76342 +       restore_commit_handle(&ch, tx_head);
76343 +
76344 +       while (log_rec_block != *end_block) {
76345 +               struct wander_record_header *header;
76346 +               struct wander_entry *entry;
76347 +
76348 +               int i;
76349 +
76350 +               if (nr_wander_records == 0) {
76351 +                       warning("zam-631",
76352 +                               "number of wander records in the linked list"
76353 +                               " greater than number stored in tx head.\n");
76354 +                       ret = RETERR(-EIO);
76355 +                       goto free_ow_set;
76356 +               }
76357 +
76358 +               log = reiser4_alloc_io_head(&log_rec_block);
76359 +               if (log == NULL)
76360 +                       return RETERR(-ENOMEM);
76361 +
76362 +               ret = jload(log);
76363 +               if (ret < 0) {
76364 +                       reiser4_drop_io_head(log);
76365 +                       return ret;
76366 +               }
76367 +
76368 +               ret = check_wander_record(log);
76369 +               if (ret) {
76370 +                       jrelse(log);
76371 +                       reiser4_drop_io_head(log);
76372 +                       return ret;
76373 +               }
76374 +
76375 +               header = (struct wander_record_header *)jdata(log);
76376 +               log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
76377 +
76378 +               entry = (struct wander_entry *)(header + 1);
76379 +
76380 +               /* restore overwrite set from wander record content */
76381 +               for (i = 0; i < wander_record_capacity(s); i++) {
76382 +                       reiser4_block_nr block;
76383 +                       jnode *node;
76384 +
76385 +                       block = le64_to_cpu(get_unaligned(&entry->wandered));
76386 +                       if (block == 0)
76387 +                               break;
76388 +
76389 +                       node = reiser4_alloc_io_head(&block);
76390 +                       if (node == NULL) {
76391 +                               ret = RETERR(-ENOMEM);
76392 +                               /*
76393 +                                * FIXME-VS:???
76394 +                                */
76395 +                               jrelse(log);
76396 +                               reiser4_drop_io_head(log);
76397 +                               goto free_ow_set;
76398 +                       }
76399 +
76400 +                       ret = jload(node);
76401 +
76402 +                       if (ret < 0) {
76403 +                               reiser4_drop_io_head(node);
76404 +                               /*
76405 +                                * FIXME-VS:???
76406 +                                */
76407 +                               jrelse(log);
76408 +                               reiser4_drop_io_head(log);
76409 +                               goto free_ow_set;
76410 +                       }
76411 +
76412 +                       block = le64_to_cpu(get_unaligned(&entry->original));
76413 +
76414 +                       assert("zam-603", block != 0);
76415 +
76416 +                       jnode_set_block(node, &block);
76417 +
76418 +                       list_add_tail(&node->capture_link, ch.overwrite_set);
76419 +
76420 +                       ++entry;
76421 +               }
76422 +
76423 +               jrelse(log);
76424 +               reiser4_drop_io_head(log);
76425 +
76426 +               --nr_wander_records;
76427 +       }
76428 +
76429 +       if (nr_wander_records != 0) {
76430 +               warning("zam-632", "number of wander records in the linked list"
76431 +                       " less than number stored in tx head.\n");
76432 +               ret = RETERR(-EIO);
76433 +               goto free_ow_set;
76434 +       }
76435 +
76436 +       {                       /* write wandered set in place */
76437 +               write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
76438 +               ret = wait_on_jnode_list(ch.overwrite_set);
76439 +
76440 +               if (ret) {
76441 +                       ret = RETERR(-EIO);
76442 +                       goto free_ow_set;
76443 +               }
76444 +       }
76445 +
76446 +       ret = update_journal_footer(&ch, 0);
76447 +
76448 +      free_ow_set:
76449 +
76450 +       while (!list_empty(ch.overwrite_set)) {
76451 +               jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
76452 +               list_del_init(&cur->capture_link);
76453 +               jrelse(cur);
76454 +               reiser4_drop_io_head(cur);
76455 +       }
76456 +
76457 +       list_del_init(&tx_head->capture_link);
76458 +
76459 +       done_commit_handle(&ch);
76460 +
76461 +       return ret;
76462 +}
76463 +
76464 +/* find oldest committed and not played transaction and play it. The transaction
76465 + * was committed and journal header block was updated but the blocks from the
76466 + * process of writing the atom's overwrite set in-place and updating of journal
76467 + * footer block were not completed. This function completes the process by
76468 + * recovering the atom's overwrite set from their wandered locations and writes
76469 + * them in-place and updating the journal footer. */
76470 +static int replay_oldest_transaction(struct super_block *s)
76471 +{
76472 +       reiser4_super_info_data *sbinfo = get_super_private(s);
76473 +       jnode *jf = sbinfo->journal_footer;
76474 +       unsigned int total;
76475 +       struct journal_footer *F;
76476 +       struct tx_header *T;
76477 +
76478 +       reiser4_block_nr prev_tx;
76479 +       reiser4_block_nr last_flushed_tx;
76480 +       reiser4_block_nr log_rec_block = 0;
76481 +
76482 +       jnode *tx_head;
76483 +
76484 +       int ret;
76485 +
76486 +       if ((ret = jload(jf)) < 0)
76487 +               return ret;
76488 +
76489 +       F = (struct journal_footer *)jdata(jf);
76490 +
76491 +       last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
76492 +
76493 +       jrelse(jf);
76494 +
76495 +       if (sbinfo->last_committed_tx == last_flushed_tx) {
76496 +               /* all transactions are replayed */
76497 +               return 0;
76498 +       }
76499 +
76500 +       prev_tx = sbinfo->last_committed_tx;
76501 +
76502 +       /* searching for oldest not flushed transaction */
76503 +       while (1) {
76504 +               tx_head = reiser4_alloc_io_head(&prev_tx);
76505 +               if (!tx_head)
76506 +                       return RETERR(-ENOMEM);
76507 +
76508 +               ret = jload(tx_head);
76509 +               if (ret < 0) {
76510 +                       reiser4_drop_io_head(tx_head);
76511 +                       return ret;
76512 +               }
76513 +
76514 +               ret = check_tx_head(tx_head);
76515 +               if (ret) {
76516 +                       jrelse(tx_head);
76517 +                       reiser4_drop_io_head(tx_head);
76518 +                       return ret;
76519 +               }
76520 +
76521 +               T = (struct tx_header *)jdata(tx_head);
76522 +
76523 +               prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
76524 +
76525 +               if (prev_tx == last_flushed_tx)
76526 +                       break;
76527 +
76528 +               jrelse(tx_head);
76529 +               reiser4_drop_io_head(tx_head);
76530 +       }
76531 +
76532 +       total = le32_to_cpu(get_unaligned(&T->total));
76533 +       log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
76534 +
76535 +       pin_jnode_data(tx_head);
76536 +       jrelse(tx_head);
76537 +
76538 +       ret =
76539 +           replay_transaction(s, tx_head, &log_rec_block,
76540 +                              jnode_get_block(tx_head), total - 1);
76541 +
76542 +       unpin_jnode_data(tx_head);
76543 +       reiser4_drop_io_head(tx_head);
76544 +
76545 +       if (ret)
76546 +               return ret;
76547 +       return -E_REPEAT;
76548 +}
76549 +
76550 +/* The reiser4 journal current implementation was optimized to not to capture
76551 +   super block if certain super blocks fields are modified. Currently, the set
76552 +   is (<free block count>, <OID allocator>). These fields are logged by
76553 +   special way which includes storing them in each transaction head block at
76554 +   atom commit time and writing that information to journal footer block at
76555 +   atom flush time.  For getting info from journal footer block to the
76556 +   in-memory super block there is a special function
76557 +   reiser4_journal_recover_sb_data() which should be called after disk format
76558 +   plugin re-reads super block after journal replaying.
76559 +*/
76560 +
76561 +/* get the information from journal footer in-memory super block */
76562 +int reiser4_journal_recover_sb_data(struct super_block *s)
76563 +{
76564 +       reiser4_super_info_data *sbinfo = get_super_private(s);
76565 +       struct journal_footer *jf;
76566 +       int ret;
76567 +
76568 +       assert("zam-673", sbinfo->journal_footer != NULL);
76569 +
76570 +       ret = jload(sbinfo->journal_footer);
76571 +       if (ret != 0)
76572 +               return ret;
76573 +
76574 +       ret = check_journal_footer(sbinfo->journal_footer);
76575 +       if (ret != 0)
76576 +               goto out;
76577 +
76578 +       jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
76579 +
76580 +       /* was there at least one flushed transaction?  */
76581 +       if (jf->last_flushed_tx) {
76582 +
76583 +               /* restore free block counter logged in this transaction */
76584 +               reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
76585 +
76586 +               /* restore oid allocator state */
76587 +               oid_init_allocator(s,
76588 +                                  le64_to_cpu(get_unaligned(&jf->nr_files)),
76589 +                                  le64_to_cpu(get_unaligned(&jf->next_oid)));
76590 +       }
76591 +      out:
76592 +       jrelse(sbinfo->journal_footer);
76593 +       return ret;
76594 +}
76595 +
76596 +/* reiser4 replay journal procedure */
76597 +int reiser4_journal_replay(struct super_block *s)
76598 +{
76599 +       reiser4_super_info_data *sbinfo = get_super_private(s);
76600 +       jnode *jh, *jf;
76601 +       struct journal_header *header;
76602 +       int nr_tx_replayed = 0;
76603 +       int ret;
76604 +
76605 +       assert("zam-582", sbinfo != NULL);
76606 +
76607 +       jh = sbinfo->journal_header;
76608 +       jf = sbinfo->journal_footer;
76609 +
76610 +       if (!jh || !jf) {
76611 +               /* it is possible that disk layout does not support journal
76612 +                  structures, we just warn about this */
76613 +               warning("zam-583",
76614 +                       "journal control blocks were not loaded by disk layout plugin.  "
76615 +                       "journal replaying is not possible.\n");
76616 +               return 0;
76617 +       }
76618 +
76619 +       /* Take free block count from journal footer block. The free block
76620 +          counter value corresponds the last flushed transaction state */
76621 +       ret = jload(jf);
76622 +       if (ret < 0)
76623 +               return ret;
76624 +
76625 +       ret = check_journal_footer(jf);
76626 +       if (ret) {
76627 +               jrelse(jf);
76628 +               return ret;
76629 +       }
76630 +
76631 +       jrelse(jf);
76632 +
76633 +       /* store last committed transaction info in reiser4 in-memory super
76634 +          block */
76635 +       ret = jload(jh);
76636 +       if (ret < 0)
76637 +               return ret;
76638 +
76639 +       ret = check_journal_header(jh);
76640 +       if (ret) {
76641 +               jrelse(jh);
76642 +               return ret;
76643 +       }
76644 +
76645 +       header = (struct journal_header *)jdata(jh);
76646 +       sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
76647 +
76648 +       jrelse(jh);
76649 +
76650 +       /* replay committed transactions */
76651 +       while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
76652 +               nr_tx_replayed++;
76653 +
76654 +       return ret;
76655 +}
76656 +
76657 +/* load journal control block (either journal header or journal footer block) */
76658 +static int
76659 +load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
76660 +{
76661 +       int ret;
76662 +
76663 +       *node = reiser4_alloc_io_head(block);
76664 +       if (!(*node))
76665 +               return RETERR(-ENOMEM);
76666 +
76667 +       ret = jload(*node);
76668 +
76669 +       if (ret) {
76670 +               reiser4_drop_io_head(*node);
76671 +               *node = NULL;
76672 +               return ret;
76673 +       }
76674 +
76675 +       pin_jnode_data(*node);
76676 +       jrelse(*node);
76677 +
76678 +       return 0;
76679 +}
76680 +
76681 +/* unload journal header or footer and free jnode */
76682 +static void unload_journal_control_block(jnode ** node)
76683 +{
76684 +       if (*node) {
76685 +               unpin_jnode_data(*node);
76686 +               reiser4_drop_io_head(*node);
76687 +               *node = NULL;
76688 +       }
76689 +}
76690 +
76691 +/* release journal control blocks */
76692 +void reiser4_done_journal_info(struct super_block *s)
76693 +{
76694 +       reiser4_super_info_data *sbinfo = get_super_private(s);
76695 +
76696 +       assert("zam-476", sbinfo != NULL);
76697 +
76698 +       unload_journal_control_block(&sbinfo->journal_header);
76699 +       unload_journal_control_block(&sbinfo->journal_footer);
76700 +       rcu_barrier();
76701 +}
76702 +
76703 +/* load journal control blocks */
76704 +int reiser4_init_journal_info(struct super_block *s)
76705 +{
76706 +       reiser4_super_info_data *sbinfo = get_super_private(s);
76707 +       journal_location *loc;
76708 +       int ret;
76709 +
76710 +       loc = &sbinfo->jloc;
76711 +
76712 +       assert("zam-651", loc != NULL);
76713 +       assert("zam-652", loc->header != 0);
76714 +       assert("zam-653", loc->footer != 0);
76715 +
76716 +       ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
76717 +
76718 +       if (ret)
76719 +               return ret;
76720 +
76721 +       ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
76722 +
76723 +       if (ret) {
76724 +               unload_journal_control_block(&sbinfo->journal_header);
76725 +       }
76726 +
76727 +       return ret;
76728 +}
76729 +
76730 +/* Make Linus happy.
76731 +   Local variables:
76732 +   c-indentation-style: "K&R"
76733 +   mode-name: "LC"
76734 +   c-basic-offset: 8
76735 +   tab-width: 8
76736 +   fill-column: 80
76737 +   End:
76738 +*/
76739 diff -urN linux-2.6.35.orig/fs/reiser4/wander.h linux-2.6.35/fs/reiser4/wander.h
76740 --- linux-2.6.35.orig/fs/reiser4/wander.h       1970-01-01 01:00:00.000000000 +0100
76741 +++ linux-2.6.35/fs/reiser4/wander.h    2010-08-04 15:44:57.000000000 +0200
76742 @@ -0,0 +1,135 @@
76743 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
76744 +
76745 +#if !defined (__FS_REISER4_WANDER_H__)
76746 +#define __FS_REISER4_WANDER_H__
76747 +
76748 +#include "dformat.h"
76749 +
76750 +#include <linux/fs.h>          /* for struct super_block  */
76751 +
76752 +/* REISER4 JOURNAL ON-DISK DATA STRUCTURES   */
76753 +
76754 +#define TX_HEADER_MAGIC  "TxMagic4"
76755 +#define WANDER_RECORD_MAGIC "LogMagc4"
76756 +
76757 +#define TX_HEADER_MAGIC_SIZE  (8)
76758 +#define WANDER_RECORD_MAGIC_SIZE (8)
76759 +
76760 +/* journal header block format */
76761 +struct journal_header {
76762 +       /* last written transaction head location */
76763 +       d64 last_committed_tx;
76764 +};
76765 +
76766 +typedef struct journal_location {
76767 +       reiser4_block_nr footer;
76768 +       reiser4_block_nr header;
76769 +} journal_location;
76770 +
76771 +/* The wander.c head comment describes usage and semantic of all these structures */
76772 +/* journal footer block format */
76773 +struct journal_footer {
76774 +       /* last flushed transaction location. */
76775 +       /* This block number is no more valid after the transaction it points
76776 +          to gets flushed, this number is used only at journal replaying time
76777 +          for detection of the end of on-disk list of committed transactions
76778 +          which were not flushed completely */
76779 +       d64 last_flushed_tx;
76780 +
76781 +       /* free block counter is written in journal footer at transaction
76782 +          flushing , not in super block because free blocks counter is logged
76783 +          by another way than super block fields (root pointer, for
76784 +          example). */
76785 +       d64 free_blocks;
76786 +
76787 +       /* number of used OIDs and maximal used OID are logged separately from
76788 +          super block */
76789 +       d64 nr_files;
76790 +       d64 next_oid;
76791 +};
76792 +
76793 +/* Each wander record (except the first one) has unified format with wander
76794 +   record header followed by an array of log entries */
76795 +struct wander_record_header {
76796 +       /* when there is no predefined location for wander records, this magic
76797 +          string should help reiser4fsck. */
76798 +       char magic[WANDER_RECORD_MAGIC_SIZE];
76799 +
76800 +       /* transaction id */
76801 +       d64 id;
76802 +
76803 +       /* total number of wander records in current transaction  */
76804 +       d32 total;
76805 +
76806 +       /* this block number in transaction */
76807 +       d32 serial;
76808 +
76809 +       /* number of previous block in commit */
76810 +       d64 next_block;
76811 +};
76812 +
76813 +/* The first wander record (transaction head) of written transaction has the
76814 +   special format */
76815 +struct tx_header {
76816 +       /* magic string makes first block in transaction different from other
76817 +          logged blocks, it should help fsck. */
76818 +       char magic[TX_HEADER_MAGIC_SIZE];
76819 +
76820 +       /* transaction id */
76821 +       d64 id;
76822 +
76823 +       /* total number of records (including this first tx head) in the
76824 +          transaction */
76825 +       d32 total;
76826 +
76827 +       /* align next field to 8-byte boundary; this field always is zero */
76828 +       d32 padding;
76829 +
76830 +       /* block number of previous transaction head */
76831 +       d64 prev_tx;
76832 +
76833 +       /* next wander record location */
76834 +       d64 next_block;
76835 +
76836 +       /* committed versions of free blocks counter */
76837 +       d64 free_blocks;
76838 +
76839 +       /* number of used OIDs (nr_files) and maximal used OID are logged
76840 +          separately from super block */
76841 +       d64 nr_files;
76842 +       d64 next_oid;
76843 +};
76844 +
76845 +/* A transaction gets written to disk as a set of wander records (each wander
76846 +   record size is fs block) */
76847 +
76848 +/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled
76849 +   by zeroes */
76850 +struct wander_entry {
76851 +       d64 original;           /* block original location */
76852 +       d64 wandered;           /* block wandered location */
76853 +};
76854 +
76855 +/* REISER4 JOURNAL WRITER FUNCTIONS   */
76856 +
76857 +extern int reiser4_write_logs(long *);
76858 +extern int reiser4_journal_replay(struct super_block *);
76859 +extern int reiser4_journal_recover_sb_data(struct super_block *);
76860 +
76861 +extern int reiser4_init_journal_info(struct super_block *);
76862 +extern void reiser4_done_journal_info(struct super_block *);
76863 +
76864 +extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int);
76865 +
76866 +#endif                         /* __FS_REISER4_WANDER_H__ */
76867 +
76868 +/* Make Linus happy.
76869 +   Local variables:
76870 +   c-indentation-style: "K&R"
76871 +   mode-name: "LC"
76872 +   c-basic-offset: 8
76873 +   tab-width: 8
76874 +   fill-column: 80
76875 +   scroll-step: 1
76876 +   End:
76877 +*/
76878 diff -urN linux-2.6.35.orig/fs/reiser4/writeout.h linux-2.6.35/fs/reiser4/writeout.h
76879 --- linux-2.6.35.orig/fs/reiser4/writeout.h     1970-01-01 01:00:00.000000000 +0100
76880 +++ linux-2.6.35/fs/reiser4/writeout.h  2010-08-04 15:44:57.000000000 +0200
76881 @@ -0,0 +1,21 @@
76882 +/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README  */
76883 +
76884 +#if !defined (__FS_REISER4_WRITEOUT_H__)
76885 +
76886 +#define WRITEOUT_SINGLE_STREAM (0x1)
76887 +#define WRITEOUT_FOR_PAGE_RECLAIM  (0x2)
76888 +#define WRITEOUT_BARRIER (0x4)
76889 +
76890 +extern int reiser4_get_writeout_flags(void);
76891 +
76892 +#endif                         /* __FS_REISER4_WRITEOUT_H__ */
76893 +
76894 +/* Make Linus happy.
76895 +   Local variables:
76896 +   c-indentation-style: "K&R"
76897 +   mode-name: "LC"
76898 +   c-basic-offset: 8
76899 +   tab-width: 8
76900 +   fill-column: 80
76901 +   End:
76902 +*/
76903 diff -urN linux-2.6.35.orig/fs/reiser4/znode.c linux-2.6.35/fs/reiser4/znode.c
76904 --- linux-2.6.35.orig/fs/reiser4/znode.c        1970-01-01 01:00:00.000000000 +0100
76905 +++ linux-2.6.35/fs/reiser4/znode.c     2010-08-04 15:44:57.000000000 +0200
76906 @@ -0,0 +1,1029 @@
76907 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76908 + * reiser4/README */
76909 +/* Znode manipulation functions. */
76910 +/* Znode is the in-memory header for a tree node. It is stored
76911 +   separately from the node itself so that it does not get written to
76912 +   disk.  In this respect znode is like buffer head or page head. We
76913 +   also use znodes for additional reiser4 specific purposes:
76914 +
76915 +    . they are organized into tree structure which is a part of whole
76916 +      reiser4 tree.
76917 +    . they are used to implement node grained locking
76918 +    . they are used to keep additional state associated with a
76919 +      node
76920 +    . they contain links to lists used by the transaction manager
76921 +
76922 +   Znode is attached to some variable "block number" which is instance of
76923 +   fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without
76924 +   appropriate node being actually loaded in memory. Existence of znode itself
76925 +   is regulated by reference count (->x_count) in it. Each time thread
76926 +   acquires reference to znode through call to zget(), ->x_count is
76927 +   incremented and decremented on call to zput().  Data (content of node) are
76928 +   brought in memory through call to zload(), which also increments ->d_count
76929 +   reference counter.  zload can block waiting on IO.  Call to zrelse()
76930 +   decreases this counter. Also, ->c_count keeps track of number of child
76931 +   znodes and prevents parent znode from being recycled until all of its
76932 +   children are. ->c_count is decremented whenever child goes out of existence
76933 +   (being actually recycled in zdestroy()) which can be some time after last
76934 +   reference to this child dies if we support some form of LRU cache for
76935 +   znodes.
76936 +
76937 +*/
76938 +/* EVERY ZNODE'S STORY
76939 +
76940 +   1. His infancy.
76941 +
76942 +   Once upon a time, the znode was born deep inside of zget() by call to
76943 +   zalloc(). At the return from zget() znode had:
76944 +
76945 +    . reference counter (x_count) of 1
76946 +    . assigned block number, marked as used in bitmap
76947 +    . pointer to parent znode. Root znode parent pointer points
76948 +      to its father: "fake" znode. This, in turn, has NULL parent pointer.
76949 +    . hash table linkage
76950 +    . no data loaded from disk
76951 +    . no node plugin
76952 +    . no sibling linkage
76953 +
76954 +   2. His childhood
76955 +
76956 +   Each node is either brought into memory as a result of tree traversal, or
76957 +   created afresh, creation of the root being a special case of the latter. In
76958 +   either case it's inserted into sibling list. This will typically require
76959 +   some ancillary tree traversing, but ultimately both sibling pointers will
76960 +   exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in
76961 +   zjnode.state.
76962 +
76963 +   3. His youth.
76964 +
76965 +   If znode is bound to already existing node in a tree, its content is read
76966 +   from the disk by call to zload(). At that moment, JNODE_LOADED bit is set
76967 +   in zjnode.state and zdata() function starts to return non null for this
76968 +   znode. zload() further calls zparse() that determines which node layout
76969 +   this node is rendered in, and sets ->nplug on success.
76970 +
76971 +   If znode is for new node just created, memory for it is allocated and
76972 +   zinit_new() function is called to initialise data, according to selected
76973 +   node layout.
76974 +
76975 +   4. His maturity.
76976 +
76977 +   After this point, znode lingers in memory for some time. Threads can
76978 +   acquire references to znode either by blocknr through call to zget(), or by
76979 +   following a pointer to unallocated znode from internal item. Each time
76980 +   reference to znode is obtained, x_count is increased. Thread can read/write
76981 +   lock znode. Znode data can be loaded through calls to zload(), d_count will
76982 +   be increased appropriately. If all references to znode are released
76983 +   (x_count drops to 0), znode is not recycled immediately. Rather, it is
76984 +   still cached in the hash table in the hope that it will be accessed
76985 +   shortly.
76986 +
76987 +   There are two ways in which znode existence can be terminated:
76988 +
76989 +    . sudden death: node bound to this znode is removed from the tree
76990 +    . overpopulation: znode is purged out of memory due to memory pressure
76991 +
76992 +   5. His death.
76993 +
76994 +   Death is complex process.
76995 +
76996 +   When we irrevocably commit ourselves to decision to remove node from the
76997 +   tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding
76998 +   znode. This is done either in ->kill_hook() of internal item or in
76999 +   reiser4_kill_root() function when tree root is removed.
77000 +
77001 +   At this moment znode still has:
77002 +
77003 +    . locks held on it, necessary write ones
77004 +    . references to it
77005 +    . disk block assigned to it
77006 +    . data loaded from the disk
77007 +    . pending requests for lock
77008 +
77009 +   But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node
77010 +   deletion. Node deletion includes two phases. First all ways to get
77011 +   references to that znode (sibling and parent links and hash lookup using
77012 +   block number stored in parent node) should be deleted -- it is done through
77013 +   sibling_list_remove(), also we assume that nobody uses down link from
77014 +   parent node due to its nonexistence or proper parent node locking and
77015 +   nobody uses parent pointers from children due to absence of them. Second we
77016 +   invalidate all pending lock requests which still are on znode's lock
77017 +   request queue, this is done by reiser4_invalidate_lock(). Another
77018 +   JNODE_IS_DYING znode status bit is used to invalidate pending lock requests.
77019 +   Once it set all requesters are forced to return -EINVAL from
77020 +   longterm_lock_znode(). Future locking attempts are not possible because all
77021 +   ways to get references to that znode are removed already. Last, node is
77022 +   uncaptured from transaction.
77023 +
77024 +   When last reference to the dying znode is just about to be released,
77025 +   block number for this lock is released and znode is removed from the
77026 +   hash table.
77027 +
77028 +   Now znode can be recycled.
77029 +
77030 +   [it's possible to free bitmap block and remove znode from the hash
77031 +   table when last lock is released. This will result in having
77032 +   referenced but completely orphaned znode]
77033 +
77034 +   6. Limbo
77035 +
77036 +   As have been mentioned above znodes with reference counter 0 are
77037 +   still cached in a hash table. Once memory pressure increases they are
77038 +   purged out of there [this requires something like LRU list for
77039 +   efficient implementation. LRU list would also greatly simplify
77040 +   implementation of coord cache that would in this case morph to just
77041 +   scanning some initial segment of LRU list]. Data loaded into
77042 +   unreferenced znode are flushed back to the durable storage if
77043 +   necessary and memory is freed. Znodes themselves can be recycled at
77044 +   this point too.
77045 +
77046 +*/
77047 +
77048 +#include "debug.h"
77049 +#include "dformat.h"
77050 +#include "key.h"
77051 +#include "coord.h"
77052 +#include "plugin/plugin_header.h"
77053 +#include "plugin/node/node.h"
77054 +#include "plugin/plugin.h"
77055 +#include "txnmgr.h"
77056 +#include "jnode.h"
77057 +#include "znode.h"
77058 +#include "block_alloc.h"
77059 +#include "tree.h"
77060 +#include "tree_walk.h"
77061 +#include "super.h"
77062 +#include "reiser4.h"
77063 +
77064 +#include <linux/pagemap.h>
77065 +#include <linux/spinlock.h>
77066 +#include <linux/slab.h>
77067 +#include <linux/err.h>
77068 +
77069 +static z_hash_table *get_htable(reiser4_tree *,
77070 +                               const reiser4_block_nr * const blocknr);
77071 +static z_hash_table *znode_get_htable(const znode *);
77072 +static void zdrop(znode *);
77073 +
77074 +/* hash table support */
77075 +
77076 +/* compare two block numbers for equality. Used by hash-table macros */
77077 +static inline int
77078 +blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2)
77079 +{
77080 +       assert("nikita-534", b1 != NULL);
77081 +       assert("nikita-535", b2 != NULL);
77082 +
77083 +       return *b1 == *b2;
77084 +}
77085 +
77086 +/* Hash znode by block number. Used by hash-table macros */
77087 +/* Audited by: umka (2002.06.11) */
77088 +static inline __u32
77089 +blknrhashfn(z_hash_table * table, const reiser4_block_nr * b)
77090 +{
77091 +       assert("nikita-536", b != NULL);
77092 +
77093 +       return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1);
77094 +}
77095 +
77096 +/* The hash table definition */
77097 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
77098 +#define KFREE(ptr, size) kfree(ptr)
77099 +TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z,
77100 +                     blknrhashfn, blknreq);
77101 +#undef KFREE
77102 +#undef KMALLOC
77103 +
77104 +/* slab for znodes */
77105 +static struct kmem_cache *znode_cache;
77106 +
77107 +int znode_shift_order;
77108 +
77109 +/**
77110 + * init_znodes - create znode cache
77111 + *
77112 + * Initializes slab cache of znodes. It is part of reiser4 module initialization.
77113 + */
77114 +int init_znodes(void)
77115 +{
77116 +       znode_cache = kmem_cache_create("znode", sizeof(znode), 0,
77117 +                                       SLAB_HWCACHE_ALIGN |
77118 +                                       SLAB_RECLAIM_ACCOUNT, NULL);
77119 +       if (znode_cache == NULL)
77120 +               return RETERR(-ENOMEM);
77121 +
77122 +       for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode);
77123 +            ++znode_shift_order);
77124 +       --znode_shift_order;
77125 +       return 0;
77126 +}
77127 +
77128 +/**
77129 + * done_znodes - delete znode cache
77130 + *
77131 + * This is called on reiser4 module unloading or system shutdown.
77132 + */
77133 +void done_znodes(void)
77134 +{
77135 +       destroy_reiser4_cache(&znode_cache);
77136 +}
77137 +
77138 +/* call this to initialise tree of znodes */
77139 +int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ )
77140 +{
77141 +       int result;
77142 +       assert("umka-050", tree != NULL);
77143 +
77144 +       rwlock_init(&tree->dk_lock);
77145 +
77146 +       result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE);
77147 +       if (result != 0)
77148 +               return result;
77149 +       result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE);
77150 +       return result;
77151 +}
77152 +
77153 +/* free this znode */
77154 +void zfree(znode * node /* znode to free */ )
77155 +{
77156 +       assert("nikita-465", node != NULL);
77157 +       assert("nikita-2120", znode_page(node) == NULL);
77158 +       assert("nikita-2301", list_empty_careful(&node->lock.owners));
77159 +       assert("nikita-2302", list_empty_careful(&node->lock.requestors));
77160 +       assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) &&
77161 +                              NODE_LIST(ZJNODE(node)) == NOT_CAPTURED));
77162 +       assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes));
77163 +       assert("nikita-3293", !znode_is_right_connected(node));
77164 +       assert("nikita-3294", !znode_is_left_connected(node));
77165 +       assert("nikita-3295", node->left == NULL);
77166 +       assert("nikita-3296", node->right == NULL);
77167 +
77168 +       /* not yet phash_jnode_destroy(ZJNODE(node)); */
77169 +
77170 +       kmem_cache_free(znode_cache, node);
77171 +}
77172 +
77173 +/* call this to free tree of znodes */
77174 +void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ )
77175 +{
77176 +       znode *node;
77177 +       znode *next;
77178 +       z_hash_table *ztable;
77179 +
77180 +       /* scan znode hash-tables and kill all znodes, then free hash tables
77181 +        * themselves. */
77182 +
77183 +       assert("nikita-795", tree != NULL);
77184 +
77185 +       ztable = &tree->zhash_table;
77186 +
77187 +       if (ztable->_table != NULL) {
77188 +               for_all_in_htable(ztable, z, node, next) {
77189 +                       node->c_count = 0;
77190 +                       node->in_parent.node = NULL;
77191 +                       assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
77192 +                       zdrop(node);
77193 +               }
77194 +
77195 +               z_hash_done(&tree->zhash_table);
77196 +       }
77197 +
77198 +       ztable = &tree->zfake_table;
77199 +
77200 +       if (ztable->_table != NULL) {
77201 +               for_all_in_htable(ztable, z, node, next) {
77202 +                       node->c_count = 0;
77203 +                       node->in_parent.node = NULL;
77204 +                       assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
77205 +                       zdrop(node);
77206 +               }
77207 +
77208 +               z_hash_done(&tree->zfake_table);
77209 +       }
77210 +}
77211 +
77212 +/* ZNODE STRUCTURES */
77213 +
77214 +/* allocate fresh znode */
77215 +znode *zalloc(gfp_t gfp_flag /* allocation flag */ )
77216 +{
77217 +       znode *node;
77218 +
77219 +       node = kmem_cache_alloc(znode_cache, gfp_flag);
77220 +       return node;
77221 +}
77222 +
77223 +/* Initialize fields of znode
77224 +   @node:    znode to initialize;
77225 +   @parent:  parent znode;
77226 +   @tree:    tree we are in. */
77227 +void zinit(znode * node, const znode * parent, reiser4_tree * tree)
77228 +{
77229 +       assert("nikita-466", node != NULL);
77230 +       assert("umka-268", current_tree != NULL);
77231 +
77232 +       memset(node, 0, sizeof *node);
77233 +
77234 +       assert("umka-051", tree != NULL);
77235 +
77236 +       jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK);
77237 +       reiser4_init_lock(&node->lock);
77238 +       init_parent_coord(&node->in_parent, parent);
77239 +}
77240 +
77241 +/*
77242 + * remove znode from indices. This is called jput() when last reference on
77243 + * znode is released.
77244 + */
77245 +void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree)
77246 +{
77247 +       assert("nikita-2108", node != NULL);
77248 +       assert("nikita-470", node->c_count == 0);
77249 +       assert_rw_write_locked(&(tree->tree_lock));
77250 +
77251 +       /* remove reference to this znode from cbk cache */
77252 +       cbk_cache_invalidate(node, tree);
77253 +
77254 +       /* update c_count of parent */
77255 +       if (znode_parent(node) != NULL) {
77256 +               assert("nikita-472", znode_parent(node)->c_count > 0);
77257 +               /* father, onto your hands I forward my spirit... */
77258 +               znode_parent(node)->c_count--;
77259 +               node->in_parent.node = NULL;
77260 +       } else {
77261 +               /* orphaned znode?! Root? */
77262 +       }
77263 +
77264 +       /* remove znode from hash-table */
77265 +       z_hash_remove_rcu(znode_get_htable(node), node);
77266 +}
77267 +
77268 +/* zdrop() -- Remove znode from the tree.
77269 +
77270 +   This is called when znode is removed from the memory. */
77271 +static void zdrop(znode * node /* znode to finish with */ )
77272 +{
77273 +       jdrop(ZJNODE(node));
77274 +}
77275 +
77276 +/*
77277 + * put znode into right place in the hash table. This is called by relocate
77278 + * code.
77279 + */
77280 +int znode_rehash(znode * node /* node to rehash */ ,
77281 +                const reiser4_block_nr * new_block_nr /* new block number */ )
77282 +{
77283 +       z_hash_table *oldtable;
77284 +       z_hash_table *newtable;
77285 +       reiser4_tree *tree;
77286 +
77287 +       assert("nikita-2018", node != NULL);
77288 +
77289 +       tree = znode_get_tree(node);
77290 +       oldtable = znode_get_htable(node);
77291 +       newtable = get_htable(tree, new_block_nr);
77292 +
77293 +       write_lock_tree(tree);
77294 +       /* remove znode from hash-table */
77295 +       z_hash_remove_rcu(oldtable, node);
77296 +
77297 +       /* assertion no longer valid due to RCU */
77298 +       /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */
77299 +
77300 +       /* update blocknr */
77301 +       znode_set_block(node, new_block_nr);
77302 +       node->zjnode.key.z = *new_block_nr;
77303 +
77304 +       /* insert it into hash */
77305 +       z_hash_insert_rcu(newtable, node);
77306 +       write_unlock_tree(tree);
77307 +       return 0;
77308 +}
77309 +
77310 +/* ZNODE LOOKUP, GET, PUT */
77311 +
77312 +/* zlook() - get znode with given block_nr in a hash table or return NULL
77313 +
77314 +   If result is non-NULL then the znode's x_count is incremented.  Internal version
77315 +   accepts pre-computed hash index.  The hash table is accessed under caller's
77316 +   tree->hash_lock.
77317 +*/
77318 +znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr)
77319 +{
77320 +       znode *result;
77321 +       __u32 hash;
77322 +       z_hash_table *htable;
77323 +
77324 +       assert("jmacd-506", tree != NULL);
77325 +       assert("jmacd-507", blocknr != NULL);
77326 +
77327 +       htable = get_htable(tree, blocknr);
77328 +       hash = blknrhashfn(htable, blocknr);
77329 +
77330 +       rcu_read_lock();
77331 +       result = z_hash_find_index(htable, hash, blocknr);
77332 +
77333 +       if (result != NULL) {
77334 +               add_x_ref(ZJNODE(result));
77335 +               result = znode_rip_check(tree, result);
77336 +       }
77337 +       rcu_read_unlock();
77338 +
77339 +       return result;
77340 +}
77341 +
77342 +/* return hash table where znode with block @blocknr is (or should be)
77343 + * stored */
77344 +static z_hash_table *get_htable(reiser4_tree * tree,
77345 +                               const reiser4_block_nr * const blocknr)
77346 +{
77347 +       z_hash_table *table;
77348 +       if (is_disk_addr_unallocated(blocknr))
77349 +               table = &tree->zfake_table;
77350 +       else
77351 +               table = &tree->zhash_table;
77352 +       return table;
77353 +}
77354 +
77355 +/* return hash table where znode @node is (or should be) stored */
77356 +static z_hash_table *znode_get_htable(const znode * node)
77357 +{
77358 +       return get_htable(znode_get_tree(node), znode_get_block(node));
77359 +}
77360 +
77361 +/* zget() - get znode from hash table, allocating it if necessary.
77362 +
77363 +   First a call to zlook, locating a x-referenced znode if one
77364 +   exists.  If znode is not found, allocate new one and return.  Result
77365 +   is returned with x_count reference increased.
77366 +
77367 +   LOCKS TAKEN:   TREE_LOCK, ZNODE_LOCK
77368 +   LOCK ORDERING: NONE
77369 +*/
77370 +znode *zget(reiser4_tree * tree,
77371 +           const reiser4_block_nr * const blocknr,
77372 +           znode * parent, tree_level level, gfp_t gfp_flag)
77373 +{
77374 +       znode *result;
77375 +       __u32 hashi;
77376 +
77377 +       z_hash_table *zth;
77378 +
77379 +       assert("jmacd-512", tree != NULL);
77380 +       assert("jmacd-513", blocknr != NULL);
77381 +       assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT);
77382 +
77383 +       zth = get_htable(tree, blocknr);
77384 +       hashi = blknrhashfn(zth, blocknr);
77385 +
77386 +       /* NOTE-NIKITA address-as-unallocated-blocknr still is not
77387 +          implemented. */
77388 +
77389 +       z_hash_prefetch_bucket(zth, hashi);
77390 +
77391 +       rcu_read_lock();
77392 +       /* Find a matching BLOCKNR in the hash table.  If the znode is found,
77393 +          we obtain an reference (x_count) but the znode remains unlocked.
77394 +          Have to worry about race conditions later. */
77395 +       result = z_hash_find_index(zth, hashi, blocknr);
77396 +       /* According to the current design, the hash table lock protects new
77397 +          znode references. */
77398 +       if (result != NULL) {
77399 +               add_x_ref(ZJNODE(result));
77400 +               /* NOTE-NIKITA it should be so, but special case during
77401 +                  creation of new root makes such assertion highly
77402 +                  complicated.  */
77403 +               assert("nikita-2131", 1 || znode_parent(result) == parent ||
77404 +                      (ZF_ISSET(result, JNODE_ORPHAN)
77405 +                       && (znode_parent(result) == NULL)));
77406 +               result = znode_rip_check(tree, result);
77407 +       }
77408 +
77409 +       rcu_read_unlock();
77410 +
77411 +       if (!result) {
77412 +               znode *shadow;
77413 +
77414 +               result = zalloc(gfp_flag);
77415 +               if (!result) {
77416 +                       return ERR_PTR(RETERR(-ENOMEM));
77417 +               }
77418 +
77419 +               zinit(result, parent, tree);
77420 +               ZJNODE(result)->blocknr = *blocknr;
77421 +               ZJNODE(result)->key.z = *blocknr;
77422 +               result->level = level;
77423 +
77424 +               write_lock_tree(tree);
77425 +
77426 +               shadow = z_hash_find_index(zth, hashi, blocknr);
77427 +               if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) {
77428 +                       jnode_list_remove(ZJNODE(result));
77429 +                       zfree(result);
77430 +                       result = shadow;
77431 +               } else {
77432 +                       result->version = znode_build_version(tree);
77433 +                       z_hash_insert_index_rcu(zth, hashi, result);
77434 +
77435 +                       if (parent != NULL)
77436 +                               ++parent->c_count;
77437 +               }
77438 +
77439 +               add_x_ref(ZJNODE(result));
77440 +
77441 +               write_unlock_tree(tree);
77442 +       }
77443 +#if REISER4_DEBUG
77444 +       if (!reiser4_blocknr_is_fake(blocknr) && *blocknr != 0)
77445 +               reiser4_check_block(blocknr, 1);
77446 +#endif
77447 +       /* Check for invalid tree level, return -EIO */
77448 +       if (unlikely(znode_get_level(result) != level)) {
77449 +               warning("jmacd-504",
77450 +                       "Wrong level for cached block %llu: %i expecting %i",
77451 +                       (unsigned long long)(*blocknr), znode_get_level(result),
77452 +                       level);
77453 +               zput(result);
77454 +               return ERR_PTR(RETERR(-EIO));
77455 +       }
77456 +
77457 +       assert("nikita-1227", znode_invariant(result));
77458 +
77459 +       return result;
77460 +}
77461 +
77462 +/* ZNODE PLUGINS/DATA */
77463 +
77464 +/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is
77465 +   stored at the fixed offset from the beginning of the node. */
77466 +static node_plugin *znode_guess_plugin(const znode * node      /* znode to guess
77467 +                                                                * plugin of */ )
77468 +{
77469 +       reiser4_tree *tree;
77470 +
77471 +       assert("nikita-1053", node != NULL);
77472 +       assert("nikita-1055", zdata(node) != NULL);
77473 +
77474 +       tree = znode_get_tree(node);
77475 +       assert("umka-053", tree != NULL);
77476 +
77477 +       if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) {
77478 +               return tree->nplug;
77479 +       } else {
77480 +               return node_plugin_by_disk_id
77481 +                   (tree, &((common_node_header *) zdata(node))->plugin_id);
77482 +#ifdef GUESS_EXISTS
77483 +               reiser4_plugin *plugin;
77484 +
77485 +               /* NOTE-NIKITA add locking here when dynamic plugins will be
77486 +                * implemented */
77487 +               for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) {
77488 +                       if ((plugin->u.node.guess != NULL)
77489 +                           && plugin->u.node.guess(node))
77490 +                               return plugin;
77491 +               }
77492 +               warning("nikita-1057", "Cannot guess node plugin");
77493 +               print_znode("node", node);
77494 +               return NULL;
77495 +#endif
77496 +       }
77497 +}
77498 +
77499 +/* parse node header and install ->node_plugin */
77500 +int zparse(znode * node /* znode to parse */ )
77501 +{
77502 +       int result;
77503 +
77504 +       assert("nikita-1233", node != NULL);
77505 +       assert("nikita-2370", zdata(node) != NULL);
77506 +
77507 +       if (node->nplug == NULL) {
77508 +               node_plugin *nplug;
77509 +
77510 +               nplug = znode_guess_plugin(node);
77511 +               if (likely(nplug != NULL)) {
77512 +                       result = nplug->parse(node);
77513 +                       if (likely(result == 0))
77514 +                               node->nplug = nplug;
77515 +               } else {
77516 +                       result = RETERR(-EIO);
77517 +               }
77518 +       } else
77519 +               result = 0;
77520 +       return result;
77521 +}
77522 +
77523 +/* zload with readahead */
77524 +int zload_ra(znode * node /* znode to load */ , ra_info_t * info)
77525 +{
77526 +       int result;
77527 +
77528 +       assert("nikita-484", node != NULL);
77529 +       assert("nikita-1377", znode_invariant(node));
77530 +       assert("jmacd-7771", !znode_above_root(node));
77531 +       assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0);
77532 +       assert("nikita-3016", reiser4_schedulable());
77533 +
77534 +       if (info)
77535 +               formatted_readahead(node, info);
77536 +
77537 +       result = jload(ZJNODE(node));
77538 +       assert("nikita-1378", znode_invariant(node));
77539 +       return result;
77540 +}
77541 +
77542 +/* load content of node into memory */
77543 +int zload(znode * node)
77544 +{
77545 +       return zload_ra(node, NULL);
77546 +}
77547 +
77548 +/* call node plugin to initialise newly allocated node. */
77549 +int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags)
77550 +{
77551 +       return jinit_new(ZJNODE(node), gfp_flags);
77552 +}
77553 +
77554 +/* drop reference to node data. When last reference is dropped, data are
77555 +   unloaded. */
77556 +void zrelse(znode * node /* znode to release references to */ )
77557 +{
77558 +       assert("nikita-1381", znode_invariant(node));
77559 +
77560 +       jrelse(ZJNODE(node));
77561 +}
77562 +
77563 +/* returns free space in node */
77564 +unsigned znode_free_space(znode * node /* znode to query */ )
77565 +{
77566 +       assert("nikita-852", node != NULL);
77567 +       return node_plugin_by_node(node)->free_space(node);
77568 +}
77569 +
77570 +/* left delimiting key of znode */
77571 +reiser4_key *znode_get_rd_key(znode * node /* znode to query */ )
77572 +{
77573 +       assert("nikita-958", node != NULL);
77574 +       assert_rw_locked(&(znode_get_tree(node)->dk_lock));
77575 +       assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk));
77576 +       assert("nikita-30671", node->rd_key_version != 0);
77577 +       return &node->rd_key;
77578 +}
77579 +
77580 +/* right delimiting key of znode */
77581 +reiser4_key *znode_get_ld_key(znode * node /* znode to query */ )
77582 +{
77583 +       assert("nikita-974", node != NULL);
77584 +       assert_rw_locked(&(znode_get_tree(node)->dk_lock));
77585 +       assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk));
77586 +       assert("nikita-30681", node->ld_key_version != 0);
77587 +       return &node->ld_key;
77588 +}
77589 +
77590 +ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0);
77591 +    )
77592 +
77593 +/* update right-delimiting key of @node */
77594 +reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key)
77595 +{
77596 +       assert("nikita-2937", node != NULL);
77597 +       assert("nikita-2939", key != NULL);
77598 +       assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
77599 +       assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk));
77600 +       assert("nikita-2944",
77601 +              znode_is_any_locked(node) ||
77602 +              znode_get_level(node) != LEAF_LEVEL ||
77603 +              keyge(key, &node->rd_key) ||
77604 +              keyeq(&node->rd_key, reiser4_min_key()) ||
77605 +              ZF_ISSET(node, JNODE_HEARD_BANSHEE));
77606 +
77607 +       node->rd_key = *key;
77608 +       ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version));
77609 +       return &node->rd_key;
77610 +}
77611 +
77612 +/* update left-delimiting key of @node */
77613 +reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key)
77614 +{
77615 +       assert("nikita-2940", node != NULL);
77616 +       assert("nikita-2941", key != NULL);
77617 +       assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
77618 +       assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk));
77619 +       assert("nikita-2943",
77620 +              znode_is_any_locked(node) || keyeq(&node->ld_key,
77621 +                                                 reiser4_min_key()));
77622 +
77623 +       node->ld_key = *key;
77624 +       ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version));
77625 +       return &node->ld_key;
77626 +}
77627 +
77628 +/* true if @key is inside key range for @node */
77629 +int znode_contains_key(znode * node /* znode to look in */ ,
77630 +                      const reiser4_key * key /* key to look for */ )
77631 +{
77632 +       assert("nikita-1237", node != NULL);
77633 +       assert("nikita-1238", key != NULL);
77634 +
77635 +       /* left_delimiting_key <= key <= right_delimiting_key */
77636 +       return keyle(znode_get_ld_key(node), key)
77637 +           && keyle(key, znode_get_rd_key(node));
77638 +}
77639 +
77640 +/* same as znode_contains_key(), but lock dk lock */
77641 +int znode_contains_key_lock(znode * node /* znode to look in */ ,
77642 +                           const reiser4_key * key /* key to look for */ )
77643 +{
77644 +       int result;
77645 +
77646 +       assert("umka-056", node != NULL);
77647 +       assert("umka-057", key != NULL);
77648 +
77649 +       read_lock_dk(znode_get_tree(node));
77650 +       result = znode_contains_key(node, key);
77651 +       read_unlock_dk(znode_get_tree(node));
77652 +       return result;
77653 +}
77654 +
77655 +/* get parent pointer, assuming tree is not locked */
77656 +znode *znode_parent_nolock(const znode * node /* child znode */ )
77657 +{
77658 +       assert("nikita-1444", node != NULL);
77659 +       return node->in_parent.node;
77660 +}
77661 +
77662 +/* get parent pointer of znode */
77663 +znode *znode_parent(const znode * node /* child znode */ )
77664 +{
77665 +       assert("nikita-1226", node != NULL);
77666 +       assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree));
77667 +       return znode_parent_nolock(node);
77668 +}
77669 +
77670 +/* detect uber znode used to protect in-superblock tree root pointer */
77671 +int znode_above_root(const znode * node /* znode to query */ )
77672 +{
77673 +       assert("umka-059", node != NULL);
77674 +
77675 +       return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR);
77676 +}
77677 +
77678 +/* check that @node is root---that its block number is recorder in the tree as
77679 +   that of root node */
77680 +#if REISER4_DEBUG
77681 +static int znode_is_true_root(const znode * node /* znode to query */ )
77682 +{
77683 +       assert("umka-060", node != NULL);
77684 +       assert("umka-061", current_tree != NULL);
77685 +
77686 +       return disk_addr_eq(znode_get_block(node),
77687 +                           &znode_get_tree(node)->root_block);
77688 +}
77689 +#endif
77690 +
77691 +/* check that @node is root */
77692 +int znode_is_root(const znode * node /* znode to query */ )
77693 +{
77694 +       assert("nikita-1206", node != NULL);
77695 +
77696 +       return znode_get_level(node) == znode_get_tree(node)->height;
77697 +}
77698 +
77699 +/* Returns true is @node was just created by zget() and wasn't ever loaded
77700 +   into memory. */
77701 +/* NIKITA-HANS: yes */
77702 +int znode_just_created(const znode * node)
77703 +{
77704 +       assert("nikita-2188", node != NULL);
77705 +       return (znode_page(node) == NULL);
77706 +}
77707 +
77708 +/* obtain updated ->znode_epoch. See seal.c for description. */
77709 +__u64 znode_build_version(reiser4_tree * tree)
77710 +{
77711 +       __u64 result;
77712 +
77713 +       spin_lock(&tree->epoch_lock);
77714 +       result = ++tree->znode_epoch;
77715 +       spin_unlock(&tree->epoch_lock);
77716 +       return result;
77717 +}
77718 +
77719 +void init_load_count(load_count * dh)
77720 +{
77721 +       assert("nikita-2105", dh != NULL);
77722 +       memset(dh, 0, sizeof *dh);
77723 +}
77724 +
77725 +void done_load_count(load_count * dh)
77726 +{
77727 +       assert("nikita-2106", dh != NULL);
77728 +       if (dh->node != NULL) {
77729 +               for (; dh->d_ref > 0; --dh->d_ref)
77730 +                       zrelse(dh->node);
77731 +               dh->node = NULL;
77732 +       }
77733 +}
77734 +
77735 +static int incr_load_count(load_count * dh)
77736 +{
77737 +       int result;
77738 +
77739 +       assert("nikita-2110", dh != NULL);
77740 +       assert("nikita-2111", dh->node != NULL);
77741 +
77742 +       result = zload(dh->node);
77743 +       if (result == 0)
77744 +               ++dh->d_ref;
77745 +       return result;
77746 +}
77747 +
77748 +int incr_load_count_znode(load_count * dh, znode * node)
77749 +{
77750 +       assert("nikita-2107", dh != NULL);
77751 +       assert("nikita-2158", node != NULL);
77752 +       assert("nikita-2109",
77753 +              ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0)));
77754 +
77755 +       dh->node = node;
77756 +       return incr_load_count(dh);
77757 +}
77758 +
77759 +int incr_load_count_jnode(load_count * dh, jnode * node)
77760 +{
77761 +       if (jnode_is_znode(node)) {
77762 +               return incr_load_count_znode(dh, JZNODE(node));
77763 +       }
77764 +       return 0;
77765 +}
77766 +
77767 +void copy_load_count(load_count * new, load_count * old)
77768 +{
77769 +       int ret = 0;
77770 +       done_load_count(new);
77771 +       new->node = old->node;
77772 +       new->d_ref = 0;
77773 +
77774 +       while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) {
77775 +       }
77776 +
77777 +       assert("jmacd-87589", ret == 0);
77778 +}
77779 +
77780 +void move_load_count(load_count * new, load_count * old)
77781 +{
77782 +       done_load_count(new);
77783 +       new->node = old->node;
77784 +       new->d_ref = old->d_ref;
77785 +       old->node = NULL;
77786 +       old->d_ref = 0;
77787 +}
77788 +
77789 +/* convert parent pointer into coord */
77790 +void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord)
77791 +{
77792 +       assert("nikita-3204", pcoord != NULL);
77793 +       assert("nikita-3205", coord != NULL);
77794 +
77795 +       coord_init_first_unit_nocheck(coord, pcoord->node);
77796 +       coord_set_item_pos(coord, pcoord->item_pos);
77797 +       coord->between = AT_UNIT;
77798 +}
77799 +
77800 +/* pack coord into parent_coord_t */
77801 +void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord)
77802 +{
77803 +       assert("nikita-3206", pcoord != NULL);
77804 +       assert("nikita-3207", coord != NULL);
77805 +
77806 +       pcoord->node = coord->node;
77807 +       pcoord->item_pos = coord->item_pos;
77808 +}
77809 +
77810 +/* Initialize a parent hint pointer. (parent hint pointer is a field in znode,
77811 +   look for comments there) */
77812 +void init_parent_coord(parent_coord_t * pcoord, const znode * node)
77813 +{
77814 +       pcoord->node = (znode *) node;
77815 +       pcoord->item_pos = (unsigned short)~0;
77816 +}
77817 +
77818 +#if REISER4_DEBUG
77819 +
77820 +/* debugging aid: znode invariant */
77821 +static int znode_invariant_f(const znode * node /* znode to check */ ,
77822 +                            char const **msg   /* where to store error
77823 +                                                * message, if any */ )
77824 +{
77825 +#define _ergo(ant, con)                                                \
77826 +       ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
77827 +
77828 +#define _equi(e1, e2)                                          \
77829 +       ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2)))
77830 +
77831 +#define _check(exp) ((*msg) = #exp, (exp))
77832 +
77833 +       return jnode_invariant_f(ZJNODE(node), msg) &&
77834 +           /* [znode-fake] invariant */
77835 +           /* fake znode doesn't have a parent, and */
77836 +           _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) &&
77837 +           /* there is another way to express this very check, and */
77838 +           _ergo(znode_above_root(node), znode_parent(node) == NULL) &&
77839 +           /* it has special block number, and */
77840 +           _ergo(znode_get_level(node) == 0,
77841 +                 disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
77842 +           /* it is the only znode with such block number, and */
77843 +           _ergo(!znode_above_root(node) && znode_is_loaded(node),
77844 +                 !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
77845 +           /* it is parent of the tree root node */
77846 +           _ergo(znode_is_true_root(node),
77847 +                 znode_above_root(znode_parent(node))) &&
77848 +           /* [znode-level] invariant */
77849 +           /* level of parent znode is one larger than that of child,
77850 +              except for the fake znode, and */
77851 +           _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)),
77852 +                 znode_get_level(znode_parent(node)) ==
77853 +                 znode_get_level(node) + 1) &&
77854 +           /* left neighbor is at the same level, and */
77855 +           _ergo(znode_is_left_connected(node) && node->left != NULL,
77856 +                 znode_get_level(node) == znode_get_level(node->left)) &&
77857 +           /* right neighbor is at the same level */
77858 +           _ergo(znode_is_right_connected(node) && node->right != NULL,
77859 +                 znode_get_level(node) == znode_get_level(node->right)) &&
77860 +           /* [znode-connected] invariant */
77861 +           _ergo(node->left != NULL, znode_is_left_connected(node)) &&
77862 +           _ergo(node->right != NULL, znode_is_right_connected(node)) &&
77863 +           _ergo(!znode_is_root(node) && node->left != NULL,
77864 +                 znode_is_right_connected(node->left) &&
77865 +                 node->left->right == node) &&
77866 +           _ergo(!znode_is_root(node) && node->right != NULL,
77867 +                 znode_is_left_connected(node->right) &&
77868 +                 node->right->left == node) &&
77869 +           /* [znode-c_count] invariant */
77870 +           /* for any znode, c_count of its parent is greater than 0 */
77871 +           _ergo(znode_parent(node) != NULL &&
77872 +                 !znode_above_root(znode_parent(node)),
77873 +                 znode_parent(node)->c_count > 0) &&
77874 +           /* leaves don't have children */
77875 +           _ergo(znode_get_level(node) == LEAF_LEVEL,
77876 +                 node->c_count == 0) &&
77877 +           _check(node->zjnode.jnodes.prev != NULL) &&
77878 +           _check(node->zjnode.jnodes.next != NULL) &&
77879 +           /* orphan doesn't have a parent */
77880 +           _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) &&
77881 +           /* [znode-modify] invariant */
77882 +           /* if znode is not write-locked, its checksum remains
77883 +            * invariant */
77884 +           /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we
77885 +            * cannot check this. */
77886 +           /* [znode-refs] invariant */
77887 +           /* only referenced znode can be long-term locked */
77888 +           _ergo(znode_is_locked(node),
77889 +                 atomic_read(&ZJNODE(node)->x_count) != 0);
77890 +}
77891 +
77892 +/* debugging aid: check znode invariant and panic if it doesn't hold */
77893 +int znode_invariant(znode * node /* znode to check */ )
77894 +{
77895 +       char const *failed_msg;
77896 +       int result;
77897 +
77898 +       assert("umka-063", node != NULL);
77899 +       assert("umka-064", current_tree != NULL);
77900 +
77901 +       spin_lock_znode(node);
77902 +       read_lock_tree(znode_get_tree(node));
77903 +       result = znode_invariant_f(node, &failed_msg);
77904 +       if (!result) {
77905 +               /* print_znode("corrupted node", node); */
77906 +               warning("jmacd-555", "Condition %s failed", failed_msg);
77907 +       }
77908 +       read_unlock_tree(znode_get_tree(node));
77909 +       spin_unlock_znode(node);
77910 +       return result;
77911 +}
77912 +
77913 +/* return non-0 iff data are loaded into znode */
77914 +int znode_is_loaded(const znode * node /* znode to query */ )
77915 +{
77916 +       assert("nikita-497", node != NULL);
77917 +       return jnode_is_loaded(ZJNODE(node));
77918 +}
77919 +
77920 +unsigned long znode_times_locked(const znode * z)
77921 +{
77922 +       return z->times_locked;
77923 +}
77924 +
77925 +#endif                         /* REISER4_DEBUG */
77926 +
77927 +/* Make Linus happy.
77928 +   Local variables:
77929 +   c-indentation-style: "K&R"
77930 +   mode-name: "LC"
77931 +   c-basic-offset: 8
77932 +   tab-width: 8
77933 +   fill-column: 120
77934 +   End:
77935 +*/
77936 diff -urN linux-2.6.35.orig/fs/reiser4/znode.h linux-2.6.35/fs/reiser4/znode.h
77937 --- linux-2.6.35.orig/fs/reiser4/znode.h        1970-01-01 01:00:00.000000000 +0100
77938 +++ linux-2.6.35/fs/reiser4/znode.h     2010-08-04 15:44:57.000000000 +0200
77939 @@ -0,0 +1,433 @@
77940 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
77941 + * reiser4/README */
77942 +
77943 +/* Declaration of znode (Zam's node). See znode.c for more details. */
77944 +
77945 +#ifndef __ZNODE_H__
77946 +#define __ZNODE_H__
77947 +
77948 +#include "forward.h"
77949 +#include "debug.h"
77950 +#include "dformat.h"
77951 +#include "key.h"
77952 +#include "coord.h"
77953 +#include "plugin/node/node.h"
77954 +#include "jnode.h"
77955 +#include "lock.h"
77956 +#include "readahead.h"
77957 +
77958 +#include <linux/types.h>
77959 +#include <linux/spinlock.h>
77960 +#include <linux/pagemap.h>     /* for PAGE_CACHE_SIZE */
77961 +#include <asm/atomic.h>
77962 +
77963 +/* znode tracks its position within parent (internal item in a parent node,
77964 + * that contains znode's block number). */
77965 +typedef struct parent_coord {
77966 +       znode *node;
77967 +       pos_in_node_t item_pos;
77968 +} parent_coord_t;
77969 +
77970 +/* &znode - node in a reiser4 tree.
77971 +
77972 +   NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce
77973 +   cacheline pressure.
77974 +
77975 +   Locking:
77976 +
77977 +   Long term: data in a disk node attached to this znode are protected
77978 +   by long term, deadlock aware lock ->lock;
77979 +
77980 +   Spin lock: the following fields are protected by the spin lock:
77981 +
77982 +    ->lock
77983 +
77984 +   Following fields are protected by the global tree lock:
77985 +
77986 +    ->left
77987 +    ->right
77988 +    ->in_parent
77989 +    ->c_count
77990 +
77991 +   Following fields are protected by the global delimiting key lock (dk_lock):
77992 +
77993 +    ->ld_key (to update ->ld_key long-term lock on the node is also required)
77994 +    ->rd_key
77995 +
77996 +   Following fields are protected by the long term lock:
77997 +
77998 +    ->nr_items
77999 +
78000 +   ->node_plugin is never changed once set. This means that after code made
78001 +   itself sure that field is valid it can be accessed without any additional
78002 +   locking.
78003 +
78004 +   ->level is immutable.
78005 +
78006 +   Invariants involving this data-type:
78007 +
78008 +      [znode-fake]
78009 +      [znode-level]
78010 +      [znode-connected]
78011 +      [znode-c_count]
78012 +      [znode-refs]
78013 +      [jnode-refs]
78014 +      [jnode-queued]
78015 +      [znode-modify]
78016 +
78017 +    For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks.
78018 +    Suggestions for how to do that are desired.*/
78019 +struct znode {
78020 +       /* Embedded jnode. */
78021 +       jnode zjnode;
78022 +
78023 +       /* contains three subfields, node, pos_in_node, and pos_in_unit.
78024 +
78025 +          pos_in_node and pos_in_unit are only hints that are cached to
78026 +          speed up lookups during balancing. They are not required to be up to
78027 +          date. Synched in find_child_ptr().
78028 +
78029 +          This value allows us to avoid expensive binary searches.
78030 +
78031 +          in_parent->node points to the parent of this node, and is NOT a
78032 +          hint.
78033 +        */
78034 +       parent_coord_t in_parent;
78035 +
78036 +       /*
78037 +        * sibling list pointers
78038 +        */
78039 +
78040 +       /* left-neighbor */
78041 +       znode *left;
78042 +       /* right-neighbor */
78043 +       znode *right;
78044 +
78045 +       /* long term lock on node content. This lock supports deadlock
78046 +          detection. See lock.c
78047 +        */
78048 +       zlock lock;
78049 +
78050 +       /* You cannot remove from memory a node that has children in
78051 +          memory. This is because we rely on the fact that parent of given
78052 +          node can always be reached without blocking for io. When reading a
78053 +          node into memory you must increase the c_count of its parent, when
78054 +          removing it from memory you must decrease the c_count.  This makes
78055 +          the code simpler, and the cases where it is suboptimal are truly
78056 +          obscure.
78057 +        */
78058 +       int c_count;
78059 +
78060 +       /* plugin of node attached to this znode. NULL if znode is not
78061 +          loaded. */
78062 +       node_plugin *nplug;
78063 +
78064 +       /* version of znode data. This is increased on each modification. This
78065 +        * is necessary to implement seals (see seal.[ch]) efficiently. */
78066 +       __u64 version;
78067 +
78068 +       /* left delimiting key. Necessary to efficiently perform
78069 +          balancing with node-level locking. Kept in memory only. */
78070 +       reiser4_key ld_key;
78071 +       /* right delimiting key. */
78072 +       reiser4_key rd_key;
78073 +
78074 +       /* znode's tree level */
78075 +       __u16 level;
78076 +       /* number of items in this node. This field is modified by node
78077 +        * plugin. */
78078 +       __u16 nr_items;
78079 +
78080 +#if REISER4_DEBUG
78081 +       void *creator;
78082 +       reiser4_key first_key;
78083 +       unsigned long times_locked;
78084 +       int left_version;       /* when node->left was updated */
78085 +       int right_version;      /* when node->right was updated */
78086 +       int ld_key_version;     /* when node->ld_key was updated */
78087 +       int rd_key_version;     /* when node->rd_key was updated */
78088 +#endif
78089 +
78090 +} __attribute__ ((aligned(16)));
78091 +
78092 +ON_DEBUG(extern atomic_t delim_key_version;
78093 +    )
78094 +
78095 +/* In general I think these macros should not be exposed. */
78096 +#define znode_is_locked(node)          (lock_is_locked(&node->lock))
78097 +#define znode_is_rlocked(node)         (lock_is_rlocked(&node->lock))
78098 +#define znode_is_wlocked(node)         (lock_is_wlocked(&node->lock))
78099 +#define znode_is_wlocked_once(node)    (lock_is_wlocked_once(&node->lock))
78100 +#define znode_can_be_rlocked(node)     (lock_can_be_rlocked(&node->lock))
78101 +#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode))
78102 +/* Macros for accessing the znode state. */
78103 +#define        ZF_CLR(p,f)             JF_CLR  (ZJNODE(p), (f))
78104 +#define        ZF_ISSET(p,f)           JF_ISSET(ZJNODE(p), (f))
78105 +#define        ZF_SET(p,f)             JF_SET  (ZJNODE(p), (f))
78106 +extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block,
78107 +                  znode * parent, tree_level level, gfp_t gfp_flag);
78108 +extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block);
78109 +extern int zload(znode * node);
78110 +extern int zload_ra(znode * node, ra_info_t * info);
78111 +extern int zinit_new(znode * node, gfp_t gfp_flags);
78112 +extern void zrelse(znode * node);
78113 +extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block);
78114 +
78115 +/* size of data in znode */
78116 +static inline unsigned
78117 +znode_size(const znode * node UNUSED_ARG /* znode to query */ )
78118 +{
78119 +       assert("nikita-1416", node != NULL);
78120 +       return PAGE_CACHE_SIZE;
78121 +}
78122 +
78123 +extern void parent_coord_to_coord(const parent_coord_t * pcoord,
78124 +                                 coord_t * coord);
78125 +extern void coord_to_parent_coord(const coord_t * coord,
78126 +                                 parent_coord_t * pcoord);
78127 +extern void init_parent_coord(parent_coord_t * pcoord, const znode * node);
78128 +
78129 +extern unsigned znode_free_space(znode * node);
78130 +
78131 +extern reiser4_key *znode_get_rd_key(znode * node);
78132 +extern reiser4_key *znode_get_ld_key(znode * node);
78133 +
78134 +extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key);
78135 +extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key);
78136 +
78137 +/* `connected' state checks */
78138 +static inline int znode_is_right_connected(const znode * node)
78139 +{
78140 +       return ZF_ISSET(node, JNODE_RIGHT_CONNECTED);
78141 +}
78142 +
78143 +static inline int znode_is_left_connected(const znode * node)
78144 +{
78145 +       return ZF_ISSET(node, JNODE_LEFT_CONNECTED);
78146 +}
78147 +
78148 +static inline int znode_is_connected(const znode * node)
78149 +{
78150 +       return znode_is_right_connected(node) && znode_is_left_connected(node);
78151 +}
78152 +
78153 +extern int znode_shift_order;
78154 +extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr);
78155 +extern void znode_remove(znode *, reiser4_tree *);
78156 +extern znode *znode_parent(const znode * node);
78157 +extern znode *znode_parent_nolock(const znode * node);
78158 +extern int znode_above_root(const znode * node);
78159 +extern int init_znodes(void);
78160 +extern void done_znodes(void);
78161 +extern int znodes_tree_init(reiser4_tree * ztree);
78162 +extern void znodes_tree_done(reiser4_tree * ztree);
78163 +extern int znode_contains_key(znode * node, const reiser4_key * key);
78164 +extern int znode_contains_key_lock(znode * node, const reiser4_key * key);
78165 +extern unsigned znode_save_free_space(znode * node);
78166 +extern unsigned znode_recover_free_space(znode * node);
78167 +extern znode *zalloc(gfp_t gfp_flag);
78168 +extern void zinit(znode *, const znode * parent, reiser4_tree *);
78169 +extern int zparse(znode * node);
78170 +
78171 +extern int znode_just_created(const znode * node);
78172 +
78173 +extern void zfree(znode * node);
78174 +
78175 +#if REISER4_DEBUG
78176 +extern void print_znode(const char *prefix, const znode * node);
78177 +#else
78178 +#define print_znode( p, n ) noop
78179 +#endif
78180 +
78181 +/* Make it look like various znode functions exist instead of treating znodes as
78182 +   jnodes in znode-specific code. */
78183 +#define znode_page(x)               jnode_page ( ZJNODE(x) )
78184 +#define zdata(x)                    jdata ( ZJNODE(x) )
78185 +#define znode_get_block(x)          jnode_get_block ( ZJNODE(x) )
78186 +#define znode_created(x)            jnode_created ( ZJNODE(x) )
78187 +#define znode_set_created(x)        jnode_set_created ( ZJNODE(x) )
78188 +#define znode_convertible(x)        jnode_convertible (ZJNODE(x))
78189 +#define znode_set_convertible(x)    jnode_set_convertible (ZJNODE(x))
78190 +
78191 +#define znode_is_dirty(x)           jnode_is_dirty    ( ZJNODE(x) )
78192 +#define znode_check_dirty(x)        jnode_check_dirty ( ZJNODE(x) )
78193 +#define znode_make_clean(x)         jnode_make_clean   ( ZJNODE(x) )
78194 +#define znode_set_block(x, b)       jnode_set_block ( ZJNODE(x), (b) )
78195 +
78196 +#define spin_lock_znode(x)          spin_lock_jnode ( ZJNODE(x) )
78197 +#define spin_unlock_znode(x)        spin_unlock_jnode ( ZJNODE(x) )
78198 +#define spin_trylock_znode(x)       spin_trylock_jnode ( ZJNODE(x) )
78199 +#define spin_znode_is_locked(x)     spin_jnode_is_locked ( ZJNODE(x) )
78200 +#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) )
78201 +
78202 +#if REISER4_DEBUG
78203 +extern int znode_x_count_is_protected(const znode * node);
78204 +extern int znode_invariant(znode * node);
78205 +#endif
78206 +
78207 +/* acquire reference to @node */
78208 +static inline znode *zref(znode * node)
78209 +{
78210 +       /* change of x_count from 0 to 1 is protected by tree spin-lock */
78211 +       return JZNODE(jref(ZJNODE(node)));
78212 +}
78213 +
78214 +/* release reference to @node */
78215 +static inline void zput(znode * node)
78216 +{
78217 +       assert("nikita-3564", znode_invariant(node));
78218 +       jput(ZJNODE(node));
78219 +}
78220 +
78221 +/* get the level field for a znode */
78222 +static inline tree_level znode_get_level(const znode * node)
78223 +{
78224 +       return node->level;
78225 +}
78226 +
78227 +/* get the level field for a jnode */
78228 +static inline tree_level jnode_get_level(const jnode * node)
78229 +{
78230 +       if (jnode_is_znode(node))
78231 +               return znode_get_level(JZNODE(node));
78232 +       else
78233 +               /* unformatted nodes are all at the LEAF_LEVEL and for
78234 +                  "semi-formatted" nodes like bitmaps, level doesn't matter. */
78235 +               return LEAF_LEVEL;
78236 +}
78237 +
78238 +/* true if jnode is on leaf level */
78239 +static inline int jnode_is_leaf(const jnode * node)
78240 +{
78241 +       if (jnode_is_znode(node))
78242 +               return (znode_get_level(JZNODE(node)) == LEAF_LEVEL);
78243 +       if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK)
78244 +               return 1;
78245 +       return 0;
78246 +}
78247 +
78248 +/* return znode's tree */
78249 +static inline reiser4_tree *znode_get_tree(const znode * node)
78250 +{
78251 +       assert("nikita-2692", node != NULL);
78252 +       return jnode_get_tree(ZJNODE(node));
78253 +}
78254 +
78255 +/* resolve race with zput */
78256 +static inline znode *znode_rip_check(reiser4_tree * tree, znode * node)
78257 +{
78258 +       jnode *j;
78259 +
78260 +       j = jnode_rip_sync(tree, ZJNODE(node));
78261 +       if (likely(j != NULL))
78262 +               node = JZNODE(j);
78263 +       else
78264 +               node = NULL;
78265 +       return node;
78266 +}
78267 +
78268 +#if defined(REISER4_DEBUG)
78269 +int znode_is_loaded(const znode * node /* znode to query */ );
78270 +#endif
78271 +
78272 +extern __u64 znode_build_version(reiser4_tree * tree);
78273 +
78274 +/* Data-handles.  A data handle object manages pairing calls to zload() and zrelse().  We
78275 +   must load the data for a node in many places.  We could do this by simply calling
78276 +   zload() everywhere, the difficulty arises when we must release the loaded data by
78277 +   calling zrelse.  In a function with many possible error/return paths, it requires extra
78278 +   work to figure out which exit paths must call zrelse and those which do not.  The data
78279 +   handle automatically calls zrelse for every zload that it is responsible for.  In that
78280 +   sense, it acts much like a lock_handle.
78281 +*/
78282 +typedef struct load_count {
78283 +       znode *node;
78284 +       int d_ref;
78285 +} load_count;
78286 +
78287 +extern void init_load_count(load_count * lc);  /* Initialize a load_count set the current node to NULL. */
78288 +extern void done_load_count(load_count * dh);  /* Finalize a load_count: call zrelse() if necessary */
78289 +extern int incr_load_count_znode(load_count * dh, znode * node);       /* Set the argument znode to the current node, call zload(). */
78290 +extern int incr_load_count_jnode(load_count * dh, jnode * node);       /* If the argument jnode is formatted, do the same as
78291 +                                                                        * incr_load_count_znode, otherwise do nothing (unformatted nodes
78292 +                                                                        * don't require zload/zrelse treatment). */
78293 +extern void move_load_count(load_count * new, load_count * old);       /* Move the contents of a load_count.  Old handle is released. */
78294 +extern void copy_load_count(load_count * new, load_count * old);       /* Copy the contents of a load_count.  Old handle remains held. */
78295 +
78296 +/* Variable initializers for load_count. */
78297 +#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 }
78298 +#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 }
78299 +/* A convenience macro for use in assertions or debug-only code, where loaded
78300 +   data is only required to perform the debugging check.  This macro
78301 +   encapsulates an expression inside a pair of calls to zload()/zrelse(). */
78302 +#define WITH_DATA( node, exp )                         \
78303 +({                                                     \
78304 +       long __with_dh_result;                          \
78305 +       znode *__with_dh_node;                          \
78306 +                                                       \
78307 +       __with_dh_node = ( node );                      \
78308 +       __with_dh_result = zload( __with_dh_node );     \
78309 +       if( __with_dh_result == 0 ) {                   \
78310 +               __with_dh_result = ( long )( exp );     \
78311 +               zrelse( __with_dh_node );               \
78312 +       }                                               \
78313 +       __with_dh_result;                               \
78314 +})
78315 +
78316 +/* Same as above, but accepts a return value in case zload fails. */
78317 +#define WITH_DATA_RET( node, ret, exp )                        \
78318 +({                                                     \
78319 +       int __with_dh_result;                           \
78320 +       znode *__with_dh_node;                          \
78321 +                                                       \
78322 +       __with_dh_node = ( node );                      \
78323 +       __with_dh_result = zload( __with_dh_node );     \
78324 +       if( __with_dh_result == 0 ) {                   \
78325 +               __with_dh_result = ( int )( exp );      \
78326 +               zrelse( __with_dh_node );               \
78327 +       } else                                          \
78328 +               __with_dh_result = ( ret );             \
78329 +       __with_dh_result;                               \
78330 +})
78331 +
78332 +#define WITH_COORD(coord, exp)                 \
78333 +({                                             \
78334 +       coord_t *__coord;                       \
78335 +                                               \
78336 +       __coord = (coord);                      \
78337 +       coord_clear_iplug(__coord);             \
78338 +       WITH_DATA(__coord->node, exp);          \
78339 +})
78340 +
78341 +#if REISER4_DEBUG
78342 +#define STORE_COUNTERS                                         \
78343 +       reiser4_lock_cnt_info __entry_counters =                \
78344 +               *reiser4_lock_counters()
78345 +#define CHECK_COUNTERS                                                 \
78346 +ON_DEBUG_CONTEXT(                                                      \
78347 +({                                                                     \
78348 +       __entry_counters.x_refs = reiser4_lock_counters() -> x_refs;    \
78349 +       __entry_counters.t_refs = reiser4_lock_counters() -> t_refs;    \
78350 +       __entry_counters.d_refs = reiser4_lock_counters() -> d_refs;    \
78351 +       assert("nikita-2159",                                           \
78352 +              !memcmp(&__entry_counters, reiser4_lock_counters(),      \
78353 +                      sizeof __entry_counters));                       \
78354 +}) )
78355 +
78356 +#else
78357 +#define STORE_COUNTERS
78358 +#define CHECK_COUNTERS noop
78359 +#endif
78360 +
78361 +/* __ZNODE_H__ */
78362 +#endif
78363 +
78364 +/* Make Linus happy.
78365 +   Local variables:
78366 +   c-indentation-style: "K&R"
78367 +   mode-name: "LC"
78368 +   c-basic-offset: 8
78369 +   tab-width: 8
78370 +   fill-column: 120
78371 +   End:
78372 +*/
78373 diff -urN linux-2.6.35.orig/include/linux/fs.h linux-2.6.35/include/linux/fs.h
78374 --- linux-2.6.35.orig/include/linux/fs.h        2010-08-02 00:11:14.000000000 +0200
78375 +++ linux-2.6.35/include/linux/fs.h     2010-08-04 18:01:31.000000000 +0200
78376 @@ -514,6 +514,7 @@
78377  struct page;
78378  struct address_space;
78379  struct writeback_control;
78380 +struct bdi_writeback;
78381
78382  struct iov_iter {
78383         const struct iovec *iov;
78384 @@ -1572,7 +1573,12 @@
78385         int (*remount_fs) (struct super_block *, int *, char *);
78386         void (*clear_inode) (struct inode *);
78387         void (*umount_begin) (struct super_block *);
78388 -
78389 +       int (*writeback_inodes)(struct super_block *sb,
78390 +                               struct bdi_writeback *wb,
78391 +                               struct writeback_control *wbc,
78392 +                               bool only_this_sb);
78393 +       void (*sync_inodes) (struct super_block *sb,
78394 +                               struct writeback_control *wbc);
78395         int (*show_options)(struct seq_file *, struct vfsmount *);
78396         int (*show_stats)(struct seq_file *, struct vfsmount *);
78397  #ifdef CONFIG_QUOTA
78398 @@ -2088,6 +2094,12 @@
78399  extern int invalidate_inode_pages2_range(struct address_space *mapping,
78400                                          pgoff_t start, pgoff_t end);
78401  extern int write_inode_now(struct inode *, int);
78402 +extern void writeback_skip_sb_inodes(struct super_block *sb,
78403 +                                    struct bdi_writeback *wb);
78404 +extern int generic_writeback_sb_inodes(struct super_block *sb,
78405 +                                      struct bdi_writeback *wb,
78406 +                                      struct writeback_control *wbc,
78407 +                                      bool only_this_sb);
78408  extern int filemap_fdatawrite(struct address_space *);
78409  extern int filemap_flush(struct address_space *);
78410  extern int filemap_fdatawait(struct address_space *);
78411 diff -urN linux-2.6.35.orig/include/linux/mm.h linux-2.6.35/include/linux/mm.h
78412 --- linux-2.6.35.orig/include/linux/mm.h        2010-08-02 00:11:14.000000000 +0200
78413 +++ linux-2.6.35/include/linux/mm.h     2010-08-04 15:44:57.000000000 +0200
78414 @@ -857,6 +857,7 @@
78415  void account_page_dirtied(struct page *page, struct address_space *mapping);
78416  int set_page_dirty(struct page *page);
78417  int set_page_dirty_lock(struct page *page);
78418 +int set_page_dirty_notag(struct page *page);
78419  int clear_page_dirty_for_io(struct page *page);
78420
78421  extern unsigned long move_page_tables(struct vm_area_struct *vma,
78422 diff -urN linux-2.6.35.orig/include/linux/writeback.h linux-2.6.35/include/linux/writeback.h
78423 --- linux-2.6.35.orig/include/linux/writeback.h 2010-08-02 00:11:14.000000000 +0200
78424 +++ linux-2.6.35/include/linux/writeback.h      2010-08-04 15:44:57.000000000 +0200
78425 @@ -13,6 +13,12 @@
78426  extern struct list_head inode_in_use;
78427  extern struct list_head inode_unused;
78428
78429 +static inline int is_flush_bd_task(struct task_struct *task)
78430 +{
78431 +       return task->flags & PF_FLUSHER;
78432 +}
78433 +#define current_is_flush_bd_task()  is_flush_bd_task(current)
78434 +
78435  /*
78436   * fs/fs-writeback.c
78437   */
78438 diff -urN linux-2.6.35.orig/mm/filemap.c linux-2.6.35/mm/filemap.c
78439 --- linux-2.6.35.orig/mm/filemap.c      2010-08-02 00:11:14.000000000 +0200
78440 +++ linux-2.6.35/mm/filemap.c   2010-08-04 17:20:40.000000000 +0200
78441 @@ -139,6 +139,7 @@
78442                 dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
78443         }
78444  }
78445 +EXPORT_SYMBOL(__remove_from_page_cache);
78446
78447  void remove_from_page_cache(struct page *page)
78448  {
78449 @@ -955,6 +956,7 @@
78450  {
78451         ra->ra_pages /= 4;
78452  }
78453 +EXPORT_SYMBOL(find_get_pages);
78454
78455  /**
78456   * do_generic_file_read - generic file read routine
78457 diff -urN linux-2.6.35.orig/mm/page-writeback.c linux-2.6.35/mm/page-writeback.c
78458 --- linux-2.6.35.orig/mm/page-writeback.c       2010-08-02 00:11:14.000000000 +0200
78459 +++ linux-2.6.35/mm/page-writeback.c    2010-08-04 15:44:57.000000000 +0200
78460 @@ -1142,6 +1142,32 @@
78461  EXPORT_SYMBOL(__set_page_dirty_nobuffers);
78462
78463  /*
78464 + * set_page_dirty_notag() -- similar to __set_page_dirty_nobuffers()
78465 + * except it doesn't tag the page dirty in the page-cache radix tree.
78466 + * This means that the address space using this cannot use the regular
78467 + * filemap ->writepages() helpers and must provide its own means of
78468 + * tracking and finding non-tagged dirty pages.
78469 + *
78470 + * NOTE: furthermore, this version also doesn't handle truncate races.
78471 + */
78472 +int set_page_dirty_notag(struct page *page)
78473 +{
78474 +       struct address_space *mapping = page->mapping;
78475 +
78476 +       if (!TestSetPageDirty(page)) {
78477 +               unsigned long flags;
78478 +               WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
78479 +               local_irq_save(flags);
78480 +               account_page_dirtied(page, mapping);
78481 +               local_irq_restore(flags);
78482 +               __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
78483 +               return 1;
78484 +       }
78485 +       return 0;
78486 +}
78487 +EXPORT_SYMBOL(set_page_dirty_notag);
78488 +
78489 +/*
78490   * When a writepage implementation decides that it doesn't want to write this
78491   * page for some reason, it should redirty the locked page via
78492   * redirty_page_for_writepage() and it should then unlock the page and return 0