kernel-reiser4.patch

   1 diff -urN linux-2.6.27.orig/Documentation/Changes linux-2.6.27/Documentation/Changes
   2 --- linux-2.6.27.orig/Documentation/Changes     2008-07-14 01:51:29.000000000 +0400
   3 +++ linux-2.6.27/Documentation/Changes  2008-10-12 18:20:00.000000000 +0400
   4 @@ -36,6 +36,7 @@
   5  o  e2fsprogs              1.29                    # tune2fs
   6  o  jfsutils               1.1.3                   # fsck.jfs -V
   7  o  reiserfsprogs          3.6.3                   # reiserfsck -V 2>&1|grep reiserfsprogs
   8 +o  reiser4progs           1.0.0                   # fsck.reiser4 -V
   9  o  xfsprogs               2.6.0                   # xfs_db -V
  10  o  pcmciautils            004                     # pccardctl -V
  11  o  quota-tools            3.09                    # quota -V
  12 @@ -145,6 +146,13 @@
  13  versions of mkreiserfs, resize_reiserfs, debugreiserfs and
  14  reiserfsck. These utils work on both i386 and alpha platforms.
  15
  16 +Reiser4progs
  17 +------------
  18 +
  19 +The reiser4progs package contains utilities for the reiser4 file system.
  20 +Detailed instructions are provided in the README file located at:
  21 +<ftp://ftp.namesys.com/pub/reiser4progs/README>.
  22 +
  23  Xfsprogs
  24  --------
  25
  26 @@ -323,6 +331,10 @@
  27  -------------
  28  o  <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
  29
  30 +Reiser4progs
  31 +------------
  32 +o  <ftp://ftp.namesys.com/pub/reiser4progs/>
  33 +
  34  Xfsprogs
  35  --------
  36  o  <ftp://oss.sgi.com/projects/xfs/download/>
  37 diff -urN linux-2.6.27.orig/Documentation/filesystems/reiser4.txt linux-2.6.27/Documentation/filesystems/reiser4.txt
  38 --- linux-2.6.27.orig/Documentation/filesystems/reiser4.txt     1970-01-01 03:00:00.000000000 +0300
  39 +++ linux-2.6.27/Documentation/filesystems/reiser4.txt  2008-10-12 18:20:00.000000000 +0400
  40 @@ -0,0 +1,75 @@
  41 +Reiser4 filesystem
  42 +==================
  43 +Reiser4 is a file system based on dancing tree algorithms, and is
  44 +described at http://www.namesys.com
  45 +
  46 +
  47 +References
  48 +==========
  49 +web page               http://namesys.com/v4/v4.html
  50 +source code            ftp://ftp.namesys.com/pub/reiser4-for-2.6/
  51 +userland tools         ftp://ftp.namesys.com/pub/reiser4progs/
  52 +install page           http://www.namesys.com/install_v4.html
  53 +
  54 +Compile options
  55 +===============
  56 +Enable reiser4 debug mode
  57 +       This checks everything imaginable while reiser4
  58 +       runs
  59 +
  60 +Mount options
  61 +=============
  62 +tmgr.atom_max_size=N
  63 +       Atoms containing more than N blocks will be forced to commit.
  64 +       N is decimal.
  65 +       Default is nr_free_pagecache_pages() / 2 at mount time.
  66 +
  67 +tmgr.atom_max_age=N
  68 +       Atoms older than N seconds will be forced to commit. N is decimal.
  69 +       Default is 600.
  70 +
  71 +tmgr.atom_max_flushers=N
  72 +       Limit of concurrent flushers for one atom. 0 means no limit.
  73 +       Default is 0.
  74 +
  75 +tree.cbk_cache.nr_slots=N
  76 +       Number of slots in the cbk cache.
  77 +
  78 +flush.relocate_threshold=N
  79 +       If flush finds more than N adjacent dirty leaf-level blocks it
  80 +       will force them to be relocated.
  81 +       Default is 64.
  82 +
  83 +flush.relocate_distance=N
  84 +       If flush finds can find a block allocation closer than at most
  85 +       N from the preceder it will relocate to that position.
  86 +       Default is 64.
  87 +
  88 +flush.scan_maxnodes=N
  89 +       The maximum number of nodes to scan left on a level during
  90 +       flush.
  91 +       Default is 10000.
  92 +
  93 +optimal_io_size=N
  94 +       Preferred IO size. This value is used to set st_blksize of
  95 +       struct stat.
  96 +       Default is 65536.
  97 +
  98 +bsdgroups
  99 +       Turn on BSD-style gid assignment.
 100 +
 101 +32bittimes
 102 +       By default file in reiser4 have 64 bit timestamps. Files
 103 +       created when filesystem is mounted with 32bittimes mount
 104 +       option will get 32 bit timestamps.
 105 +
 106 +mtflush
 107 +       Turn off concurrent flushing.
 108 +
 109 +nopseudo
 110 +       Disable pseudo files support. See
 111 +       http://namesys.com/v4/pseudo.html for more about pseudo files.
 112 +
 113 +dont_load_bitmap
 114 +       Don't load all bitmap blocks at mount time, it is useful for
 115 +       machines with tiny RAM and large disks.
 116 diff -urN linux-2.6.27.orig/fs/fs-writeback.c linux-2.6.27/fs/fs-writeback.c
 117 --- linux-2.6.27.orig/fs/fs-writeback.c 2008-10-13 01:35:38.000000000 +0400
 118 +++ linux-2.6.27/fs/fs-writeback.c      2008-10-12 18:20:00.000000000 +0400
 119 @@ -531,7 +531,10 @@
 120  static void sync_sb_inodes(struct super_block *sb,
 121                                 struct writeback_control *wbc)
 122  {
 123 -       generic_sync_sb_inodes(sb, wbc);
 124 +       if (sb->s_op->sync_inodes)
 125 +               sb->s_op->sync_inodes(sb, wbc);
 126 +       else
 127 +               generic_sync_sb_inodes(sb, wbc);
 128  }
 129
 130  /*
 131 diff -urN linux-2.6.27.orig/fs/Kconfig linux-2.6.27/fs/Kconfig
 132 --- linux-2.6.27.orig/fs/Kconfig        2008-10-13 01:35:38.000000000 +0400
 133 +++ linux-2.6.27/fs/Kconfig     2008-10-12 18:20:00.000000000 +0400
 134 @@ -274,6 +274,8 @@
 135         default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
 136         default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
 137
 138 +source "fs/reiser4/Kconfig"
 139 +
 140  config REISERFS_FS
 141         tristate "Reiserfs support"
 142         help
 143 diff -urN linux-2.6.27.orig/fs/Makefile linux-2.6.27/fs/Makefile
 144 --- linux-2.6.27.orig/fs/Makefile       2008-10-13 01:35:38.000000000 +0400
 145 +++ linux-2.6.27/fs/Makefile    2008-10-12 18:20:00.000000000 +0400
 146 @@ -68,6 +68,7 @@
 147
 148  # Do not add any filesystems before this line
 149  obj-$(CONFIG_REISERFS_FS)      += reiserfs/
 150 +obj-$(CONFIG_REISER4_FS)       += reiser4/
 151  obj-$(CONFIG_EXT3_FS)          += ext3/ # Before ext2 so root fs can be ext3
 152  obj-$(CONFIG_EXT4DEV_FS)       += ext4/ # Before ext2 so root fs can be ext4dev
 153  obj-$(CONFIG_JBD)              += jbd/
 154 diff -urN linux-2.6.27.orig/fs/reiser4/as_ops.c linux-2.6.27/fs/reiser4/as_ops.c
 155 --- linux-2.6.27.orig/fs/reiser4/as_ops.c       1970-01-01 03:00:00.000000000 +0300
 156 +++ linux-2.6.27/fs/reiser4/as_ops.c    2008-10-13 02:23:37.000000000 +0400
 157 @@ -0,0 +1,356 @@
 158 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
 159 +
 160 +/* Interface to VFS. Reiser4 address_space_operations are defined here. */
 161 +
 162 +#include "forward.h"
 163 +#include "debug.h"
 164 +#include "dformat.h"
 165 +#include "coord.h"
 166 +#include "plugin/item/item.h"
 167 +#include "plugin/file/file.h"
 168 +#include "plugin/security/perm.h"
 169 +#include "plugin/disk_format/disk_format.h"
 170 +#include "plugin/plugin.h"
 171 +#include "plugin/plugin_set.h"
 172 +#include "plugin/object.h"
 173 +#include "txnmgr.h"
 174 +#include "jnode.h"
 175 +#include "znode.h"
 176 +#include "block_alloc.h"
 177 +#include "tree.h"
 178 +#include "vfs_ops.h"
 179 +#include "inode.h"
 180 +#include "page_cache.h"
 181 +#include "ktxnmgrd.h"
 182 +#include "super.h"
 183 +#include "reiser4.h"
 184 +#include "entd.h"
 185 +
 186 +#include <linux/profile.h>
 187 +#include <linux/types.h>
 188 +#include <linux/mount.h>
 189 +#include <linux/vfs.h>
 190 +#include <linux/mm.h>
 191 +#include <linux/buffer_head.h>
 192 +#include <linux/dcache.h>
 193 +#include <linux/list.h>
 194 +#include <linux/pagemap.h>
 195 +#include <linux/slab.h>
 196 +#include <linux/seq_file.h>
 197 +#include <linux/init.h>
 198 +#include <linux/module.h>
 199 +#include <linux/writeback.h>
 200 +#include <linux/backing-dev.h>
 201 +#include <linux/quotaops.h>
 202 +#include <linux/security.h>
 203 +
 204 +/* address space operations */
 205 +
 206 +/**
 207 + * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
 208 + * @page: page to be dirtied
 209 + *
 210 + * Operation of struct address_space_operations. This implementation is used by
 211 + * unix and cryptcompress file plugins.
 212 + *
 213 + * This is called when reiser4 page gets dirtied outside of reiser4, for
 214 + * example, when dirty bit is moved from pte to physical page.
 215 + *
 216 + * Tags page in the mapping's page tree with special tag so that it is possible
 217 + * to do all the reiser4 specific work wrt dirty pages (jnode creation,
 218 + * capturing by an atom) later because it can not be done in the contexts where
 219 + * set_page_dirty is called.
 220 + */
 221 +int reiser4_set_page_dirty(struct page *page)
 222 +{
 223 +       /* this page can be unformatted only */
 224 +       assert("vs-1734", (page->mapping &&
 225 +                          page->mapping->host &&
 226 +                          reiser4_get_super_fake(page->mapping->host->i_sb) !=
 227 +                          page->mapping->host
 228 +                          && reiser4_get_cc_fake(page->mapping->host->i_sb) !=
 229 +                          page->mapping->host
 230 +                          && reiser4_get_bitmap_fake(page->mapping->host->i_sb) !=
 231 +                          page->mapping->host));
 232 +       return __set_page_dirty_nobuffers(page);
 233 +}
 234 +
 235 +/* ->invalidatepage method for reiser4 */
 236 +
 237 +/*
 238 + * this is called for each truncated page from
 239 + * truncate_inode_pages()->truncate_{complete,partial}_page().
 240 + *
 241 + * At the moment of call, page is under lock, and outstanding io (if any) has
 242 + * completed.
 243 + */
 244 +
 245 +/**
 246 + * reiser4_invalidatepage
 247 + * @page: page to invalidate
 248 + * @offset: starting offset for partial invalidation
 249 + *
 250 + */
 251 +void reiser4_invalidatepage(struct page *page, unsigned long offset)
 252 +{
 253 +       int ret = 0;
 254 +       reiser4_context *ctx;
 255 +       struct inode *inode;
 256 +       jnode *node;
 257 +
 258 +       /*
 259 +        * This is called to truncate file's page.
 260 +        *
 261 +        * Originally, reiser4 implemented truncate in a standard way
 262 +        * (vmtruncate() calls ->invalidatepage() on all truncated pages
 263 +        * first, then file system ->truncate() call-back is invoked).
 264 +        *
 265 +        * This lead to the problem when ->invalidatepage() was called on a
 266 +        * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
 267 +        * process. That is, truncate was bypassing transactions. To avoid
 268 +        * this, try_capture_page_to_invalidate() call was added here.
 269 +        *
 270 +        * After many troubles with vmtruncate() based truncate (including
 271 +        * races with flush, tail conversion, etc.) it was re-written in the
 272 +        * top-to-bottom style: items are killed in reiser4_cut_tree_object()
 273 +        * and pages belonging to extent are invalidated in kill_hook_extent().
 274 +        * So probably now additional call to capture is not needed here.
 275 +        */
 276 +
 277 +       assert("nikita-3137", PageLocked(page));
 278 +       assert("nikita-3138", !PageWriteback(page));
 279 +       inode = page->mapping->host;
 280 +
 281 +       /*
 282 +        * ->invalidatepage() should only be called for the unformatted
 283 +        * jnodes. Destruction of all other types of jnodes is performed
 284 +        * separately. But, during some corner cases (like handling errors
 285 +        * during mount) it is simpler to let ->invalidatepage to be called on
 286 +        * them. Check for this, and do nothing.
 287 +        */
 288 +       if (reiser4_get_super_fake(inode->i_sb) == inode)
 289 +               return;
 290 +       if (reiser4_get_cc_fake(inode->i_sb) == inode)
 291 +               return;
 292 +       if (reiser4_get_bitmap_fake(inode->i_sb) == inode)
 293 +               return;
 294 +       assert("vs-1426", PagePrivate(page));
 295 +       assert("vs-1427",
 296 +              page->mapping == jnode_get_mapping(jnode_by_page(page)));
 297 +       assert("", jprivate(page) != NULL);
 298 +       assert("", ergo(inode_file_plugin(inode) !=
 299 +                       file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID),
 300 +                       offset == 0));
 301 +
 302 +       ctx = reiser4_init_context(inode->i_sb);
 303 +       if (IS_ERR(ctx))
 304 +               return;
 305 +
 306 +       node = jprivate(page);
 307 +       spin_lock_jnode(node);
 308 +       if (!(node->state & ((1 << JNODE_DIRTY) | (1<< JNODE_FLUSH_QUEUED) |
 309 +                         (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
 310 +               /* there is not need to capture */
 311 +               jref(node);
 312 +               JF_SET(node, JNODE_HEARD_BANSHEE);
 313 +               page_clear_jnode(page, node);
 314 +               reiser4_uncapture_jnode(node);
 315 +               unhash_unformatted_jnode(node);
 316 +               jput(node);
 317 +               reiser4_exit_context(ctx);
 318 +               return;
 319 +       }
 320 +       spin_unlock_jnode(node);
 321 +
 322 +       /* capture page being truncated. */
 323 +       ret = try_capture_page_to_invalidate(page);
 324 +       if (ret != 0)
 325 +               warning("nikita-3141", "Cannot capture: %i", ret);
 326 +
 327 +       if (offset == 0) {
 328 +               /* remove jnode from transaction and detach it from page. */
 329 +               jref(node);
 330 +               JF_SET(node, JNODE_HEARD_BANSHEE);
 331 +               /* page cannot be detached from jnode concurrently, because it
 332 +                * is locked */
 333 +               reiser4_uncapture_page(page);
 334 +
 335 +               /* this detaches page from jnode, so that jdelete will not try
 336 +                * to lock page which is already locked */
 337 +               spin_lock_jnode(node);
 338 +               page_clear_jnode(page, node);
 339 +               spin_unlock_jnode(node);
 340 +               unhash_unformatted_jnode(node);
 341 +
 342 +               jput(node);
 343 +       }
 344 +
 345 +       reiser4_exit_context(ctx);
 346 +}
 347 +
 348 +/* help function called from reiser4_releasepage(). It returns true if jnode
 349 + * can be detached from its page and page released. */
 350 +int jnode_is_releasable(jnode * node /* node to check */ )
 351 +{
 352 +       assert("nikita-2781", node != NULL);
 353 +       assert_spin_locked(&(node->guard));
 354 +       assert_spin_locked(&(node->load));
 355 +
 356 +       /* is some thread is currently using jnode page, later cannot be
 357 +        * detached */
 358 +       if (atomic_read(&node->d_count) != 0) {
 359 +               return 0;
 360 +       }
 361 +
 362 +       assert("vs-1214", !jnode_is_loaded(node));
 363 +
 364 +       /*
 365 +        * can only release page if real block number is assigned to it. Simple
 366 +        * check for ->atom wouldn't do, because it is possible for node to be
 367 +        * clean, not it atom yet, and still having fake block number. For
 368 +        * example, node just created in jinit_new().
 369 +        */
 370 +       if (reiser4_blocknr_is_fake(jnode_get_block(node)))
 371 +               return 0;
 372 +
 373 +       /*
 374 +        * pages prepared for write can not be released anyway, so avoid
 375 +        * detaching jnode from the page
 376 +        */
 377 +       if (JF_ISSET(node, JNODE_WRITE_PREPARED))
 378 +               return 0;
 379 +
 380 +       /*
 381 +        * dirty jnode cannot be released. It can however be submitted to disk
 382 +        * as part of early flushing, but only after getting flush-prepped.
 383 +        */
 384 +       if (JF_ISSET(node, JNODE_DIRTY))
 385 +               return 0;
 386 +
 387 +       /* overwrite set is only written by log writer. */
 388 +       if (JF_ISSET(node, JNODE_OVRWR))
 389 +               return 0;
 390 +
 391 +       /* jnode is already under writeback */
 392 +       if (JF_ISSET(node, JNODE_WRITEBACK))
 393 +               return 0;
 394 +
 395 +       /* don't flush bitmaps or journal records */
 396 +       if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
 397 +               return 0;
 398 +
 399 +       return 1;
 400 +}
 401 +
 402 +/*
 403 + * ->releasepage method for reiser4
 404 + *
 405 + * This is called by VM scanner when it comes across clean page.  What we have
 406 + * to do here is to check whether page can really be released (freed that is)
 407 + * and if so, detach jnode from it and remove page from the page cache.
 408 + *
 409 + * Check for releasability is done by releasable() function.
 410 + */
 411 +int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
 412 +{
 413 +       jnode *node;
 414 +
 415 +       assert("nikita-2257", PagePrivate(page));
 416 +       assert("nikita-2259", PageLocked(page));
 417 +       assert("nikita-2892", !PageWriteback(page));
 418 +       assert("nikita-3019", reiser4_schedulable());
 419 +
 420 +       /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
 421 +          is not clear what to do in this case. A lot of deadlocks seems be
 422 +          possible. */
 423 +
 424 +       node = jnode_by_page(page);
 425 +       assert("nikita-2258", node != NULL);
 426 +       assert("reiser4-4", page->mapping != NULL);
 427 +       assert("reiser4-5", page->mapping->host != NULL);
 428 +
 429 +       if (PageDirty(page))
 430 +               return 0;
 431 +
 432 +       /* extra page reference is used by reiser4 to protect
 433 +        * jnode<->page link from this ->releasepage(). */
 434 +       if (page_count(page) > 3)
 435 +               return 0;
 436 +
 437 +       /* releasable() needs jnode lock, because it looks at the jnode fields
 438 +        * and we need jload_lock here to avoid races with jload(). */
 439 +       spin_lock_jnode(node);
 440 +       spin_lock(&(node->load));
 441 +       if (jnode_is_releasable(node)) {
 442 +               struct address_space *mapping;
 443 +
 444 +               mapping = page->mapping;
 445 +               jref(node);
 446 +               /* there is no need to synchronize against
 447 +                * jnode_extent_write() here, because pages seen by
 448 +                * jnode_extent_write() are !releasable(). */
 449 +               page_clear_jnode(page, node);
 450 +               spin_unlock(&(node->load));
 451 +               spin_unlock_jnode(node);
 452 +
 453 +               /* we are under memory pressure so release jnode also. */
 454 +               jput(node);
 455 +
 456 +               return 1;
 457 +       } else {
 458 +               spin_unlock(&(node->load));
 459 +               spin_unlock_jnode(node);
 460 +               assert("nikita-3020", reiser4_schedulable());
 461 +               return 0;
 462 +       }
 463 +}
 464 +
 465 +int reiser4_readpage(struct file *file, struct page *page)
 466 +{
 467 +       assert("edward-1533", PageLocked(page));
 468 +       assert("edward-1534", !PageUptodate(page));
 469 +       assert("edward-1535", page->mapping && page->mapping->host);
 470 +
 471 +       return inode_file_plugin(page->mapping->host)->readpage(file, page);
 472 +}
 473 +
 474 +int reiser4_readpages(struct file *file, struct address_space *mapping,
 475 +                     struct list_head *pages, unsigned nr_pages)
 476 +{
 477 +       return inode_file_plugin(mapping->host)->readpages(file, mapping,
 478 +                                                          pages, nr_pages);
 479 +}
 480 +
 481 +int reiser4_writepages(struct address_space *mapping,
 482 +                      struct writeback_control *wbc)
 483 +{
 484 +       return inode_file_plugin(mapping->host)->writepages(mapping, wbc);
 485 +}
 486 +
 487 +int reiser4_prepare_write(struct file *file, struct page *page,
 488 +                         unsigned from, unsigned to)
 489 +{
 490 +       return inode_file_plugin(file->f_dentry->d_inode)->prepare_write(file,
 491 +                                                                        page,
 492 +                                                                        from,
 493 +                                                                        to);
 494 +}
 495 +
 496 +int reiser4_commit_write(struct file *file, struct page *page,
 497 +                        unsigned from, unsigned to)
 498 +{
 499 +       return inode_file_plugin(file->f_dentry->d_inode)->commit_write(file,
 500 +                                                                       page,
 501 +                                                                       from,
 502 +                                                                       to);
 503 +}
 504 +
 505 +/* Make Linus happy.
 506 +   Local variables:
 507 +   c-indentation-style: "K&R"
 508 +   mode-name: "LC"
 509 +   c-basic-offset: 8
 510 +   tab-width: 8
 511 +   fill-column: 120
 512 +   End:
 513 +*/
 514 diff -urN linux-2.6.27.orig/fs/reiser4/block_alloc.c linux-2.6.27/fs/reiser4/block_alloc.c
 515 --- linux-2.6.27.orig/fs/reiser4/block_alloc.c  1970-01-01 03:00:00.000000000 +0300
 516 +++ linux-2.6.27/fs/reiser4/block_alloc.c       2008-10-12 18:20:00.000000000 +0400
 517 @@ -0,0 +1,1137 @@
 518 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
 519 +
 520 +#include "debug.h"
 521 +#include "dformat.h"
 522 +#include "plugin/plugin.h"
 523 +#include "txnmgr.h"
 524 +#include "znode.h"
 525 +#include "block_alloc.h"
 526 +#include "tree.h"
 527 +#include "super.h"
 528 +
 529 +#include <linux/types.h>       /* for __u??  */
 530 +#include <linux/fs.h>          /* for struct super_block  */
 531 +#include <linux/spinlock.h>
 532 +
 533 +/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
 534 +
 535 +/* We need to be able to reserve enough disk space to ensure that an atomic
 536 +   operation will have enough disk space to flush (see flush.c and
 537 +   http://namesys.com/v4/v4.html) and commit it once it is started.
 538 +
 539 +   In our design a call for reserving disk space may fail but not an actual
 540 +   block allocation.
 541 +
 542 +   All free blocks, already allocated blocks, and all kinds of reserved blocks
 543 +   are counted in different per-fs block counters.
 544 +
 545 +   A reiser4 super block's set of block counters currently is:
 546 +
 547 +   free -- free blocks,
 548 +   used -- already allocated blocks,
 549 +
 550 +   grabbed -- initially reserved for performing an fs operation, those blocks
 551 +          are taken from free blocks, then grabbed disk space leaks from grabbed
 552 +          blocks counter to other counters like "fake allocated", "flush
 553 +          reserved", "used", the rest of not used grabbed space is returned to
 554 +          free space at the end of fs operation;
 555 +
 556 +   fake allocated -- counts all nodes without real disk block numbers assigned,
 557 +                     we have separate accounting for formatted and unformatted
 558 +                     nodes (for easier debugging);
 559 +
 560 +   flush reserved -- disk space needed for flushing and committing an atom.
 561 +                     Each dirty already allocated block could be written as a
 562 +                     part of atom's overwrite set or as a part of atom's
 563 +                     relocate set.  In both case one additional block is needed,
 564 +                     it is used as a wandered block if we do overwrite or as a
 565 +                    new location for a relocated block.
 566 +
 567 +   In addition, blocks in some states are counted on per-thread and per-atom
 568 +   basis.  A reiser4 context has a counter of blocks grabbed by this transaction
 569 +   and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
 570 +   of each reiser4 context.  Each reiser4 atom has a counter of "flush reserved"
 571 +   blocks, which are reserved for flush processing and atom commit. */
 572 +
 573 +/* AN EXAMPLE: suppose we insert new item to the reiser4 tree.  We estimate
 574 +   number of blocks to grab for most expensive case of balancing when the leaf
 575 +   node we insert new item to gets split and new leaf node is allocated.
 576 +
 577 +   So, we need to grab blocks for
 578 +
 579 +   1) one block for possible dirtying the node we insert an item to. That block
 580 +      would be used for node relocation at flush time or for allocating of a
 581 +      wandered one, it depends what will be a result (what set, relocate or
 582 +      overwrite the node gets assigned to) of the node processing by the flush
 583 +      algorithm.
 584 +
 585 +   2) one block for either allocating a new node, or dirtying of right or left
 586 +      clean neighbor, only one case may happen.
 587 +
 588 +   VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current
 589 +   node, and creation of new node.  have I forgotten something?  email me.
 590 +
 591 +   These grabbed blocks are counted in both reiser4 context "grabbed blocks"
 592 +   counter and in the fs-wide one (both ctx->grabbed_blocks and
 593 +   sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
 594 +   decremented by 2.
 595 +
 596 +   Suppose both two blocks were spent for dirtying of an already allocated clean
 597 +   node (one block went from "grabbed" to "flush reserved") and for new block
 598 +   allocating (one block went from "grabbed" to "fake allocated formatted").
 599 +
 600 +   Inserting of a child pointer to the parent node caused parent node to be
 601 +   split, the balancing code takes care about this grabbing necessary space
 602 +   immediately by calling reiser4_grab with BA_RESERVED flag set which means
 603 +   "can use the 5% reserved disk space".
 604 +
 605 +   At this moment insertion completes and grabbed blocks (if they were not used)
 606 +   should be returned to the free space counter.
 607 +
 608 +   However the atom life-cycle is not completed.  The atom had one "flush
 609 +   reserved" block added by our insertion and the new fake allocated node is
 610 +   counted as a "fake allocated formatted" one.  The atom has to be fully
 611 +   processed by flush before commit.  Suppose that the flush moved the first,
 612 +   already allocated node to the atom's overwrite list, the new fake allocated
 613 +   node, obviously, went into the atom relocate set.  The reiser4 flush
 614 +   allocates the new node using one unit from "fake allocated formatted"
 615 +   counter, the log writer uses one from "flush reserved" for wandered block
 616 +   allocation.
 617 +
 618 +   And, it is not the end.  When the wandered block is deallocated after the
 619 +   atom gets fully played (see wander.c for term description), the disk space
 620 +   occupied for it is returned to free blocks. */
 621 +
 622 +/* BLOCK NUMBERS */
 623 +
 624 +/* Any reiser4 node has a block number assigned to it.  We use these numbers for
 625 +   indexing in hash tables, so if a block has not yet been assigned a location
 626 +   on disk we need to give it a temporary fake block number.
 627 +
 628 +   Current implementation of reiser4 uses 64-bit integers for block numbers. We
 629 +   use highest bit in 64-bit block number to distinguish fake and real block
 630 +   numbers. So, only 63 bits may be used to addressing of real device
 631 +   blocks. That "fake" block numbers space is divided into subspaces of fake
 632 +   block numbers for data blocks and for shadow (working) bitmap blocks.
 633 +
 634 +   Fake block numbers for data blocks are generated by a cyclic counter, which
 635 +   gets incremented after each real block allocation. We assume that it is
 636 +   impossible to overload this counter during one transaction life. */
 637 +
 638 +/* Initialize a blocknr hint. */
 639 +void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint)
 640 +{
 641 +       memset(hint, 0, sizeof(reiser4_blocknr_hint));
 642 +}
 643 +
 644 +/* Release any resources of a blocknr hint. */
 645 +void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
 646 +{
 647 +       /* No resources should be freed in current blocknr_hint implementation. */
 648 +}
 649 +
 650 +/* see above for explanation of fake block number.  */
 651 +/* Audited by: green(2002.06.11) */
 652 +int reiser4_blocknr_is_fake(const reiser4_block_nr * da)
 653 +{
 654 +       /* The reason for not simply returning result of '&' operation is that
 655 +          while return value is (possibly 32bit) int,  the reiser4_block_nr is
 656 +          at least 64 bits long, and high bit (which is the only possible
 657 +          non zero bit after the masking) would be stripped off */
 658 +       return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
 659 +}
 660 +
 661 +/* Static functions for <reiser4 super block>/<reiser4 context> block counters
 662 +   arithmetic. Mostly, they are isolated to not to code same assertions in
 663 +   several places. */
 664 +static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
 665 +{
 666 +       BUG_ON(ctx->grabbed_blocks < count);
 667 +       assert("zam-527", ctx->grabbed_blocks >= count);
 668 +       ctx->grabbed_blocks -= count;
 669 +}
 670 +
 671 +static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
 672 +{
 673 +       ctx->grabbed_blocks += count;
 674 +}
 675 +
 676 +static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
 677 +{
 678 +       assert("zam-525", sbinfo->blocks_grabbed >= count);
 679 +       sbinfo->blocks_grabbed -= count;
 680 +}
 681 +
 682 +/* Decrease the counter of block reserved for flush in super block. */
 683 +static void
 684 +sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
 685 +{
 686 +       assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
 687 +       sbinfo->blocks_flush_reserved -= count;
 688 +}
 689 +
 690 +static void
 691 +sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
 692 +                          reiser4_ba_flags_t flags)
 693 +{
 694 +       if (flags & BA_FORMATTED) {
 695 +               assert("zam-806", sbinfo->blocks_fake_allocated >= count);
 696 +               sbinfo->blocks_fake_allocated -= count;
 697 +       } else {
 698 +               assert("zam-528",
 699 +                      sbinfo->blocks_fake_allocated_unformatted >= count);
 700 +               sbinfo->blocks_fake_allocated_unformatted -= count;
 701 +       }
 702 +}
 703 +
 704 +static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
 705 +{
 706 +       assert("zam-530",
 707 +              sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
 708 +       sbinfo->blocks_used -= count;
 709 +}
 710 +
 711 +static void
 712 +sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
 713 +{
 714 +       assert("edward-501", sbinfo->blocks_clustered >= count);
 715 +       sbinfo->blocks_clustered -= count;
 716 +}
 717 +
 718 +/* Increase the counter of block reserved for flush in atom. */
 719 +static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
 720 +{
 721 +       assert("zam-772", atom != NULL);
 722 +       assert_spin_locked(&(atom->alock));
 723 +       atom->flush_reserved += count;
 724 +}
 725 +
 726 +/* Decrease the counter of block reserved for flush in atom. */
 727 +static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
 728 +{
 729 +       assert("zam-774", atom != NULL);
 730 +       assert_spin_locked(&(atom->alock));
 731 +       assert("nikita-2790", atom->flush_reserved >= count);
 732 +       atom->flush_reserved -= count;
 733 +}
 734 +
 735 +/* super block has 6 counters: free, used, grabbed, fake allocated
 736 +   (formatted and unformatted) and flush reserved. Their sum must be
 737 +   number of blocks on a device. This function checks this */
 738 +int reiser4_check_block_counters(const struct super_block *super)
 739 +{
 740 +       __u64 sum;
 741 +
 742 +       sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
 743 +           reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
 744 +           reiser4_fake_allocated_unformatted(super) + reiser4_flush_reserved(super) +
 745 +           reiser4_clustered_blocks(super);
 746 +       if (reiser4_block_count(super) != sum) {
 747 +               printk("super block counters: "
 748 +                      "used %llu, free %llu, "
 749 +                      "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
 750 +                      "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
 751 +                      (unsigned long long)reiser4_data_blocks(super),
 752 +                      (unsigned long long)reiser4_free_blocks(super),
 753 +                      (unsigned long long)reiser4_grabbed_blocks(super),
 754 +                      (unsigned long long)reiser4_fake_allocated(super),
 755 +                      (unsigned long long)
 756 +                      reiser4_fake_allocated_unformatted(super),
 757 +                      (unsigned long long)reiser4_flush_reserved(super),
 758 +                      (unsigned long long)reiser4_clustered_blocks(super),
 759 +                      (unsigned long long)sum,
 760 +                      (unsigned long long)reiser4_block_count(super));
 761 +               return 0;
 762 +       }
 763 +       return 1;
 764 +}
 765 +
 766 +/* Adjust "working" free blocks counter for number of blocks we are going to
 767 +   allocate.  Record number of grabbed blocks in fs-wide and per-thread
 768 +   counters.  This function should be called before bitmap scanning or
 769 +   allocating fake block numbers
 770 +
 771 +   @super           -- pointer to reiser4 super block;
 772 +   @count           -- number of blocks we reserve;
 773 +
 774 +   @return          -- 0 if success,  -ENOSPC, if all
 775 +                       free blocks are preserved or already allocated.
 776 +*/
 777 +
 778 +static int
 779 +reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
 780 +{
 781 +       __u64 free_blocks;
 782 +       int ret = 0, use_reserved = flags & BA_RESERVED;
 783 +       reiser4_super_info_data *sbinfo;
 784 +
 785 +       assert("vs-1276", ctx == get_current_context());
 786 +
 787 +       /* Do not grab anything on ro-mounted fs. */
 788 +       if (rofs_super(ctx->super)) {
 789 +               ctx->grab_enabled = 0;
 790 +               return 0;
 791 +       }
 792 +
 793 +       sbinfo = get_super_private(ctx->super);
 794 +
 795 +       spin_lock_reiser4_super(sbinfo);
 796 +
 797 +       free_blocks = sbinfo->blocks_free;
 798 +
 799 +       if ((use_reserved && free_blocks < count) ||
 800 +           (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
 801 +               ret = RETERR(-ENOSPC);
 802 +               goto unlock_and_ret;
 803 +       }
 804 +
 805 +       add_to_ctx_grabbed(ctx, count);
 806 +
 807 +       sbinfo->blocks_grabbed += count;
 808 +       sbinfo->blocks_free -= count;
 809 +
 810 +#if REISER4_DEBUG
 811 +       if (ctx->grabbed_initially == 0)
 812 +               ctx->grabbed_initially = count;
 813 +#endif
 814 +
 815 +       assert("nikita-2986", reiser4_check_block_counters(ctx->super));
 816 +
 817 +       /* disable grab space in current context */
 818 +       ctx->grab_enabled = 0;
 819 +
 820 +      unlock_and_ret:
 821 +       spin_unlock_reiser4_super(sbinfo);
 822 +
 823 +       return ret;
 824 +}
 825 +
 826 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
 827 +{
 828 +       int ret;
 829 +       reiser4_context *ctx;
 830 +
 831 +       assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
 832 +                                  lock_stack_isclean(get_current_lock_stack
 833 +                                                     ())));
 834 +       ctx = get_current_context();
 835 +       if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) {
 836 +               return 0;
 837 +       }
 838 +
 839 +       ret = reiser4_grab(ctx, count, flags);
 840 +       if (ret == -ENOSPC) {
 841 +
 842 +               /* Trying to commit the all transactions if BA_CAN_COMMIT flag present */
 843 +               if (flags & BA_CAN_COMMIT) {
 844 +                       txnmgr_force_commit_all(ctx->super, 0);
 845 +                       ctx->grab_enabled = 1;
 846 +                       ret = reiser4_grab(ctx, count, flags);
 847 +               }
 848 +       }
 849 +       /*
 850 +        * allocation from reserved pool cannot fail. This is severe error.
 851 +        */
 852 +       assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
 853 +       return ret;
 854 +}
 855 +
 856 +/*
 857 + * SPACE RESERVED FOR UNLINK/TRUNCATE
 858 + *
 859 + * Unlink and truncate require space in transaction (to update stat data, at
 860 + * least). But we don't want rm(1) to fail with "No space on device" error.
 861 + *
 862 + * Solution is to reserve 5% of disk space for truncates and
 863 + * unlinks. Specifically, normal space grabbing requests don't grab space from
 864 + * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
 865 + * drain it. Per super block delete mutex is used to allow only one
 866 + * thread at a time to grab from reserved area.
 867 + *
 868 + * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
 869 + * flag.
 870 + *
 871 + */
 872 +
 873 +int reiser4_grab_reserved(struct super_block *super,
 874 +                         __u64 count, reiser4_ba_flags_t flags)
 875 +{
 876 +       reiser4_super_info_data *sbinfo = get_super_private(super);
 877 +
 878 +       assert("nikita-3175", flags & BA_CAN_COMMIT);
 879 +
 880 +       /* Check the delete mutex already taken by us, we assume that
 881 +        * reading of machine word is atomic. */
 882 +       if (sbinfo->delete_mutex_owner == current) {
 883 +               if (reiser4_grab_space
 884 +                   (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
 885 +                       warning("zam-1003",
 886 +                               "nested call of grab_reserved fails count=(%llu)",
 887 +                               (unsigned long long)count);
 888 +                       reiser4_release_reserved(super);
 889 +                       return RETERR(-ENOSPC);
 890 +               }
 891 +               return 0;
 892 +       }
 893 +
 894 +       if (reiser4_grab_space(count, flags)) {
 895 +               mutex_lock(&sbinfo->delete_mutex);
 896 +               assert("nikita-2929", sbinfo->delete_mutex_owner == NULL);
 897 +               sbinfo->delete_mutex_owner = current;
 898 +
 899 +               if (reiser4_grab_space(count, flags | BA_RESERVED)) {
 900 +                       warning("zam-833",
 901 +                               "reserved space is not enough (%llu)",
 902 +                               (unsigned long long)count);
 903 +                       reiser4_release_reserved(super);
 904 +                       return RETERR(-ENOSPC);
 905 +               }
 906 +       }
 907 +       return 0;
 908 +}
 909 +
 910 +void reiser4_release_reserved(struct super_block *super)
 911 +{
 912 +       reiser4_super_info_data *info;
 913 +
 914 +       info = get_super_private(super);
 915 +       if (info->delete_mutex_owner == current) {
 916 +               info->delete_mutex_owner = NULL;
 917 +               mutex_unlock(&info->delete_mutex);
 918 +       }
 919 +}
 920 +
 921 +static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
 922 +{
 923 +       reiser4_context *ctx;
 924 +       reiser4_super_info_data *sbinfo;
 925 +
 926 +       ctx = get_current_context();
 927 +       sub_from_ctx_grabbed(ctx, count);
 928 +
 929 +       sbinfo = get_super_private(ctx->super);
 930 +       spin_lock_reiser4_super(sbinfo);
 931 +
 932 +       sub_from_sb_grabbed(sbinfo, count);
 933 +       /* return sbinfo locked */
 934 +       return sbinfo;
 935 +}
 936 +
 937 +/* is called after @count fake block numbers are allocated and pointer to
 938 +   those blocks are inserted into tree. */
 939 +static void grabbed2fake_allocated_formatted(void)
 940 +{
 941 +       reiser4_super_info_data *sbinfo;
 942 +
 943 +       sbinfo = grabbed2fake_allocated_head(1);
 944 +       sbinfo->blocks_fake_allocated++;
 945 +
 946 +       assert("vs-922", reiser4_check_block_counters(reiser4_get_current_sb()));
 947 +
 948 +       spin_unlock_reiser4_super(sbinfo);
 949 +}
 950 +
 951 +/**
 952 + * grabbed2fake_allocated_unformatted
 953 + * @count:
 954 + *
 955 + */
 956 +static void grabbed2fake_allocated_unformatted(int count)
 957 +{
 958 +       reiser4_super_info_data *sbinfo;
 959 +
 960 +       sbinfo = grabbed2fake_allocated_head(count);
 961 +       sbinfo->blocks_fake_allocated_unformatted += count;
 962 +
 963 +       assert("vs-9221", reiser4_check_block_counters(reiser4_get_current_sb()));
 964 +
 965 +       spin_unlock_reiser4_super(sbinfo);
 966 +}
 967 +
 968 +void grabbed2cluster_reserved(int count)
 969 +{
 970 +       reiser4_context *ctx;
 971 +       reiser4_super_info_data *sbinfo;
 972 +
 973 +       ctx = get_current_context();
 974 +       sub_from_ctx_grabbed(ctx, count);
 975 +
 976 +       sbinfo = get_super_private(ctx->super);
 977 +       spin_lock_reiser4_super(sbinfo);
 978 +
 979 +       sub_from_sb_grabbed(sbinfo, count);
 980 +       sbinfo->blocks_clustered += count;
 981 +
 982 +       assert("edward-504", reiser4_check_block_counters(ctx->super));
 983 +
 984 +       spin_unlock_reiser4_super(sbinfo);
 985 +}
 986 +
 987 +void cluster_reserved2grabbed(int count)
 988 +{
 989 +       reiser4_context *ctx;
 990 +       reiser4_super_info_data *sbinfo;
 991 +
 992 +       ctx = get_current_context();
 993 +
 994 +       sbinfo = get_super_private(ctx->super);
 995 +       spin_lock_reiser4_super(sbinfo);
 996 +
 997 +       sub_from_cluster_reserved(sbinfo, count);
 998 +       sbinfo->blocks_grabbed += count;
 999 +
1000 +       assert("edward-505", reiser4_check_block_counters(ctx->super));
1001 +
1002 +       spin_unlock_reiser4_super(sbinfo);
1003 +       add_to_ctx_grabbed(ctx, count);
1004 +}
1005 +
1006 +void cluster_reserved2free(int count)
1007 +{
1008 +       reiser4_context *ctx;
1009 +       reiser4_super_info_data *sbinfo;
1010 +
1011 +       ctx = get_current_context();
1012 +       sbinfo = get_super_private(ctx->super);
1013 +
1014 +       cluster_reserved2grabbed(count);
1015 +       grabbed2free(ctx, sbinfo, count);
1016 +}
1017 +
1018 +static DEFINE_SPINLOCK(fake_lock);
1019 +static reiser4_block_nr fake_gen = 0;
1020 +
1021 +/**
1022 + * assign_fake_blocknr
1023 + * @blocknr:
1024 + * @count:
1025 + *
1026 + * Obtain a fake block number for new node which will be used to refer to
1027 + * this newly allocated node until real allocation is done.
1028 + */
1029 +static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
1030 +{
1031 +       spin_lock(&fake_lock);
1032 +       *blocknr = fake_gen;
1033 +       fake_gen += count;
1034 +       spin_unlock(&fake_lock);
1035 +
1036 +       BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
1037 +       /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
1038 +       *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
1039 +       assert("zam-394", zlook(current_tree, blocknr) == NULL);
1040 +}
1041 +
1042 +int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
1043 +{
1044 +       assign_fake_blocknr(blocknr, 1);
1045 +       grabbed2fake_allocated_formatted();
1046 +       return 0;
1047 +}
1048 +
1049 +/**
1050 + * fake_blocknrs_unformatted
1051 + * @count: number of fake numbers to get
1052 + *
1053 + * Allocates @count fake block numbers which will be assigned to jnodes
1054 + */
1055 +reiser4_block_nr fake_blocknr_unformatted(int count)
1056 +{
1057 +       reiser4_block_nr blocknr;
1058 +
1059 +       assign_fake_blocknr(&blocknr, count);
1060 +       grabbed2fake_allocated_unformatted(count);
1061 +
1062 +       return blocknr;
1063 +}
1064 +
1065 +/* adjust sb block counters, if real (on-disk) block allocation immediately
1066 +   follows grabbing of free disk space. */
1067 +static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1068 +                        __u64 count)
1069 +{
1070 +       sub_from_ctx_grabbed(ctx, count);
1071 +
1072 +       spin_lock_reiser4_super(sbinfo);
1073 +
1074 +       sub_from_sb_grabbed(sbinfo, count);
1075 +       sbinfo->blocks_used += count;
1076 +
1077 +       assert("nikita-2679", reiser4_check_block_counters(ctx->super));
1078 +
1079 +       spin_unlock_reiser4_super(sbinfo);
1080 +}
1081 +
1082 +/* adjust sb block counters when @count unallocated blocks get mapped to disk */
1083 +static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
1084 +                               reiser4_ba_flags_t flags)
1085 +{
1086 +       spin_lock_reiser4_super(sbinfo);
1087 +
1088 +       sub_from_sb_fake_allocated(sbinfo, count, flags);
1089 +       sbinfo->blocks_used += count;
1090 +
1091 +       assert("nikita-2680",
1092 +              reiser4_check_block_counters(reiser4_get_current_sb()));
1093 +
1094 +       spin_unlock_reiser4_super(sbinfo);
1095 +}
1096 +
1097 +static void flush_reserved2used(txn_atom * atom, __u64 count)
1098 +{
1099 +       reiser4_super_info_data *sbinfo;
1100 +
1101 +       assert("zam-787", atom != NULL);
1102 +       assert_spin_locked(&(atom->alock));
1103 +
1104 +       sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1105 +
1106 +       sbinfo = get_current_super_private();
1107 +       spin_lock_reiser4_super(sbinfo);
1108 +
1109 +       sub_from_sb_flush_reserved(sbinfo, count);
1110 +       sbinfo->blocks_used += count;
1111 +
1112 +       assert("zam-789",
1113 +              reiser4_check_block_counters(reiser4_get_current_sb()));
1114 +
1115 +       spin_unlock_reiser4_super(sbinfo);
1116 +}
1117 +
1118 +/* update the per fs  blocknr hint default value. */
1119 +void
1120 +update_blocknr_hint_default(const struct super_block *s,
1121 +                           const reiser4_block_nr * block)
1122 +{
1123 +       reiser4_super_info_data *sbinfo = get_super_private(s);
1124 +
1125 +       assert("nikita-3342", !reiser4_blocknr_is_fake(block));
1126 +
1127 +       spin_lock_reiser4_super(sbinfo);
1128 +       if (*block < sbinfo->block_count) {
1129 +               sbinfo->blocknr_hint_default = *block;
1130 +       } else {
1131 +               warning("zam-676",
1132 +                       "block number %llu is too large to be used in a blocknr hint\n",
1133 +                       (unsigned long long)*block);
1134 +               dump_stack();
1135 +               DEBUGON(1);
1136 +       }
1137 +       spin_unlock_reiser4_super(sbinfo);
1138 +}
1139 +
1140 +/* get current value of the default blocknr hint. */
1141 +void get_blocknr_hint_default(reiser4_block_nr * result)
1142 +{
1143 +       reiser4_super_info_data *sbinfo = get_current_super_private();
1144 +
1145 +       spin_lock_reiser4_super(sbinfo);
1146 +       *result = sbinfo->blocknr_hint_default;
1147 +       assert("zam-677", *result < sbinfo->block_count);
1148 +       spin_unlock_reiser4_super(sbinfo);
1149 +}
1150 +
1151 +/* Allocate "real" disk blocks by calling a proper space allocation plugin
1152 + * method. Blocks are allocated in one contiguous disk region. The plugin
1153 + * independent part accounts blocks by subtracting allocated amount from grabbed
1154 + * or fake block counter and add the same amount to the counter of allocated
1155 + * blocks.
1156 + *
1157 + * @hint -- a reiser4 blocknr hint object which contains further block
1158 + *          allocation hints and parameters (search start, a stage of block
1159 + *          which will be mapped to disk, etc.),
1160 + * @blk  -- an out parameter for the beginning of the allocated region,
1161 + * @len  -- in/out parameter, it should contain the maximum number of allocated
1162 + *          blocks, after block allocation completes, it contains the length of
1163 + *          allocated disk region.
1164 + * @flags -- see reiser4_ba_flags_t description.
1165 + *
1166 + * @return -- 0 if success, error code otherwise.
1167 + */
1168 +int
1169 +reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
1170 +                    reiser4_block_nr * len, reiser4_ba_flags_t flags)
1171 +{
1172 +       __u64 needed = *len;
1173 +       reiser4_context *ctx;
1174 +       reiser4_super_info_data *sbinfo;
1175 +       int ret;
1176 +
1177 +       assert("zam-986", hint != NULL);
1178 +
1179 +       ctx = get_current_context();
1180 +       sbinfo = get_super_private(ctx->super);
1181 +
1182 +       /* For write-optimized data we use default search start value, which is
1183 +        * close to last write location. */
1184 +       if (flags & BA_USE_DEFAULT_SEARCH_START) {
1185 +               get_blocknr_hint_default(&hint->blk);
1186 +       }
1187 +
1188 +       /* VITALY: allocator should grab this for internal/tx-lists/similar only. */
1189 +/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */
1190 +       if (hint->block_stage == BLOCK_NOT_COUNTED) {
1191 +               ret = reiser4_grab_space_force(*len, flags);
1192 +               if (ret != 0)
1193 +                       return ret;
1194 +       }
1195 +
1196 +       ret =
1197 +           sa_alloc_blocks(reiser4_get_space_allocator(ctx->super),
1198 +                           hint, (int)needed, blk, len);
1199 +
1200 +       if (!ret) {
1201 +               assert("zam-680", *blk < reiser4_block_count(ctx->super));
1202 +               assert("zam-681",
1203 +                      *blk + *len <= reiser4_block_count(ctx->super));
1204 +
1205 +               if (flags & BA_PERMANENT) {
1206 +                       /* we assume that current atom exists at this moment */
1207 +                       txn_atom *atom = get_current_atom_locked();
1208 +                       atom->nr_blocks_allocated += *len;
1209 +                       spin_unlock_atom(atom);
1210 +               }
1211 +
1212 +               switch (hint->block_stage) {
1213 +               case BLOCK_NOT_COUNTED:
1214 +               case BLOCK_GRABBED:
1215 +                       grabbed2used(ctx, sbinfo, *len);
1216 +                       break;
1217 +               case BLOCK_UNALLOCATED:
1218 +                       fake_allocated2used(sbinfo, *len, flags);
1219 +                       break;
1220 +               case BLOCK_FLUSH_RESERVED:
1221 +                       {
1222 +                               txn_atom *atom = get_current_atom_locked();
1223 +                               flush_reserved2used(atom, *len);
1224 +                               spin_unlock_atom(atom);
1225 +                       }
1226 +                       break;
1227 +               default:
1228 +                       impossible("zam-531", "wrong block stage");
1229 +               }
1230 +       } else {
1231 +               assert("zam-821",
1232 +                      ergo(hint->max_dist == 0
1233 +                           && !hint->backward, ret != -ENOSPC));
1234 +               if (hint->block_stage == BLOCK_NOT_COUNTED)
1235 +                       grabbed2free(ctx, sbinfo, needed);
1236 +       }
1237 +
1238 +       return ret;
1239 +}
1240 +
1241 +/* used -> fake_allocated -> grabbed -> free */
1242 +
1243 +/* adjust sb block counters when @count unallocated blocks get unmapped from
1244 +   disk */
1245 +static void
1246 +used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1247 +                   int formatted)
1248 +{
1249 +       spin_lock_reiser4_super(sbinfo);
1250 +
1251 +       if (formatted)
1252 +               sbinfo->blocks_fake_allocated += count;
1253 +       else
1254 +               sbinfo->blocks_fake_allocated_unformatted += count;
1255 +
1256 +       sub_from_sb_used(sbinfo, count);
1257 +
1258 +       assert("nikita-2681",
1259 +              reiser4_check_block_counters(reiser4_get_current_sb()));
1260 +
1261 +       spin_unlock_reiser4_super(sbinfo);
1262 +}
1263 +
1264 +static void
1265 +used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
1266 +                   __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
1267 +{
1268 +       assert("nikita-2791", atom != NULL);
1269 +       assert_spin_locked(&(atom->alock));
1270 +
1271 +       add_to_atom_flush_reserved_nolock(atom, (__u32) count);
1272 +
1273 +       spin_lock_reiser4_super(sbinfo);
1274 +
1275 +       sbinfo->blocks_flush_reserved += count;
1276 +       /*add_to_sb_flush_reserved(sbinfo, count); */
1277 +       sub_from_sb_used(sbinfo, count);
1278 +
1279 +       assert("nikita-2681",
1280 +              reiser4_check_block_counters(reiser4_get_current_sb()));
1281 +
1282 +       spin_unlock_reiser4_super(sbinfo);
1283 +}
1284 +
1285 +/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */
1286 +static void
1287 +fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1288 +                      __u64 count, reiser4_ba_flags_t flags)
1289 +{
1290 +       add_to_ctx_grabbed(ctx, count);
1291 +
1292 +       spin_lock_reiser4_super(sbinfo);
1293 +
1294 +       assert("nikita-2682", reiser4_check_block_counters(ctx->super));
1295 +
1296 +       sbinfo->blocks_grabbed += count;
1297 +       sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
1298 +
1299 +       assert("nikita-2683", reiser4_check_block_counters(ctx->super));
1300 +
1301 +       spin_unlock_reiser4_super(sbinfo);
1302 +}
1303 +
1304 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
1305 +{
1306 +       reiser4_context *ctx;
1307 +       reiser4_super_info_data *sbinfo;
1308 +
1309 +       ctx = get_current_context();
1310 +       sbinfo = get_super_private(ctx->super);
1311 +
1312 +       fake_allocated2grabbed(ctx, sbinfo, count, flags);
1313 +       grabbed2free(ctx, sbinfo, count);
1314 +}
1315 +
1316 +void grabbed2free_mark(__u64 mark)
1317 +{
1318 +       reiser4_context *ctx;
1319 +       reiser4_super_info_data *sbinfo;
1320 +
1321 +       ctx = get_current_context();
1322 +       sbinfo = get_super_private(ctx->super);
1323 +
1324 +       assert("nikita-3007", (__s64) mark >= 0);
1325 +       assert("nikita-3006", ctx->grabbed_blocks >= mark);
1326 +       grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
1327 +}
1328 +
1329 +/**
1330 + * grabbed2free - adjust grabbed and free block counters
1331 + * @ctx: context to update grabbed block counter of
1332 + * @sbinfo: super block to update grabbed and free block counters of
1333 + * @count: number of blocks to adjust counters by
1334 + *
1335 + * Decreases context's and per filesystem's counters of grabbed
1336 + * blocks. Increases per filesystem's counter of free blocks.
1337 + */
1338 +void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1339 +                 __u64 count)
1340 +{
1341 +       sub_from_ctx_grabbed(ctx, count);
1342 +
1343 +       spin_lock_reiser4_super(sbinfo);
1344 +
1345 +       sub_from_sb_grabbed(sbinfo, count);
1346 +       sbinfo->blocks_free += count;
1347 +       assert("nikita-2684", reiser4_check_block_counters(ctx->super));
1348 +
1349 +       spin_unlock_reiser4_super(sbinfo);
1350 +}
1351 +
1352 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
1353 +{
1354 +       reiser4_context *ctx;
1355 +       reiser4_super_info_data *sbinfo;
1356 +
1357 +       assert("vs-1095", atom);
1358 +
1359 +       ctx = get_current_context();
1360 +       sbinfo = get_super_private(ctx->super);
1361 +
1362 +       sub_from_ctx_grabbed(ctx, count);
1363 +
1364 +       add_to_atom_flush_reserved_nolock(atom, count);
1365 +
1366 +       spin_lock_reiser4_super(sbinfo);
1367 +
1368 +       sbinfo->blocks_flush_reserved += count;
1369 +       sub_from_sb_grabbed(sbinfo, count);
1370 +
1371 +       assert("vpf-292", reiser4_check_block_counters(ctx->super));
1372 +
1373 +       spin_unlock_reiser4_super(sbinfo);
1374 +}
1375 +
1376 +void grabbed2flush_reserved(__u64 count)
1377 +{
1378 +       txn_atom *atom = get_current_atom_locked();
1379 +
1380 +       grabbed2flush_reserved_nolock(atom, count);
1381 +
1382 +       spin_unlock_atom(atom);
1383 +}
1384 +
1385 +void flush_reserved2grabbed(txn_atom * atom, __u64 count)
1386 +{
1387 +       reiser4_context *ctx;
1388 +       reiser4_super_info_data *sbinfo;
1389 +
1390 +       assert("nikita-2788", atom != NULL);
1391 +       assert_spin_locked(&(atom->alock));
1392 +
1393 +       ctx = get_current_context();
1394 +       sbinfo = get_super_private(ctx->super);
1395 +
1396 +       add_to_ctx_grabbed(ctx, count);
1397 +
1398 +       sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1399 +
1400 +       spin_lock_reiser4_super(sbinfo);
1401 +
1402 +       sbinfo->blocks_grabbed += count;
1403 +       sub_from_sb_flush_reserved(sbinfo, count);
1404 +
1405 +       assert("vpf-292", reiser4_check_block_counters(ctx->super));
1406 +
1407 +       spin_unlock_reiser4_super(sbinfo);
1408 +}
1409 +
1410 +/**
1411 + * all_grabbed2free - releases all blocks grabbed in context
1412 + *
1413 + * Decreases context's and super block's grabbed block counters by number of
1414 + * blocks grabbed by current context and increases super block's free block
1415 + * counter correspondingly.
1416 + */
1417 +void all_grabbed2free(void)
1418 +{
1419 +       reiser4_context *ctx = get_current_context();
1420 +
1421 +       grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
1422 +}
1423 +
1424 +/* adjust sb block counters if real (on-disk) blocks do not become unallocated
1425 +   after freeing, @count blocks become "grabbed". */
1426 +static void
1427 +used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1428 +            __u64 count)
1429 +{
1430 +       add_to_ctx_grabbed(ctx, count);
1431 +
1432 +       spin_lock_reiser4_super(sbinfo);
1433 +
1434 +       sbinfo->blocks_grabbed += count;
1435 +       sub_from_sb_used(sbinfo, count);
1436 +
1437 +       assert("nikita-2685", reiser4_check_block_counters(ctx->super));
1438 +
1439 +       spin_unlock_reiser4_super(sbinfo);
1440 +}
1441 +
1442 +/* this used to be done through used2grabbed and grabbed2free*/
1443 +static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
1444 +{
1445 +       spin_lock_reiser4_super(sbinfo);
1446 +
1447 +       sbinfo->blocks_free += count;
1448 +       sub_from_sb_used(sbinfo, count);
1449 +
1450 +       assert("nikita-2685",
1451 +              reiser4_check_block_counters(reiser4_get_current_sb()));
1452 +
1453 +       spin_unlock_reiser4_super(sbinfo);
1454 +}
1455 +
1456 +#if REISER4_DEBUG
1457 +
1458 +/* check "allocated" state of given block range */
1459 +static void
1460 +reiser4_check_blocks(const reiser4_block_nr * start,
1461 +                    const reiser4_block_nr * len, int desired)
1462 +{
1463 +       sa_check_blocks(start, len, desired);
1464 +}
1465 +
1466 +/* check "allocated" state of given block */
1467 +void reiser4_check_block(const reiser4_block_nr * block, int desired)
1468 +{
1469 +       const reiser4_block_nr one = 1;
1470 +
1471 +       reiser4_check_blocks(block, &one, desired);
1472 +}
1473 +
1474 +#endif
1475 +
1476 +/* Blocks deallocation function may do an actual deallocation through space
1477 +   plugin allocation or store deleted block numbers in atom's delete_set data
1478 +   structure depend on @defer parameter. */
1479 +
1480 +/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which
1481 +   will be deleted from WORKING bitmap. They might be just unmapped from disk, or
1482 +   freed but disk space is still grabbed by current thread, or these blocks must
1483 +   not be counted in any reiser4 sb block counters, see block_stage_t comment */
1484 +
1485 +/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
1486 +   distinguish blocks allocated for unformatted and formatted nodes */
1487 +
1488 +int
1489 +reiser4_dealloc_blocks(const reiser4_block_nr * start,
1490 +                      const reiser4_block_nr * len,
1491 +                      block_stage_t target_stage, reiser4_ba_flags_t flags)
1492 +{
1493 +       txn_atom *atom = NULL;
1494 +       int ret;
1495 +       reiser4_context *ctx;
1496 +       reiser4_super_info_data *sbinfo;
1497 +
1498 +       ctx = get_current_context();
1499 +       sbinfo = get_super_private(ctx->super);
1500 +
1501 +       if (REISER4_DEBUG) {
1502 +               assert("zam-431", *len != 0);
1503 +               assert("zam-432", *start != 0);
1504 +               assert("zam-558", !reiser4_blocknr_is_fake(start));
1505 +
1506 +               spin_lock_reiser4_super(sbinfo);
1507 +               assert("zam-562", *start < sbinfo->block_count);
1508 +               spin_unlock_reiser4_super(sbinfo);
1509 +       }
1510 +
1511 +       if (flags & BA_DEFER) {
1512 +               blocknr_set_entry *bsep = NULL;
1513 +
1514 +               /* storing deleted block numbers in a blocknr set
1515 +                  datastructure for further actual deletion */
1516 +               do {
1517 +                       atom = get_current_atom_locked();
1518 +                       assert("zam-430", atom != NULL);
1519 +
1520 +                       ret =
1521 +                           blocknr_set_add_extent(atom, &atom->delete_set,
1522 +                                                  &bsep, start, len);
1523 +
1524 +                       if (ret == -ENOMEM)
1525 +                               return ret;
1526 +
1527 +                       /* This loop might spin at most two times */
1528 +               } while (ret == -E_REPEAT);
1529 +
1530 +               assert("zam-477", ret == 0);
1531 +               assert("zam-433", atom != NULL);
1532 +
1533 +               spin_unlock_atom(atom);
1534 +
1535 +       } else {
1536 +               assert("zam-425", get_current_super_private() != NULL);
1537 +               sa_dealloc_blocks(reiser4_get_space_allocator(ctx->super),
1538 +                                 *start, *len);
1539 +
1540 +               if (flags & BA_PERMANENT) {
1541 +                       /* These blocks were counted as allocated, we have to revert it
1542 +                        * back if allocation is discarded. */
1543 +                       txn_atom *atom = get_current_atom_locked();
1544 +                       atom->nr_blocks_allocated -= *len;
1545 +                       spin_unlock_atom(atom);
1546 +               }
1547 +
1548 +               switch (target_stage) {
1549 +               case BLOCK_NOT_COUNTED:
1550 +                       assert("vs-960", flags & BA_FORMATTED);
1551 +                       /* VITALY: This is what was grabbed for internal/tx-lists/similar only */
1552 +                       used2free(sbinfo, *len);
1553 +                       break;
1554 +
1555 +               case BLOCK_GRABBED:
1556 +                       used2grabbed(ctx, sbinfo, *len);
1557 +                       break;
1558 +
1559 +               case BLOCK_UNALLOCATED:
1560 +                       used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
1561 +                       break;
1562 +
1563 +               case BLOCK_FLUSH_RESERVED:{
1564 +                               txn_atom *atom;
1565 +
1566 +                               atom = get_current_atom_locked();
1567 +                               used2flush_reserved(sbinfo, atom, *len,
1568 +                                                   flags & BA_FORMATTED);
1569 +                               spin_unlock_atom(atom);
1570 +                               break;
1571 +                       }
1572 +               default:
1573 +                       impossible("zam-532", "wrong block stage");
1574 +               }
1575 +       }
1576 +
1577 +       return 0;
1578 +}
1579 +
1580 +/* wrappers for block allocator plugin methods */
1581 +int reiser4_pre_commit_hook(void)
1582 +{
1583 +       assert("zam-502", get_current_super_private() != NULL);
1584 +       sa_pre_commit_hook();
1585 +       return 0;
1586 +}
1587 +
1588 +/* an actor which applies delete set to block allocator data */
1589 +static int
1590 +apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
1591 +          const reiser4_block_nr * b, void *data UNUSED_ARG)
1592 +{
1593 +       reiser4_context *ctx;
1594 +       reiser4_super_info_data *sbinfo;
1595 +
1596 +       __u64 len = 1;
1597 +
1598 +       ctx = get_current_context();
1599 +       sbinfo = get_super_private(ctx->super);
1600 +
1601 +       assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
1602 +       assert("zam-552", sbinfo != NULL);
1603 +
1604 +       if (b != NULL)
1605 +               len = *b;
1606 +
1607 +       if (REISER4_DEBUG) {
1608 +               spin_lock_reiser4_super(sbinfo);
1609 +
1610 +               assert("zam-554", *a < reiser4_block_count(ctx->super));
1611 +               assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
1612 +
1613 +               spin_unlock_reiser4_super(sbinfo);
1614 +       }
1615 +
1616 +       sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
1617 +       /* adjust sb block counters */
1618 +       used2free(sbinfo, len);
1619 +       return 0;
1620 +}
1621 +
1622 +void reiser4_post_commit_hook(void)
1623 +{
1624 +       txn_atom *atom;
1625 +
1626 +       atom = get_current_atom_locked();
1627 +       assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
1628 +       spin_unlock_atom(atom);
1629 +
1630 +       /* do the block deallocation which was deferred
1631 +          until commit is done */
1632 +       blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
1633 +
1634 +       assert("zam-504", get_current_super_private() != NULL);
1635 +       sa_post_commit_hook();
1636 +}
1637 +
1638 +void reiser4_post_write_back_hook(void)
1639 +{
1640 +       assert("zam-504", get_current_super_private() != NULL);
1641 +
1642 +       sa_post_commit_hook();
1643 +}
1644 +
1645 +/*
1646 +   Local variables:
1647 +   c-indentation-style: "K&R"
1648 +   mode-name: "LC"
1649 +   c-basic-offset: 8
1650 +   tab-width: 8
1651 +   fill-column: 120
1652 +   scroll-step: 1
1653 +   End:
1654 +*/
1655 diff -urN linux-2.6.27.orig/fs/reiser4/block_alloc.h linux-2.6.27/fs/reiser4/block_alloc.h
1656 --- linux-2.6.27.orig/fs/reiser4/block_alloc.h  1970-01-01 03:00:00.000000000 +0300
1657 +++ linux-2.6.27/fs/reiser4/block_alloc.h       2008-10-12 18:20:00.000000000 +0400
1658 @@ -0,0 +1,175 @@
1659 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1660 +
1661 +#if !defined (__FS_REISER4_BLOCK_ALLOC_H__)
1662 +#define __FS_REISER4_BLOCK_ALLOC_H__
1663 +
1664 +#include "dformat.h"
1665 +#include "forward.h"
1666 +
1667 +#include <linux/types.h>       /* for __u??  */
1668 +#include <linux/fs.h>
1669 +
1670 +/* Mask when is applied to given block number shows is that block number is a fake one */
1671 +#define REISER4_FAKE_BLOCKNR_BIT_MASK   0x8000000000000000ULL
1672 +/* Mask which isolates a type of object this fake block number was assigned to */
1673 +#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
1674 +
1675 +/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
1676 +   against these two values to understand is the object unallocated or bitmap
1677 +   shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
1678 +#define REISER4_UNALLOCATED_STATUS_VALUE    0xC000000000000000ULL
1679 +#define REISER4_BITMAP_BLOCKS_STATUS_VALUE  0x8000000000000000ULL
1680 +
1681 +/* specification how block allocation was counted in sb block counters */
1682 +typedef enum {
1683 +       BLOCK_NOT_COUNTED = 0,  /* reiser4 has no info about this block yet */
1684 +       BLOCK_GRABBED = 1,      /* free space grabbed for further allocation
1685 +                                  of this block */
1686 +       BLOCK_FLUSH_RESERVED = 2,       /* block is reserved for flush needs. */
1687 +       BLOCK_UNALLOCATED = 3,  /* block is used for existing in-memory object
1688 +                                  ( unallocated formatted or unformatted
1689 +                                  node) */
1690 +       BLOCK_ALLOCATED = 4     /* block is mapped to disk, real on-disk block
1691 +                                  number assigned */
1692 +} block_stage_t;
1693 +
1694 +/* a hint for block allocator */
1695 +struct reiser4_blocknr_hint {
1696 +       /* FIXME: I think we want to add a longterm lock on the bitmap block here.  This
1697 +          is to prevent jnode_flush() calls from interleaving allocations on the same
1698 +          bitmap, once a hint is established. */
1699 +
1700 +       /* search start hint */
1701 +       reiser4_block_nr blk;
1702 +       /* if not zero, it is a region size we search for free blocks in */
1703 +       reiser4_block_nr max_dist;
1704 +       /* level for allocation, may be useful have branch-level and higher
1705 +          write-optimized. */
1706 +       tree_level level;
1707 +       /* block allocator assumes that blocks, which will be mapped to disk,
1708 +          are in this specified block_stage */
1709 +       block_stage_t block_stage;
1710 +       /* If direction = 1 allocate blocks in backward direction from the end
1711 +        * of disk to the beginning of disk.  */
1712 +       unsigned int backward:1;
1713 +
1714 +};
1715 +
1716 +/* These flags control block allocation/deallocation behavior */
1717 +enum reiser4_ba_flags {
1718 +       /* do allocatations from reserved (5%) area */
1719 +       BA_RESERVED = (1 << 0),
1720 +
1721 +       /* block allocator can do commit trying to recover free space */
1722 +       BA_CAN_COMMIT = (1 << 1),
1723 +
1724 +       /* if operation will be applied to formatted block */
1725 +       BA_FORMATTED = (1 << 2),
1726 +
1727 +       /* defer actual block freeing until transaction commit */
1728 +       BA_DEFER = (1 << 3),
1729 +
1730 +       /* allocate blocks for permanent fs objects (formatted or unformatted), not
1731 +          wandered of log blocks */
1732 +       BA_PERMANENT = (1 << 4),
1733 +
1734 +       /* grab space even it was disabled */
1735 +       BA_FORCE = (1 << 5),
1736 +
1737 +       /* use default start value for free blocks search. */
1738 +       BA_USE_DEFAULT_SEARCH_START = (1 << 6)
1739 +};
1740 +
1741 +typedef enum reiser4_ba_flags reiser4_ba_flags_t;
1742 +
1743 +extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint);
1744 +extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint);
1745 +extern void update_blocknr_hint_default(const struct super_block *,
1746 +                                       const reiser4_block_nr *);
1747 +extern void get_blocknr_hint_default(reiser4_block_nr *);
1748 +
1749 +extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
1750 +
1751 +int assign_fake_blocknr_formatted(reiser4_block_nr *);
1752 +reiser4_block_nr fake_blocknr_unformatted(int);
1753 +
1754 +/* free -> grabbed -> fake_allocated -> used */
1755 +
1756 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
1757 +void all_grabbed2free(void);
1758 +void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count);
1759 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
1760 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
1761 +void grabbed2flush_reserved(__u64 count);
1762 +int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
1763 +                        reiser4_block_nr * start,
1764 +                        reiser4_block_nr * len, reiser4_ba_flags_t flags);
1765 +int reiser4_dealloc_blocks(const reiser4_block_nr *,
1766 +                          const reiser4_block_nr *,
1767 +                          block_stage_t, reiser4_ba_flags_t flags);
1768 +
1769 +static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
1770 +                                     reiser4_block_nr * start,
1771 +                                     reiser4_ba_flags_t flags)
1772 +{
1773 +       reiser4_block_nr one = 1;
1774 +       return reiser4_alloc_blocks(hint, start, &one, flags);
1775 +}
1776 +
1777 +static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
1778 +                                       block_stage_t stage,
1779 +                                       reiser4_ba_flags_t flags)
1780 +{
1781 +       const reiser4_block_nr one = 1;
1782 +       return reiser4_dealloc_blocks(block, &one, stage, flags);
1783 +}
1784 +
1785 +#define reiser4_grab_space_force(count, flags)         \
1786 +       reiser4_grab_space(count, flags | BA_FORCE)
1787 +
1788 +extern void grabbed2free_mark(__u64 mark);
1789 +extern int reiser4_grab_reserved(struct super_block *,
1790 +                                __u64, reiser4_ba_flags_t);
1791 +extern void reiser4_release_reserved(struct super_block *super);
1792 +
1793 +/* grabbed -> fake_allocated */
1794 +
1795 +/* fake_allocated -> used */
1796 +
1797 +/* used -> fake_allocated -> grabbed -> free */
1798 +
1799 +extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
1800 +
1801 +extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da);
1802 +
1803 +extern void grabbed2cluster_reserved(int count);
1804 +extern void cluster_reserved2grabbed(int count);
1805 +extern void cluster_reserved2free(int count);
1806 +
1807 +extern int reiser4_check_block_counters(const struct super_block *);
1808 +
1809 +#if REISER4_DEBUG
1810 +
1811 +extern void reiser4_check_block(const reiser4_block_nr *, int);
1812 +
1813 +#else
1814 +
1815 +#  define reiser4_check_block(beg, val)        noop
1816 +
1817 +#endif
1818 +
1819 +extern int reiser4_pre_commit_hook(void);
1820 +extern void reiser4_post_commit_hook(void);
1821 +extern void reiser4_post_write_back_hook(void);
1822 +
1823 +#endif                         /* __FS_REISER4_BLOCK_ALLOC_H__ */
1824 +
1825 +/* Make Linus happy.
1826 +   Local variables:
1827 +   c-indentation-style: "K&R"
1828 +   mode-name: "LC"
1829 +   c-basic-offset: 8
1830 +   tab-width: 8
1831 +   fill-column: 120
1832 +   End:
1833 +*/
1834 diff -urN linux-2.6.27.orig/fs/reiser4/blocknrset.c linux-2.6.27/fs/reiser4/blocknrset.c
1835 --- linux-2.6.27.orig/fs/reiser4/blocknrset.c   1970-01-01 03:00:00.000000000 +0300
1836 +++ linux-2.6.27/fs/reiser4/blocknrset.c        2008-10-12 18:20:00.000000000 +0400
1837 @@ -0,0 +1,368 @@
1838 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1839 +
1840 +/* This file contains code for various block number sets used by the atom to
1841 +   track the deleted set and wandered block mappings. */
1842 +
1843 +#include "debug.h"
1844 +#include "dformat.h"
1845 +#include "txnmgr.h"
1846 +#include "context.h"
1847 +
1848 +#include <linux/slab.h>
1849 +
1850 +/* The proposed data structure for storing unordered block number sets is a
1851 +   list of elements, each of which contains an array of block number or/and
1852 +   array of block number pairs. That element called blocknr_set_entry is used
1853 +   to store block numbers from the beginning and for extents from the end of
1854 +   the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
1855 +   count numbers of blocks and extents.
1856 +
1857 +   +------------------- blocknr_set_entry->data ------------------+
1858 +   |block1|block2| ... <free space> ... |pair3|pair2|pair1|
1859 +   +------------------------------------------------------------+
1860 +
1861 +   When current blocknr_set_entry is full, allocate a new one. */
1862 +
1863 +/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
1864 + * set (single blocks and block extents), in that case blocknr pair represent an
1865 + * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
1866 + * there represent a (real block) -> (wandered block) mapping. */
1867 +
1868 +/* Protection: blocknr sets belong to reiser4 atom, and
1869 + * their modifications are performed with the atom lock held */
1870 +
1871 +/* The total size of a blocknr_set_entry. */
1872 +#define BLOCKNR_SET_ENTRY_SIZE 128
1873 +
1874 +/* The number of blocks that can fit the blocknr data area. */
1875 +#define BLOCKNR_SET_ENTRIES_NUMBER             \
1876 +       ((BLOCKNR_SET_ENTRY_SIZE -              \
1877 +         2 * sizeof (unsigned) -               \
1878 +         sizeof(struct list_head)) /           \
1879 +        sizeof(reiser4_block_nr))
1880 +
1881 +/* An entry of the blocknr_set */
1882 +struct blocknr_set_entry {
1883 +       unsigned nr_singles;
1884 +       unsigned nr_pairs;
1885 +       struct list_head link;
1886 +       reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
1887 +};
1888 +
1889 +/* A pair of blocks as recorded in the blocknr_set_entry data. */
1890 +struct blocknr_pair {
1891 +       reiser4_block_nr a;
1892 +       reiser4_block_nr b;
1893 +};
1894 +
1895 +/* Return the number of blocknr slots available in a blocknr_set_entry. */
1896 +/* Audited by: green(2002.06.11) */
1897 +static unsigned bse_avail(blocknr_set_entry * bse)
1898 +{
1899 +       unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
1900 +
1901 +       assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
1902 +       cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
1903 +
1904 +       return BLOCKNR_SET_ENTRIES_NUMBER - used;
1905 +}
1906 +
1907 +/* Initialize a blocknr_set_entry. */
1908 +static void bse_init(blocknr_set_entry *bse)
1909 +{
1910 +       bse->nr_singles = 0;
1911 +       bse->nr_pairs = 0;
1912 +       INIT_LIST_HEAD(&bse->link);
1913 +}
1914 +
1915 +/* Allocate and initialize a blocknr_set_entry. */
1916 +/* Audited by: green(2002.06.11) */
1917 +static blocknr_set_entry *bse_alloc(void)
1918 +{
1919 +       blocknr_set_entry *e;
1920 +
1921 +       if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
1922 +                                          reiser4_ctx_gfp_mask_get())) == NULL)
1923 +               return NULL;
1924 +
1925 +       bse_init(e);
1926 +
1927 +       return e;
1928 +}
1929 +
1930 +/* Free a blocknr_set_entry. */
1931 +/* Audited by: green(2002.06.11) */
1932 +static void bse_free(blocknr_set_entry * bse)
1933 +{
1934 +       kfree(bse);
1935 +}
1936 +
1937 +/* Add a block number to a blocknr_set_entry */
1938 +/* Audited by: green(2002.06.11) */
1939 +static void
1940 +bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
1941 +{
1942 +       assert("jmacd-5099", bse_avail(bse) >= 1);
1943 +
1944 +       bse->entries[bse->nr_singles++] = *block;
1945 +}
1946 +
1947 +/* Get a pair of block numbers */
1948 +/* Audited by: green(2002.06.11) */
1949 +static inline struct blocknr_pair *bse_get_pair(blocknr_set_entry * bse,
1950 +                                               unsigned pno)
1951 +{
1952 +       assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
1953 +
1954 +       return (struct blocknr_pair *) (bse->entries +
1955 +                                       BLOCKNR_SET_ENTRIES_NUMBER -
1956 +                                       2 * (pno + 1));
1957 +}
1958 +
1959 +/* Add a pair of block numbers to a blocknr_set_entry */
1960 +/* Audited by: green(2002.06.11) */
1961 +static void
1962 +bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
1963 +            const reiser4_block_nr * b)
1964 +{
1965 +       struct blocknr_pair *pair;
1966 +
1967 +       assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
1968 +
1969 +       pair = bse_get_pair(bse, bse->nr_pairs++);
1970 +
1971 +       pair->a = *a;
1972 +       pair->b = *b;
1973 +}
1974 +
1975 +/* Add either a block or pair of blocks to the block number set.  The first
1976 +   blocknr (@a) must be non-NULL.  If @b is NULL a single blocknr is added, if
1977 +   @b is non-NULL a pair is added.  The block number set belongs to atom, and
1978 +   the call is made with the atom lock held.  There may not be enough space in
1979 +   the current blocknr_set_entry.  If new_bsep points to a non-NULL
1980 +   blocknr_set_entry then it will be added to the blocknr_set and new_bsep
1981 +   will be set to NULL.  If new_bsep contains NULL then the atom lock will be
1982 +   released and a new bse will be allocated in new_bsep.  E_REPEAT will be
1983 +   returned with the atom unlocked for the operation to be tried again.  If
1984 +   the operation succeeds, 0 is returned.  If new_bsep is non-NULL and not
1985 +   used during the call, it will be freed automatically. */
1986 +static int blocknr_set_add(txn_atom *atom, struct list_head *bset,
1987 +                          blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
1988 +                          const reiser4_block_nr *b)
1989 +{
1990 +       blocknr_set_entry *bse;
1991 +       unsigned entries_needed;
1992 +
1993 +       assert("jmacd-5101", a != NULL);
1994 +
1995 +       entries_needed = (b == NULL) ? 1 : 2;
1996 +       if (list_empty(bset) ||
1997 +           bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) {
1998 +               /* See if a bse was previously allocated. */
1999 +               if (*new_bsep == NULL) {
2000 +                       spin_unlock_atom(atom);
2001 +                       *new_bsep = bse_alloc();
2002 +                       return (*new_bsep != NULL) ? -E_REPEAT :
2003 +                               RETERR(-ENOMEM);
2004 +               }
2005 +
2006 +               /* Put it on the head of the list. */
2007 +               list_add(&((*new_bsep)->link), bset);
2008 +
2009 +               *new_bsep = NULL;
2010 +       }
2011 +
2012 +       /* Add the single or pair. */
2013 +       bse = list_entry(bset->next, blocknr_set_entry, link);
2014 +       if (b == NULL) {
2015 +               bse_put_single(bse, a);
2016 +       } else {
2017 +               bse_put_pair(bse, a, b);
2018 +       }
2019 +
2020 +       /* If new_bsep is non-NULL then there was an allocation race, free this copy. */
2021 +       if (*new_bsep != NULL) {
2022 +               bse_free(*new_bsep);
2023 +               *new_bsep = NULL;
2024 +       }
2025 +
2026 +       return 0;
2027 +}
2028 +
2029 +/* Add an extent to the block set.  If the length is 1, it is treated as a
2030 +   single block (e.g., reiser4_set_add_block). */
2031 +/* Audited by: green(2002.06.11) */
2032 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2033 +   kmalloc might schedule. The only exception is atom spinlock, which is
2034 +   properly freed. */
2035 +int
2036 +blocknr_set_add_extent(txn_atom * atom,
2037 +                      struct list_head * bset,
2038 +                      blocknr_set_entry ** new_bsep,
2039 +                      const reiser4_block_nr * start,
2040 +                      const reiser4_block_nr * len)
2041 +{
2042 +       assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
2043 +       return blocknr_set_add(atom, bset, new_bsep, start,
2044 +                              *len == 1 ? NULL : len);
2045 +}
2046 +
2047 +/* Add a block pair to the block set. It adds exactly a pair, which is checked
2048 + * by an assertion that both arguments are not null.*/
2049 +/* Audited by: green(2002.06.11) */
2050 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2051 +   kmalloc might schedule. The only exception is atom spinlock, which is
2052 +   properly freed. */
2053 +int
2054 +blocknr_set_add_pair(txn_atom * atom,
2055 +                    struct list_head * bset,
2056 +                    blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
2057 +                    const reiser4_block_nr * b)
2058 +{
2059 +       assert("jmacd-5103", a != NULL && b != NULL);
2060 +       return blocknr_set_add(atom, bset, new_bsep, a, b);
2061 +}
2062 +
2063 +/* Initialize a blocknr_set. */
2064 +void blocknr_set_init(struct list_head *bset)
2065 +{
2066 +       INIT_LIST_HEAD(bset);
2067 +}
2068 +
2069 +/* Release the entries of a blocknr_set. */
2070 +void blocknr_set_destroy(struct list_head *bset)
2071 +{
2072 +       blocknr_set_entry *bse;
2073 +
2074 +       while (!list_empty(bset)) {
2075 +               bse = list_entry(bset->next, blocknr_set_entry, link);
2076 +               list_del_init(&bse->link);
2077 +               bse_free(bse);
2078 +       }
2079 +}
2080 +
2081 +/* Merge blocknr_set entries out of @from into @into. */
2082 +/* Audited by: green(2002.06.11) */
2083 +/* Auditor comments: This merge does not know if merged sets contain
2084 +   blocks pairs (As for wandered sets) or extents, so it cannot really merge
2085 +   overlapping ranges if there is some. So I believe it may lead to
2086 +   some blocks being presented several times in one blocknr_set. To help
2087 +   debugging such problems it might help to check for duplicate entries on
2088 +   actual processing of this set. Testing this kind of stuff right here is
2089 +   also complicated by the fact that these sets are not sorted and going
2090 +   through whole set on each element addition is going to be CPU-heavy task */
2091 +void blocknr_set_merge(struct list_head * from, struct list_head * into)
2092 +{
2093 +       blocknr_set_entry *bse_into = NULL;
2094 +
2095 +       /* If @from is empty, no work to perform. */
2096 +       if (list_empty(from))
2097 +               return;
2098 +       /* If @into is not empty, try merging partial-entries. */
2099 +       if (!list_empty(into)) {
2100 +
2101 +               /* Neither set is empty, pop the front to members and try to combine them. */
2102 +               blocknr_set_entry *bse_from;
2103 +               unsigned into_avail;
2104 +
2105 +               bse_into = list_entry(into->next, blocknr_set_entry, link);
2106 +               list_del_init(&bse_into->link);
2107 +               bse_from = list_entry(from->next, blocknr_set_entry, link);
2108 +               list_del_init(&bse_from->link);
2109 +
2110 +               /* Combine singles. */
2111 +               for (into_avail = bse_avail(bse_into);
2112 +                    into_avail != 0 && bse_from->nr_singles != 0;
2113 +                    into_avail -= 1) {
2114 +                       bse_put_single(bse_into,
2115 +                                      &bse_from->entries[--bse_from->
2116 +                                                         nr_singles]);
2117 +               }
2118 +
2119 +               /* Combine pairs. */
2120 +               for (; into_avail > 1 && bse_from->nr_pairs != 0;
2121 +                    into_avail -= 2) {
2122 +                       struct blocknr_pair *pair =
2123 +                               bse_get_pair(bse_from, --bse_from->nr_pairs);
2124 +                       bse_put_pair(bse_into, &pair->a, &pair->b);
2125 +               }
2126 +
2127 +               /* If bse_from is empty, delete it now. */
2128 +               if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
2129 +                       bse_free(bse_from);
2130 +               } else {
2131 +                       /* Otherwise, bse_into is full or nearly full (e.g.,
2132 +                          it could have one slot avail and bse_from has one
2133 +                          pair left).  Push it back onto the list.  bse_from
2134 +                          becomes bse_into, which will be the new partial. */
2135 +                       list_add(&bse_into->link, into);
2136 +                       bse_into = bse_from;
2137 +               }
2138 +       }
2139 +
2140 +       /* Splice lists together. */
2141 +       list_splice_init(from, into->prev);
2142 +
2143 +       /* Add the partial entry back to the head of the list. */
2144 +       if (bse_into != NULL)
2145 +               list_add(&bse_into->link, into);
2146 +}
2147 +
2148 +/* Iterate over all blocknr set elements. */
2149 +int blocknr_set_iterator(txn_atom *atom, struct list_head *bset,
2150 +                        blocknr_set_actor_f actor, void *data, int delete)
2151 +{
2152 +
2153 +       blocknr_set_entry *entry;
2154 +
2155 +       assert("zam-429", atom != NULL);
2156 +       assert("zam-430", atom_is_protected(atom));
2157 +       assert("zam-431", bset != 0);
2158 +       assert("zam-432", actor != NULL);
2159 +
2160 +       entry = list_entry(bset->next, blocknr_set_entry, link);
2161 +       while (bset != &entry->link) {
2162 +               blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
2163 +               unsigned int i;
2164 +               int ret;
2165 +
2166 +               for (i = 0; i < entry->nr_singles; i++) {
2167 +                       ret = actor(atom, &entry->entries[i], NULL, data);
2168 +
2169 +                       /* We can't break a loop if delete flag is set. */
2170 +                       if (ret != 0 && !delete)
2171 +                               return ret;
2172 +               }
2173 +
2174 +               for (i = 0; i < entry->nr_pairs; i++) {
2175 +                       struct blocknr_pair *ab;
2176 +
2177 +                       ab = bse_get_pair(entry, i);
2178 +
2179 +                       ret = actor(atom, &ab->a, &ab->b, data);
2180 +
2181 +                       if (ret != 0 && !delete)
2182 +                               return ret;
2183 +               }
2184 +
2185 +               if (delete) {
2186 +                       list_del(&entry->link);
2187 +                       bse_free(entry);
2188 +               }
2189 +
2190 +               entry = tmp;
2191 +       }
2192 +
2193 +       return 0;
2194 +}
2195 +
2196 +/*
2197 + * Local variables:
2198 + * c-indentation-style: "K&R"
2199 + * mode-name: "LC"
2200 + * c-basic-offset: 8
2201 + * tab-width: 8
2202 + * fill-column: 79
2203 + * scroll-step: 1
2204 + * End:
2205 + */
2206 diff -urN linux-2.6.27.orig/fs/reiser4/carry.c linux-2.6.27/fs/reiser4/carry.c
2207 --- linux-2.6.27.orig/fs/reiser4/carry.c        1970-01-01 03:00:00.000000000 +0300
2208 +++ linux-2.6.27/fs/reiser4/carry.c     2008-10-12 18:20:00.000000000 +0400
2209 @@ -0,0 +1,1391 @@
2210 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2211 +/* Functions to "carry" tree modification(s) upward. */
2212 +/* Tree is modified one level at a time. As we modify a level we accumulate a
2213 +   set of changes that need to be propagated to the next level.  We manage
2214 +   node locking such that any searches that collide with carrying are
2215 +   restarted, from the root if necessary.
2216 +
2217 +   Insertion of a new item may result in items being moved among nodes and
2218 +   this requires the delimiting key to be updated at the least common parent
2219 +   of the nodes modified to preserve search tree invariants. Also, insertion
2220 +   may require allocation of a new node. A pointer to the new node has to be
2221 +   inserted into some node on the parent level, etc.
2222 +
2223 +   Tree carrying is meant to be analogous to arithmetic carrying.
2224 +
2225 +   A carry operation is always associated with some node (&carry_node).
2226 +
2227 +   Carry process starts with some initial set of operations to be performed
2228 +   and an initial set of already locked nodes.  Operations are performed one
2229 +   by one. Performing each single operation has following possible effects:
2230 +
2231 +    - content of carry node associated with operation is modified
2232 +    - new carry nodes are locked and involved into carry process on this level
2233 +    - new carry operations are posted to the next level
2234 +
2235 +   After all carry operations on this level are done, process is repeated for
2236 +   the accumulated sequence on carry operations for the next level. This
2237 +   starts by trying to lock (in left to right order) all carry nodes
2238 +   associated with carry operations on the parent level. After this, we decide
2239 +   whether more nodes are required on the left of already locked set. If so,
2240 +   all locks taken on the parent level are released, new carry nodes are
2241 +   added, and locking process repeats.
2242 +
2243 +   It may happen that balancing process fails owing to unrecoverable error on
2244 +   some of upper levels of a tree (possible causes are io error, failure to
2245 +   allocate new node, etc.). In this case we should unmount the filesystem,
2246 +   rebooting if it is the root, and possibly advise the use of fsck.
2247 +
2248 +   USAGE:
2249 +
2250 +    int some_tree_operation( znode *node, ... )
2251 +    {
2252 +       // Allocate on a stack pool of carry objects: operations and nodes.
2253 +       // Most carry processes will only take objects from here, without
2254 +       // dynamic allocation.
2255 +
2256 +I feel uneasy about this pool.  It adds to code complexity, I understand why it exists, but.... -Hans
2257 +
2258 +       carry_pool  pool;
2259 +       carry_level lowest_level;
2260 +       carry_op   *op;
2261 +
2262 +       init_carry_pool( &pool );
2263 +       init_carry_level( &lowest_level, &pool );
2264 +
2265 +       // operation may be one of:
2266 +       //   COP_INSERT    --- insert new item into node
2267 +       //   COP_CUT       --- remove part of or whole node
2268 +       //   COP_PASTE     --- increase size of item
2269 +       //   COP_DELETE    --- delete pointer from parent node
2270 +       //   COP_UPDATE    --- update delimiting key in least
2271 +       //                     common ancestor of two
2272 +
2273 +       op = reiser4_post_carry( &lowest_level, operation, node, 0 );
2274 +       if( IS_ERR( op ) || ( op == NULL ) ) {
2275 +           handle error
2276 +       } else {
2277 +           // fill in remaining fields in @op, according to carry.h:carry_op
2278 +           result = carry( &lowest_level, NULL );
2279 +       }
2280 +       done_carry_pool( &pool );
2281 +    }
2282 +
2283 +   When you are implementing node plugin method that participates in carry
2284 +   (shifting, insertion, deletion, etc.), do the following:
2285 +
2286 +   int foo_node_method( znode *node, ..., carry_level *todo )
2287 +   {
2288 +       carry_op   *op;
2289 +
2290 +       ....
2291 +
2292 +       // note, that last argument to reiser4_post_carry() is non-null
2293 +       // here, because @op is to be applied to the parent of @node, rather
2294 +       // than to the @node itself as in the previous case.
2295 +
2296 +       op = node_post_carry( todo, operation, node, 1 );
2297 +       // fill in remaining fields in @op, according to carry.h:carry_op
2298 +
2299 +       ....
2300 +
2301 +   }
2302 +
2303 +   BATCHING:
2304 +
2305 +   One of the main advantages of level-by-level balancing implemented here is
2306 +   ability to batch updates on a parent level and to peform them more
2307 +   efficiently as a result.
2308 +
2309 +   Description To Be Done (TBD).
2310 +
2311 +   DIFFICULTIES AND SUBTLE POINTS:
2312 +
2313 +   1. complex plumbing is required, because:
2314 +
2315 +       a. effective allocation through pools is needed
2316 +
2317 +       b. target of operation is not exactly known when operation is
2318 +       posted. This is worked around through bitfields in &carry_node and
2319 +       logic in lock_carry_node()
2320 +
2321 +       c. of interaction with locking code: node should be added into sibling
2322 +       list when pointer to it is inserted into its parent, which is some time
2323 +       after node was created. Between these moments, node is somewhat in
2324 +       suspended state and is only registered in the carry lists
2325 +
2326 +    2. whole balancing logic is implemented here, in particular, insertion
2327 +    logic is coded in make_space().
2328 +
2329 +    3. special cases like insertion (reiser4_add_tree_root()) or deletion
2330 +    (reiser4_kill_tree_root()) of tree root and morphing of paste into insert
2331 +    (insert_paste()) have to be handled.
2332 +
2333 +    4. there is non-trivial interdependency between allocation of new nodes
2334 +    and almost everything else. This is mainly due to the (1.c) above. I shall
2335 +    write about this later.
2336 +
2337 +*/
2338 +
2339 +#include "forward.h"
2340 +#include "debug.h"
2341 +#include "key.h"
2342 +#include "coord.h"
2343 +#include "plugin/item/item.h"
2344 +#include "plugin/item/extent.h"
2345 +#include "plugin/node/node.h"
2346 +#include "jnode.h"
2347 +#include "znode.h"
2348 +#include "tree_mod.h"
2349 +#include "tree_walk.h"
2350 +#include "block_alloc.h"
2351 +#include "pool.h"
2352 +#include "tree.h"
2353 +#include "carry.h"
2354 +#include "carry_ops.h"
2355 +#include "super.h"
2356 +#include "reiser4.h"
2357 +
2358 +#include <linux/types.h>
2359 +
2360 +/* level locking/unlocking */
2361 +static int lock_carry_level(carry_level * level);
2362 +static void unlock_carry_level(carry_level * level, int failure);
2363 +static void done_carry_level(carry_level * level);
2364 +static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
2365 +
2366 +int lock_carry_node(carry_level * level, carry_node * node);
2367 +int lock_carry_node_tail(carry_node * node);
2368 +
2369 +/* carry processing proper */
2370 +static int carry_on_level(carry_level * doing, carry_level * todo);
2371 +
2372 +static carry_op *add_op(carry_level * level, pool_ordering order,
2373 +                       carry_op * reference);
2374 +
2375 +/* handlers for carry operations. */
2376 +
2377 +static void fatal_carry_error(carry_level * doing, int ecode);
2378 +static int add_new_root(carry_level * level, carry_node * node, znode * fake);
2379 +
2380 +static void print_level(const char *prefix, carry_level * level);
2381 +
2382 +#if REISER4_DEBUG
2383 +typedef enum {
2384 +       CARRY_TODO,
2385 +       CARRY_DOING
2386 +} carry_queue_state;
2387 +static int carry_level_invariant(carry_level * level, carry_queue_state state);
2388 +#endif
2389 +
2390 +/* main entry point for tree balancing.
2391 +
2392 +   Tree carry performs operations from @doing and while doing so accumulates
2393 +   information about operations to be performed on the next level ("carried"
2394 +   to the parent level). Carried operations are performed, causing possibly
2395 +   more operations to be carried upward etc. carry() takes care about
2396 +   locking and pinning znodes while operating on them.
2397 +
2398 +   For usage, see comment at the top of fs/reiser4/carry.c
2399 +
2400 +*/
2401 +int reiser4_carry(carry_level * doing /* set of carry operations to be
2402 +                                      * performed */ ,
2403 +                 carry_level * done  /* set of nodes, already performed
2404 +                                      *  at the previous level.
2405 +                                      * NULL in most cases */)
2406 +{
2407 +       int result = 0;
2408 +       /* queue of new requests */
2409 +       carry_level *todo;
2410 +       ON_DEBUG(STORE_COUNTERS);
2411 +
2412 +       assert("nikita-888", doing != NULL);
2413 +       BUG_ON(done != NULL);
2414 +
2415 +       todo = doing + 1;
2416 +       init_carry_level(todo, doing->pool);
2417 +
2418 +       /* queue of requests preformed on the previous level */
2419 +       done = todo + 1;
2420 +       init_carry_level(done, doing->pool);
2421 +
2422 +       /* iterate until there is nothing more to do */
2423 +       while (result == 0 && doing->ops_num > 0) {
2424 +               carry_level *tmp;
2425 +
2426 +               /* at this point @done is locked. */
2427 +               /* repeat lock/do/unlock while
2428 +
2429 +                  (1) lock_carry_level() fails due to deadlock avoidance, or
2430 +
2431 +                  (2) carry_on_level() decides that more nodes have to
2432 +                  be involved.
2433 +
2434 +                  (3) some unexpected error occurred while balancing on the
2435 +                  upper levels. In this case all changes are rolled back.
2436 +
2437 +                */
2438 +               while (1) {
2439 +                       result = lock_carry_level(doing);
2440 +                       if (result == 0) {
2441 +                               /* perform operations from @doing and
2442 +                                  accumulate new requests in @todo */
2443 +                               result = carry_on_level(doing, todo);
2444 +                               if (result == 0)
2445 +                                       break;
2446 +                               else if (result != -E_REPEAT ||
2447 +                                        !doing->restartable) {
2448 +                                       warning("nikita-1043",
2449 +                                               "Fatal error during carry: %i",
2450 +                                               result);
2451 +                                       print_level("done", done);
2452 +                                       print_level("doing", doing);
2453 +                                       print_level("todo", todo);
2454 +                                       /* do some rough stuff like aborting
2455 +                                          all pending transcrashes and thus
2456 +                                          pushing tree back to the consistent
2457 +                                          state. Alternatvely, just panic.
2458 +                                        */
2459 +                                       fatal_carry_error(doing, result);
2460 +                                       return result;
2461 +                               }
2462 +                       } else if (result != -E_REPEAT) {
2463 +                               fatal_carry_error(doing, result);
2464 +                               return result;
2465 +                       }
2466 +                       unlock_carry_level(doing, 1);
2467 +               }
2468 +               /* at this point @done can be safely unlocked */
2469 +               done_carry_level(done);
2470 +
2471 +               /* cyclically shift queues */
2472 +               tmp = done;
2473 +               done = doing;
2474 +               doing = todo;
2475 +               todo = tmp;
2476 +               init_carry_level(todo, doing->pool);
2477 +
2478 +               /* give other threads chance to run */
2479 +               reiser4_preempt_point();
2480 +       }
2481 +       done_carry_level(done);
2482 +
2483 +       /* all counters, but x_refs should remain the same. x_refs can change
2484 +          owing to transaction manager */
2485 +       ON_DEBUG(CHECK_COUNTERS);
2486 +       return result;
2487 +}
2488 +
2489 +/* perform carry operations on given level.
2490 +
2491 +   Optimizations proposed by pooh:
2492 +
2493 +   (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
2494 +   required;
2495 +
2496 +   (2) unlock node if there are no more operations to be performed upon it and
2497 +   node didn't add any operation to @todo. This can be implemented by
2498 +   attaching to each node two counters: counter of operaions working on this
2499 +   node and counter and operations carried upward from this node.
2500 +
2501 +*/
2502 +static int carry_on_level(carry_level * doing  /* queue of carry operations to
2503 +                                                * do on this level */ ,
2504 +                         carry_level * todo    /* queue where new carry
2505 +                                                * operations to be performed on
2506 +                                                * the * parent level are
2507 +                                                * accumulated during @doing
2508 +                                                * processing. */ )
2509 +{
2510 +       int result;
2511 +       int (*f) (carry_op *, carry_level *, carry_level *);
2512 +       carry_op *op;
2513 +       carry_op *tmp_op;
2514 +
2515 +       assert("nikita-1034", doing != NULL);
2516 +       assert("nikita-1035", todo != NULL);
2517 +
2518 +       /* @doing->nodes are locked. */
2519 +
2520 +       /* This function can be split into two phases: analysis and modification.
2521 +
2522 +          Analysis calculates precisely what items should be moved between
2523 +          nodes. This information is gathered in some structures attached to
2524 +          each carry_node in a @doing queue. Analysis also determines whether
2525 +          new nodes are to be allocated etc.
2526 +
2527 +          After analysis is completed, actual modification is performed. Here
2528 +          we can take advantage of "batch modification": if there are several
2529 +          operations acting on the same node, modifications can be performed
2530 +          more efficiently when batched together.
2531 +
2532 +          Above is an optimization left for the future.
2533 +        */
2534 +       /* Important, but delayed optimization: it's possible to batch
2535 +          operations together and perform them more efficiently as a
2536 +          result. For example, deletion of several neighboring items from a
2537 +          node can be converted to a single ->cut() operation.
2538 +
2539 +          Before processing queue, it should be scanned and "mergeable"
2540 +          operations merged.
2541 +        */
2542 +       result = 0;
2543 +       for_all_ops(doing, op, tmp_op) {
2544 +               carry_opcode opcode;
2545 +
2546 +               assert("nikita-1041", op != NULL);
2547 +               opcode = op->op;
2548 +               assert("nikita-1042", op->op < COP_LAST_OP);
2549 +               f = op_dispatch_table[op->op].handler;
2550 +               result = f(op, doing, todo);
2551 +               /* locking can fail with -E_REPEAT. Any different error is fatal
2552 +                  and will be handled by fatal_carry_error() sledgehammer.
2553 +                */
2554 +               if (result != 0)
2555 +                       break;
2556 +       }
2557 +       if (result == 0) {
2558 +               carry_plugin_info info;
2559 +               carry_node *scan;
2560 +               carry_node *tmp_scan;
2561 +
2562 +               info.doing = doing;
2563 +               info.todo = todo;
2564 +
2565 +               assert("nikita-3002",
2566 +                      carry_level_invariant(doing, CARRY_DOING));
2567 +               for_all_nodes(doing, scan, tmp_scan) {
2568 +                       znode *node;
2569 +
2570 +                       node = reiser4_carry_real(scan);
2571 +                       assert("nikita-2547", node != NULL);
2572 +                       if (node_is_empty(node)) {
2573 +                               result =
2574 +                                   node_plugin_by_node(node)->
2575 +                                   prepare_removal(node, &info);
2576 +                               if (result != 0)
2577 +                                       break;
2578 +                       }
2579 +               }
2580 +       }
2581 +       return result;
2582 +}
2583 +
2584 +/* post carry operation
2585 +
2586 +   This is main function used by external carry clients: node layout plugins
2587 +   and tree operations to create new carry operation to be performed on some
2588 +   level.
2589 +
2590 +   New operation will be included in the @level queue. To actually perform it,
2591 +   call carry( level, ... ). This function takes write lock on @node. Carry
2592 +   manages all its locks by itself, don't worry about this.
2593 +
2594 +   This function adds operation and node at the end of the queue. It is up to
2595 +   caller to guarantee proper ordering of node queue.
2596 +
2597 +*/
2598 +carry_op * reiser4_post_carry(carry_level * level /* queue where new operation
2599 +                                                  * is to be posted at */ ,
2600 +                             carry_opcode op /* opcode of operation */ ,
2601 +                             znode * node      /* node on which this operation
2602 +                                                * will operate */ ,
2603 +                             int apply_to_parent_p /* whether operation will
2604 +                                                    * operate directly on @node
2605 +                                                    * or on it parent. */)
2606 +{
2607 +       carry_op *result;
2608 +       carry_node *child;
2609 +
2610 +       assert("nikita-1046", level != NULL);
2611 +       assert("nikita-1788", znode_is_write_locked(node));
2612 +
2613 +       result = add_op(level, POOLO_LAST, NULL);
2614 +       if (IS_ERR(result))
2615 +               return result;
2616 +       child = reiser4_add_carry(level, POOLO_LAST, NULL);
2617 +       if (IS_ERR(child)) {
2618 +               reiser4_pool_free(&level->pool->op_pool, &result->header);
2619 +               return (carry_op *) child;
2620 +       }
2621 +       result->node = child;
2622 +       result->op = op;
2623 +       child->parent = apply_to_parent_p;
2624 +       if (ZF_ISSET(node, JNODE_ORPHAN))
2625 +               child->left_before = 1;
2626 +       child->node = node;
2627 +       return result;
2628 +}
2629 +
2630 +/* initialize carry queue */
2631 +void init_carry_level(carry_level * level /* level to initialize */ ,
2632 +                     carry_pool * pool /* pool @level will allocate objects
2633 +                                        * from */ )
2634 +{
2635 +       assert("nikita-1045", level != NULL);
2636 +       assert("nikita-967", pool != NULL);
2637 +
2638 +       memset(level, 0, sizeof *level);
2639 +       level->pool = pool;
2640 +
2641 +       INIT_LIST_HEAD(&level->nodes);
2642 +       INIT_LIST_HEAD(&level->ops);
2643 +}
2644 +
2645 +/* allocate carry pool and initialize pools within queue */
2646 +carry_pool *init_carry_pool(int size)
2647 +{
2648 +       carry_pool *pool;
2649 +
2650 +       assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
2651 +       pool = kmalloc(size, reiser4_ctx_gfp_mask_get());
2652 +       if (pool == NULL)
2653 +               return ERR_PTR(RETERR(-ENOMEM));
2654 +
2655 +       reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
2656 +                         (char *)pool->op);
2657 +       reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
2658 +                         NODES_LOCKED_POOL_SIZE, (char *)pool->node);
2659 +       return pool;
2660 +}
2661 +
2662 +/* finish with queue pools */
2663 +void done_carry_pool(carry_pool * pool /* pool to destroy */ )
2664 +{
2665 +       reiser4_done_pool(&pool->op_pool);
2666 +       reiser4_done_pool(&pool->node_pool);
2667 +       kfree(pool);
2668 +}
2669 +
2670 +/* add new carry node to the @level.
2671 +
2672 +   Returns pointer to the new carry node allocated from pool.  It's up to
2673 +   callers to maintain proper order in the @level. Assumption is that if carry
2674 +   nodes on one level are already sorted and modifications are peroformed from
2675 +   left to right, carry nodes added on the parent level will be ordered
2676 +   automatically. To control ordering use @order and @reference parameters.
2677 +
2678 +*/
2679 +carry_node *reiser4_add_carry_skip(carry_level * level /* &carry_level to add
2680 +                                                        * node to */ ,
2681 +                                  pool_ordering order  /* where to insert:
2682 +                                                        * at the beginning of
2683 +                                                        * @level,
2684 +                                                        * before @reference,
2685 +                                                        * after @reference,
2686 +                                                        * at the end of @level
2687 +                                                        */ ,
2688 +                                  carry_node * reference/* reference node for
2689 +                                                         * insertion */)
2690 +{
2691 +       ON_DEBUG(carry_node * orig_ref = reference);
2692 +
2693 +       if (order == POOLO_BEFORE) {
2694 +               reference = find_left_carry(reference, level);
2695 +               if (reference == NULL)
2696 +                       reference = list_entry(level->nodes.next, carry_node,
2697 +                                              header.level_linkage);
2698 +               else
2699 +                       reference = list_entry(reference->header.level_linkage.next,
2700 +                                              carry_node, header.level_linkage);
2701 +       } else if (order == POOLO_AFTER) {
2702 +               reference = find_right_carry(reference, level);
2703 +               if (reference == NULL)
2704 +                       reference = list_entry(level->nodes.prev, carry_node,
2705 +                                              header.level_linkage);
2706 +               else
2707 +                       reference = list_entry(reference->header.level_linkage.prev,
2708 +                                              carry_node, header.level_linkage);
2709 +       }
2710 +       assert("nikita-2209",
2711 +              ergo(orig_ref != NULL,
2712 +                   reiser4_carry_real(reference) ==
2713 +                   reiser4_carry_real(orig_ref)));
2714 +       return reiser4_add_carry(level, order, reference);
2715 +}
2716 +
2717 +carry_node *reiser4_add_carry(carry_level * level      /* &carry_level to add node
2718 +                                                * to */ ,
2719 +                     pool_ordering order       /* where to insert: at the
2720 +                                                * beginning of @level, before
2721 +                                                * @reference, after @reference,
2722 +                                                * at the end of @level */ ,
2723 +                     carry_node * reference    /* reference node for
2724 +                                                * insertion */ )
2725 +{
2726 +       carry_node *result;
2727 +
2728 +       result =
2729 +           (carry_node *) reiser4_add_obj(&level->pool->node_pool,
2730 +                                          &level->nodes,
2731 +                                          order, &reference->header);
2732 +       if (!IS_ERR(result) && (result != NULL))
2733 +               ++level->nodes_num;
2734 +       return result;
2735 +}
2736 +
2737 +/* add new carry operation to the @level.
2738 +
2739 +   Returns pointer to the new carry operations allocated from pool. It's up to
2740 +   callers to maintain proper order in the @level. To control ordering use
2741 +   @order and @reference parameters.
2742 +
2743 +*/
2744 +static carry_op *add_op(carry_level * level /* &carry_level to add node to */ ,
2745 +                       pool_ordering order     /* where to insert: at the beginning of
2746 +                                                * @level, before @reference, after
2747 +                                                * @reference, at the end of @level */ ,
2748 +                       carry_op *
2749 +                       reference /* reference node for insertion */ )
2750 +{
2751 +       carry_op *result;
2752 +
2753 +       result =
2754 +           (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops,
2755 +                                        order, &reference->header);
2756 +       if (!IS_ERR(result) && (result != NULL))
2757 +               ++level->ops_num;
2758 +       return result;
2759 +}
2760 +
2761 +/* Return node on the right of which @node was created.
2762 +
2763 +   Each node is created on the right of some existing node (or it is new root,
2764 +   which is special case not handled here).
2765 +
2766 +   @node is new node created on some level, but not yet inserted into its
2767 +   parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
2768 +
2769 +*/
2770 +static carry_node *find_begetting_brother(carry_node * node    /* node to start search
2771 +                                                                * from */ ,
2772 +                                         carry_level * kin UNUSED_ARG  /* level to
2773 +                                                                        * scan */ )
2774 +{
2775 +       carry_node *scan;
2776 +
2777 +       assert("nikita-1614", node != NULL);
2778 +       assert("nikita-1615", kin != NULL);
2779 +       assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
2780 +       assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL,
2781 +                                  ZF_ISSET(reiser4_carry_real(node),
2782 +                                           JNODE_ORPHAN)));
2783 +       for (scan = node;;
2784 +            scan = list_entry(scan->header.level_linkage.prev, carry_node,
2785 +                              header.level_linkage)) {
2786 +               assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
2787 +               if ((scan->node != node->node) &&
2788 +                   !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
2789 +                       assert("nikita-1618", reiser4_carry_real(scan) != NULL);
2790 +                       break;
2791 +               }
2792 +       }
2793 +       return scan;
2794 +}
2795 +
2796 +static cmp_t
2797 +carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
2798 +{
2799 +       assert("nikita-2199", n1 != NULL);
2800 +       assert("nikita-2200", n2 != NULL);
2801 +
2802 +       if (n1 == n2)
2803 +               return EQUAL_TO;
2804 +       while (1) {
2805 +               n1 = carry_node_next(n1);
2806 +               if (carry_node_end(level, n1))
2807 +                       return GREATER_THAN;
2808 +               if (n1 == n2)
2809 +                       return LESS_THAN;
2810 +       }
2811 +       impossible("nikita-2201", "End of level reached");
2812 +}
2813 +
2814 +carry_node *find_carry_node(carry_level * level, const znode * node)
2815 +{
2816 +       carry_node *scan;
2817 +       carry_node *tmp_scan;
2818 +
2819 +       assert("nikita-2202", level != NULL);
2820 +       assert("nikita-2203", node != NULL);
2821 +
2822 +       for_all_nodes(level, scan, tmp_scan) {
2823 +               if (reiser4_carry_real(scan) == node)
2824 +                       return scan;
2825 +       }
2826 +       return NULL;
2827 +}
2828 +
2829 +znode *reiser4_carry_real(const carry_node * node)
2830 +{
2831 +       assert("nikita-3061", node != NULL);
2832 +
2833 +       return node->lock_handle.node;
2834 +}
2835 +
2836 +carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
2837 +                             const znode * node)
2838 +{
2839 +       carry_node *base;
2840 +       carry_node *scan;
2841 +       carry_node *tmp_scan;
2842 +       carry_node *proj;
2843 +
2844 +       base = find_carry_node(doing, node);
2845 +       assert("nikita-2204", base != NULL);
2846 +
2847 +       for_all_nodes(todo, scan, tmp_scan) {
2848 +               proj = find_carry_node(doing, scan->node);
2849 +               assert("nikita-2205", proj != NULL);
2850 +               if (carry_node_cmp(doing, proj, base) != LESS_THAN)
2851 +                       break;
2852 +       }
2853 +       return scan;
2854 +}
2855 +
2856 +static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
2857 +                                    znode * node)
2858 +{
2859 +       carry_node *reference;
2860 +
2861 +       assert("nikita-2994", doing != NULL);
2862 +       assert("nikita-2995", todo != NULL);
2863 +       assert("nikita-2996", node != NULL);
2864 +
2865 +       reference = insert_carry_node(doing, todo, node);
2866 +       assert("nikita-2997", reference != NULL);
2867 +
2868 +       return reiser4_add_carry(todo, POOLO_BEFORE, reference);
2869 +}
2870 +
2871 +/* like reiser4_post_carry(), but designed to be called from node plugin methods.
2872 +   This function is different from reiser4_post_carry() in that it finds proper
2873 +   place to insert node in the queue. */
2874 +carry_op *node_post_carry(carry_plugin_info * info     /* carry parameters
2875 +                                                        * passed down to node
2876 +                                                        * plugin */ ,
2877 +                         carry_opcode op /* opcode of operation */ ,
2878 +                         znode * node  /* node on which this
2879 +                                        * operation will operate */ ,
2880 +                         int apply_to_parent_p /* whether operation will
2881 +                                                * operate directly on @node
2882 +                                                * or on it parent. */ )
2883 +{
2884 +       carry_op *result;
2885 +       carry_node *child;
2886 +
2887 +       assert("nikita-2207", info != NULL);
2888 +       assert("nikita-2208", info->todo != NULL);
2889 +
2890 +       if (info->doing == NULL)
2891 +               return reiser4_post_carry(info->todo, op, node,
2892 +                                         apply_to_parent_p);
2893 +
2894 +       result = add_op(info->todo, POOLO_LAST, NULL);
2895 +       if (IS_ERR(result))
2896 +               return result;
2897 +       child = add_carry_atplace(info->doing, info->todo, node);
2898 +       if (IS_ERR(child)) {
2899 +               reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
2900 +               return (carry_op *) child;
2901 +       }
2902 +       result->node = child;
2903 +       result->op = op;
2904 +       child->parent = apply_to_parent_p;
2905 +       if (ZF_ISSET(node, JNODE_ORPHAN))
2906 +               child->left_before = 1;
2907 +       child->node = node;
2908 +       return result;
2909 +}
2910 +
2911 +/* lock all carry nodes in @level */
2912 +static int lock_carry_level(carry_level * level /* level to lock */ )
2913 +{
2914 +       int result;
2915 +       carry_node *node;
2916 +       carry_node *tmp_node;
2917 +
2918 +       assert("nikita-881", level != NULL);
2919 +       assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
2920 +
2921 +       /* lock nodes from left to right */
2922 +       result = 0;
2923 +       for_all_nodes(level, node, tmp_node) {
2924 +               result = lock_carry_node(level, node);
2925 +               if (result != 0)
2926 +                       break;
2927 +       }
2928 +       return result;
2929 +}
2930 +
2931 +/* Synchronize delimiting keys between @node and its left neighbor.
2932 +
2933 +   To reduce contention on dk key and simplify carry code, we synchronize
2934 +   delimiting keys only when carry ultimately leaves tree level (carrying
2935 +   changes upward) and unlocks nodes at this level.
2936 +
2937 +   This function first finds left neighbor of @node and then updates left
2938 +   neighbor's right delimiting key to conincide with least key in @node.
2939 +
2940 +*/
2941 +
2942 +ON_DEBUG(extern atomic_t delim_key_version;
2943 +    )
2944 +
2945 +static void sync_dkeys(znode * spot /* node to update */ )
2946 +{
2947 +       reiser4_key pivot;
2948 +       reiser4_tree *tree;
2949 +
2950 +       assert("nikita-1610", spot != NULL);
2951 +       assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
2952 +
2953 +       tree = znode_get_tree(spot);
2954 +       read_lock_tree(tree);
2955 +       write_lock_dk(tree);
2956 +
2957 +       assert("nikita-2192", znode_is_loaded(spot));
2958 +
2959 +       /* sync left delimiting key of @spot with key in its leftmost item */
2960 +       if (node_is_empty(spot))
2961 +               pivot = *znode_get_rd_key(spot);
2962 +       else
2963 +               leftmost_key_in_node(spot, &pivot);
2964 +
2965 +       znode_set_ld_key(spot, &pivot);
2966 +
2967 +       /* there can be sequence of empty nodes pending removal on the left of
2968 +          @spot. Scan them and update their left and right delimiting keys to
2969 +          match left delimiting key of @spot. Also, update right delimiting
2970 +          key of first non-empty left neighbor.
2971 +        */
2972 +       while (1) {
2973 +               if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
2974 +                       break;
2975 +
2976 +               spot = spot->left;
2977 +               if (spot == NULL)
2978 +                       break;
2979 +
2980 +               znode_set_rd_key(spot, &pivot);
2981 +               /* don't sink into the domain of another balancing */
2982 +               if (!znode_is_write_locked(spot))
2983 +                       break;
2984 +               if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
2985 +                       znode_set_ld_key(spot, &pivot);
2986 +               else
2987 +                       break;
2988 +       }
2989 +
2990 +       write_unlock_dk(tree);
2991 +       read_unlock_tree(tree);
2992 +}
2993 +
2994 +/* unlock all carry nodes in @level */
2995 +static void unlock_carry_level(carry_level * level /* level to unlock */ ,
2996 +                              int failure      /* true if unlocking owing to
2997 +                                                * failure */ )
2998 +{
2999 +       carry_node *node;
3000 +       carry_node *tmp_node;
3001 +
3002 +       assert("nikita-889", level != NULL);
3003 +
3004 +       if (!failure) {
3005 +               znode *spot;
3006 +
3007 +               spot = NULL;
3008 +               /* update delimiting keys */
3009 +               for_all_nodes(level, node, tmp_node) {
3010 +                       if (reiser4_carry_real(node) != spot) {
3011 +                               spot = reiser4_carry_real(node);
3012 +                               sync_dkeys(spot);
3013 +                       }
3014 +               }
3015 +       }
3016 +
3017 +       /* nodes can be unlocked in arbitrary order.  In preemptible
3018 +          environment it's better to unlock in reverse order of locking,
3019 +          though.
3020 +        */
3021 +       for_all_nodes_back(level, node, tmp_node) {
3022 +               /* all allocated nodes should be already linked to their
3023 +                  parents at this moment. */
3024 +               assert("nikita-1631",
3025 +                      ergo(!failure, !ZF_ISSET(reiser4_carry_real(node),
3026 +                                               JNODE_ORPHAN)));
3027 +               ON_DEBUG(check_dkeys(reiser4_carry_real(node)));
3028 +               unlock_carry_node(level, node, failure);
3029 +       }
3030 +       level->new_root = NULL;
3031 +}
3032 +
3033 +/* finish with @level
3034 +
3035 +   Unlock nodes and release all allocated resources */
3036 +static void done_carry_level(carry_level * level /* level to finish */ )
3037 +{
3038 +       carry_node *node;
3039 +       carry_node *tmp_node;
3040 +       carry_op *op;
3041 +       carry_op *tmp_op;
3042 +
3043 +       assert("nikita-1076", level != NULL);
3044 +
3045 +       unlock_carry_level(level, 0);
3046 +       for_all_nodes(level, node, tmp_node) {
3047 +               assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
3048 +               assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
3049 +               reiser4_pool_free(&level->pool->node_pool, &node->header);
3050 +       }
3051 +       for_all_ops(level, op, tmp_op)
3052 +           reiser4_pool_free(&level->pool->op_pool, &op->header);
3053 +}
3054 +
3055 +/* helper function to complete locking of carry node
3056 +
3057 +   Finish locking of carry node. There are several ways in which new carry
3058 +   node can be added into carry level and locked. Normal is through
3059 +   lock_carry_node(), but also from find_{left|right}_neighbor(). This
3060 +   function factors out common final part of all locking scenarios. It
3061 +   supposes that @node -> lock_handle is lock handle for lock just taken and
3062 +   fills ->real_node from this lock handle.
3063 +
3064 +*/
3065 +int lock_carry_node_tail(carry_node * node /* node to complete locking of */ )
3066 +{
3067 +       assert("nikita-1052", node != NULL);
3068 +       assert("nikita-1187", reiser4_carry_real(node) != NULL);
3069 +       assert("nikita-1188", !node->unlock);
3070 +
3071 +       node->unlock = 1;
3072 +       /* Load node content into memory and install node plugin by
3073 +          looking at the node header.
3074 +
3075 +          Most of the time this call is cheap because the node is
3076 +          already in memory.
3077 +
3078 +          Corresponding zrelse() is in unlock_carry_node()
3079 +        */
3080 +       return zload(reiser4_carry_real(node));
3081 +}
3082 +
3083 +/* lock carry node
3084 +
3085 +   "Resolve" node to real znode, lock it and mark as locked.
3086 +   This requires recursive locking of znodes.
3087 +
3088 +   When operation is posted to the parent level, node it will be applied to is
3089 +   not yet known. For example, when shifting data between two nodes,
3090 +   delimiting has to be updated in parent or parents of nodes involved. But
3091 +   their parents is not yet locked and, moreover said nodes can be reparented
3092 +   by concurrent balancing.
3093 +
3094 +   To work around this, carry operation is applied to special "carry node"
3095 +   rather than to the znode itself. Carry node consists of some "base" or
3096 +   "reference" znode and flags indicating how to get to the target of carry
3097 +   operation (->real_node field of carry_node) from base.
3098 +
3099 +*/
3100 +int lock_carry_node(carry_level * level /* level @node is in */ ,
3101 +                   carry_node * node /* node to lock */ )
3102 +{
3103 +       int result;
3104 +       znode *reference_point;
3105 +       lock_handle lh;
3106 +       lock_handle tmp_lh;
3107 +       reiser4_tree *tree;
3108 +
3109 +       assert("nikita-887", level != NULL);
3110 +       assert("nikita-882", node != NULL);
3111 +
3112 +       result = 0;
3113 +       reference_point = node->node;
3114 +       init_lh(&lh);
3115 +       init_lh(&tmp_lh);
3116 +       if (node->left_before) {
3117 +               /* handling of new nodes, allocated on the previous level:
3118 +
3119 +                  some carry ops were propably posted from the new node, but
3120 +                  this node neither has parent pointer set, nor is
3121 +                  connected. This will be done in ->create_hook() for
3122 +                  internal item.
3123 +
3124 +                  No then less, parent of new node has to be locked. To do
3125 +                  this, first go to the "left" in the carry order. This
3126 +                  depends on the decision to always allocate new node on the
3127 +                  right of existing one.
3128 +
3129 +                  Loop handles case when multiple nodes, all orphans, were
3130 +                  inserted.
3131 +
3132 +                  Strictly speaking, taking tree lock is not necessary here,
3133 +                  because all nodes scanned by loop in
3134 +                  find_begetting_brother() are write-locked by this thread,
3135 +                  and thus, their sibling linkage cannot change.
3136 +
3137 +                */
3138 +               tree = znode_get_tree(reference_point);
3139 +               read_lock_tree(tree);
3140 +               reference_point = find_begetting_brother(node, level)->node;
3141 +               read_unlock_tree(tree);
3142 +               assert("nikita-1186", reference_point != NULL);
3143 +       }
3144 +       if (node->parent && (result == 0)) {
3145 +               result =
3146 +                   reiser4_get_parent(&tmp_lh, reference_point,
3147 +                                      ZNODE_WRITE_LOCK);
3148 +               if (result != 0) {
3149 +                       ;       /* nothing */
3150 +               } else if (znode_get_level(tmp_lh.node) == 0) {
3151 +                       assert("nikita-1347", znode_above_root(tmp_lh.node));
3152 +                       result = add_new_root(level, node, tmp_lh.node);
3153 +                       if (result == 0) {
3154 +                               reference_point = level->new_root;
3155 +                               move_lh(&lh, &node->lock_handle);
3156 +                       }
3157 +               } else if ((level->new_root != NULL)
3158 +                          && (level->new_root !=
3159 +                              znode_parent_nolock(reference_point))) {
3160 +                       /* parent of node exists, but this level aready
3161 +                          created different new root, so */
3162 +                       warning("nikita-1109",
3163 +                               /* it should be "radicis", but tradition is
3164 +                                  tradition.  do banshees read latin? */
3165 +                               "hodie natus est radici frater");
3166 +                       result = -EIO;
3167 +               } else {
3168 +                       move_lh(&lh, &tmp_lh);
3169 +                       reference_point = lh.node;
3170 +               }
3171 +       }
3172 +       if (node->left && (result == 0)) {
3173 +               assert("nikita-1183", node->parent);
3174 +               assert("nikita-883", reference_point != NULL);
3175 +               result =
3176 +                   reiser4_get_left_neighbor(&tmp_lh, reference_point,
3177 +                                             ZNODE_WRITE_LOCK,
3178 +                                             GN_CAN_USE_UPPER_LEVELS);
3179 +               if (result == 0) {
3180 +                       done_lh(&lh);
3181 +                       move_lh(&lh, &tmp_lh);
3182 +                       reference_point = lh.node;
3183 +               }
3184 +       }
3185 +       if (!node->parent && !node->left && !node->left_before) {
3186 +               result =
3187 +                   longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
3188 +                                       ZNODE_LOCK_HIPRI);
3189 +       }
3190 +       if (result == 0) {
3191 +               move_lh(&node->lock_handle, &lh);
3192 +               result = lock_carry_node_tail(node);
3193 +       }
3194 +       done_lh(&tmp_lh);
3195 +       done_lh(&lh);
3196 +       return result;
3197 +}
3198 +
3199 +/* release a lock on &carry_node.
3200 +
3201 +   Release if necessary lock on @node. This opearion is pair of
3202 +   lock_carry_node() and is idempotent: you can call it more than once on the
3203 +   same node.
3204 +
3205 +*/
3206 +static void
3207 +unlock_carry_node(carry_level * level,
3208 +                 carry_node * node /* node to be released */ ,
3209 +                 int failure   /* 0 if node is unlocked due
3210 +                                * to some error */ )
3211 +{
3212 +       znode *real_node;
3213 +
3214 +       assert("nikita-884", node != NULL);
3215 +
3216 +       real_node = reiser4_carry_real(node);
3217 +       /* pair to zload() in lock_carry_node_tail() */
3218 +       zrelse(real_node);
3219 +       if (node->unlock && (real_node != NULL)) {
3220 +               assert("nikita-899", real_node == node->lock_handle.node);
3221 +               longterm_unlock_znode(&node->lock_handle);
3222 +       }
3223 +       if (failure) {
3224 +               if (node->deallocate && (real_node != NULL)) {
3225 +                       /* free node in bitmap
3226 +
3227 +                          Prepare node for removal. Last zput() will finish
3228 +                          with it.
3229 +                        */
3230 +                       ZF_SET(real_node, JNODE_HEARD_BANSHEE);
3231 +               }
3232 +               if (node->free) {
3233 +                       assert("nikita-2177",
3234 +                              list_empty_careful(&node->lock_handle.locks_link));
3235 +                       assert("nikita-2112",
3236 +                              list_empty_careful(&node->lock_handle.owners_link));
3237 +                       reiser4_pool_free(&level->pool->node_pool,
3238 +                                         &node->header);
3239 +               }
3240 +       }
3241 +}
3242 +
3243 +/* fatal_carry_error() - all-catching error handling function
3244 +
3245 +   It is possible that carry faces unrecoverable error, like unability to
3246 +   insert pointer at the internal level. Our simple solution is just panic in
3247 +   this situation. More sophisticated things like attempt to remount
3248 +   file-system as read-only can be implemented without much difficlties.
3249 +
3250 +   It is believed, that:
3251 +
3252 +   1. in stead of panicking, all current transactions can be aborted rolling
3253 +   system back to the consistent state.
3254 +
3255 +Umm, if you simply panic without doing anything more at all, then all current
3256 +transactions are aborted and the system is rolled back to a consistent state,
3257 +by virtue of the design of the transactional mechanism. Well, wait, let's be
3258 +precise.  If an internal node is corrupted on disk due to hardware failure,
3259 +then there may be no consistent state that can be rolled back to, so instead
3260 +we should say that it will rollback the transactions, which barring other
3261 +factors means rolling back to a consistent state.
3262 +
3263 +# Nikita: there is a subtle difference between panic and aborting
3264 +# transactions: machine doesn't reboot. Processes aren't killed. Processes
3265 +# don't using reiser4 (not that we care about such processes), or using other
3266 +# reiser4 mounts (about them we do care) will simply continue to run. With
3267 +# some luck, even application using aborted file system can survive: it will
3268 +# get some error, like EBADF, from each file descriptor on failed file system,
3269 +# but applications that do care about tolerance will cope with this (squid
3270 +# will).
3271 +
3272 +It would be a nice feature though to support rollback without rebooting
3273 +followed by remount, but this can wait for later versions.
3274 +
3275 +   2. once isolated transactions will be implemented it will be possible to
3276 +   roll back offending transaction.
3277 +
3278 +2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about
3279 +it more before deciding if it should be done.  -Hans
3280 +
3281 +*/
3282 +static void fatal_carry_error(carry_level * doing UNUSED_ARG   /* carry level
3283 +                                                                * where
3284 +                                                                * unrecoverable
3285 +                                                                * error
3286 +                                                                * occurred */ ,
3287 +                             int ecode /* error code */ )
3288 +{
3289 +       assert("nikita-1230", doing != NULL);
3290 +       assert("nikita-1231", ecode < 0);
3291 +
3292 +       reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
3293 +}
3294 +
3295 +/* add new root to the tree
3296 +
3297 +   This function itself only manages changes in carry structures and delegates
3298 +   all hard work (allocation of znode for new root, changes of parent and
3299 +   sibling pointers to the reiser4_add_tree_root().
3300 +
3301 +   Locking: old tree root is locked by carry at this point. Fake znode is also
3302 +   locked.
3303 +
3304 +*/
3305 +static int add_new_root(carry_level * level    /* carry level in context of which
3306 +                                                * operation is performed */ ,
3307 +                       carry_node * node /* carry node for existing root */ ,
3308 +                       znode * fake    /* "fake" znode already locked by
3309 +                                        * us */ )
3310 +{
3311 +       int result;
3312 +
3313 +       assert("nikita-1104", level != NULL);
3314 +       assert("nikita-1105", node != NULL);
3315 +
3316 +       assert("nikita-1403", znode_is_write_locked(node->node));
3317 +       assert("nikita-1404", znode_is_write_locked(fake));
3318 +
3319 +       /* trying to create new root. */
3320 +       /* @node is root and it's already locked by us. This
3321 +          means that nobody else can be trying to add/remove
3322 +          tree root right now.
3323 +        */
3324 +       if (level->new_root == NULL)
3325 +               level->new_root = reiser4_add_tree_root(node->node, fake);
3326 +       if (!IS_ERR(level->new_root)) {
3327 +               assert("nikita-1210", znode_is_root(level->new_root));
3328 +               node->deallocate = 1;
3329 +               result =
3330 +                   longterm_lock_znode(&node->lock_handle, level->new_root,
3331 +                                       ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
3332 +               if (result == 0)
3333 +                       zput(level->new_root);
3334 +       } else {
3335 +               result = PTR_ERR(level->new_root);
3336 +               level->new_root = NULL;
3337 +       }
3338 +       return result;
3339 +}
3340 +
3341 +/* allocate new znode and add the operation that inserts the
3342 +   pointer to it into the parent node into the todo level
3343 +
3344 +   Allocate new znode, add it into carry queue and post into @todo queue
3345 +   request to add pointer to new node into its parent.
3346 +
3347 +   This is carry related routing that calls reiser4_new_node() to allocate new
3348 +   node.
3349 +*/
3350 +carry_node *add_new_znode(znode * brother      /* existing left neighbor of new
3351 +                                                * node */ ,
3352 +                         carry_node * ref      /* carry node after which new
3353 +                                                * carry node is to be inserted
3354 +                                                * into queue. This affects
3355 +                                                * locking. */ ,
3356 +                         carry_level * doing   /* carry queue where new node is
3357 +                                                * to be added */ ,
3358 +                         carry_level * todo    /* carry queue where COP_INSERT
3359 +                                                * operation to add pointer to
3360 +                                                * new node will ne added */ )
3361 +{
3362 +       carry_node *fresh;
3363 +       znode *new_znode;
3364 +       carry_op *add_pointer;
3365 +       carry_plugin_info info;
3366 +
3367 +       assert("nikita-1048", brother != NULL);
3368 +       assert("nikita-1049", todo != NULL);
3369 +
3370 +       /* There is a lot of possible variations here: to what parent
3371 +          new node will be attached and where. For simplicity, always
3372 +          do the following:
3373 +
3374 +          (1) new node and @brother will have the same parent.
3375 +
3376 +          (2) new node is added on the right of @brother
3377 +
3378 +        */
3379 +
3380 +       fresh = reiser4_add_carry_skip(doing,
3381 +                                      ref ? POOLO_AFTER : POOLO_LAST, ref);
3382 +       if (IS_ERR(fresh))
3383 +               return fresh;
3384 +
3385 +       fresh->deallocate = 1;
3386 +       fresh->free = 1;
3387 +
3388 +       new_znode = reiser4_new_node(brother, znode_get_level(brother));
3389 +       if (IS_ERR(new_znode))
3390 +               /* @fresh will be deallocated automatically by error
3391 +                  handling code in the caller. */
3392 +               return (carry_node *) new_znode;
3393 +
3394 +       /* new_znode returned znode with x_count 1. Caller has to decrease
3395 +          it. make_space() does. */
3396 +
3397 +       ZF_SET(new_znode, JNODE_ORPHAN);
3398 +       fresh->node = new_znode;
3399 +
3400 +       while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) {
3401 +               ref = carry_node_prev(ref);
3402 +               assert("nikita-1606", !carry_node_end(doing, ref));
3403 +       }
3404 +
3405 +       info.todo = todo;
3406 +       info.doing = doing;
3407 +       add_pointer = node_post_carry(&info, COP_INSERT,
3408 +                                     reiser4_carry_real(ref), 1);
3409 +       if (IS_ERR(add_pointer)) {
3410 +               /* no need to deallocate @new_znode here: it will be
3411 +                  deallocated during carry error handling. */
3412 +               return (carry_node *) add_pointer;
3413 +       }
3414 +
3415 +       add_pointer->u.insert.type = COPT_CHILD;
3416 +       add_pointer->u.insert.child = fresh;
3417 +       add_pointer->u.insert.brother = brother;
3418 +       /* initially new node spawns empty key range */
3419 +       write_lock_dk(znode_get_tree(brother));
3420 +       znode_set_ld_key(new_znode,
3421 +                        znode_set_rd_key(new_znode,
3422 +                                         znode_get_rd_key(brother)));
3423 +       write_unlock_dk(znode_get_tree(brother));
3424 +       return fresh;
3425 +}
3426 +
3427 +/* DEBUGGING FUNCTIONS.
3428 +
3429 +   Probably we also should leave them on even when
3430 +   debugging is turned off to print dumps at errors.
3431 +*/
3432 +#if REISER4_DEBUG
3433 +static int carry_level_invariant(carry_level * level, carry_queue_state state)
3434 +{
3435 +       carry_node *node;
3436 +       carry_node *tmp_node;
3437 +
3438 +       if (level == NULL)
3439 +               return 0;
3440 +
3441 +       if (level->track_type != 0 &&
3442 +           level->track_type != CARRY_TRACK_NODE &&
3443 +           level->track_type != CARRY_TRACK_CHANGE)
3444 +               return 0;
3445 +
3446 +       /* check that nodes are in ascending order */
3447 +       for_all_nodes(level, node, tmp_node) {
3448 +               znode *left;
3449 +               znode *right;
3450 +
3451 +               reiser4_key lkey;
3452 +               reiser4_key rkey;
3453 +
3454 +               if (node != carry_node_front(level)) {
3455 +                       if (state == CARRY_TODO) {
3456 +                               right = node->node;
3457 +                               left = carry_node_prev(node)->node;
3458 +                       } else {
3459 +                               right = reiser4_carry_real(node);
3460 +                               left = reiser4_carry_real(carry_node_prev(node));
3461 +                       }
3462 +                       if (right == NULL || left == NULL)
3463 +                               continue;
3464 +                       if (node_is_empty(right) || node_is_empty(left))
3465 +                               continue;
3466 +                       if (!keyle(leftmost_key_in_node(left, &lkey),
3467 +                                  leftmost_key_in_node(right, &rkey))) {
3468 +                               warning("", "wrong key order");
3469 +                               return 0;
3470 +                       }
3471 +               }
3472 +       }
3473 +       return 1;
3474 +}
3475 +#endif
3476 +
3477 +/* get symbolic name for boolean */
3478 +static const char *tf(int boolean /* truth value */ )
3479 +{
3480 +       return boolean ? "t" : "f";
3481 +}
3482 +
3483 +/* symbolic name for carry operation */
3484 +static const char *carry_op_name(carry_opcode op /* carry opcode */ )
3485 +{
3486 +       switch (op) {
3487 +       case COP_INSERT:
3488 +               return "COP_INSERT";
3489 +       case COP_DELETE:
3490 +               return "COP_DELETE";
3491 +       case COP_CUT:
3492 +               return "COP_CUT";
3493 +       case COP_PASTE:
3494 +               return "COP_PASTE";
3495 +       case COP_UPDATE:
3496 +               return "COP_UPDATE";
3497 +       case COP_EXTENT:
3498 +               return "COP_EXTENT";
3499 +       case COP_INSERT_FLOW:
3500 +               return "COP_INSERT_FLOW";
3501 +       default:{
3502 +                       /* not mt safe, but who cares? */
3503 +                       static char buf[20];
3504 +
3505 +                       sprintf(buf, "unknown op: %x", op);
3506 +                       return buf;
3507 +               }
3508 +       }
3509 +}
3510 +
3511 +/* dump information about carry node */
3512 +static void print_carry(const char *prefix /* prefix to print */ ,
3513 +                       carry_node * node /* node to print */ )
3514 +{
3515 +       if (node == NULL) {
3516 +               printk("%s: null\n", prefix);
3517 +               return;
3518 +       }
3519 +       printk
3520 +           ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
3521 +            prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
3522 +            tf(node->free), tf(node->deallocate));
3523 +}
3524 +
3525 +/* dump information about carry operation */
3526 +static void print_op(const char *prefix /* prefix to print */ ,
3527 +                    carry_op * op /* operation to print */ )
3528 +{
3529 +       if (op == NULL) {
3530 +               printk("%s: null\n", prefix);
3531 +               return;
3532 +       }
3533 +       printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
3534 +       print_carry("\tnode", op->node);
3535 +       switch (op->op) {
3536 +       case COP_INSERT:
3537 +       case COP_PASTE:
3538 +               print_coord("\tcoord",
3539 +                           op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
3540 +               reiser4_print_key("\tkey",
3541 +                                 op->u.insert.d ? op->u.insert.d->key : NULL);
3542 +               print_carry("\tchild", op->u.insert.child);
3543 +               break;
3544 +       case COP_DELETE:
3545 +               print_carry("\tchild", op->u.delete.child);
3546 +               break;
3547 +       case COP_CUT:
3548 +               if (op->u.cut_or_kill.is_cut) {
3549 +                       print_coord("\tfrom",
3550 +                                   op->u.cut_or_kill.u.kill->params.from, 0);
3551 +                       print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
3552 +                                   0);
3553 +               } else {
3554 +                       print_coord("\tfrom",
3555 +                                   op->u.cut_or_kill.u.cut->params.from, 0);
3556 +                       print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
3557 +                                   0);
3558 +               }
3559 +               break;
3560 +       case COP_UPDATE:
3561 +               print_carry("\tleft", op->u.update.left);
3562 +               break;
3563 +       default:
3564 +               /* do nothing */
3565 +               break;
3566 +       }
3567 +}
3568 +
3569 +/* dump information about all nodes and operations in a @level */
3570 +static void print_level(const char *prefix /* prefix to print */ ,
3571 +                       carry_level * level /* level to print */ )
3572 +{
3573 +       carry_node *node;
3574 +       carry_node *tmp_node;
3575 +       carry_op *op;
3576 +       carry_op *tmp_op;
3577 +
3578 +       if (level == NULL) {
3579 +               printk("%s: null\n", prefix);
3580 +               return;
3581 +       }
3582 +       printk("%s: %p, restartable: %s\n",
3583 +              prefix, level, tf(level->restartable));
3584 +
3585 +       for_all_nodes(level, node, tmp_node)
3586 +           print_carry("\tcarry node", node);
3587 +       for_all_ops(level, op, tmp_op)
3588 +           print_op("\tcarry op", op);
3589 +}
3590 +
3591 +/* Make Linus happy.
3592 +   Local variables:
3593 +   c-indentation-style: "K&R"
3594 +   mode-name: "LC"
3595 +   c-basic-offset: 8
3596 +   tab-width: 8
3597 +   fill-column: 120
3598 +   scroll-step: 1
3599 +   End:
3600 +*/
3601 diff -urN linux-2.6.27.orig/fs/reiser4/carry.h linux-2.6.27/fs/reiser4/carry.h
3602 --- linux-2.6.27.orig/fs/reiser4/carry.h        1970-01-01 03:00:00.000000000 +0300
3603 +++ linux-2.6.27/fs/reiser4/carry.h     2008-10-12 18:20:00.000000000 +0400
3604 @@ -0,0 +1,442 @@
3605 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
3606 +
3607 +/* Functions and data types to "carry" tree modification(s) upward.
3608 +   See fs/reiser4/carry.c for details. */
3609 +
3610 +#if !defined( __FS_REISER4_CARRY_H__ )
3611 +#define __FS_REISER4_CARRY_H__
3612 +
3613 +#include "forward.h"
3614 +#include "debug.h"
3615 +#include "pool.h"
3616 +#include "znode.h"
3617 +
3618 +#include <linux/types.h>
3619 +
3620 +/* &carry_node - "location" of carry node.
3621 +
3622 +   "location" of node that is involved or going to be involved into
3623 +   carry process. Node where operation will be carried to on the
3624 +   parent level cannot be recorded explicitly. Operation will be carried
3625 +   usually to the parent of some node (where changes are performed at
3626 +   the current level) or, to the left neighbor of its parent. But while
3627 +   modifications are performed at the current level, parent may
3628 +   change. So, we have to allow some indirection (or, positevly,
3629 +   flexibility) in locating carry nodes.
3630 +
3631 +*/
3632 +typedef struct carry_node {
3633 +       /* pool linkage */
3634 +       struct reiser4_pool_header header;
3635 +
3636 +       /* base node from which real_node is calculated. See
3637 +          fs/reiser4/carry.c:lock_carry_node(). */
3638 +       znode *node;
3639 +
3640 +       /* how to get ->real_node */
3641 +       /* to get ->real_node obtain parent of ->node */
3642 +       __u32 parent:1;
3643 +       /* to get ->real_node obtain left neighbor of parent of
3644 +          ->node */
3645 +       __u32 left:1;
3646 +       __u32 left_before:1;
3647 +
3648 +       /* locking */
3649 +
3650 +       /* this node was locked by carry process and should be
3651 +          unlocked when carry leaves a level */
3652 +       __u32 unlock:1;
3653 +
3654 +       /* disk block for this node was allocated by carry process and
3655 +          should be deallocated when carry leaves a level */
3656 +       __u32 deallocate:1;
3657 +       /* this carry node was allocated by carry process and should be
3658 +          freed when carry leaves a level */
3659 +       __u32 free:1;
3660 +
3661 +       /* type of lock we want to take on this node */
3662 +       lock_handle lock_handle;
3663 +} carry_node;
3664 +
3665 +/* &carry_opcode - elementary operations that can be carried upward
3666 +
3667 +   Operations that carry() can handle. This list is supposed to be
3668 +   expanded.
3669 +
3670 +   Each carry operation (cop) is handled by appropriate function defined
3671 +   in fs/reiser4/carry.c. For example COP_INSERT is handled by
3672 +   fs/reiser4/carry.c:carry_insert() etc. These functions in turn
3673 +   call plugins of nodes affected by operation to modify nodes' content
3674 +   and to gather operations to be performed on the next level.
3675 +
3676 +*/
3677 +typedef enum {
3678 +       /* insert new item into node. */
3679 +       COP_INSERT,
3680 +       /* delete pointer from parent node */
3681 +       COP_DELETE,
3682 +       /* remove part of or whole node. */
3683 +       COP_CUT,
3684 +       /* increase size of item. */
3685 +       COP_PASTE,
3686 +       /* insert extent (that is sequence of unformatted nodes). */
3687 +       COP_EXTENT,
3688 +       /* update delimiting key in least common ancestor of two
3689 +          nodes. This is performed when items are moved between two
3690 +          nodes.
3691 +        */
3692 +       COP_UPDATE,
3693 +       /* insert flow */
3694 +       COP_INSERT_FLOW,
3695 +       COP_LAST_OP,
3696 +} carry_opcode;
3697 +
3698 +#define CARRY_FLOW_NEW_NODES_LIMIT 20
3699 +
3700 +/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
3701 +   item is determined. */
3702 +typedef enum {
3703 +       /* target item is one containing pointer to the ->child node */
3704 +       COPT_CHILD,
3705 +       /* target item is given explicitly by @coord */
3706 +       COPT_ITEM_DATA,
3707 +       /* target item is given by key */
3708 +       COPT_KEY,
3709 +       /* see insert_paste_common() for more comments on this. */
3710 +       COPT_PASTE_RESTARTED,
3711 +} cop_insert_pos_type;
3712 +
3713 +/* flags to cut and delete */
3714 +typedef enum {
3715 +       /* don't kill node even if it became completely empty as results of
3716 +        * cut. This is needed for eottl handling. See carry_extent() for
3717 +        * details. */
3718 +       DELETE_RETAIN_EMPTY = (1 << 0)
3719 +} cop_delete_flag;
3720 +
3721 +/*
3722 + * carry() implements "lock handle tracking" feature.
3723 + *
3724 + * Callers supply carry with node where to perform initial operation and lock
3725 + * handle on this node. Trying to optimize node utilization carry may actually
3726 + * move insertion point to different node. Callers expect that lock handle
3727 + * will rebe transferred to the new node also.
3728 + *
3729 + */
3730 +typedef enum {
3731 +       /* transfer lock handle along with insertion point */
3732 +       CARRY_TRACK_CHANGE = 1,
3733 +       /* acquire new lock handle to the node where insertion point is. This
3734 +        * is used when carry() client doesn't initially possess lock handle
3735 +        * on the insertion point node, for example, by extent insertion
3736 +        * code. See carry_extent(). */
3737 +       CARRY_TRACK_NODE = 2
3738 +} carry_track_type;
3739 +
3740 +/* data supplied to COP_{INSERT|PASTE} by callers */
3741 +typedef struct carry_insert_data {
3742 +       /* position where new item is to be inserted */
3743 +       coord_t *coord;
3744 +       /* new item description */
3745 +       reiser4_item_data *data;
3746 +       /* key of new item */
3747 +       const reiser4_key *key;
3748 +} carry_insert_data;
3749 +
3750 +/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */
3751 +struct cut_kill_params {
3752 +       /* coord where cut starts (inclusive) */
3753 +       coord_t *from;
3754 +       /* coord where cut stops (inclusive, this item/unit will also be
3755 +        * cut) */
3756 +       coord_t *to;
3757 +       /* starting key. This is necessary when item and unit pos don't
3758 +        * uniquely identify what portion or tree to remove. For example, this
3759 +        * indicates what portion of extent unit will be affected. */
3760 +       const reiser4_key *from_key;
3761 +       /* exclusive stop key */
3762 +       const reiser4_key *to_key;
3763 +       /* if this is not NULL, smallest actually removed key is stored
3764 +        * here. */
3765 +       reiser4_key *smallest_removed;
3766 +       /* kill_node_content()  is called for file truncate */
3767 +       int truncate;
3768 +};
3769 +
3770 +struct carry_cut_data {
3771 +       struct cut_kill_params params;
3772 +};
3773 +
3774 +struct carry_kill_data {
3775 +       struct cut_kill_params params;
3776 +       /* parameter to be passed to the ->kill_hook() method of item
3777 +        * plugin */
3778 +       /*void *iplug_params; *//* FIXME: unused currently */
3779 +       /* if not NULL---inode whose items are being removed. This is needed
3780 +        * for ->kill_hook() of extent item to update VM structures when
3781 +        * removing pages. */
3782 +       struct inode *inode;
3783 +       /* sibling list maintenance is complicated by existence of eottl. When
3784 +        * eottl whose left and right neighbors are formatted leaves is
3785 +        * removed, one has to connect said leaves in the sibling list. This
3786 +        * cannot be done when extent removal is just started as locking rules
3787 +        * require sibling list update to happen atomically with removal of
3788 +        * extent item. Therefore: 1. pointers to left and right neighbors
3789 +        * have to be passed down to the ->kill_hook() of extent item, and
3790 +        * 2. said neighbors have to be locked. */
3791 +       lock_handle *left;
3792 +       lock_handle *right;
3793 +       /* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */
3794 +       unsigned flags;
3795 +       char *buf;
3796 +};
3797 +
3798 +/* &carry_tree_op - operation to "carry" upward.
3799 +
3800 +   Description of an operation we want to "carry" to the upper level of
3801 +   a tree: e.g, when we insert something and there is not enough space
3802 +   we allocate a new node and "carry" the operation of inserting a
3803 +   pointer to the new node to the upper level, on removal of empty node,
3804 +   we carry up operation of removing appropriate entry from parent.
3805 +
3806 +   There are two types of carry ops: when adding or deleting node we
3807 +   node at the parent level where appropriate modification has to be
3808 +   performed is known in advance. When shifting items between nodes
3809 +   (split, merge), delimiting key should be changed in the least common
3810 +   parent of the nodes involved that is not known in advance.
3811 +
3812 +   For the operations of the first type we store in &carry_op pointer to
3813 +   the &carry_node at the parent level. For the operation of the second
3814 +   type we store &carry_node or parents of the left and right nodes
3815 +   modified and keep track of them upward until they coincide.
3816 +
3817 +*/
3818 +typedef struct carry_op {
3819 +       /* pool linkage */
3820 +       struct reiser4_pool_header header;
3821 +       carry_opcode op;
3822 +       /* node on which operation is to be performed:
3823 +
3824 +          for insert, paste: node where new item is to be inserted
3825 +
3826 +          for delete: node where pointer is to be deleted
3827 +
3828 +          for cut: node to cut from
3829 +
3830 +          for update: node where delimiting key is to be modified
3831 +
3832 +          for modify: parent of modified node
3833 +
3834 +        */
3835 +       carry_node *node;
3836 +       union {
3837 +               struct {
3838 +                       /* (sub-)type of insertion/paste. Taken from
3839 +                          cop_insert_pos_type. */
3840 +                       __u8 type;
3841 +                       /* various operation flags. Taken from
3842 +                          cop_insert_flag. */
3843 +                       __u8 flags;
3844 +                       carry_insert_data *d;
3845 +                       carry_node *child;
3846 +                       znode *brother;
3847 +               } insert, paste, extent;
3848 +
3849 +               struct {
3850 +                       int is_cut;
3851 +                       union {
3852 +                               carry_kill_data *kill;
3853 +                               carry_cut_data *cut;
3854 +                       } u;
3855 +               } cut_or_kill;
3856 +
3857 +               struct {
3858 +                       carry_node *left;
3859 +               } update;
3860 +               struct {
3861 +                       /* changed child */
3862 +                       carry_node *child;
3863 +                       /* bitmask of changes. See &cop_modify_flag */
3864 +                       __u32 flag;
3865 +               } modify;
3866 +               struct {
3867 +                       /* flags to deletion operation. Are taken from
3868 +                          cop_delete_flag */
3869 +                       __u32 flags;
3870 +                       /* child to delete from parent. If this is
3871 +                          NULL, delete op->node.  */
3872 +                       carry_node *child;
3873 +               } delete;
3874 +               struct {
3875 +                       /* various operation flags. Taken from
3876 +                          cop_insert_flag. */
3877 +                       __u32 flags;
3878 +                       flow_t *flow;
3879 +                       coord_t *insert_point;
3880 +                       reiser4_item_data *data;
3881 +                       /* flow insertion is limited by number of new blocks
3882 +                          added in that operation which do not get any data
3883 +                          but part of flow. This limit is set by macro
3884 +                          CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
3885 +                          of nodes added already during one carry_flow */
3886 +                       int new_nodes;
3887 +               } insert_flow;
3888 +       } u;
3889 +} carry_op;
3890 +
3891 +/* &carry_op_pool - preallocated pool of carry operations, and nodes */
3892 +typedef struct carry_pool {
3893 +       carry_op op[CARRIES_POOL_SIZE];
3894 +       struct reiser4_pool op_pool;
3895 +       carry_node node[NODES_LOCKED_POOL_SIZE];
3896 +       struct reiser4_pool node_pool;
3897 +} carry_pool;
3898 +
3899 +/* &carry_tree_level - carry process on given level
3900 +
3901 +   Description of balancing process on the given level.
3902 +
3903 +   No need for locking here, as carry_tree_level is essentially per
3904 +   thread thing (for now).
3905 +
3906 +*/
3907 +struct carry_level {
3908 +       /* this level may be restarted */
3909 +       __u32 restartable:1;
3910 +       /* list of carry nodes on this level, ordered by key order */
3911 +       struct list_head nodes;
3912 +       struct list_head ops;
3913 +       /* pool where new objects are allocated from */
3914 +       carry_pool *pool;
3915 +       int ops_num;
3916 +       int nodes_num;
3917 +       /* new root created on this level, if any */
3918 +       znode *new_root;
3919 +       /* This is set by caller (insert_by_key(), rreiser4_esize_item(), etc.)
3920 +          when they want ->tracked to automagically wander to the node where
3921 +          insertion point moved after insert or paste.
3922 +        */
3923 +       carry_track_type track_type;
3924 +       /* lock handle supplied by user that we are tracking. See
3925 +          above. */
3926 +       lock_handle *tracked;
3927 +};
3928 +
3929 +/* information carry passes to plugin methods that may add new operations to
3930 +   the @todo queue  */
3931 +struct carry_plugin_info {
3932 +       carry_level *doing;
3933 +       carry_level *todo;
3934 +};
3935 +
3936 +int reiser4_carry(carry_level * doing, carry_level * done);
3937 +
3938 +carry_node *reiser4_add_carry(carry_level * level, pool_ordering order,
3939 +                             carry_node * reference);
3940 +carry_node *reiser4_add_carry_skip(carry_level * level, pool_ordering order,
3941 +                                  carry_node * reference);
3942 +
3943 +extern carry_node *insert_carry_node(carry_level * doing,
3944 +                                    carry_level * todo, const znode * node);
3945 +
3946 +extern carry_pool *init_carry_pool(int);
3947 +extern void done_carry_pool(carry_pool * pool);
3948 +
3949 +extern void init_carry_level(carry_level * level, carry_pool * pool);
3950 +
3951 +extern carry_op *reiser4_post_carry(carry_level * level, carry_opcode op,
3952 +                                   znode * node, int apply_to_parent);
3953 +extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
3954 +                                znode * node, int apply_to_parent_p);
3955 +
3956 +carry_node *add_new_znode(znode * brother, carry_node * reference,
3957 +                         carry_level * doing, carry_level * todo);
3958 +
3959 +carry_node *find_carry_node(carry_level * level, const znode * node);
3960 +
3961 +extern znode *reiser4_carry_real(const carry_node * node);
3962 +
3963 +/* helper macros to iterate over carry queues */
3964 +
3965 +#define carry_node_next( node )                                        \
3966 +       list_entry((node)->header.level_linkage.next, carry_node,       \
3967 +                  header.level_linkage)
3968 +
3969 +#define carry_node_prev( node )                                        \
3970 +       list_entry((node)->header.level_linkage.prev, carry_node,       \
3971 +                  header.level_linkage)
3972 +
3973 +#define carry_node_front( level )                                              \
3974 +       list_entry((level)->nodes.next, carry_node, header.level_linkage)
3975 +
3976 +#define carry_node_back( level )                                               \
3977 +       list_entry((level)->nodes.prev, carry_node, header.level_linkage)
3978 +
3979 +#define carry_node_end( level, node )                          \
3980 +       (&(level)->nodes == &(node)->header.level_linkage)
3981 +
3982 +/* macro to iterate over all operations in a @level */
3983 +#define for_all_ops( level /* carry level (of type carry_level *) */,                  \
3984 +                    op    /* pointer to carry operation, modified by loop (of          \
3985 +                           * type carry_op *) */,                                      \
3986 +                    tmp   /* pointer to carry operation (of type carry_op *),          \
3987 +                           * used to make iterator stable in the face of               \
3988 +                           * deletions from the level */ )                             \
3989 +for (op = list_entry(level->ops.next, carry_op, header.level_linkage),                 \
3990 +     tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage);  \
3991 +     &op->header.level_linkage != &level->ops;                                         \
3992 +     op = tmp,                                                                         \
3993 +     tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
3994 +
3995 +#if 0
3996 +for( op = ( carry_op * ) pool_level_list_front( &level -> ops ),               \
3997 +     tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ;              \
3998 +     ! pool_level_list_end( &level -> ops, &op -> header ) ;                   \
3999 +     op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) )
4000 +#endif
4001 +
4002 +/* macro to iterate over all nodes in a @level */                                              \
4003 +#define for_all_nodes( level /* carry level (of type carry_level *) */,                                \
4004 +                      node  /* pointer to carry node, modified by loop (of                     \
4005 +                             * type carry_node *) */,                                          \
4006 +                      tmp   /* pointer to carry node (of type carry_node *),                   \
4007 +                             * used to make iterator stable in the face of *                   \
4008 +                             * deletions from the level */ )                                   \
4009 +for (node = list_entry(level->nodes.next, carry_node, header.level_linkage),                   \
4010 +     tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);      \
4011 +     &node->header.level_linkage != &level->nodes;                                             \
4012 +     node = tmp,                                                                               \
4013 +     tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
4014 +
4015 +#if 0
4016 +for( node = carry_node_front( level ),                                         \
4017 +     tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ;         \
4018 +     node = tmp, tmp = carry_node_next( node ) )
4019 +#endif
4020 +
4021 +/* macro to iterate over all nodes in a @level in reverse order
4022 +
4023 +   This is used, because nodes are unlocked in reversed order of locking */
4024 +#define for_all_nodes_back( level /* carry level (of type carry_level *) */,   \
4025 +                           node  /* pointer to carry node, modified by loop    \
4026 +                                  * (of type carry_node *) */,                 \
4027 +                           tmp   /* pointer to carry node (of type carry_node  \
4028 +                                  * *), used to make iterator stable in the    \
4029 +                                  * face of deletions from the level */ )      \
4030 +for( node = carry_node_back( level ),          \
4031 +     tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ;         \
4032 +     node = tmp, tmp = carry_node_prev( node ) )
4033 +
4034 +/* __FS_REISER4_CARRY_H__ */
4035 +#endif
4036 +
4037 +/* Make Linus happy.
4038 +   Local variables:
4039 +   c-indentation-style: "K&R"
4040 +   mode-name: "LC"
4041 +   c-basic-offset: 8
4042 +   tab-width: 8
4043 +   fill-column: 120
4044 +   scroll-step: 1
4045 +   End:
4046 +*/
4047 diff -urN linux-2.6.27.orig/fs/reiser4/carry_ops.c linux-2.6.27/fs/reiser4/carry_ops.c
4048 --- linux-2.6.27.orig/fs/reiser4/carry_ops.c    1970-01-01 03:00:00.000000000 +0300
4049 +++ linux-2.6.27/fs/reiser4/carry_ops.c 2008-10-12 18:20:00.000000000 +0400
4050 @@ -0,0 +1,2131 @@
4051 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4052 +
4053 +/* implementation of carry operations */
4054 +
4055 +#include "forward.h"
4056 +#include "debug.h"
4057 +#include "key.h"
4058 +#include "coord.h"
4059 +#include "plugin/item/item.h"
4060 +#include "plugin/node/node.h"
4061 +#include "jnode.h"
4062 +#include "znode.h"
4063 +#include "block_alloc.h"
4064 +#include "tree_walk.h"
4065 +#include "pool.h"
4066 +#include "tree_mod.h"
4067 +#include "carry.h"
4068 +#include "carry_ops.h"
4069 +#include "tree.h"
4070 +#include "super.h"
4071 +#include "reiser4.h"
4072 +
4073 +#include <linux/types.h>
4074 +#include <linux/err.h>
4075 +
4076 +static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node,
4077 +                           carry_level * doing, carry_level * todo,
4078 +                           unsigned int including_insert_coord_p);
4079 +
4080 +extern int lock_carry_node(carry_level * level, carry_node * node);
4081 +extern int lock_carry_node_tail(carry_node * node);
4082 +
4083 +/* find left neighbor of a carry node
4084 +
4085 +   Look for left neighbor of @node and add it to the @doing queue. See
4086 +   comments in the body.
4087 +
4088 +*/
4089 +static carry_node *find_left_neighbor(carry_op * op    /* node to find left
4090 +                                                        * neighbor of */ ,
4091 +                                     carry_level * doing /* level to scan */ )
4092 +{
4093 +       int result;
4094 +       carry_node *node;
4095 +       carry_node *left;
4096 +       int flags;
4097 +       reiser4_tree *tree;
4098 +
4099 +       node = op->node;
4100 +
4101 +       tree = current_tree;
4102 +       read_lock_tree(tree);
4103 +       /* first, check whether left neighbor is already in a @doing queue */
4104 +       if (reiser4_carry_real(node)->left != NULL) {
4105 +               /* NOTE: there is locking subtlety here. Look into
4106 +                * find_right_neighbor() for more info */
4107 +               if (find_carry_node(doing,
4108 +                                   reiser4_carry_real(node)->left) != NULL) {
4109 +                       read_unlock_tree(tree);
4110 +                       left = node;
4111 +                       do {
4112 +                               left = list_entry(left->header.level_linkage.prev,
4113 +                                                 carry_node, header.level_linkage);
4114 +                               assert("nikita-3408", !carry_node_end(doing,
4115 +                                                                     left));
4116 +                       } while (reiser4_carry_real(left) ==
4117 +                                reiser4_carry_real(node));
4118 +                       return left;
4119 +               }
4120 +       }
4121 +       read_unlock_tree(tree);
4122 +
4123 +       left = reiser4_add_carry_skip(doing, POOLO_BEFORE, node);
4124 +       if (IS_ERR(left))
4125 +               return left;
4126 +
4127 +       left->node = node->node;
4128 +       left->free = 1;
4129 +
4130 +       flags = GN_TRY_LOCK;
4131 +       if (!op->u.insert.flags & COPI_LOAD_LEFT)
4132 +               flags |= GN_NO_ALLOC;
4133 +
4134 +       /* then, feeling lucky, peek left neighbor in the cache. */
4135 +       result = reiser4_get_left_neighbor(&left->lock_handle,
4136 +                                          reiser4_carry_real(node),
4137 +                                          ZNODE_WRITE_LOCK, flags);
4138 +       if (result == 0) {
4139 +               /* ok, node found and locked. */
4140 +               result = lock_carry_node_tail(left);
4141 +               if (result != 0)
4142 +                       left = ERR_PTR(result);
4143 +       } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
4144 +               /* node is leftmost node in a tree, or neighbor wasn't in
4145 +                  cache, or there is an extent on the left. */
4146 +               reiser4_pool_free(&doing->pool->node_pool, &left->header);
4147 +               left = NULL;
4148 +       } else if (doing->restartable) {
4149 +               /* if left neighbor is locked, and level is restartable, add
4150 +                  new node to @doing and restart. */
4151 +               assert("nikita-913", node->parent != 0);
4152 +               assert("nikita-914", node->node != NULL);
4153 +               left->left = 1;
4154 +               left->free = 0;
4155 +               left = ERR_PTR(-E_REPEAT);
4156 +       } else {
4157 +               /* left neighbor is locked, level cannot be restarted. Just
4158 +                  ignore left neighbor. */
4159 +               reiser4_pool_free(&doing->pool->node_pool, &left->header);
4160 +               left = NULL;
4161 +       }
4162 +       return left;
4163 +}
4164 +
4165 +/* find right neighbor of a carry node
4166 +
4167 +   Look for right neighbor of @node and add it to the @doing queue. See
4168 +   comments in the body.
4169 +
4170 +*/
4171 +static carry_node *find_right_neighbor(carry_op * op   /* node to find right
4172 +                                                        * neighbor of */ ,
4173 +                                      carry_level * doing /* level to scan */ )
4174 +{
4175 +       int result;
4176 +       carry_node *node;
4177 +       carry_node *right;
4178 +       lock_handle lh;
4179 +       int flags;
4180 +       reiser4_tree *tree;
4181 +
4182 +       init_lh(&lh);
4183 +
4184 +       node = op->node;
4185 +
4186 +       tree = current_tree;
4187 +       read_lock_tree(tree);
4188 +       /* first, check whether right neighbor is already in a @doing queue */
4189 +       if (reiser4_carry_real(node)->right != NULL) {
4190 +               /*
4191 +                * Tree lock is taken here anyway, because, even if _outcome_
4192 +                * of (find_carry_node() != NULL) doesn't depends on
4193 +                * concurrent updates to ->right, find_carry_node() cannot
4194 +                * work with second argument NULL. Hence, following comment is
4195 +                * of historic importance only.
4196 +                *
4197 +                * Subtle:
4198 +                *
4199 +                * Q: why don't we need tree lock here, looking for the right
4200 +                * neighbor?
4201 +                *
4202 +                * A: even if value of node->real_node->right were changed
4203 +                * during find_carry_node() execution, outcome of execution
4204 +                * wouldn't change, because (in short) other thread cannot add
4205 +                * elements to the @doing, and if node->real_node->right
4206 +                * already was in @doing, value of node->real_node->right
4207 +                * couldn't change, because node cannot be inserted between
4208 +                * locked neighbors.
4209 +                */
4210 +               if (find_carry_node(doing,
4211 +                                   reiser4_carry_real(node)->right) != NULL) {
4212 +                       read_unlock_tree(tree);
4213 +                       /*
4214 +                        * What we are doing here (this is also applicable to
4215 +                        * the find_left_neighbor()).
4216 +                        *
4217 +                        * tree_walk.c code requires that insertion of a
4218 +                        * pointer to a child, modification of parent pointer
4219 +                        * in the child, and insertion of the child into
4220 +                        * sibling list are atomic (see
4221 +                        * plugin/item/internal.c:create_hook_internal()).
4222 +                        *
4223 +                        * carry allocates new node long before pointer to it
4224 +                        * is inserted into parent and, actually, long before
4225 +                        * parent is even known. Such allocated-but-orphaned
4226 +                        * nodes are only trackable through carry level lists.
4227 +                        *
4228 +                        * Situation that is handled here is following: @node
4229 +                        * has valid ->right pointer, but there is
4230 +                        * allocated-but-orphaned node in the carry queue that
4231 +                        * is logically between @node and @node->right. Here
4232 +                        * we are searching for it. Critical point is that
4233 +                        * this is only possible if @node->right is also in
4234 +                        * the carry queue (this is checked above), because
4235 +                        * this is the only way new orphaned node could be
4236 +                        * inserted between them (before inserting new node,
4237 +                        * make_space() first tries to shift to the right, so,
4238 +                        * right neighbor will be locked and queued).
4239 +                        *
4240 +                        */
4241 +                       right = node;
4242 +                       do {
4243 +                               right = list_entry(right->header.level_linkage.next,
4244 +                                                  carry_node, header.level_linkage);
4245 +                               assert("nikita-3408", !carry_node_end(doing,
4246 +                                                                     right));
4247 +                       } while (reiser4_carry_real(right) ==
4248 +                                reiser4_carry_real(node));
4249 +                       return right;
4250 +               }
4251 +       }
4252 +       read_unlock_tree(tree);
4253 +
4254 +       flags = GN_CAN_USE_UPPER_LEVELS;
4255 +       if (!op->u.insert.flags & COPI_LOAD_RIGHT)
4256 +               flags = GN_NO_ALLOC;
4257 +
4258 +       /* then, try to lock right neighbor */
4259 +       init_lh(&lh);
4260 +       result = reiser4_get_right_neighbor(&lh,
4261 +                                           reiser4_carry_real(node),
4262 +                                           ZNODE_WRITE_LOCK, flags);
4263 +       if (result == 0) {
4264 +               /* ok, node found and locked. */
4265 +               right = reiser4_add_carry_skip(doing, POOLO_AFTER, node);
4266 +               if (!IS_ERR(right)) {
4267 +                       right->node = lh.node;
4268 +                       move_lh(&right->lock_handle, &lh);
4269 +                       right->free = 1;
4270 +                       result = lock_carry_node_tail(right);
4271 +                       if (result != 0)
4272 +                               right = ERR_PTR(result);
4273 +               }
4274 +       } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
4275 +               /* node is rightmost node in a tree, or neighbor wasn't in
4276 +                  cache, or there is an extent on the right. */
4277 +               right = NULL;
4278 +       } else
4279 +               right = ERR_PTR(result);
4280 +       done_lh(&lh);
4281 +       return right;
4282 +}
4283 +
4284 +/* how much free space in a @node is needed for @op
4285 +
4286 +   How much space in @node is required for completion of @op, where @op is
4287 +   insert or paste operation.
4288 +*/
4289 +static unsigned int space_needed_for_op(znode * node   /* znode data are
4290 +                                                        * inserted or
4291 +                                                        * pasted in */ ,
4292 +                                       carry_op * op   /* carry
4293 +                                                          operation */ )
4294 +{
4295 +       assert("nikita-919", op != NULL);
4296 +
4297 +       switch (op->op) {
4298 +       default:
4299 +               impossible("nikita-1701", "Wrong opcode");
4300 +       case COP_INSERT:
4301 +               return space_needed(node, NULL, op->u.insert.d->data, 1);
4302 +       case COP_PASTE:
4303 +               return space_needed(node, op->u.insert.d->coord,
4304 +                                   op->u.insert.d->data, 0);
4305 +       }
4306 +}
4307 +
4308 +/* how much space in @node is required to insert or paste @data at
4309 +   @coord. */
4310 +unsigned int space_needed(const znode * node   /* node data are inserted or
4311 +                                                * pasted in */ ,
4312 +                         const coord_t * coord /* coord where data are
4313 +                                                * inserted or pasted
4314 +                                                * at */ ,
4315 +                         const reiser4_item_data * data        /* data to insert or
4316 +                                                                * paste */ ,
4317 +                         int insertion /* non-0 is inserting, 0---paste */ )
4318 +{
4319 +       int result;
4320 +       item_plugin *iplug;
4321 +
4322 +       assert("nikita-917", node != NULL);
4323 +       assert("nikita-918", node_plugin_by_node(node) != NULL);
4324 +       assert("vs-230", !insertion || (coord == NULL));
4325 +
4326 +       result = 0;
4327 +       iplug = data->iplug;
4328 +       if (iplug->b.estimate != NULL) {
4329 +               /* ask item plugin how much space is needed to insert this
4330 +                  item */
4331 +               result += iplug->b.estimate(insertion ? NULL : coord, data);
4332 +       } else {
4333 +               /* reasonable default */
4334 +               result += data->length;
4335 +       }
4336 +       if (insertion) {
4337 +               node_plugin *nplug;
4338 +
4339 +               nplug = node->nplug;
4340 +               /* and add node overhead */
4341 +               if (nplug->item_overhead != NULL) {
4342 +                       result += nplug->item_overhead(node, NULL);
4343 +               }
4344 +       }
4345 +       return result;
4346 +}
4347 +
4348 +/* find &coord in parent where pointer to new child is to be stored. */
4349 +static int find_new_child_coord(carry_op * op  /* COP_INSERT carry operation to
4350 +                                                * insert pointer to new
4351 +                                                * child */ )
4352 +{
4353 +       int result;
4354 +       znode *node;
4355 +       znode *child;
4356 +
4357 +       assert("nikita-941", op != NULL);
4358 +       assert("nikita-942", op->op == COP_INSERT);
4359 +
4360 +       node = reiser4_carry_real(op->node);
4361 +       assert("nikita-943", node != NULL);
4362 +       assert("nikita-944", node_plugin_by_node(node) != NULL);
4363 +
4364 +       child = reiser4_carry_real(op->u.insert.child);
4365 +       result =
4366 +           find_new_child_ptr(node, child, op->u.insert.brother,
4367 +                              op->u.insert.d->coord);
4368 +
4369 +       build_child_ptr_data(child, op->u.insert.d->data);
4370 +       return result;
4371 +}
4372 +
4373 +/* additional amount of free space in @node required to complete @op */
4374 +static int free_space_shortage(znode * node /* node to check */ ,
4375 +                              carry_op * op /* operation being performed */ )
4376 +{
4377 +       assert("nikita-1061", node != NULL);
4378 +       assert("nikita-1062", op != NULL);
4379 +
4380 +       switch (op->op) {
4381 +       default:
4382 +               impossible("nikita-1702", "Wrong opcode");
4383 +       case COP_INSERT:
4384 +       case COP_PASTE:
4385 +               return space_needed_for_op(node, op) - znode_free_space(node);
4386 +       case COP_EXTENT:
4387 +               /* when inserting extent shift data around until insertion
4388 +                  point is utmost in the node. */
4389 +               if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
4390 +                       return +1;
4391 +               else
4392 +                       return -1;
4393 +       }
4394 +}
4395 +
4396 +/* helper function: update node pointer in operation after insertion
4397 +   point was probably shifted into @target. */
4398 +static znode *sync_op(carry_op * op, carry_node * target)
4399 +{
4400 +       znode *insertion_node;
4401 +
4402 +       /* reget node from coord: shift might move insertion coord to
4403 +          the neighbor */
4404 +       insertion_node = op->u.insert.d->coord->node;
4405 +       /* if insertion point was actually moved into new node,
4406 +          update carry node pointer in operation. */
4407 +       if (insertion_node != reiser4_carry_real(op->node)) {
4408 +               op->node = target;
4409 +               assert("nikita-2540",
4410 +                      reiser4_carry_real(target) == insertion_node);
4411 +       }
4412 +       assert("nikita-2541",
4413 +              reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4414 +       return insertion_node;
4415 +}
4416 +
4417 +/*
4418 + * complete make_space() call: update tracked lock handle if necessary. See
4419 + * comments for fs/reiser4/carry.h:carry_track_type
4420 + */
4421 +static int
4422 +make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
4423 +{
4424 +       int result;
4425 +       carry_track_type tracking;
4426 +       znode *node;
4427 +
4428 +       tracking = doing->track_type;
4429 +       node = op->u.insert.d->coord->node;
4430 +
4431 +       if (tracking == CARRY_TRACK_NODE ||
4432 +           (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
4433 +               /* inserting or pasting into node different from
4434 +                  original. Update lock handle supplied by caller. */
4435 +               assert("nikita-1417", doing->tracked != NULL);
4436 +               done_lh(doing->tracked);
4437 +               init_lh(doing->tracked);
4438 +               result = longterm_lock_znode(doing->tracked, node,
4439 +                                            ZNODE_WRITE_LOCK,
4440 +                                            ZNODE_LOCK_HIPRI);
4441 +       } else
4442 +               result = 0;
4443 +       return result;
4444 +}
4445 +
4446 +/* This is insertion policy function. It shifts data to the left and right
4447 +   neighbors of insertion coord and allocates new nodes until there is enough
4448 +   free space to complete @op.
4449 +
4450 +   See comments in the body.
4451 +
4452 +   Assumes that the node format favors insertions at the right end of the node
4453 +   as node40 does.
4454 +
4455 +   See carry_flow() on detail about flow insertion
4456 +*/
4457 +static int make_space(carry_op * op /* carry operation, insert or paste */ ,
4458 +                     carry_level * doing /* current carry queue */ ,
4459 +                     carry_level * todo /* carry queue on the parent level */ )
4460 +{
4461 +       znode *node;
4462 +       int result;
4463 +       int not_enough_space;
4464 +       int blk_alloc;
4465 +       znode *orig_node;
4466 +       __u32 flags;
4467 +
4468 +       coord_t *coord;
4469 +
4470 +       assert("nikita-890", op != NULL);
4471 +       assert("nikita-891", todo != NULL);
4472 +       assert("nikita-892",
4473 +              op->op == COP_INSERT ||
4474 +              op->op == COP_PASTE || op->op == COP_EXTENT);
4475 +       assert("nikita-1607",
4476 +              reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4477 +
4478 +       flags = op->u.insert.flags;
4479 +
4480 +       /* NOTE check that new node can only be allocated after checking left
4481 +        * and right neighbors. This is necessary for proper work of
4482 +        * find_{left,right}_neighbor(). */
4483 +       assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
4484 +                                  flags & COPI_DONT_SHIFT_LEFT));
4485 +       assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
4486 +                                  flags & COPI_DONT_SHIFT_RIGHT));
4487 +
4488 +       coord = op->u.insert.d->coord;
4489 +       orig_node = node = coord->node;
4490 +
4491 +       assert("nikita-908", node != NULL);
4492 +       assert("nikita-909", node_plugin_by_node(node) != NULL);
4493 +
4494 +       result = 0;
4495 +       /* If there is not enough space in a node, try to shift something to
4496 +          the left neighbor. This is a bit tricky, as locking to the left is
4497 +          low priority. This is handled by restart logic in carry().
4498 +        */
4499 +       not_enough_space = free_space_shortage(node, op);
4500 +       if (not_enough_space <= 0)
4501 +               /* it is possible that carry was called when there actually
4502 +                  was enough space in the node. For example, when inserting
4503 +                  leftmost item so that delimiting keys have to be updated.
4504 +                */
4505 +               return make_space_tail(op, doing, orig_node);
4506 +       if (!(flags & COPI_DONT_SHIFT_LEFT)) {
4507 +               carry_node *left;
4508 +               /* make note in statistics of an attempt to move
4509 +                  something into the left neighbor */
4510 +               left = find_left_neighbor(op, doing);
4511 +               if (unlikely(IS_ERR(left))) {
4512 +                       if (PTR_ERR(left) == -E_REPEAT)
4513 +                               return -E_REPEAT;
4514 +                       else {
4515 +                               /* some error other than restart request
4516 +                                  occurred. This shouldn't happen. Issue a
4517 +                                  warning and continue as if left neighbor
4518 +                                  weren't existing.
4519 +                                */
4520 +                               warning("nikita-924",
4521 +                                       "Error accessing left neighbor: %li",
4522 +                                       PTR_ERR(left));
4523 +                       }
4524 +               } else if (left != NULL) {
4525 +
4526 +                       /* shift everything possible on the left of and
4527 +                          including insertion coord into the left neighbor */
4528 +                       result = carry_shift_data(LEFT_SIDE, coord,
4529 +                                                 reiser4_carry_real(left),
4530 +                                                 doing, todo,
4531 +                                                 flags & COPI_GO_LEFT);
4532 +
4533 +                       /* reget node from coord: shift_left() might move
4534 +                          insertion coord to the left neighbor */
4535 +                       node = sync_op(op, left);
4536 +
4537 +                       not_enough_space = free_space_shortage(node, op);
4538 +                       /* There is not enough free space in @node, but
4539 +                          may be, there is enough free space in
4540 +                          @left. Various balancing decisions are valid here.
4541 +                          The same for the shifiting to the right.
4542 +                        */
4543 +               }
4544 +       }
4545 +       /* If there still is not enough space, shift to the right */
4546 +       if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
4547 +               carry_node *right;
4548 +
4549 +               right = find_right_neighbor(op, doing);
4550 +               if (IS_ERR(right)) {
4551 +                       warning("nikita-1065",
4552 +                               "Error accessing right neighbor: %li",
4553 +                               PTR_ERR(right));
4554 +               } else if (right != NULL) {
4555 +                       /* node containing insertion point, and its right
4556 +                          neighbor node are write locked by now.
4557 +
4558 +                          shift everything possible on the right of but
4559 +                          excluding insertion coord into the right neighbor
4560 +                        */
4561 +                       result = carry_shift_data(RIGHT_SIDE, coord,
4562 +                                                 reiser4_carry_real(right),
4563 +                                                 doing, todo,
4564 +                                                 flags & COPI_GO_RIGHT);
4565 +                       /* reget node from coord: shift_right() might move
4566 +                          insertion coord to the right neighbor */
4567 +                       node = sync_op(op, right);
4568 +                       not_enough_space = free_space_shortage(node, op);
4569 +               }
4570 +       }
4571 +       /* If there is still not enough space, allocate new node(s).
4572 +
4573 +          We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
4574 +          the carry operation flags (currently this is needed during flush
4575 +          only).
4576 +        */
4577 +       for (blk_alloc = 0;
4578 +            not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
4579 +            !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
4580 +               carry_node *fresh;      /* new node we are allocating */
4581 +               coord_t coord_shadow;   /* remembered insertion point before
4582 +                                        * shifting data into new node */
4583 +               carry_node *node_shadow;        /* remembered insertion node before
4584 +                                                * shifting */
4585 +               unsigned int gointo;    /* whether insertion point should move
4586 +                                        * into newly allocated node */
4587 +
4588 +               /* allocate new node on the right of @node. Znode and disk
4589 +                  fake block number for new node are allocated.
4590 +
4591 +                  add_new_znode() posts carry operation COP_INSERT with
4592 +                  COPT_CHILD option to the parent level to add
4593 +                  pointer to newly created node to its parent.
4594 +
4595 +                  Subtle point: if several new nodes are required to complete
4596 +                  insertion operation at this level, they will be inserted
4597 +                  into their parents in the order of creation, which means
4598 +                  that @node will be valid "cookie" at the time of insertion.
4599 +
4600 +                */
4601 +               fresh = add_new_znode(node, op->node, doing, todo);
4602 +               if (IS_ERR(fresh))
4603 +                       return PTR_ERR(fresh);
4604 +
4605 +               /* Try to shift into new node. */
4606 +               result = lock_carry_node(doing, fresh);
4607 +               zput(reiser4_carry_real(fresh));
4608 +               if (result != 0) {
4609 +                       warning("nikita-947",
4610 +                               "Cannot lock new node: %i", result);
4611 +                       return result;
4612 +               }
4613 +
4614 +               /* both nodes are write locked by now.
4615 +
4616 +                  shift everything possible on the right of and
4617 +                  including insertion coord into the right neighbor.
4618 +                */
4619 +               coord_dup(&coord_shadow, op->u.insert.d->coord);
4620 +               node_shadow = op->node;
4621 +               /* move insertion point into newly created node if:
4622 +
4623 +                  . insertion point is rightmost in the source node, or
4624 +                  . this is not the first node we are allocating in a row.
4625 +                */
4626 +               gointo =
4627 +                   (blk_alloc > 0) ||
4628 +                   coord_is_after_rightmost(op->u.insert.d->coord);
4629 +
4630 +               if (gointo &&
4631 +                   op->op == COP_PASTE &&
4632 +                   coord_is_existing_item(op->u.insert.d->coord) &&
4633 +                   is_solid_item((item_plugin_by_coord(op->u.insert.d->coord)))) {
4634 +                       /* paste into solid (atomic) item, which can contain
4635 +                          only one unit, so we need to shift it right, where
4636 +                          insertion point supposed to be */
4637 +
4638 +                       assert("edward-1444", op->u.insert.d->data->iplug ==
4639 +                              item_plugin_by_id(STATIC_STAT_DATA_ID));
4640 +                       assert("edward-1445",
4641 +                              op->u.insert.d->data->length >
4642 +                              node_plugin_by_node(coord->node)->free_space
4643 +                              (coord->node));
4644 +
4645 +                       op->u.insert.d->coord->between = BEFORE_UNIT;
4646 +               }
4647 +
4648 +               result = carry_shift_data(RIGHT_SIDE, coord,
4649 +                                         reiser4_carry_real(fresh),
4650 +                                         doing, todo, gointo);
4651 +               /* if insertion point was actually moved into new node,
4652 +                  update carry node pointer in operation. */
4653 +               node = sync_op(op, fresh);
4654 +               not_enough_space = free_space_shortage(node, op);
4655 +               if ((not_enough_space > 0) && (node != coord_shadow.node)) {
4656 +                       /* there is not enough free in new node. Shift
4657 +                          insertion point back to the @shadow_node so that
4658 +                          next new node would be inserted between
4659 +                          @shadow_node and @fresh.
4660 +                        */
4661 +                       coord_normalize(&coord_shadow);
4662 +                       coord_dup(coord, &coord_shadow);
4663 +                       node = coord->node;
4664 +                       op->node = node_shadow;
4665 +                       if (1 || (flags & COPI_STEP_BACK)) {
4666 +                               /* still not enough space?! Maybe there is
4667 +                                  enough space in the source node (i.e., node
4668 +                                  data are moved from) now.
4669 +                                */
4670 +                               not_enough_space =
4671 +                                   free_space_shortage(node, op);
4672 +                       }
4673 +               }
4674 +       }
4675 +       if (not_enough_space > 0) {
4676 +               if (!(flags & COPI_DONT_ALLOCATE))
4677 +                       warning("nikita-948", "Cannot insert new item");
4678 +               result = -E_NODE_FULL;
4679 +       }
4680 +       assert("nikita-1622", ergo(result == 0,
4681 +                                 reiser4_carry_real(op->node) == coord->node));
4682 +       assert("nikita-2616", coord == op->u.insert.d->coord);
4683 +       if (result == 0)
4684 +               result = make_space_tail(op, doing, orig_node);
4685 +       return result;
4686 +}
4687 +
4688 +/* insert_paste_common() - common part of insert and paste operations
4689 +
4690 +   This function performs common part of COP_INSERT and COP_PASTE.
4691 +
4692 +   There are two ways in which insertion/paste can be requested:
4693 +
4694 +    . by directly supplying reiser4_item_data. In this case, op ->
4695 +    u.insert.type is set to COPT_ITEM_DATA.
4696 +
4697 +    . by supplying child pointer to which is to inserted into parent. In this
4698 +    case op -> u.insert.type == COPT_CHILD.
4699 +
4700 +    . by supplying key of new item/unit. This is currently only used during
4701 +    extent insertion
4702 +
4703 +   This is required, because when new node is allocated we don't know at what
4704 +   position pointer to it is to be stored in the parent. Actually, we don't
4705 +   even know what its parent will be, because parent can be re-balanced
4706 +   concurrently and new node re-parented, and because parent can be full and
4707 +   pointer to the new node will go into some other node.
4708 +
4709 +   insert_paste_common() resolves pointer to child node into position in the
4710 +   parent by calling find_new_child_coord(), that fills
4711 +   reiser4_item_data. After this, insertion/paste proceeds uniformly.
4712 +
4713 +   Another complication is with finding free space during pasting. It may
4714 +   happen that while shifting items to the neighbors and newly allocated
4715 +   nodes, insertion coord can no longer be in the item we wanted to paste
4716 +   into. At this point, paste becomes (morphs) into insert. Moreover free
4717 +   space analysis has to be repeated, because amount of space required for
4718 +   insertion is different from that of paste (item header overhead, etc).
4719 +
4720 +   This function "unifies" different insertion modes (by resolving child
4721 +   pointer or key into insertion coord), and then calls make_space() to free
4722 +   enough space in the node by shifting data to the left and right and by
4723 +   allocating new nodes if necessary. Carry operation knows amount of space
4724 +   required for its completion. After enough free space is obtained, caller of
4725 +   this function (carry_{insert,paste,etc.}) performs actual insertion/paste
4726 +   by calling item plugin method.
4727 +
4728 +*/
4729 +static int insert_paste_common(carry_op * op   /* carry operation being
4730 +                                                * performed */ ,
4731 +                              carry_level * doing /* current carry level */ ,
4732 +                              carry_level * todo /* next carry level */ ,
4733 +                              carry_insert_data * cdata        /* pointer to
4734 +                                                                * cdata */ ,
4735 +                              coord_t * coord /* insertion/paste coord */ ,
4736 +                              reiser4_item_data * data /* data to be
4737 +                                                        * inserted/pasted */ )
4738 +{
4739 +       assert("nikita-981", op != NULL);
4740 +       assert("nikita-980", todo != NULL);
4741 +       assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE)
4742 +              || (op->op == COP_EXTENT));
4743 +
4744 +       if (op->u.insert.type == COPT_PASTE_RESTARTED) {
4745 +               /* nothing to do. Fall through to make_space(). */
4746 +               ;
4747 +       } else if (op->u.insert.type == COPT_KEY) {
4748 +               node_search_result intra_node;
4749 +               znode *node;
4750 +               /* Problem with doing batching at the lowest level, is that
4751 +                  operations here are given by coords where modification is
4752 +                  to be performed, and one modification can invalidate coords
4753 +                  of all following operations.
4754 +
4755 +                  So, we are implementing yet another type for operation that
4756 +                  will use (the only) "locator" stable across shifting of
4757 +                  data between nodes, etc.: key (COPT_KEY).
4758 +
4759 +                  This clause resolves key to the coord in the node.
4760 +
4761 +                  But node can change also. Probably some pieces have to be
4762 +                  added to the lock_carry_node(), to lock node by its key.
4763 +
4764 +                */
4765 +               /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
4766 +                  if you need something else. */
4767 +               op->u.insert.d->coord = coord;
4768 +               node = reiser4_carry_real(op->node);
4769 +               intra_node = node_plugin_by_node(node)->lookup
4770 +                   (node, op->u.insert.d->key, FIND_EXACT,
4771 +                    op->u.insert.d->coord);
4772 +               if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
4773 +                       warning("nikita-1715", "Intra node lookup failure: %i",
4774 +                               intra_node);
4775 +                       return intra_node;
4776 +               }
4777 +       } else if (op->u.insert.type == COPT_CHILD) {
4778 +               /* if we are asked to insert pointer to the child into
4779 +                  internal node, first convert pointer to the child into
4780 +                  coord within parent node.
4781 +                */
4782 +               znode *child;
4783 +               int result;
4784 +
4785 +               op->u.insert.d = cdata;
4786 +               op->u.insert.d->coord = coord;
4787 +               op->u.insert.d->data = data;
4788 +               op->u.insert.d->coord->node = reiser4_carry_real(op->node);
4789 +               result = find_new_child_coord(op);
4790 +               child = reiser4_carry_real(op->u.insert.child);
4791 +               if (result != NS_NOT_FOUND) {
4792 +                       warning("nikita-993",
4793 +                               "Cannot find a place for child pointer: %i",
4794 +                               result);
4795 +                       return result;
4796 +               }
4797 +               /* This only happens when we did multiple insertions at
4798 +                  the previous level, trying to insert single item and
4799 +                  it so happened, that insertion of pointers to all new
4800 +                  nodes before this one already caused parent node to
4801 +                  split (may be several times).
4802 +
4803 +                  I am going to come up with better solution.
4804 +
4805 +                  You are not expected to understand this.
4806 +                  -- v6root/usr/sys/ken/slp.c
4807 +
4808 +                  Basically, what happens here is the following: carry came
4809 +                  to the parent level and is about to insert internal item
4810 +                  pointing to the child node that it just inserted in the
4811 +                  level below. Position where internal item is to be inserted
4812 +                  was found by find_new_child_coord() above, but node of the
4813 +                  current carry operation (that is, parent node of child
4814 +                  inserted on the previous level), was determined earlier in
4815 +                  the lock_carry_level/lock_carry_node. It could so happen
4816 +                  that other carry operations already performed on the parent
4817 +                  level already split parent node, so that insertion point
4818 +                  moved into another node. Handle this by creating new carry
4819 +                  node for insertion point if necessary.
4820 +                */
4821 +               if (reiser4_carry_real(op->node) !=
4822 +                   op->u.insert.d->coord->node) {
4823 +                       pool_ordering direction;
4824 +                       znode *z1;
4825 +                       znode *z2;
4826 +                       reiser4_key k1;
4827 +                       reiser4_key k2;
4828 +
4829 +                       /*
4830 +                        * determine in what direction insertion point
4831 +                        * moved. Do this by comparing delimiting keys.
4832 +                        */
4833 +                       z1 = op->u.insert.d->coord->node;
4834 +                       z2 = reiser4_carry_real(op->node);
4835 +                       if (keyle(leftmost_key_in_node(z1, &k1),
4836 +                                 leftmost_key_in_node(z2, &k2)))
4837 +                               /* insertion point moved to the left */
4838 +                               direction = POOLO_BEFORE;
4839 +                       else
4840 +                               /* insertion point moved to the right */
4841 +                               direction = POOLO_AFTER;
4842 +
4843 +                       op->node = reiser4_add_carry_skip(doing,
4844 +                                                         direction, op->node);
4845 +                       if (IS_ERR(op->node))
4846 +                               return PTR_ERR(op->node);
4847 +                       op->node->node = op->u.insert.d->coord->node;
4848 +                       op->node->free = 1;
4849 +                       result = lock_carry_node(doing, op->node);
4850 +                       if (result != 0)
4851 +                               return result;
4852 +               }
4853 +
4854 +               /*
4855 +                * set up key of an item being inserted: we are inserting
4856 +                * internal item and its key is (by the very definition of
4857 +                * search tree) is leftmost key in the child node.
4858 +                */
4859 +               write_lock_dk(znode_get_tree(child));
4860 +               op->u.insert.d->key = leftmost_key_in_node(child,
4861 +                                                          znode_get_ld_key(child));
4862 +               write_unlock_dk(znode_get_tree(child));
4863 +               op->u.insert.d->data->arg = op->u.insert.brother;
4864 +       } else {
4865 +               assert("vs-243", op->u.insert.d->coord != NULL);
4866 +               op->u.insert.d->coord->node = reiser4_carry_real(op->node);
4867 +       }
4868 +
4869 +       /* find free space. */
4870 +       return make_space(op, doing, todo);
4871 +}
4872 +
4873 +/* handle carry COP_INSERT operation.
4874 +
4875 +   Insert new item into node. New item can be given in one of two ways:
4876 +
4877 +   - by passing &tree_coord and &reiser4_item_data as part of @op. This is
4878 +   only applicable at the leaf/twig level.
4879 +
4880 +   - by passing a child node pointer to which is to be inserted by this
4881 +   operation.
4882 +
4883 +*/
4884 +static int carry_insert(carry_op * op /* operation to perform */ ,
4885 +                       carry_level * doing     /* queue of operations @op
4886 +                                                * is part of */ ,
4887 +                       carry_level * todo      /* queue where new operations
4888 +                                                * are accumulated */ )
4889 +{
4890 +       znode *node;
4891 +       carry_insert_data cdata;
4892 +       coord_t coord;
4893 +       reiser4_item_data data;
4894 +       carry_plugin_info info;
4895 +       int result;
4896 +
4897 +       assert("nikita-1036", op != NULL);
4898 +       assert("nikita-1037", todo != NULL);
4899 +       assert("nikita-1038", op->op == COP_INSERT);
4900 +
4901 +       coord_init_zero(&coord);
4902 +
4903 +       /* perform common functionality of insert and paste. */
4904 +       result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
4905 +       if (result != 0)
4906 +               return result;
4907 +
4908 +       node = op->u.insert.d->coord->node;
4909 +       assert("nikita-1039", node != NULL);
4910 +       assert("nikita-1040", node_plugin_by_node(node) != NULL);
4911 +
4912 +       assert("nikita-949",
4913 +              space_needed_for_op(node, op) <= znode_free_space(node));
4914 +
4915 +       /* ask node layout to create new item. */
4916 +       info.doing = doing;
4917 +       info.todo = todo;
4918 +       result = node_plugin_by_node(node)->create_item
4919 +           (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data,
4920 +            &info);
4921 +       doing->restartable = 0;
4922 +       znode_make_dirty(node);
4923 +
4924 +       return result;
4925 +}
4926 +
4927 +/*
4928 + * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
4929 + * supplied with a "flow" (that is, a stream of data) and inserts it into tree
4930 + * by slicing into multiple items.
4931 + */
4932 +
4933 +#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point )
4934 +#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow )
4935 +#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data )
4936 +
4937 +static size_t item_data_overhead(carry_op * op)
4938 +{
4939 +       if (flow_insert_data(op)->iplug->b.estimate == NULL)
4940 +               return 0;
4941 +       return (flow_insert_data(op)->iplug->b.
4942 +               estimate(NULL /* estimate insertion */ , flow_insert_data(op)) -
4943 +               flow_insert_data(op)->length);
4944 +}
4945 +
4946 +/* FIXME-VS: this is called several times during one make_flow_for_insertion
4947 +   and it will always return the same result. Some optimization could be made
4948 +   by calculating this value once at the beginning and passing it around. That
4949 +   would reduce some flexibility in future changes
4950 +*/
4951 +static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
4952 +static size_t flow_insertion_overhead(carry_op * op)
4953 +{
4954 +       znode *node;
4955 +       size_t insertion_overhead;
4956 +
4957 +       node = flow_insert_point(op)->node;
4958 +       insertion_overhead = 0;
4959 +       if (node->nplug->item_overhead &&
4960 +           !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key,
4961 +                      flow_insert_data(op)))
4962 +               insertion_overhead =
4963 +                   node->nplug->item_overhead(node, NULL) +
4964 +                       item_data_overhead(op);
4965 +       return insertion_overhead;
4966 +}
4967 +
4968 +/* how many bytes of flow does fit to the node */
4969 +static int what_can_fit_into_node(carry_op * op)
4970 +{
4971 +       size_t free, overhead;
4972 +
4973 +       overhead = flow_insertion_overhead(op);
4974 +       free = znode_free_space(flow_insert_point(op)->node);
4975 +       if (free <= overhead)
4976 +               return 0;
4977 +       free -= overhead;
4978 +       /* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */
4979 +       if (free < op->u.insert_flow.flow->length)
4980 +               return free;
4981 +       return (int)op->u.insert_flow.flow->length;
4982 +}
4983 +
4984 +/* in make_space_for_flow_insertion we need to check either whether whole flow
4985 +   fits into a node or whether minimal fraction of flow fits into a node */
4986 +static int enough_space_for_whole_flow(carry_op * op)
4987 +{
4988 +       return (unsigned)what_can_fit_into_node(op) ==
4989 +           op->u.insert_flow.flow->length;
4990 +}
4991 +
4992 +#define MIN_FLOW_FRACTION 1
4993 +static int enough_space_for_min_flow_fraction(carry_op * op)
4994 +{
4995 +       assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
4996 +
4997 +       return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
4998 +}
4999 +
5000 +/* this returns 0 if left neighbor was obtained successfully and everything
5001 +   upto insertion point including it were shifted and left neighbor still has
5002 +   some free space to put minimal fraction of flow into it */
5003 +static int
5004 +make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
5005 +{
5006 +       carry_node *left;
5007 +       znode *orig;
5008 +
5009 +       left = find_left_neighbor(op, doing);
5010 +       if (unlikely(IS_ERR(left))) {
5011 +               warning("vs-899",
5012 +                       "make_space_by_shift_left: "
5013 +                       "error accessing left neighbor: %li", PTR_ERR(left));
5014 +               return 1;
5015 +       }
5016 +       if (left == NULL)
5017 +               /* left neighbor either does not exist or is unformatted
5018 +                  node */
5019 +               return 1;
5020 +
5021 +       orig = flow_insert_point(op)->node;
5022 +       /* try to shift content of node @orig from its head upto insert point
5023 +          including insertion point into the left neighbor */
5024 +       carry_shift_data(LEFT_SIDE, flow_insert_point(op),
5025 +                        reiser4_carry_real(left), doing, todo,
5026 +                        1 /* including insert point */);
5027 +       if (reiser4_carry_real(left) != flow_insert_point(op)->node) {
5028 +               /* insertion point did not move */
5029 +               return 1;
5030 +       }
5031 +
5032 +       /* insertion point is set after last item in the node */
5033 +       assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
5034 +
5035 +       if (!enough_space_for_min_flow_fraction(op)) {
5036 +               /* insertion point node does not have enough free space to put
5037 +                  even minimal portion of flow into it, therefore, move
5038 +                  insertion point back to orig node (before first item) */
5039 +               coord_init_before_first_item(flow_insert_point(op), orig);
5040 +               return 1;
5041 +       }
5042 +
5043 +       /* part of flow is to be written to the end of node */
5044 +       op->node = left;
5045 +       return 0;
5046 +}
5047 +
5048 +/* this returns 0 if right neighbor was obtained successfully and everything to
5049 +   the right of insertion point was shifted to it and node got enough free
5050 +   space to put minimal fraction of flow into it */
5051 +static int
5052 +make_space_by_shift_right(carry_op * op, carry_level * doing,
5053 +                         carry_level * todo)
5054 +{
5055 +       carry_node *right;
5056 +
5057 +       right = find_right_neighbor(op, doing);
5058 +       if (unlikely(IS_ERR(right))) {
5059 +               warning("nikita-1065", "shift_right_excluding_insert_point: "
5060 +                       "error accessing right neighbor: %li", PTR_ERR(right));
5061 +               return 1;
5062 +       }
5063 +       if (right) {
5064 +               /* shift everything possible on the right of but excluding
5065 +                  insertion coord into the right neighbor */
5066 +               carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5067 +                                reiser4_carry_real(right), doing, todo,
5068 +                                0 /* not including insert point */);
5069 +       } else {
5070 +               /* right neighbor either does not exist or is unformatted
5071 +                  node */
5072 +               ;
5073 +       }
5074 +       if (coord_is_after_rightmost(flow_insert_point(op))) {
5075 +               if (enough_space_for_min_flow_fraction(op)) {
5076 +                       /* part of flow is to be written to the end of node */
5077 +                       return 0;
5078 +               }
5079 +       }
5080 +
5081 +       /* new node is to be added if insert point node did not get enough
5082 +          space for whole flow */
5083 +       return 1;
5084 +}
5085 +
5086 +/* this returns 0 when insert coord is set at the node end and fraction of flow
5087 +   fits into that node */
5088 +static int
5089 +make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
5090 +{
5091 +       int result;
5092 +       znode *node;
5093 +       carry_node *new;
5094 +
5095 +       node = flow_insert_point(op)->node;
5096 +
5097 +       if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5098 +               return RETERR(-E_NODE_FULL);
5099 +       /* add new node after insert point node */
5100 +       new = add_new_znode(node, op->node, doing, todo);
5101 +       if (unlikely(IS_ERR(new))) {
5102 +               return PTR_ERR(new);
5103 +       }
5104 +       result = lock_carry_node(doing, new);
5105 +       zput(reiser4_carry_real(new));
5106 +       if (unlikely(result)) {
5107 +               return result;
5108 +       }
5109 +       op->u.insert_flow.new_nodes++;
5110 +       if (!coord_is_after_rightmost(flow_insert_point(op))) {
5111 +               carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5112 +                                reiser4_carry_real(new), doing, todo,
5113 +                                0 /* not including insert point */);
5114 +               assert("vs-901",
5115 +                      coord_is_after_rightmost(flow_insert_point(op)));
5116 +
5117 +               if (enough_space_for_min_flow_fraction(op)) {
5118 +                       return 0;
5119 +               }
5120 +               if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5121 +                       return RETERR(-E_NODE_FULL);
5122 +
5123 +               /* add one more new node */
5124 +               new = add_new_znode(node, op->node, doing, todo);
5125 +               if (unlikely(IS_ERR(new))) {
5126 +                       return PTR_ERR(new);
5127 +               }
5128 +               result = lock_carry_node(doing, new);
5129 +               zput(reiser4_carry_real(new));
5130 +               if (unlikely(result)) {
5131 +                       return result;
5132 +               }
5133 +               op->u.insert_flow.new_nodes++;
5134 +       }
5135 +
5136 +       /* move insertion point to new node */
5137 +       coord_init_before_first_item(flow_insert_point(op),
5138 +                                    reiser4_carry_real(new));
5139 +       op->node = new;
5140 +       return 0;
5141 +}
5142 +
5143 +static int
5144 +make_space_for_flow_insertion(carry_op * op, carry_level * doing,
5145 +                             carry_level * todo)
5146 +{
5147 +       __u32 flags = op->u.insert_flow.flags;
5148 +
5149 +       if (enough_space_for_whole_flow(op)) {
5150 +               /* whole flow fits into insert point node */
5151 +               return 0;
5152 +       }
5153 +
5154 +       if (!(flags & COPI_DONT_SHIFT_LEFT)
5155 +           && (make_space_by_shift_left(op, doing, todo) == 0)) {
5156 +               /* insert point is shifted to left neighbor of original insert
5157 +                  point node and is set after last unit in that node. It has
5158 +                  enough space to fit at least minimal fraction of flow. */
5159 +               return 0;
5160 +       }
5161 +
5162 +       if (enough_space_for_whole_flow(op)) {
5163 +               /* whole flow fits into insert point node */
5164 +               return 0;
5165 +       }
5166 +
5167 +       if (!(flags & COPI_DONT_SHIFT_RIGHT)
5168 +           && (make_space_by_shift_right(op, doing, todo) == 0)) {
5169 +               /* insert point is still set to the same node, but there is
5170 +                  nothing to the right of insert point. */
5171 +               return 0;
5172 +       }
5173 +
5174 +       if (enough_space_for_whole_flow(op)) {
5175 +               /* whole flow fits into insert point node */
5176 +               return 0;
5177 +       }
5178 +
5179 +       return make_space_by_new_nodes(op, doing, todo);
5180 +}
5181 +
5182 +/* implements COP_INSERT_FLOW operation */
5183 +static int
5184 +carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
5185 +{
5186 +       int result;
5187 +       flow_t *f;
5188 +       coord_t *insert_point;
5189 +       node_plugin *nplug;
5190 +       carry_plugin_info info;
5191 +       znode *orig_node;
5192 +       lock_handle *orig_lh;
5193 +
5194 +       f = op->u.insert_flow.flow;
5195 +       result = 0;
5196 +
5197 +       /* carry system needs this to work */
5198 +       info.doing = doing;
5199 +       info.todo = todo;
5200 +
5201 +       orig_node = flow_insert_point(op)->node;
5202 +       orig_lh = doing->tracked;
5203 +
5204 +       while (f->length) {
5205 +               result = make_space_for_flow_insertion(op, doing, todo);
5206 +               if (result)
5207 +                       break;
5208 +
5209 +               insert_point = flow_insert_point(op);
5210 +               nplug = node_plugin_by_node(insert_point->node);
5211 +
5212 +               /* compose item data for insertion/pasting */
5213 +               flow_insert_data(op)->data = f->data;
5214 +               flow_insert_data(op)->length = what_can_fit_into_node(op);
5215 +
5216 +               if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
5217 +                       /* insert point is set to item of file we are writing to and we have to append to it */
5218 +                       assert("vs-903", insert_point->between == AFTER_UNIT);
5219 +                       nplug->change_item_size(insert_point,
5220 +                                               flow_insert_data(op)->length);
5221 +                       flow_insert_data(op)->iplug->b.paste(insert_point,
5222 +                                                            flow_insert_data
5223 +                                                            (op), &info);
5224 +               } else {
5225 +                       /* new item must be inserted */
5226 +                       pos_in_node_t new_pos;
5227 +                       flow_insert_data(op)->length += item_data_overhead(op);
5228 +
5229 +                       /* FIXME-VS: this is because node40_create_item changes
5230 +                          insert_point for obscure reasons */
5231 +                       switch (insert_point->between) {
5232 +                       case AFTER_ITEM:
5233 +                               new_pos = insert_point->item_pos + 1;
5234 +                               break;
5235 +                       case EMPTY_NODE:
5236 +                               new_pos = 0;
5237 +                               break;
5238 +                       case BEFORE_ITEM:
5239 +                               assert("vs-905", insert_point->item_pos == 0);
5240 +                               new_pos = 0;
5241 +                               break;
5242 +                       default:
5243 +                               impossible("vs-906",
5244 +                                          "carry_insert_flow: invalid coord");
5245 +                               new_pos = 0;
5246 +                               break;
5247 +                       }
5248 +
5249 +                       nplug->create_item(insert_point, &f->key,
5250 +                                          flow_insert_data(op), &info);
5251 +                       coord_set_item_pos(insert_point, new_pos);
5252 +               }
5253 +               coord_init_after_item_end(insert_point);
5254 +               doing->restartable = 0;
5255 +               znode_make_dirty(insert_point->node);
5256 +
5257 +               move_flow_forward(f, (unsigned)flow_insert_data(op)->length);
5258 +       }
5259 +
5260 +       if (orig_node != flow_insert_point(op)->node) {
5261 +               /* move lock to new insert point */
5262 +               done_lh(orig_lh);
5263 +               init_lh(orig_lh);
5264 +               result =
5265 +                   longterm_lock_znode(orig_lh, flow_insert_point(op)->node,
5266 +                                       ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
5267 +       }
5268 +
5269 +       return result;
5270 +}
5271 +
5272 +/* implements COP_DELETE operation
5273 +
5274 +   Remove pointer to @op -> u.delete.child from it's parent.
5275 +
5276 +   This function also handles killing of a tree root is last pointer from it
5277 +   was removed. This is complicated by our handling of "twig" level: root on
5278 +   twig level is never killed.
5279 +
5280 +*/
5281 +static int carry_delete(carry_op * op /* operation to be performed */ ,
5282 +                       carry_level * doing UNUSED_ARG  /* current carry
5283 +                                                        * level */ ,
5284 +                       carry_level * todo /* next carry level */ )
5285 +{
5286 +       int result;
5287 +       coord_t coord;
5288 +       coord_t coord2;
5289 +       znode *parent;
5290 +       znode *child;
5291 +       carry_plugin_info info;
5292 +       reiser4_tree *tree;
5293 +
5294 +       /*
5295 +        * This operation is called to delete internal item pointing to the
5296 +        * child node that was removed by carry from the tree on the previous
5297 +        * tree level.
5298 +        */
5299 +
5300 +       assert("nikita-893", op != NULL);
5301 +       assert("nikita-894", todo != NULL);
5302 +       assert("nikita-895", op->op == COP_DELETE);
5303 +
5304 +       coord_init_zero(&coord);
5305 +       coord_init_zero(&coord2);
5306 +
5307 +       parent = reiser4_carry_real(op->node);
5308 +       child = op->u.delete.child ?
5309 +               reiser4_carry_real(op->u.delete.child) : op->node->node;
5310 +       tree = znode_get_tree(child);
5311 +       read_lock_tree(tree);
5312 +
5313 +       /*
5314 +        * @parent was determined when carry entered parent level
5315 +        * (lock_carry_level/lock_carry_node). Since then, actual parent of
5316 +        * @child node could change due to other carry operations performed on
5317 +        * the parent level. Check for this.
5318 +        */
5319 +
5320 +       if (znode_parent(child) != parent) {
5321 +               /* NOTE-NIKITA add stat counter for this. */
5322 +               parent = znode_parent(child);
5323 +               assert("nikita-2581", find_carry_node(doing, parent));
5324 +       }
5325 +       read_unlock_tree(tree);
5326 +
5327 +       assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
5328 +
5329 +       /* Twig level horrors: tree should be of height at least 2. So, last
5330 +          pointer from the root at twig level is preserved even if child is
5331 +          empty. This is ugly, but so it was architectured.
5332 +        */
5333 +
5334 +       if (znode_is_root(parent) &&
5335 +           znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
5336 +           node_num_items(parent) == 1) {
5337 +               /* Delimiting key manipulations. */
5338 +               write_lock_dk(tree);
5339 +               znode_set_ld_key(child, znode_set_ld_key(parent, reiser4_min_key()));
5340 +               znode_set_rd_key(child, znode_set_rd_key(parent, reiser4_max_key()));
5341 +               ZF_SET(child, JNODE_DKSET);
5342 +               write_unlock_dk(tree);
5343 +
5344 +               /* @child escaped imminent death! */
5345 +               ZF_CLR(child, JNODE_HEARD_BANSHEE);
5346 +               return 0;
5347 +       }
5348 +
5349 +       /* convert child pointer to the coord_t */
5350 +       result = find_child_ptr(parent, child, &coord);
5351 +       if (result != NS_FOUND) {
5352 +               warning("nikita-994", "Cannot find child pointer: %i", result);
5353 +               print_coord_content("coord", &coord);
5354 +               return result;
5355 +       }
5356 +
5357 +       coord_dup(&coord2, &coord);
5358 +       info.doing = doing;
5359 +       info.todo = todo;
5360 +       {
5361 +               /*
5362 +                * Actually kill internal item: prepare structure with
5363 +                * arguments for ->cut_and_kill() method...
5364 +                */
5365 +
5366 +               struct carry_kill_data kdata;
5367 +               kdata.params.from = &coord;
5368 +               kdata.params.to = &coord2;
5369 +               kdata.params.from_key = NULL;
5370 +               kdata.params.to_key = NULL;
5371 +               kdata.params.smallest_removed = NULL;
5372 +               kdata.params.truncate = 1;
5373 +               kdata.flags = op->u.delete.flags;
5374 +               kdata.inode = NULL;
5375 +               kdata.left = NULL;
5376 +               kdata.right = NULL;
5377 +               kdata.buf = NULL;
5378 +               /* ... and call it. */
5379 +               result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
5380 +                                                                  &info);
5381 +       }
5382 +       doing->restartable = 0;
5383 +
5384 +       /* check whether root should be killed violently */
5385 +       if (znode_is_root(parent) &&
5386 +           /* don't kill roots at and lower than twig level */
5387 +           znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
5388 +           node_num_items(parent) == 1) {
5389 +               result = reiser4_kill_tree_root(coord.node);
5390 +       }
5391 +
5392 +       return result < 0 ? : 0;
5393 +}
5394 +
5395 +/* implements COP_CUT opration
5396 +
5397 +   Cuts part or whole content of node.
5398 +
5399 +*/
5400 +static int carry_cut(carry_op * op /* operation to be performed */ ,
5401 +                    carry_level * doing /* current carry level */ ,
5402 +                    carry_level * todo /* next carry level */ )
5403 +{
5404 +       int result;
5405 +       carry_plugin_info info;
5406 +       node_plugin *nplug;
5407 +
5408 +       assert("nikita-896", op != NULL);
5409 +       assert("nikita-897", todo != NULL);
5410 +       assert("nikita-898", op->op == COP_CUT);
5411 +
5412 +       info.doing = doing;
5413 +       info.todo = todo;
5414 +
5415 +       nplug = node_plugin_by_node(reiser4_carry_real(op->node));
5416 +       if (op->u.cut_or_kill.is_cut)
5417 +               result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
5418 +       else
5419 +               result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
5420 +
5421 +       doing->restartable = 0;
5422 +       return result < 0 ? : 0;
5423 +}
5424 +
5425 +/* helper function for carry_paste(): returns true if @op can be continued as
5426 +   paste  */
5427 +static int
5428 +can_paste(coord_t * icoord, const reiser4_key * key,
5429 +         const reiser4_item_data * data)
5430 +{
5431 +       coord_t circa;
5432 +       item_plugin *new_iplug;
5433 +       item_plugin *old_iplug;
5434 +       int result = 0;         /* to keep gcc shut */
5435 +
5436 +       assert("", icoord->between != AT_UNIT);
5437 +
5438 +       /* obviously, one cannot paste when node is empty---there is nothing
5439 +          to paste into. */
5440 +       if (node_is_empty(icoord->node))
5441 +               return 0;
5442 +       /* if insertion point is at the middle of the item, then paste */
5443 +       if (!coord_is_between_items(icoord))
5444 +               return 1;
5445 +       coord_dup(&circa, icoord);
5446 +       circa.between = AT_UNIT;
5447 +
5448 +       old_iplug = item_plugin_by_coord(&circa);
5449 +       new_iplug = data->iplug;
5450 +
5451 +       /* check whether we can paste to the item @icoord is "at" when we
5452 +          ignore ->between field */
5453 +       if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) {
5454 +               result = 1;
5455 +       } else if (icoord->between == BEFORE_UNIT
5456 +                  || icoord->between == BEFORE_ITEM) {
5457 +               /* otherwise, try to glue to the item at the left, if any */
5458 +               coord_dup(&circa, icoord);
5459 +               if (coord_set_to_left(&circa)) {
5460 +                       result = 0;
5461 +                       coord_init_before_item(icoord);
5462 +               } else {
5463 +                       old_iplug = item_plugin_by_coord(&circa);
5464 +                       result = (old_iplug == new_iplug)
5465 +                           && item_can_contain_key(icoord, key, data);
5466 +                       if (result) {
5467 +                               coord_dup(icoord, &circa);
5468 +                               icoord->between = AFTER_UNIT;
5469 +                       }
5470 +               }
5471 +       } else if (icoord->between == AFTER_UNIT
5472 +                  || icoord->between == AFTER_ITEM) {
5473 +               coord_dup(&circa, icoord);
5474 +               /* otherwise, try to glue to the item at the right, if any */
5475 +               if (coord_set_to_right(&circa)) {
5476 +                       result = 0;
5477 +                       coord_init_after_item(icoord);
5478 +               } else {
5479 +                       int (*cck) (const coord_t *, const reiser4_key *,
5480 +                                   const reiser4_item_data *);
5481 +
5482 +                       old_iplug = item_plugin_by_coord(&circa);
5483 +
5484 +                       cck = old_iplug->b.can_contain_key;
5485 +                       if (cck == NULL)
5486 +                               /* item doesn't define ->can_contain_key
5487 +                                  method? So it is not expandable. */
5488 +                               result = 0;
5489 +                       else {
5490 +                               result = (old_iplug == new_iplug)
5491 +                                   && cck(&circa /*icoord */ , key, data);
5492 +                               if (result) {
5493 +                                       coord_dup(icoord, &circa);
5494 +                                       icoord->between = BEFORE_UNIT;
5495 +                               }
5496 +                       }
5497 +               }
5498 +       } else
5499 +               impossible("nikita-2513", "Nothing works");
5500 +       if (result) {
5501 +               if (icoord->between == BEFORE_ITEM) {
5502 +                       assert("vs-912", icoord->unit_pos == 0);
5503 +                       icoord->between = BEFORE_UNIT;
5504 +               } else if (icoord->between == AFTER_ITEM) {
5505 +                       coord_init_after_item_end(icoord);
5506 +               }
5507 +       }
5508 +       return result;
5509 +}
5510 +
5511 +/* implements COP_PASTE operation
5512 +
5513 +   Paste data into existing item. This is complicated by the fact that after
5514 +   we shifted something to the left or right neighbors trying to free some
5515 +   space, item we were supposed to paste into can be in different node than
5516 +   insertion coord. If so, we are no longer doing paste, but insert. See
5517 +   comments in insert_paste_common().
5518 +
5519 +*/
5520 +static int carry_paste(carry_op * op /* operation to be performed */ ,
5521 +                      carry_level * doing UNUSED_ARG   /* current carry
5522 +                                                        * level */ ,
5523 +                      carry_level * todo /* next carry level */ )
5524 +{
5525 +       znode *node;
5526 +       carry_insert_data cdata;
5527 +       coord_t dcoord;
5528 +       reiser4_item_data data;
5529 +       int result;
5530 +       int real_size;
5531 +       item_plugin *iplug;
5532 +       carry_plugin_info info;
5533 +       coord_t *coord;
5534 +
5535 +       assert("nikita-982", op != NULL);
5536 +       assert("nikita-983", todo != NULL);
5537 +       assert("nikita-984", op->op == COP_PASTE);
5538 +
5539 +       coord_init_zero(&dcoord);
5540 +
5541 +       result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
5542 +       if (result != 0)
5543 +               return result;
5544 +
5545 +       coord = op->u.insert.d->coord;
5546 +
5547 +       /* handle case when op -> u.insert.coord doesn't point to the item
5548 +          of required type. restart as insert. */
5549 +       if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
5550 +               op->op = COP_INSERT;
5551 +               op->u.insert.type = COPT_PASTE_RESTARTED;
5552 +               result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
5553 +
5554 +               return result;
5555 +       }
5556 +
5557 +       node = coord->node;
5558 +       iplug = item_plugin_by_coord(coord);
5559 +       assert("nikita-992", iplug != NULL);
5560 +
5561 +       assert("nikita-985", node != NULL);
5562 +       assert("nikita-986", node_plugin_by_node(node) != NULL);
5563 +
5564 +       assert("nikita-987",
5565 +              space_needed_for_op(node, op) <= znode_free_space(node));
5566 +
5567 +       assert("nikita-1286", coord_is_existing_item(coord));
5568 +
5569 +       /*
5570 +        * if item is expanded as a result of this operation, we should first
5571 +        * change item size, than call ->b.paste item method. If item is
5572 +        * shrunk, it should be done other way around: first call ->b.paste
5573 +        * method, then reduce item size.
5574 +        */
5575 +
5576 +       real_size = space_needed_for_op(node, op);
5577 +       if (real_size > 0)
5578 +               node->nplug->change_item_size(coord, real_size);
5579 +
5580 +       doing->restartable = 0;
5581 +       info.doing = doing;
5582 +       info.todo = todo;
5583 +
5584 +       result = iplug->b.paste(coord, op->u.insert.d->data, &info);
5585 +
5586 +       if (real_size < 0)
5587 +               node->nplug->change_item_size(coord, real_size);
5588 +
5589 +       /* if we pasted at the beginning of the item, update item's key. */
5590 +       if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
5591 +               node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
5592 +
5593 +       znode_make_dirty(node);
5594 +       return result;
5595 +}
5596 +
5597 +/* handle carry COP_EXTENT operation. */
5598 +static int carry_extent(carry_op * op /* operation to perform */ ,
5599 +                       carry_level * doing     /* queue of operations @op
5600 +                                                * is part of */ ,
5601 +                       carry_level * todo      /* queue where new operations
5602 +                                                * are accumulated */ )
5603 +{
5604 +       znode *node;
5605 +       carry_insert_data cdata;
5606 +       coord_t coord;
5607 +       reiser4_item_data data;
5608 +       carry_op *delete_dummy;
5609 +       carry_op *insert_extent;
5610 +       int result;
5611 +       carry_plugin_info info;
5612 +
5613 +       assert("nikita-1751", op != NULL);
5614 +       assert("nikita-1752", todo != NULL);
5615 +       assert("nikita-1753", op->op == COP_EXTENT);
5616 +
5617 +       /* extent insertion overview:
5618 +
5619 +          extents live on the TWIG LEVEL, which is level one above the leaf
5620 +          one. This complicates extent insertion logic somewhat: it may
5621 +          happen (and going to happen all the time) that in logical key
5622 +          ordering extent has to be placed between items I1 and I2, located
5623 +          at the leaf level, but I1 and I2 are in the same formatted leaf
5624 +          node N1. To insert extent one has to
5625 +
5626 +          (1) reach node N1 and shift data between N1, its neighbors and
5627 +          possibly newly allocated nodes until I1 and I2 fall into different
5628 +          nodes. Since I1 and I2 are still neighboring items in logical key
5629 +          order, they will be necessary utmost items in their respective
5630 +          nodes.
5631 +
5632 +          (2) After this new extent item is inserted into node on the twig
5633 +          level.
5634 +
5635 +          Fortunately this process can reuse almost all code from standard
5636 +          insertion procedure (viz. make_space() and insert_paste_common()),
5637 +          due to the following observation: make_space() only shifts data up
5638 +          to and excluding or including insertion point. It never
5639 +          "over-moves" through insertion point. Thus, one can use
5640 +          make_space() to perform step (1). All required for this is just to
5641 +          instruct free_space_shortage() to keep make_space() shifting data
5642 +          until insertion point is at the node border.
5643 +
5644 +        */
5645 +
5646 +       /* perform common functionality of insert and paste. */
5647 +       result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5648 +       if (result != 0)
5649 +               return result;
5650 +
5651 +       node = op->u.extent.d->coord->node;
5652 +       assert("nikita-1754", node != NULL);
5653 +       assert("nikita-1755", node_plugin_by_node(node) != NULL);
5654 +       assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
5655 +
5656 +       /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
5657 +          extent fits between items. */
5658 +
5659 +       info.doing = doing;
5660 +       info.todo = todo;
5661 +
5662 +       /* there is another complication due to placement of extents on the
5663 +          twig level: extents are "rigid" in the sense that key-range
5664 +          occupied by extent cannot grow indefinitely to the right as it is
5665 +          for the formatted leaf nodes. Because of this when search finds two
5666 +          adjacent extents on the twig level, it has to "drill" to the leaf
5667 +          level, creating new node. Here we are removing this node.
5668 +        */
5669 +       if (node_is_empty(node)) {
5670 +               delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
5671 +               if (IS_ERR(delete_dummy))
5672 +                       return PTR_ERR(delete_dummy);
5673 +               delete_dummy->u.delete.child = NULL;
5674 +               delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
5675 +               ZF_SET(node, JNODE_HEARD_BANSHEE);
5676 +       }
5677 +
5678 +       /* proceed with inserting extent item into parent. We are definitely
5679 +          inserting rather than pasting if we get that far. */
5680 +       insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
5681 +       if (IS_ERR(insert_extent))
5682 +               /* @delete_dummy will be automatically destroyed on the level
5683 +                  exiting  */
5684 +               return PTR_ERR(insert_extent);
5685 +       /* NOTE-NIKITA insertion by key is simplest option here. Another
5686 +          possibility is to insert on the left or right of already existing
5687 +          item.
5688 +        */
5689 +       insert_extent->u.insert.type = COPT_KEY;
5690 +       insert_extent->u.insert.d = op->u.extent.d;
5691 +       assert("nikita-1719", op->u.extent.d->key != NULL);
5692 +       insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
5693 +       insert_extent->u.insert.flags =
5694 +           znode_get_tree(node)->carry.new_extent_flags;
5695 +
5696 +       /*
5697 +        * if carry was asked to track lock handle we should actually track
5698 +        * lock handle on the twig node rather than on the leaf where
5699 +        * operation was started from. Transfer tracked lock handle.
5700 +        */
5701 +       if (doing->track_type) {
5702 +               assert("nikita-3242", doing->tracked != NULL);
5703 +               assert("nikita-3244", todo->tracked == NULL);
5704 +               todo->tracked = doing->tracked;
5705 +               todo->track_type = CARRY_TRACK_NODE;
5706 +               doing->tracked = NULL;
5707 +               doing->track_type = 0;
5708 +       }
5709 +
5710 +       return 0;
5711 +}
5712 +
5713 +/* update key in @parent between pointers to @left and @right.
5714 +
5715 +   Find coords of @left and @right and update delimiting key between them.
5716 +   This is helper function called by carry_update(). Finds position of
5717 +   internal item involved. Updates item key. Updates delimiting keys of child
5718 +   nodes involved.
5719 +*/
5720 +static int update_delimiting_key(znode * parent        /* node key is updated
5721 +                                                * in */ ,
5722 +                                znode * left /* child of @parent */ ,
5723 +                                znode * right /* child of @parent */ ,
5724 +                                carry_level * doing    /* current carry
5725 +                                                        * level */ ,
5726 +                                carry_level * todo     /* parent carry
5727 +                                                        * level */ ,
5728 +                                const char **error_msg /* place to
5729 +                                                        * store error
5730 +                                                        * message */ )
5731 +{
5732 +       coord_t left_pos;
5733 +       coord_t right_pos;
5734 +       int result;
5735 +       reiser4_key ldkey;
5736 +       carry_plugin_info info;
5737 +
5738 +       assert("nikita-1177", right != NULL);
5739 +       /* find position of right left child in a parent */
5740 +       result = find_child_ptr(parent, right, &right_pos);
5741 +       if (result != NS_FOUND) {
5742 +               *error_msg = "Cannot find position of right child";
5743 +               return result;
5744 +       }
5745 +
5746 +       if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
5747 +               /* find position of the left child in a parent */
5748 +               result = find_child_ptr(parent, left, &left_pos);
5749 +               if (result != NS_FOUND) {
5750 +                       *error_msg = "Cannot find position of left child";
5751 +                       return result;
5752 +               }
5753 +               assert("nikita-1355", left_pos.node != NULL);
5754 +       } else
5755 +               left_pos.node = NULL;
5756 +
5757 +       /* check that they are separated by exactly one key and are basically
5758 +          sane */
5759 +       if (REISER4_DEBUG) {
5760 +               if ((left_pos.node != NULL)
5761 +                   && !coord_is_existing_unit(&left_pos)) {
5762 +                       *error_msg = "Left child is bastard";
5763 +                       return RETERR(-EIO);
5764 +               }
5765 +               if (!coord_is_existing_unit(&right_pos)) {
5766 +                       *error_msg = "Right child is bastard";
5767 +                       return RETERR(-EIO);
5768 +               }
5769 +               if (left_pos.node != NULL &&
5770 +                   !coord_are_neighbors(&left_pos, &right_pos)) {
5771 +                       *error_msg = "Children are not direct siblings";
5772 +                       return RETERR(-EIO);
5773 +               }
5774 +       }
5775 +       *error_msg = NULL;
5776 +
5777 +       info.doing = doing;
5778 +       info.todo = todo;
5779 +
5780 +       /*
5781 +        * If child node is not empty, new key of internal item is a key of
5782 +        * leftmost item in the child node. If the child is empty, take its
5783 +        * right delimiting key as a new key of the internal item. Precise key
5784 +        * in the latter case is not important per se, because the child (and
5785 +        * the internal item) are going to be killed shortly anyway, but we
5786 +        * have to preserve correct order of keys in the parent node.
5787 +        */
5788 +
5789 +       if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
5790 +               leftmost_key_in_node(right, &ldkey);
5791 +       else {
5792 +               read_lock_dk(znode_get_tree(parent));
5793 +               ldkey = *znode_get_rd_key(right);
5794 +               read_unlock_dk(znode_get_tree(parent));
5795 +       }
5796 +       node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
5797 +       doing->restartable = 0;
5798 +       znode_make_dirty(parent);
5799 +       return 0;
5800 +}
5801 +
5802 +/* implements COP_UPDATE opration
5803 +
5804 +   Update delimiting keys.
5805 +
5806 +*/
5807 +static int carry_update(carry_op * op /* operation to be performed */ ,
5808 +                       carry_level * doing /* current carry level */ ,
5809 +                       carry_level * todo /* next carry level */ )
5810 +{
5811 +       int result;
5812 +       carry_node *missing UNUSED_ARG;
5813 +       znode *left;
5814 +       znode *right;
5815 +       carry_node *lchild;
5816 +       carry_node *rchild;
5817 +       const char *error_msg;
5818 +       reiser4_tree *tree;
5819 +
5820 +       /*
5821 +        * This operation is called to update key of internal item. This is
5822 +        * necessary when carry shifted of cut data on the child
5823 +        * level. Arguments of this operation are:
5824 +        *
5825 +        *     @right --- child node. Operation should update key of internal
5826 +        *     item pointing to @right.
5827 +        *
5828 +        *     @left --- left neighbor of @right. This parameter is optional.
5829 +        */
5830 +
5831 +       assert("nikita-902", op != NULL);
5832 +       assert("nikita-903", todo != NULL);
5833 +       assert("nikita-904", op->op == COP_UPDATE);
5834 +
5835 +       lchild = op->u.update.left;
5836 +       rchild = op->node;
5837 +
5838 +       if (lchild != NULL) {
5839 +               assert("nikita-1001", lchild->parent);
5840 +               assert("nikita-1003", !lchild->left);
5841 +               left = reiser4_carry_real(lchild);
5842 +       } else
5843 +               left = NULL;
5844 +
5845 +       tree = znode_get_tree(rchild->node);
5846 +       read_lock_tree(tree);
5847 +       right = znode_parent(rchild->node);
5848 +       read_unlock_tree(tree);
5849 +
5850 +       if (right != NULL) {
5851 +               result = update_delimiting_key(right,
5852 +                                              lchild ? lchild->node : NULL,
5853 +                                              rchild->node,
5854 +                                              doing, todo, &error_msg);
5855 +       } else {
5856 +               error_msg = "Cannot find node to update key in";
5857 +               result = RETERR(-EIO);
5858 +       }
5859 +       /* operation will be reposted to the next level by the
5860 +          ->update_item_key() method of node plugin, if necessary. */
5861 +
5862 +       if (result != 0) {
5863 +               warning("nikita-999", "Error updating delimiting key: %s (%i)",
5864 +                       error_msg ? : "", result);
5865 +       }
5866 +       return result;
5867 +}
5868 +
5869 +/* move items from @node during carry */
5870 +static int carry_shift_data(sideof side /* in what direction to move data */ ,
5871 +                           coord_t * insert_coord      /* coord where new item
5872 +                                                        * is to be inserted */ ,
5873 +                           znode * node /* node which data are moved from */ ,
5874 +                           carry_level * doing /* active carry queue */ ,
5875 +                           carry_level * todo  /* carry queue where new
5876 +                                                * operations are to be put
5877 +                                                * in */ ,
5878 +                           unsigned int including_insert_coord_p       /* true if
5879 +                                                                        * @insertion_coord
5880 +                                                                        * can be moved */ )
5881 +{
5882 +       int result;
5883 +       znode *source;
5884 +       carry_plugin_info info;
5885 +       node_plugin *nplug;
5886 +
5887 +       source = insert_coord->node;
5888 +
5889 +       info.doing = doing;
5890 +       info.todo = todo;
5891 +
5892 +       nplug = node_plugin_by_node(node);
5893 +       result = nplug->shift(insert_coord, node,
5894 +                             (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
5895 +                             (int)including_insert_coord_p, &info);
5896 +       /* the only error ->shift() method of node plugin can return is
5897 +          -ENOMEM due to carry node/operation allocation. */
5898 +       assert("nikita-915", result >= 0 || result == -ENOMEM);
5899 +       if (result > 0) {
5900 +               /*
5901 +                * if some number of bytes was actually shifted, mark nodes
5902 +                * dirty, and carry level as non-restartable.
5903 +                */
5904 +               doing->restartable = 0;
5905 +               znode_make_dirty(source);
5906 +               znode_make_dirty(node);
5907 +       }
5908 +
5909 +       assert("nikita-2077", coord_check(insert_coord));
5910 +       return 0;
5911 +}
5912 +
5913 +typedef carry_node *(*carry_iterator) (carry_node * node);
5914 +static carry_node *find_dir_carry(carry_node * node, carry_level * level,
5915 +                                 carry_iterator iterator);
5916 +
5917 +static carry_node *pool_level_list_prev(carry_node *node)
5918 +{
5919 +       return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage);
5920 +}
5921 +
5922 +/* look for the left neighbor of given carry node in a carry queue.
5923 +
5924 +   This is used by find_left_neighbor(), but I am not sure that this
5925 +   really gives any advantage. More statistics required.
5926 +
5927 +*/
5928 +carry_node *find_left_carry(carry_node * node  /* node to find left neighbor
5929 +                                                * of */ ,
5930 +                           carry_level * level /* level to scan */ )
5931 +{
5932 +       return find_dir_carry(node, level,
5933 +                             (carry_iterator) pool_level_list_prev);
5934 +}
5935 +
5936 +static carry_node *pool_level_list_next(carry_node *node)
5937 +{
5938 +       return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);
5939 +}
5940 +
5941 +/* look for the right neighbor of given carry node in a
5942 +   carry queue.
5943 +
5944 +   This is used by find_right_neighbor(), but I am not sure that this
5945 +   really gives any advantage. More statistics required.
5946 +
5947 +*/
5948 +carry_node *find_right_carry(carry_node * node /* node to find right neighbor
5949 +                                                * of */ ,
5950 +                            carry_level * level /* level to scan */ )
5951 +{
5952 +       return find_dir_carry(node, level,
5953 +                             (carry_iterator) pool_level_list_next);
5954 +}
5955 +
5956 +/* look for the left or right neighbor of given carry node in a carry
5957 +   queue.
5958 +
5959 +   Helper function used by find_{left|right}_carry().
5960 +*/
5961 +static carry_node *find_dir_carry(carry_node * node    /* node to start scanning
5962 +                                                        * from */ ,
5963 +                                 carry_level * level /* level to scan */ ,
5964 +                                 carry_iterator iterator       /* operation to
5965 +                                                                * move to the next
5966 +                                                                * node */ )
5967 +{
5968 +       carry_node *neighbor;
5969 +
5970 +       assert("nikita-1059", node != NULL);
5971 +       assert("nikita-1060", level != NULL);
5972 +
5973 +       /* scan list of carry nodes on this list dir-ward, skipping all
5974 +          carry nodes referencing the same znode. */
5975 +       neighbor = node;
5976 +       while (1) {
5977 +               neighbor = iterator(neighbor);
5978 +               if (carry_node_end(level, neighbor))
5979 +                       /* list head is reached */
5980 +                       return NULL;
5981 +               if (reiser4_carry_real(neighbor) != reiser4_carry_real(node))
5982 +                       return neighbor;
5983 +       }
5984 +}
5985 +
5986 +/*
5987 + * Memory reservation estimation.
5988 + *
5989 + * Carry process proceeds through tree levels upwards. Carry assumes that it
5990 + * takes tree in consistent state (e.g., that search tree invariants hold),
5991 + * and leaves tree consistent after it finishes. This means that when some
5992 + * error occurs carry cannot simply return if there are pending carry
5993 + * operations. Generic solution for this problem is carry-undo either as
5994 + * transaction manager feature (requiring checkpoints and isolation), or
5995 + * through some carry specific mechanism.
5996 + *
5997 + * Our current approach is to panic if carry hits an error while tree is
5998 + * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
5999 + * this "memory reservation" mechanism was added.
6000 + *
6001 + * Memory reservation is implemented by perthread-pages.diff patch from
6002 + * core-patches. Its API is defined in <linux/gfp.h>
6003 + *
6004 + *     int  perthread_pages_reserve(int nrpages, gfp_t gfp);
6005 + *     void perthread_pages_release(int nrpages);
6006 + *     int  perthread_pages_count(void);
6007 + *
6008 + * carry estimates its worst case memory requirements at the entry, reserved
6009 + * enough memory, and released unused pages before returning.
6010 + *
6011 + * Code below estimates worst case memory requirements for a given carry
6012 + * queue. This is dome by summing worst case memory requirements for each
6013 + * operation in the queue.
6014 + *
6015 + */
6016 +
6017 +/*
6018 + * Memory memory requirements of many operations depends on the tree
6019 + * height. For example, item insertion requires new node to be inserted at
6020 + * each tree level in the worst case. What tree height should be used for
6021 + * estimation? Current tree height is wrong, because tree height can change
6022 + * between the time when estimation was done and the time when operation is
6023 + * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
6024 + * is also not desirable, because it would lead to the huge over-estimation
6025 + * all the time. Plausible solution is "capped tree height": if current tree
6026 + * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
6027 + * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
6028 + * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
6029 + * to be increased even more during short interval of time.
6030 + */
6031 +#define TREE_HEIGHT_CAP (5)
6032 +
6033 +/* return capped tree height for the @tree. See comment above. */
6034 +static int cap_tree_height(reiser4_tree * tree)
6035 +{
6036 +       return max_t(int, tree->height, TREE_HEIGHT_CAP);
6037 +}
6038 +
6039 +/* return capped tree height for the current tree. */
6040 +static int capped_height(void)
6041 +{
6042 +       return cap_tree_height(current_tree);
6043 +}
6044 +
6045 +/* return number of pages required to store given number of bytes */
6046 +static int bytes_to_pages(int bytes)
6047 +{
6048 +       return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
6049 +}
6050 +
6051 +/* how many pages are required to allocate znodes during item insertion. */
6052 +static int carry_estimate_znodes(void)
6053 +{
6054 +       /*
6055 +        * Note, that there we have some problem here: there is no way to
6056 +        * reserve pages specifically for the given slab. This means that
6057 +        * these pages can be hijacked for some other end.
6058 +        */
6059 +
6060 +       /* in the worst case we need 3 new znode on each tree level */
6061 +       return bytes_to_pages(capped_height() * sizeof(znode) * 3);
6062 +}
6063 +
6064 +/*
6065 + * how many pages are required to load bitmaps. One bitmap per level.
6066 + */
6067 +static int carry_estimate_bitmaps(void)
6068 +{
6069 +       if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
6070 +               int bytes;
6071 +
6072 +               bytes = capped_height() * (0 +  /* bnode should be added, but its is private to
6073 +                                                * bitmap.c, skip for now. */
6074 +                                          2 * sizeof(jnode));  /* working and commit jnodes */
6075 +               return bytes_to_pages(bytes) + 2;       /* and their contents */
6076 +       } else
6077 +               /* bitmaps were pre-loaded during mount */
6078 +               return 0;
6079 +}
6080 +
6081 +/* worst case item insertion memory requirements */
6082 +static int carry_estimate_insert(carry_op * op, carry_level * level)
6083 +{
6084 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6085 +           capped_height() +   /* new block on each level */
6086 +           1 +                 /* and possibly extra new block at the leaf level */
6087 +           3;                  /* loading of leaves into memory */
6088 +}
6089 +
6090 +/* worst case item deletion memory requirements */
6091 +static int carry_estimate_delete(carry_op * op, carry_level * level)
6092 +{
6093 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6094 +           3;                  /* loading of leaves into memory */
6095 +}
6096 +
6097 +/* worst case tree cut memory requirements */
6098 +static int carry_estimate_cut(carry_op * op, carry_level * level)
6099 +{
6100 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6101 +           3;                  /* loading of leaves into memory */
6102 +}
6103 +
6104 +/* worst case memory requirements of pasting into item */
6105 +static int carry_estimate_paste(carry_op * op, carry_level * level)
6106 +{
6107 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6108 +           capped_height() +   /* new block on each level */
6109 +           1 +                 /* and possibly extra new block at the leaf level */
6110 +           3;                  /* loading of leaves into memory */
6111 +}
6112 +
6113 +/* worst case memory requirements of extent insertion */
6114 +static int carry_estimate_extent(carry_op * op, carry_level * level)
6115 +{
6116 +       return carry_estimate_insert(op, level) +       /* insert extent */
6117 +           carry_estimate_delete(op, level);   /* kill leaf */
6118 +}
6119 +
6120 +/* worst case memory requirements of key update */
6121 +static int carry_estimate_update(carry_op * op, carry_level * level)
6122 +{
6123 +       return 0;
6124 +}
6125 +
6126 +/* worst case memory requirements of flow insertion */
6127 +static int carry_estimate_insert_flow(carry_op * op, carry_level * level)
6128 +{
6129 +       int newnodes;
6130 +
6131 +       newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
6132 +                      CARRY_FLOW_NEW_NODES_LIMIT);
6133 +       /*
6134 +        * roughly estimate insert_flow as a sequence of insertions.
6135 +        */
6136 +       return newnodes * carry_estimate_insert(op, level);
6137 +}
6138 +
6139 +/* This is dispatch table for carry operations. It can be trivially
6140 +   abstracted into useful plugin: tunable balancing policy is a good
6141 +   thing. */
6142 +carry_op_handler op_dispatch_table[COP_LAST_OP] = {
6143 +       [COP_INSERT] = {
6144 +                       .handler = carry_insert,
6145 +                       .estimate = carry_estimate_insert}
6146 +       ,
6147 +       [COP_DELETE] = {
6148 +                       .handler = carry_delete,
6149 +                       .estimate = carry_estimate_delete}
6150 +       ,
6151 +       [COP_CUT] = {
6152 +                    .handler = carry_cut,
6153 +                    .estimate = carry_estimate_cut}
6154 +       ,
6155 +       [COP_PASTE] = {
6156 +                      .handler = carry_paste,
6157 +                      .estimate = carry_estimate_paste}
6158 +       ,
6159 +       [COP_EXTENT] = {
6160 +                       .handler = carry_extent,
6161 +                       .estimate = carry_estimate_extent}
6162 +       ,
6163 +       [COP_UPDATE] = {
6164 +                       .handler = carry_update,
6165 +                       .estimate = carry_estimate_update}
6166 +       ,
6167 +       [COP_INSERT_FLOW] = {
6168 +                            .handler = carry_insert_flow,
6169 +                            .estimate = carry_estimate_insert_flow}
6170 +};
6171 +
6172 +/* Make Linus happy.
6173 +   Local variables:
6174 +   c-indentation-style: "K&R"
6175 +   mode-name: "LC"
6176 +   c-basic-offset: 8
6177 +   tab-width: 8
6178 +   fill-column: 120
6179 +   scroll-step: 1
6180 +   End:
6181 +*/
6182 diff -urN linux-2.6.27.orig/fs/reiser4/carry_ops.h linux-2.6.27/fs/reiser4/carry_ops.h
6183 --- linux-2.6.27.orig/fs/reiser4/carry_ops.h    1970-01-01 03:00:00.000000000 +0300
6184 +++ linux-2.6.27/fs/reiser4/carry_ops.h 2008-10-12 18:20:00.000000000 +0400
6185 @@ -0,0 +1,42 @@
6186 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6187 +
6188 +/* implementation of carry operations. See carry_ops.c for details. */
6189 +
6190 +#if !defined( __CARRY_OPS_H__ )
6191 +#define __CARRY_OPS_H__
6192 +
6193 +#include "forward.h"
6194 +#include "znode.h"
6195 +#include "carry.h"
6196 +
6197 +/* carry operation handlers */
6198 +typedef struct carry_op_handler {
6199 +       /* perform operation */
6200 +       int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
6201 +       /* estimate memory requirements for @op */
6202 +       int (*estimate) (carry_op * op, carry_level * level);
6203 +} carry_op_handler;
6204 +
6205 +/* This is dispatch table for carry operations. It can be trivially
6206 +   abstracted into useful plugin: tunable balancing policy is a good
6207 +   thing. */
6208 +extern carry_op_handler op_dispatch_table[COP_LAST_OP];
6209 +
6210 +unsigned int space_needed(const znode * node, const coord_t * coord,
6211 +                         const reiser4_item_data * data, int inserting);
6212 +extern carry_node *find_left_carry(carry_node * node, carry_level * level);
6213 +extern carry_node *find_right_carry(carry_node * node, carry_level * level);
6214 +
6215 +/* __CARRY_OPS_H__ */
6216 +#endif
6217 +
6218 +/* Make Linus happy.
6219 +   Local variables:
6220 +   c-indentation-style: "K&R"
6221 +   mode-name: "LC"
6222 +   c-basic-offset: 8
6223 +   tab-width: 8
6224 +   fill-column: 120
6225 +   scroll-step: 1
6226 +   End:
6227 +*/
6228 diff -urN linux-2.6.27.orig/fs/reiser4/context.c linux-2.6.27/fs/reiser4/context.c
6229 --- linux-2.6.27.orig/fs/reiser4/context.c      1970-01-01 03:00:00.000000000 +0300
6230 +++ linux-2.6.27/fs/reiser4/context.c   2008-10-12 18:20:00.000000000 +0400
6231 @@ -0,0 +1,288 @@
6232 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6233 +
6234 +/* Manipulation of reiser4_context */
6235 +
6236 +/*
6237 + * global context used during system call. Variable of this type is allocated
6238 + * on the stack at the beginning of the reiser4 part of the system call and
6239 + * pointer to it is stored in the current->fs_context. This allows us to avoid
6240 + * passing pointer to current transaction and current lockstack (both in
6241 + * one-to-one mapping with threads) all over the call chain.
6242 + *
6243 + * It's kind of like those global variables the prof used to tell you not to
6244 + * use in CS1, except thread specific.;-) Nikita, this was a good idea.
6245 + *
6246 + * In some situations it is desirable to have ability to enter reiser4_context
6247 + * more than once for the same thread (nested contexts). For example, there
6248 + * are some functions that can be called either directly from VFS/VM or from
6249 + * already active reiser4 context (->writepage, for example).
6250 + *
6251 + * In such situations "child" context acts like dummy: all activity is
6252 + * actually performed in the top level context, and get_current_context()
6253 + * always returns top level context.
6254 + * Of course, reiser4_init_context()/reiser4_done_context() have to be properly
6255 + * nested any way.
6256 + *
6257 + * Note that there is an important difference between reiser4 uses
6258 + * ->fs_context and the way other file systems use it. Other file systems
6259 + * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
6260 + * (this is why ->fs_context was initially called ->journal_info). This means,
6261 + * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
6262 + * to the file system, they assume that some transaction is already underway,
6263 + * and usually bail out, because starting nested transaction would most likely
6264 + * lead to the deadlock. This gives false positives with reiser4, because we
6265 + * set ->fs_context before starting transaction.
6266 + */
6267 +
6268 +#include "debug.h"
6269 +#include "super.h"
6270 +#include "context.h"
6271 +
6272 +#include <linux/writeback.h>   /* balance_dirty_pages() */
6273 +#include <linux/hardirq.h>
6274 +
6275 +static void _reiser4_init_context(reiser4_context * context,
6276 +                                 struct super_block *super)
6277 +{
6278 +       memset(context, 0, sizeof(*context));
6279 +
6280 +       context->super = super;
6281 +       context->magic = context_magic;
6282 +       context->outer = current->journal_info;
6283 +       current->journal_info = (void *)context;
6284 +       context->nr_children = 0;
6285 +       context->gfp_mask = GFP_KERNEL;
6286 +
6287 +       init_lock_stack(&context->stack);
6288 +
6289 +       reiser4_txn_begin(context);
6290 +
6291 +       /* initialize head of tap list */
6292 +       INIT_LIST_HEAD(&context->taps);
6293 +#if REISER4_DEBUG
6294 +       context->task = current;
6295 +#endif
6296 +       grab_space_enable();
6297 +}
6298 +
6299 +/* initialize context and bind it to the current thread
6300 +
6301 +   This function should be called at the beginning of reiser4 part of
6302 +   syscall.
6303 +*/
6304 +reiser4_context * reiser4_init_context(struct super_block * super)
6305 +{
6306 +       reiser4_context *context;
6307 +
6308 +       assert("nikita-2662", !in_interrupt() && !in_irq());
6309 +       assert("nikita-3357", super != NULL);
6310 +       assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6311 +
6312 +       context = get_current_context_check();
6313 +       if (context && context->super == super) {
6314 +               context = (reiser4_context *) current->journal_info;
6315 +               context->nr_children++;
6316 +               return context;
6317 +       }
6318 +
6319 +       context = kmalloc(sizeof(*context), GFP_KERNEL);
6320 +       if (context == NULL)
6321 +               return ERR_PTR(RETERR(-ENOMEM));
6322 +
6323 +       _reiser4_init_context(context, super);
6324 +       return context;
6325 +}
6326 +
6327 +/* this is used in scan_mgr which is called with spinlock held and in
6328 +   reiser4_fill_super magic */
6329 +void init_stack_context(reiser4_context *context, struct super_block *super)
6330 +{
6331 +       assert("nikita-2662", !in_interrupt() && !in_irq());
6332 +       assert("nikita-3357", super != NULL);
6333 +       assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6334 +       assert("vs-12", !is_in_reiser4_context());
6335 +
6336 +       _reiser4_init_context(context, super);
6337 +       context->on_stack = 1;
6338 +       return;
6339 +}
6340 +
6341 +/* cast lock stack embedded into reiser4 context up to its container */
6342 +reiser4_context *get_context_by_lock_stack(lock_stack * owner)
6343 +{
6344 +       return container_of(owner, reiser4_context, stack);
6345 +}
6346 +
6347 +/* true if there is already _any_ reiser4 context for the current thread */
6348 +int is_in_reiser4_context(void)
6349 +{
6350 +       reiser4_context *ctx;
6351 +
6352 +       ctx = current->journal_info;
6353 +       return ctx != NULL && ((unsigned long)ctx->magic) == context_magic;
6354 +}
6355 +
6356 +/*
6357 + * call balance dirty pages for the current context.
6358 + *
6359 + * File system is expected to call balance_dirty_pages_ratelimited() whenever
6360 + * it dirties a page. reiser4 does this for unformatted nodes (that is, during
6361 + * write---this covers vast majority of all dirty traffic), but we cannot do
6362 + * this immediately when formatted node is dirtied, because long term lock is
6363 + * usually held at that time. To work around this, dirtying of formatted node
6364 + * simply increases ->nr_marked_dirty counter in the current reiser4
6365 + * context. When we are about to leave this context,
6366 + * balance_dirty_pages_ratelimited() is called, if necessary.
6367 + *
6368 + * This introduces another problem: sometimes we do not want to run
6369 + * balance_dirty_pages_ratelimited() when leaving a context, for example
6370 + * because some important lock (like ->i_mutex on the parent directory) is
6371 + * held. To achieve this, ->nobalance flag can be set in the current context.
6372 + */
6373 +static void balance_dirty_pages_at(reiser4_context *context)
6374 +{
6375 +       reiser4_super_info_data *sbinfo = get_super_private(context->super);
6376 +
6377 +       /*
6378 +        * call balance_dirty_pages_ratelimited() to process formatted nodes
6379 +        * dirtied during this system call. Do that only if we are not in mount
6380 +        * and there were nodes dirtied in this context and we are not in
6381 +        * writepage (to avoid deadlock) and not in pdflush
6382 +        */
6383 +       if (sbinfo != NULL && sbinfo->fake != NULL &&
6384 +           context->nr_marked_dirty != 0 &&
6385 +           !(current->flags & PF_MEMALLOC) &&
6386 +           !current_is_pdflush())
6387 +               balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
6388 +}
6389 +
6390 +/* release resources associated with context.
6391 +
6392 +   This function should be called at the end of "session" with reiser4,
6393 +   typically just before leaving reiser4 driver back to VFS.
6394 +
6395 +   This is good place to put some degugging consistency checks, like that
6396 +   thread released all locks and closed transcrash etc.
6397 +
6398 +*/
6399 +static void reiser4_done_context(reiser4_context * context /* context being released */ )
6400 +{
6401 +       assert("nikita-860", context != NULL);
6402 +       assert("nikita-859", context->magic == context_magic);
6403 +       assert("vs-646", (reiser4_context *) current->journal_info == context);
6404 +       assert("zam-686", !in_interrupt() && !in_irq());
6405 +
6406 +       /* only do anything when leaving top-level reiser4 context. All nested
6407 +        * contexts are just dummies. */
6408 +       if (context->nr_children == 0) {
6409 +               assert("jmacd-673", context->trans == NULL);
6410 +               assert("jmacd-1002", lock_stack_isclean(&context->stack));
6411 +               assert("nikita-1936", reiser4_no_counters_are_held());
6412 +               assert("nikita-2626", list_empty_careful(reiser4_taps_list()));
6413 +               assert("zam-1004", ergo(get_super_private(context->super),
6414 +                                       get_super_private(context->super)->delete_mutex_owner !=
6415 +                                       current));
6416 +
6417 +               /* release all grabbed but as yet unused blocks */
6418 +               if (context->grabbed_blocks != 0)
6419 +                       all_grabbed2free();
6420 +
6421 +               /*
6422 +                * synchronize against longterm_unlock_znode():
6423 +                * wake_up_requestor() wakes up requestors without holding
6424 +                * zlock (otherwise they will immediately bump into that lock
6425 +                * after wake up on another CPU). To work around (rare)
6426 +                * situation where requestor has been woken up asynchronously
6427 +                * and managed to run until completion (and destroy its
6428 +                * context and lock stack) before wake_up_requestor() called
6429 +                * wake_up() on it, wake_up_requestor() synchronize on lock
6430 +                * stack spin lock. It has actually been observed that spin
6431 +                * lock _was_ locked at this point, because
6432 +                * wake_up_requestor() took interrupt.
6433 +                */
6434 +               spin_lock_stack(&context->stack);
6435 +               spin_unlock_stack(&context->stack);
6436 +
6437 +               assert("zam-684", context->nr_children == 0);
6438 +               /* restore original ->fs_context value */
6439 +               current->journal_info = context->outer;
6440 +               if (context->on_stack == 0)
6441 +                       kfree(context);
6442 +       } else {
6443 +               context->nr_children--;
6444 +#if REISER4_DEBUG
6445 +               assert("zam-685", context->nr_children >= 0);
6446 +#endif
6447 +       }
6448 +}
6449 +
6450 +/*
6451 + * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
6452 + * transaction. Call done_context() to do context related book-keeping.
6453 + */
6454 +void reiser4_exit_context(reiser4_context * context)
6455 +{
6456 +       assert("nikita-3021", reiser4_schedulable());
6457 +
6458 +       if (context->nr_children == 0) {
6459 +               if (!context->nobalance) {
6460 +                       reiser4_txn_restart(context);
6461 +                       balance_dirty_pages_at(context);
6462 +               }
6463 +
6464 +               /* if filesystem is mounted with -o sync or -o dirsync - commit
6465 +                  transaction.  FIXME: TXNH_DONT_COMMIT is used to avoid
6466 +                  commiting on exit_context when inode semaphore is held and
6467 +                  to have ktxnmgrd to do commit instead to get better
6468 +                  concurrent filesystem accesses. But, when one mounts with -o
6469 +                  sync, he cares more about reliability than about
6470 +                  performance. So, for now we have this simple mount -o sync
6471 +                  support. */
6472 +               if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) {
6473 +                       txn_atom *atom;
6474 +
6475 +                       atom = get_current_atom_locked_nocheck();
6476 +                       if (atom) {
6477 +                               atom->flags |= ATOM_FORCE_COMMIT;
6478 +                               context->trans->flags &= ~TXNH_DONT_COMMIT;
6479 +                               spin_unlock_atom(atom);
6480 +                       }
6481 +               }
6482 +               reiser4_txn_end(context);
6483 +       }
6484 +       reiser4_done_context(context);
6485 +}
6486 +
6487 +void reiser4_ctx_gfp_mask_set(void)
6488 +{
6489 +       reiser4_context *ctx;
6490 +
6491 +       ctx = get_current_context();
6492 +       if (ctx->entd == 0 &&
6493 +           list_empty(&ctx->stack.locks) &&
6494 +           ctx->trans->atom == NULL)
6495 +               ctx->gfp_mask = GFP_KERNEL;
6496 +       else
6497 +               ctx->gfp_mask = GFP_NOFS;
6498 +}
6499 +
6500 +void reiser4_ctx_gfp_mask_force (gfp_t mask)
6501 +{
6502 +       reiser4_context *ctx;
6503 +       ctx = get_current_context();
6504 +
6505 +       assert("edward-1454", ctx != NULL);
6506 +
6507 +       ctx->gfp_mask = mask;
6508 +}
6509 +
6510 +/*
6511 + * Local variables:
6512 + * c-indentation-style: "K&R"
6513 + * mode-name: "LC"
6514 + * c-basic-offset: 8
6515 + * tab-width: 8
6516 + * fill-column: 120
6517 + * scroll-step: 1
6518 + * End:
6519 + */
6520 diff -urN linux-2.6.27.orig/fs/reiser4/context.h linux-2.6.27/fs/reiser4/context.h
6521 --- linux-2.6.27.orig/fs/reiser4/context.h      1970-01-01 03:00:00.000000000 +0300
6522 +++ linux-2.6.27/fs/reiser4/context.h   2008-10-12 18:20:00.000000000 +0400
6523 @@ -0,0 +1,228 @@
6524 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
6525 + * reiser4/README */
6526 +
6527 +/* Reiser4 context. See context.c for details. */
6528 +
6529 +#if !defined( __REISER4_CONTEXT_H__ )
6530 +#define __REISER4_CONTEXT_H__
6531 +
6532 +#include "forward.h"
6533 +#include "debug.h"
6534 +#include "dformat.h"
6535 +#include "tap.h"
6536 +#include "lock.h"
6537 +
6538 +#include <linux/types.h>       /* for __u??  */
6539 +#include <linux/fs.h>          /* for struct super_block  */
6540 +#include <linux/spinlock.h>
6541 +#include <linux/sched.h>       /* for struct task_struct */
6542 +
6543 +/* reiser4 per-thread context */
6544 +struct reiser4_context {
6545 +       /* magic constant. For identification of reiser4 contexts. */
6546 +       __u32 magic;
6547 +
6548 +       /* current lock stack. See lock.[ch]. This is where list of all
6549 +          locks taken by current thread is kept. This is also used in
6550 +          deadlock detection. */
6551 +       lock_stack stack;
6552 +
6553 +       /* current transcrash. */
6554 +       txn_handle *trans;
6555 +       /* transaction handle embedded into reiser4_context. ->trans points
6556 +        * here by default. */
6557 +       txn_handle trans_in_ctx;
6558 +
6559 +       /* super block we are working with.  To get the current tree
6560 +          use &get_super_private (reiser4_get_current_sb ())->tree. */
6561 +       struct super_block *super;
6562 +
6563 +       /* parent fs activation */
6564 +       struct fs_activation *outer;
6565 +
6566 +       /* per-thread grabbed (for further allocation) blocks counter */
6567 +       reiser4_block_nr grabbed_blocks;
6568 +
6569 +       /* list of taps currently monitored. See tap.c */
6570 +       struct list_head taps;
6571 +
6572 +       /* grabbing space is enabled */
6573 +       unsigned int grab_enabled:1;
6574 +       /* should be set when we are write dirty nodes to disk in jnode_flush or
6575 +        * reiser4_write_logs() */
6576 +       unsigned int writeout_mode:1;
6577 +       /* true, if current thread is an ent thread */
6578 +       unsigned int entd:1;
6579 +       /* true, if balance_dirty_pages() should not be run when leaving this
6580 +        * context. This is used to avoid lengthly balance_dirty_pages()
6581 +        * operation when holding some important resource, like directory
6582 +        * ->i_mutex */
6583 +       unsigned int nobalance:1;
6584 +
6585 +       /* this bit is used on reiser4_done_context to decide whether context is
6586 +          kmalloc-ed and has to be kfree-ed */
6587 +       unsigned int on_stack:1;
6588 +
6589 +       /* count non-trivial jnode_set_dirty() calls */
6590 +       unsigned long nr_marked_dirty;
6591 +
6592 +       /* reiser4_sync_inodes calls (via generic_sync_sb_inodes)
6593 +        * reiser4_writepages for each of dirty inodes. Reiser4_writepages
6594 +        * captures pages. When number of pages captured in one
6595 +        * reiser4_sync_inodes reaches some threshold - some atoms get
6596 +        * flushed */
6597 +       int nr_captured;
6598 +       int nr_children;        /* number of child contexts */
6599 +#if REISER4_DEBUG
6600 +       /* debugging information about reiser4 locks held by the current
6601 +        * thread */
6602 +       reiser4_lock_cnt_info locks;
6603 +       struct task_struct *task;       /* so we can easily find owner of the stack */
6604 +
6605 +       /*
6606 +        * disk space grabbing debugging support
6607 +        */
6608 +       /* how many disk blocks were grabbed by the first call to
6609 +        * reiser4_grab_space() in this context */
6610 +       reiser4_block_nr grabbed_initially;
6611 +
6612 +       /* list of all threads doing flush currently */
6613 +       struct list_head flushers_link;
6614 +       /* information about last error encountered by reiser4 */
6615 +       err_site err;
6616 +#endif
6617 +       void *vp;
6618 +       gfp_t gfp_mask;
6619 +};
6620 +
6621 +extern reiser4_context *get_context_by_lock_stack(lock_stack *);
6622 +
6623 +/* Debugging helps. */
6624 +#if REISER4_DEBUG
6625 +extern void print_contexts(void);
6626 +#endif
6627 +
6628 +#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
6629 +#define current_blocksize reiser4_get_current_sb()->s_blocksize
6630 +#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
6631 +
6632 +extern reiser4_context *reiser4_init_context(struct super_block *);
6633 +extern void init_stack_context(reiser4_context *, struct super_block *);
6634 +extern void reiser4_exit_context(reiser4_context *);
6635 +
6636 +/* magic constant we store in reiser4_context allocated at the stack. Used to
6637 +   catch accesses to staled or uninitialized contexts. */
6638 +#define context_magic ((__u32) 0x4b1b5d0b)
6639 +
6640 +extern int is_in_reiser4_context(void);
6641 +
6642 +/*
6643 + * return reiser4_context for the thread @tsk
6644 + */
6645 +static inline reiser4_context *get_context(const struct task_struct *tsk)
6646 +{
6647 +       assert("vs-1682",
6648 +              ((reiser4_context *) tsk->journal_info)->magic == context_magic);
6649 +       return (reiser4_context *) tsk->journal_info;
6650 +}
6651 +
6652 +/*
6653 + * return reiser4 context of the current thread, or NULL if there is none.
6654 + */
6655 +static inline reiser4_context *get_current_context_check(void)
6656 +{
6657 +       if (is_in_reiser4_context())
6658 +               return get_context(current);
6659 +       else
6660 +               return NULL;
6661 +}
6662 +
6663 +static inline reiser4_context *get_current_context(void);      /* __attribute__((const)); */
6664 +
6665 +/* return context associated with current thread */
6666 +static inline reiser4_context *get_current_context(void)
6667 +{
6668 +       return get_context(current);
6669 +}
6670 +
6671 +static inline gfp_t reiser4_ctx_gfp_mask_get(void)
6672 +{
6673 +       reiser4_context *ctx;
6674 +
6675 +       ctx = get_current_context_check();
6676 +       return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask;
6677 +}
6678 +
6679 +void reiser4_ctx_gfp_mask_set(void);
6680 +void reiser4_ctx_gfp_mask_force (gfp_t mask);
6681 +
6682 +/*
6683 + * true if current thread is in the write-out mode. Thread enters write-out
6684 + * mode during jnode_flush and reiser4_write_logs().
6685 + */
6686 +static inline int is_writeout_mode(void)
6687 +{
6688 +       return get_current_context()->writeout_mode;
6689 +}
6690 +
6691 +/*
6692 + * enter write-out mode
6693 + */
6694 +static inline void writeout_mode_enable(void)
6695 +{
6696 +       assert("zam-941", !get_current_context()->writeout_mode);
6697 +       get_current_context()->writeout_mode = 1;
6698 +}
6699 +
6700 +/*
6701 + * leave write-out mode
6702 + */
6703 +static inline void writeout_mode_disable(void)
6704 +{
6705 +       assert("zam-942", get_current_context()->writeout_mode);
6706 +       get_current_context()->writeout_mode = 0;
6707 +}
6708 +
6709 +static inline void grab_space_enable(void)
6710 +{
6711 +       get_current_context()->grab_enabled = 1;
6712 +}
6713 +
6714 +static inline void grab_space_disable(void)
6715 +{
6716 +       get_current_context()->grab_enabled = 0;
6717 +}
6718 +
6719 +static inline void grab_space_set_enabled(int enabled)
6720 +{
6721 +       get_current_context()->grab_enabled = enabled;
6722 +}
6723 +
6724 +static inline int is_grab_enabled(reiser4_context * ctx)
6725 +{
6726 +       return ctx->grab_enabled;
6727 +}
6728 +
6729 +/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
6730 + * flush would be performed when it is closed. This is necessary when handle
6731 + * has to be closed under some coarse semaphore, like i_mutex of
6732 + * directory. Commit will be performed by ktxnmgrd. */
6733 +static inline void context_set_commit_async(reiser4_context * context)
6734 +{
6735 +       context->nobalance = 1;
6736 +       context->trans->flags |= TXNH_DONT_COMMIT;
6737 +}
6738 +
6739 +/* __REISER4_CONTEXT_H__ */
6740 +#endif
6741 +
6742 +/* Make Linus happy.
6743 +   Local variables:
6744 +   c-indentation-style: "K&R"
6745 +   mode-name: "LC"
6746 +   c-basic-offset: 8
6747 +   tab-width: 8
6748 +   fill-column: 120
6749 +   scroll-step: 1
6750 +   End:
6751 +*/
6752 diff -urN linux-2.6.27.orig/fs/reiser4/coord.c linux-2.6.27/fs/reiser4/coord.c
6753 --- linux-2.6.27.orig/fs/reiser4/coord.c        1970-01-01 03:00:00.000000000 +0300
6754 +++ linux-2.6.27/fs/reiser4/coord.c     2008-10-12 18:20:00.000000000 +0400
6755 @@ -0,0 +1,935 @@
6756 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6757 +
6758 +#include "forward.h"
6759 +#include "debug.h"
6760 +#include "dformat.h"
6761 +#include "tree.h"
6762 +#include "plugin/item/item.h"
6763 +#include "znode.h"
6764 +#include "coord.h"
6765 +
6766 +/* Internal constructor. */
6767 +static inline void
6768 +coord_init_values(coord_t * coord, const znode * node, pos_in_node_t item_pos,
6769 +                 pos_in_node_t unit_pos, between_enum between)
6770 +{
6771 +       coord->node = (znode *) node;
6772 +       coord_set_item_pos(coord, item_pos);
6773 +       coord->unit_pos = unit_pos;
6774 +       coord->between = between;
6775 +       ON_DEBUG(coord->plug_v = 0);
6776 +       ON_DEBUG(coord->body_v = 0);
6777 +
6778 +       /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */
6779 +}
6780 +
6781 +/* after shifting of node content, coord previously set properly may become
6782 +   invalid, try to "normalize" it. */
6783 +void coord_normalize(coord_t * coord)
6784 +{
6785 +       znode *node;
6786 +
6787 +       node = coord->node;
6788 +       assert("vs-683", node);
6789 +
6790 +       coord_clear_iplug(coord);
6791 +
6792 +       if (node_is_empty(node)) {
6793 +               coord_init_first_unit(coord, node);
6794 +       } else if ((coord->between == AFTER_ITEM)
6795 +                  || (coord->between == AFTER_UNIT)) {
6796 +               return;
6797 +       } else if (coord->item_pos == coord_num_items(coord)
6798 +                  && coord->between == BEFORE_ITEM) {
6799 +               coord_dec_item_pos(coord);
6800 +               coord->between = AFTER_ITEM;
6801 +       } else if (coord->unit_pos == coord_num_units(coord)
6802 +                  && coord->between == BEFORE_UNIT) {
6803 +               coord->unit_pos--;
6804 +               coord->between = AFTER_UNIT;
6805 +       } else if (coord->item_pos == coord_num_items(coord)
6806 +                  && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
6807 +               coord_dec_item_pos(coord);
6808 +               coord->unit_pos = 0;
6809 +               coord->between = AFTER_ITEM;
6810 +       }
6811 +}
6812 +
6813 +/* Copy a coordinate. */
6814 +void coord_dup(coord_t * coord, const coord_t * old_coord)
6815 +{
6816 +       assert("jmacd-9800", coord_check(old_coord));
6817 +       coord_dup_nocheck(coord, old_coord);
6818 +}
6819 +
6820 +/* Copy a coordinate without check. Useful when old_coord->node is not
6821 +   loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
6822 +void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord)
6823 +{
6824 +       coord->node = old_coord->node;
6825 +       coord_set_item_pos(coord, old_coord->item_pos);
6826 +       coord->unit_pos = old_coord->unit_pos;
6827 +       coord->between = old_coord->between;
6828 +       coord->iplugid = old_coord->iplugid;
6829 +       ON_DEBUG(coord->plug_v = old_coord->plug_v);
6830 +       ON_DEBUG(coord->body_v = old_coord->body_v);
6831 +}
6832 +
6833 +/* Initialize an invalid coordinate. */
6834 +void coord_init_invalid(coord_t * coord, const znode * node)
6835 +{
6836 +       coord_init_values(coord, node, 0, 0, INVALID_COORD);
6837 +}
6838 +
6839 +void coord_init_first_unit_nocheck(coord_t * coord, const znode * node)
6840 +{
6841 +       coord_init_values(coord, node, 0, 0, AT_UNIT);
6842 +}
6843 +
6844 +/* Initialize a coordinate to point at the first unit of the first item.  If the node is
6845 +   empty, it is positioned at the EMPTY_NODE. */
6846 +void coord_init_first_unit(coord_t * coord, const znode * node)
6847 +{
6848 +       int is_empty = node_is_empty(node);
6849 +
6850 +       coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
6851 +
6852 +       assert("jmacd-9801", coord_check(coord));
6853 +}
6854 +
6855 +/* Initialize a coordinate to point at the last unit of the last item.  If the node is
6856 +   empty, it is positioned at the EMPTY_NODE. */
6857 +void coord_init_last_unit(coord_t * coord, const znode * node)
6858 +{
6859 +       int is_empty = node_is_empty(node);
6860 +
6861 +       coord_init_values(coord, node,
6862 +                         (is_empty ? 0 : node_num_items(node) - 1), 0,
6863 +                         (is_empty ? EMPTY_NODE : AT_UNIT));
6864 +       if (!is_empty)
6865 +               coord->unit_pos = coord_last_unit_pos(coord);
6866 +       assert("jmacd-9802", coord_check(coord));
6867 +}
6868 +
6869 +/* Initialize a coordinate to before the first item.  If the node is empty, it is
6870 +   positioned at the EMPTY_NODE. */
6871 +void coord_init_before_first_item(coord_t * coord, const znode * node)
6872 +{
6873 +       int is_empty = node_is_empty(node);
6874 +
6875 +       coord_init_values(coord, node, 0, 0,
6876 +                         (is_empty ? EMPTY_NODE : BEFORE_UNIT));
6877 +
6878 +       assert("jmacd-9803", coord_check(coord));
6879 +}
6880 +
6881 +/* Initialize a coordinate to after the last item.  If the node is empty, it is positioned
6882 +   at the EMPTY_NODE. */
6883 +void coord_init_after_last_item(coord_t * coord, const znode * node)
6884 +{
6885 +       int is_empty = node_is_empty(node);
6886 +
6887 +       coord_init_values(coord, node,
6888 +                         (is_empty ? 0 : node_num_items(node) - 1), 0,
6889 +                         (is_empty ? EMPTY_NODE : AFTER_ITEM));
6890 +
6891 +       assert("jmacd-9804", coord_check(coord));
6892 +}
6893 +
6894 +/* Initialize a coordinate to after last unit in the item. Coord must be set
6895 +   already to existing item */
6896 +void coord_init_after_item_end(coord_t * coord)
6897 +{
6898 +       coord->between = AFTER_UNIT;
6899 +       coord->unit_pos = coord_last_unit_pos(coord);
6900 +}
6901 +
6902 +/* Initialize a coordinate to before the item. Coord must be set already to existing item */
6903 +void coord_init_before_item(coord_t * coord)
6904 +{
6905 +       coord->unit_pos = 0;
6906 +       coord->between = BEFORE_ITEM;
6907 +}
6908 +
6909 +/* Initialize a coordinate to after the item. Coord must be set already to existing item */
6910 +void coord_init_after_item(coord_t * coord)
6911 +{
6912 +       coord->unit_pos = 0;
6913 +       coord->between = AFTER_ITEM;
6914 +}
6915 +
6916 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
6917 +   it was not clear how actually */
6918 +void coord_init_zero(coord_t * coord)
6919 +{
6920 +       memset(coord, 0, sizeof(*coord));
6921 +}
6922 +
6923 +/* Return the number of units at the present item.  Asserts coord_is_existing_item(). */
6924 +unsigned coord_num_units(const coord_t * coord)
6925 +{
6926 +       assert("jmacd-9806", coord_is_existing_item(coord));
6927 +
6928 +       return item_plugin_by_coord(coord)->b.nr_units(coord);
6929 +}
6930 +
6931 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
6932 +/* Audited by: green(2002.06.15) */
6933 +int coord_is_invalid(const coord_t * coord)
6934 +{
6935 +       return coord->between == INVALID_COORD;
6936 +}
6937 +
6938 +/* Returns true if the coordinate is positioned at an existing item, not before or after
6939 +   an item.  It may be placed at, before, or after any unit within the item, whether
6940 +   existing or not. */
6941 +int coord_is_existing_item(const coord_t * coord)
6942 +{
6943 +       switch (coord->between) {
6944 +       case EMPTY_NODE:
6945 +       case BEFORE_ITEM:
6946 +       case AFTER_ITEM:
6947 +       case INVALID_COORD:
6948 +               return 0;
6949 +
6950 +       case BEFORE_UNIT:
6951 +       case AT_UNIT:
6952 +       case AFTER_UNIT:
6953 +               return coord->item_pos < coord_num_items(coord);
6954 +       }
6955 +
6956 +       impossible("jmacd-9900", "unreachable coord: %p", coord);
6957 +       return 0;
6958 +}
6959 +
6960 +/* Returns true if the coordinate is positioned at an existing unit, not before or after a
6961 +   unit. */
6962 +/* Audited by: green(2002.06.15) */
6963 +int coord_is_existing_unit(const coord_t * coord)
6964 +{
6965 +       switch (coord->between) {
6966 +       case EMPTY_NODE:
6967 +       case BEFORE_UNIT:
6968 +       case AFTER_UNIT:
6969 +       case BEFORE_ITEM:
6970 +       case AFTER_ITEM:
6971 +       case INVALID_COORD:
6972 +               return 0;
6973 +
6974 +       case AT_UNIT:
6975 +               return (coord->item_pos < coord_num_items(coord)
6976 +                       && coord->unit_pos < coord_num_units(coord));
6977 +       }
6978 +
6979 +       impossible("jmacd-9902", "unreachable");
6980 +       return 0;
6981 +}
6982 +
6983 +/* Returns true if the coordinate is positioned at the first unit of the first item.  Not
6984 +   true for empty nodes nor coordinates positioned before the first item. */
6985 +/* Audited by: green(2002.06.15) */
6986 +int coord_is_leftmost_unit(const coord_t * coord)
6987 +{
6988 +       return (coord->between == AT_UNIT && coord->item_pos == 0
6989 +               && coord->unit_pos == 0);
6990 +}
6991 +
6992 +#if REISER4_DEBUG
6993 +/* For assertions only, checks for a valid coordinate. */
6994 +int coord_check(const coord_t * coord)
6995 +{
6996 +       if (coord->node == NULL) {
6997 +               return 0;
6998 +       }
6999 +       if (znode_above_root(coord->node))
7000 +               return 1;
7001 +
7002 +       switch (coord->between) {
7003 +       default:
7004 +       case INVALID_COORD:
7005 +               return 0;
7006 +       case EMPTY_NODE:
7007 +               if (!node_is_empty(coord->node)) {
7008 +                       return 0;
7009 +               }
7010 +               return coord->item_pos == 0 && coord->unit_pos == 0;
7011 +
7012 +       case BEFORE_UNIT:
7013 +       case AFTER_UNIT:
7014 +               if (node_is_empty(coord->node) && (coord->item_pos == 0)
7015 +                   && (coord->unit_pos == 0))
7016 +                       return 1;
7017 +       case AT_UNIT:
7018 +               break;
7019 +       case AFTER_ITEM:
7020 +       case BEFORE_ITEM:
7021 +               /* before/after item should not set unit_pos. */
7022 +               if (coord->unit_pos != 0) {
7023 +                       return 0;
7024 +               }
7025 +               break;
7026 +       }
7027 +
7028 +       if (coord->item_pos >= node_num_items(coord->node)) {
7029 +               return 0;
7030 +       }
7031 +
7032 +       /* FIXME-VS: we are going to check unit_pos. This makes no sense when
7033 +          between is set either AFTER_ITEM or BEFORE_ITEM */
7034 +       if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
7035 +               return 1;
7036 +
7037 +       if (coord_is_iplug_set(coord) &&
7038 +           coord->unit_pos >
7039 +           item_plugin_by_coord(coord)->b.nr_units(coord) - 1) {
7040 +               return 0;
7041 +       }
7042 +       return 1;
7043 +}
7044 +#endif
7045 +
7046 +/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev.
7047 +   Returns 1 if the new position is does not exist. */
7048 +static int coord_adjust_items(coord_t * coord, unsigned items, int is_next)
7049 +{
7050 +       /* If the node is invalid, leave it. */
7051 +       if (coord->between == INVALID_COORD) {
7052 +               return 1;
7053 +       }
7054 +
7055 +       /* If the node is empty, set it appropriately. */
7056 +       if (items == 0) {
7057 +               coord->between = EMPTY_NODE;
7058 +               coord_set_item_pos(coord, 0);
7059 +               coord->unit_pos = 0;
7060 +               return 1;
7061 +       }
7062 +
7063 +       /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
7064 +       if (coord->between == EMPTY_NODE) {
7065 +               coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
7066 +               coord_set_item_pos(coord, 0);
7067 +               coord->unit_pos = 0;
7068 +               return 0;
7069 +       }
7070 +
7071 +       /* If the item_pos is out-of-range, set it appropriatly. */
7072 +       if (coord->item_pos >= items) {
7073 +               coord->between = AFTER_ITEM;
7074 +               coord_set_item_pos(coord, items - 1);
7075 +               coord->unit_pos = 0;
7076 +               /* If is_next, return 1 (can't go any further). */
7077 +               return is_next;
7078 +       }
7079 +
7080 +       return 0;
7081 +}
7082 +
7083 +/* Advances the coordinate by one unit to the right.  If empty, no change.  If
7084 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is an
7085 +   existing unit. */
7086 +int coord_next_unit(coord_t * coord)
7087 +{
7088 +       unsigned items = coord_num_items(coord);
7089 +
7090 +       if (coord_adjust_items(coord, items, 1) == 1) {
7091 +               return 1;
7092 +       }
7093 +
7094 +       switch (coord->between) {
7095 +       case BEFORE_UNIT:
7096 +               /* Now it is positioned at the same unit. */
7097 +               coord->between = AT_UNIT;
7098 +               return 0;
7099 +
7100 +       case AFTER_UNIT:
7101 +       case AT_UNIT:
7102 +               /* If it was at or after a unit and there are more units in this item,
7103 +                  advance to the next one. */
7104 +               if (coord->unit_pos < coord_last_unit_pos(coord)) {
7105 +                       coord->unit_pos += 1;
7106 +                       coord->between = AT_UNIT;
7107 +                       return 0;
7108 +               }
7109 +
7110 +               /* Otherwise, it is crossing an item boundary and treated as if it was
7111 +                  after the current item. */
7112 +               coord->between = AFTER_ITEM;
7113 +               coord->unit_pos = 0;
7114 +               /* FALLTHROUGH */
7115 +
7116 +       case AFTER_ITEM:
7117 +               /* Check for end-of-node. */
7118 +               if (coord->item_pos == items - 1) {
7119 +                       return 1;
7120 +               }
7121 +
7122 +               coord_inc_item_pos(coord);
7123 +               coord->unit_pos = 0;
7124 +               coord->between = AT_UNIT;
7125 +               return 0;
7126 +
7127 +       case BEFORE_ITEM:
7128 +               /* The adjust_items checks ensure that we are valid here. */
7129 +               coord->unit_pos = 0;
7130 +               coord->between = AT_UNIT;
7131 +               return 0;
7132 +
7133 +       case INVALID_COORD:
7134 +       case EMPTY_NODE:
7135 +               /* Handled in coord_adjust_items(). */
7136 +               break;
7137 +       }
7138 +
7139 +       impossible("jmacd-9902", "unreachable");
7140 +       return 0;
7141 +}
7142 +
7143 +/* Advances the coordinate by one item to the right.  If empty, no change.  If
7144 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
7145 +   an existing item. */
7146 +int coord_next_item(coord_t * coord)
7147 +{
7148 +       unsigned items = coord_num_items(coord);
7149 +
7150 +       if (coord_adjust_items(coord, items, 1) == 1) {
7151 +               return 1;
7152 +       }
7153 +
7154 +       switch (coord->between) {
7155 +       case AFTER_UNIT:
7156 +       case AT_UNIT:
7157 +       case BEFORE_UNIT:
7158 +       case AFTER_ITEM:
7159 +               /* Check for end-of-node. */
7160 +               if (coord->item_pos == items - 1) {
7161 +                       coord->between = AFTER_ITEM;
7162 +                       coord->unit_pos = 0;
7163 +                       coord_clear_iplug(coord);
7164 +                       return 1;
7165 +               }
7166 +
7167 +               /* Anywhere in an item, go to the next one. */
7168 +               coord->between = AT_UNIT;
7169 +               coord_inc_item_pos(coord);
7170 +               coord->unit_pos = 0;
7171 +               return 0;
7172 +
7173 +       case BEFORE_ITEM:
7174 +               /* The out-of-range check ensures that we are valid here. */
7175 +               coord->unit_pos = 0;
7176 +               coord->between = AT_UNIT;
7177 +               return 0;
7178 +       case INVALID_COORD:
7179 +       case EMPTY_NODE:
7180 +               /* Handled in coord_adjust_items(). */
7181 +               break;
7182 +       }
7183 +
7184 +       impossible("jmacd-9903", "unreachable");
7185 +       return 0;
7186 +}
7187 +
7188 +/* Advances the coordinate by one unit to the left.  If empty, no change.  If
7189 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
7190 +   is an existing unit. */
7191 +int coord_prev_unit(coord_t * coord)
7192 +{
7193 +       unsigned items = coord_num_items(coord);
7194 +
7195 +       if (coord_adjust_items(coord, items, 0) == 1) {
7196 +               return 1;
7197 +       }
7198 +
7199 +       switch (coord->between) {
7200 +       case AT_UNIT:
7201 +       case BEFORE_UNIT:
7202 +               if (coord->unit_pos > 0) {
7203 +                       coord->unit_pos -= 1;
7204 +                       coord->between = AT_UNIT;
7205 +                       return 0;
7206 +               }
7207 +
7208 +               if (coord->item_pos == 0) {
7209 +                       coord->between = BEFORE_ITEM;
7210 +                       return 1;
7211 +               }
7212 +
7213 +               coord_dec_item_pos(coord);
7214 +               coord->unit_pos = coord_last_unit_pos(coord);
7215 +               coord->between = AT_UNIT;
7216 +               return 0;
7217 +
7218 +       case AFTER_UNIT:
7219 +               /* What if unit_pos is out-of-range? */
7220 +               assert("jmacd-5442",
7221 +                      coord->unit_pos <= coord_last_unit_pos(coord));
7222 +               coord->between = AT_UNIT;
7223 +               return 0;
7224 +
7225 +       case BEFORE_ITEM:
7226 +               if (coord->item_pos == 0) {
7227 +                       return 1;
7228 +               }
7229 +
7230 +               coord_dec_item_pos(coord);
7231 +               /* FALLTHROUGH */
7232 +
7233 +       case AFTER_ITEM:
7234 +               coord->between = AT_UNIT;
7235 +               coord->unit_pos = coord_last_unit_pos(coord);
7236 +               return 0;
7237 +
7238 +       case INVALID_COORD:
7239 +       case EMPTY_NODE:
7240 +               break;
7241 +       }
7242 +
7243 +       impossible("jmacd-9904", "unreachable");
7244 +       return 0;
7245 +}
7246 +
7247 +/* Advances the coordinate by one item to the left.  If empty, no change.  If
7248 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
7249 +   is an existing item. */
7250 +int coord_prev_item(coord_t * coord)
7251 +{
7252 +       unsigned items = coord_num_items(coord);
7253 +
7254 +       if (coord_adjust_items(coord, items, 0) == 1) {
7255 +               return 1;
7256 +       }
7257 +
7258 +       switch (coord->between) {
7259 +       case AT_UNIT:
7260 +       case AFTER_UNIT:
7261 +       case BEFORE_UNIT:
7262 +       case BEFORE_ITEM:
7263 +
7264 +               if (coord->item_pos == 0) {
7265 +                       coord->between = BEFORE_ITEM;
7266 +                       coord->unit_pos = 0;
7267 +                       return 1;
7268 +               }
7269 +
7270 +               coord_dec_item_pos(coord);
7271 +               coord->unit_pos = 0;
7272 +               coord->between = AT_UNIT;
7273 +               return 0;
7274 +
7275 +       case AFTER_ITEM:
7276 +               coord->between = AT_UNIT;
7277 +               coord->unit_pos = 0;
7278 +               return 0;
7279 +
7280 +       case INVALID_COORD:
7281 +       case EMPTY_NODE:
7282 +               break;
7283 +       }
7284 +
7285 +       impossible("jmacd-9905", "unreachable");
7286 +       return 0;
7287 +}
7288 +
7289 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7290 +void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir)
7291 +{
7292 +       assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7293 +       if (dir == LEFT_SIDE) {
7294 +               coord_init_first_unit(coord, node);
7295 +       } else {
7296 +               coord_init_last_unit(coord, node);
7297 +       }
7298 +}
7299 +
7300 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
7301 +   argument. */
7302 +/* Audited by: green(2002.06.15) */
7303 +int coord_is_after_sideof_unit(coord_t * coord, sideof dir)
7304 +{
7305 +       assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7306 +       if (dir == LEFT_SIDE) {
7307 +               return coord_is_before_leftmost(coord);
7308 +       } else {
7309 +               return coord_is_after_rightmost(coord);
7310 +       }
7311 +}
7312 +
7313 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
7314 +/* Audited by: green(2002.06.15) */
7315 +int coord_sideof_unit(coord_t * coord, sideof dir)
7316 +{
7317 +       assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7318 +       if (dir == LEFT_SIDE) {
7319 +               return coord_prev_unit(coord);
7320 +       } else {
7321 +               return coord_next_unit(coord);
7322 +       }
7323 +}
7324 +
7325 +#if REISER4_DEBUG
7326 +int coords_equal(const coord_t * c1, const coord_t * c2)
7327 +{
7328 +       assert("nikita-2840", c1 != NULL);
7329 +       assert("nikita-2841", c2 != NULL);
7330 +
7331 +       return
7332 +           c1->node == c2->node &&
7333 +           c1->item_pos == c2->item_pos &&
7334 +           c1->unit_pos == c2->unit_pos && c1->between == c2->between;
7335 +}
7336 +#endif  /*  REISER4_DEBUG  */
7337 +
7338 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
7339 +   return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
7340 +/* Audited by: green(2002.06.15) */
7341 +coord_wrt_node coord_wrt(const coord_t * coord)
7342 +{
7343 +       if (coord_is_before_leftmost(coord)) {
7344 +               return COORD_ON_THE_LEFT;
7345 +       }
7346 +
7347 +       if (coord_is_after_rightmost(coord)) {
7348 +               return COORD_ON_THE_RIGHT;
7349 +       }
7350 +
7351 +       return COORD_INSIDE;
7352 +}
7353 +
7354 +/* Returns true if the coordinate is positioned after the last item or after the last unit
7355 +   of the last item or it is an empty node. */
7356 +/* Audited by: green(2002.06.15) */
7357 +int coord_is_after_rightmost(const coord_t * coord)
7358 +{
7359 +       assert("jmacd-7313", coord_check(coord));
7360 +
7361 +       switch (coord->between) {
7362 +       case INVALID_COORD:
7363 +       case AT_UNIT:
7364 +       case BEFORE_UNIT:
7365 +       case BEFORE_ITEM:
7366 +               return 0;
7367 +
7368 +       case EMPTY_NODE:
7369 +               return 1;
7370 +
7371 +       case AFTER_ITEM:
7372 +               return (coord->item_pos == node_num_items(coord->node) - 1);
7373 +
7374 +       case AFTER_UNIT:
7375 +               return ((coord->item_pos == node_num_items(coord->node) - 1) &&
7376 +                       coord->unit_pos == coord_last_unit_pos(coord));
7377 +       }
7378 +
7379 +       impossible("jmacd-9908", "unreachable");
7380 +       return 0;
7381 +}
7382 +
7383 +/* Returns true if the coordinate is positioned before the first item or it is an empty
7384 +   node. */
7385 +int coord_is_before_leftmost(const coord_t * coord)
7386 +{
7387 +       /* FIXME-VS: coord_check requires node to be loaded whereas it is not
7388 +          necessary to check if coord is set before leftmost
7389 +          assert ("jmacd-7313", coord_check (coord)); */
7390 +       switch (coord->between) {
7391 +       case INVALID_COORD:
7392 +       case AT_UNIT:
7393 +       case AFTER_ITEM:
7394 +       case AFTER_UNIT:
7395 +               return 0;
7396 +
7397 +       case EMPTY_NODE:
7398 +               return 1;
7399 +
7400 +       case BEFORE_ITEM:
7401 +       case BEFORE_UNIT:
7402 +               return (coord->item_pos == 0) && (coord->unit_pos == 0);
7403 +       }
7404 +
7405 +       impossible("jmacd-9908", "unreachable");
7406 +       return 0;
7407 +}
7408 +
7409 +/* Returns true if the coordinate is positioned after a item, before a item, after the
7410 +   last unit of an item, before the first unit of an item, or at an empty node. */
7411 +/* Audited by: green(2002.06.15) */
7412 +int coord_is_between_items(const coord_t * coord)
7413 +{
7414 +       assert("jmacd-7313", coord_check(coord));
7415 +
7416 +       switch (coord->between) {
7417 +       case INVALID_COORD:
7418 +       case AT_UNIT:
7419 +               return 0;
7420 +
7421 +       case AFTER_ITEM:
7422 +       case BEFORE_ITEM:
7423 +       case EMPTY_NODE:
7424 +               return 1;
7425 +
7426 +       case BEFORE_UNIT:
7427 +               return coord->unit_pos == 0;
7428 +
7429 +       case AFTER_UNIT:
7430 +               return coord->unit_pos == coord_last_unit_pos(coord);
7431 +       }
7432 +
7433 +       impossible("jmacd-9908", "unreachable");
7434 +       return 0;
7435 +}
7436 +
7437 +#if REISER4_DEBUG
7438 +/* Returns true if the coordinates are positioned at adjacent units, regardless of
7439 +   before-after or item boundaries. */
7440 +int coord_are_neighbors(coord_t * c1, coord_t * c2)
7441 +{
7442 +       coord_t *left;
7443 +       coord_t *right;
7444 +
7445 +       assert("nikita-1241", c1 != NULL);
7446 +       assert("nikita-1242", c2 != NULL);
7447 +       assert("nikita-1243", c1->node == c2->node);
7448 +       assert("nikita-1244", coord_is_existing_unit(c1));
7449 +       assert("nikita-1245", coord_is_existing_unit(c2));
7450 +
7451 +       left = right = NULL;
7452 +       switch (coord_compare(c1, c2)) {
7453 +       case COORD_CMP_ON_LEFT:
7454 +               left = c1;
7455 +               right = c2;
7456 +               break;
7457 +       case COORD_CMP_ON_RIGHT:
7458 +               left = c2;
7459 +               right = c1;
7460 +               break;
7461 +       case COORD_CMP_SAME:
7462 +               return 0;
7463 +       default:
7464 +               wrong_return_value("nikita-1246", "compare_coords()");
7465 +       }
7466 +       assert("vs-731", left && right);
7467 +       if (left->item_pos == right->item_pos) {
7468 +               return left->unit_pos + 1 == right->unit_pos;
7469 +       } else if (left->item_pos + 1 == right->item_pos) {
7470 +               return (left->unit_pos == coord_last_unit_pos(left))
7471 +                   && (right->unit_pos == 0);
7472 +       } else {
7473 +               return 0;
7474 +       }
7475 +}
7476 +#endif  /*  REISER4_DEBUG  */
7477 +
7478 +/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT,
7479 +   COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2.  */
7480 +/* Audited by: green(2002.06.15) */
7481 +coord_cmp coord_compare(coord_t * c1, coord_t * c2)
7482 +{
7483 +       assert("vs-209", c1->node == c2->node);
7484 +       assert("vs-194", coord_is_existing_unit(c1)
7485 +              && coord_is_existing_unit(c2));
7486 +
7487 +       if (c1->item_pos > c2->item_pos)
7488 +               return COORD_CMP_ON_RIGHT;
7489 +       if (c1->item_pos < c2->item_pos)
7490 +               return COORD_CMP_ON_LEFT;
7491 +       if (c1->unit_pos > c2->unit_pos)
7492 +               return COORD_CMP_ON_RIGHT;
7493 +       if (c1->unit_pos < c2->unit_pos)
7494 +               return COORD_CMP_ON_LEFT;
7495 +       return COORD_CMP_SAME;
7496 +}
7497 +
7498 +/* If the coordinate is between items, shifts it to the right.  Returns 0 on success and
7499 +   non-zero if there is no position to the right. */
7500 +int coord_set_to_right(coord_t * coord)
7501 +{
7502 +       unsigned items = coord_num_items(coord);
7503 +
7504 +       if (coord_adjust_items(coord, items, 1) == 1) {
7505 +               return 1;
7506 +       }
7507 +
7508 +       switch (coord->between) {
7509 +       case AT_UNIT:
7510 +               return 0;
7511 +
7512 +       case BEFORE_ITEM:
7513 +       case BEFORE_UNIT:
7514 +               coord->between = AT_UNIT;
7515 +               return 0;
7516 +
7517 +       case AFTER_UNIT:
7518 +               if (coord->unit_pos < coord_last_unit_pos(coord)) {
7519 +                       coord->unit_pos += 1;
7520 +                       coord->between = AT_UNIT;
7521 +                       return 0;
7522 +               } else {
7523 +
7524 +                       coord->unit_pos = 0;
7525 +
7526 +                       if (coord->item_pos == items - 1) {
7527 +                               coord->between = AFTER_ITEM;
7528 +                               return 1;
7529 +                       }
7530 +
7531 +                       coord_inc_item_pos(coord);
7532 +                       coord->between = AT_UNIT;
7533 +                       return 0;
7534 +               }
7535 +
7536 +       case AFTER_ITEM:
7537 +               if (coord->item_pos == items - 1) {
7538 +                       return 1;
7539 +               }
7540 +
7541 +               coord_inc_item_pos(coord);
7542 +               coord->unit_pos = 0;
7543 +               coord->between = AT_UNIT;
7544 +               return 0;
7545 +
7546 +       case EMPTY_NODE:
7547 +               return 1;
7548 +
7549 +       case INVALID_COORD:
7550 +               break;
7551 +       }
7552 +
7553 +       impossible("jmacd-9920", "unreachable");
7554 +       return 0;
7555 +}
7556 +
7557 +/* If the coordinate is between items, shifts it to the left.  Returns 0 on success and
7558 +   non-zero if there is no position to the left. */
7559 +int coord_set_to_left(coord_t * coord)
7560 +{
7561 +       unsigned items = coord_num_items(coord);
7562 +
7563 +       if (coord_adjust_items(coord, items, 0) == 1) {
7564 +               return 1;
7565 +       }
7566 +
7567 +       switch (coord->between) {
7568 +       case AT_UNIT:
7569 +               return 0;
7570 +
7571 +       case AFTER_UNIT:
7572 +               coord->between = AT_UNIT;
7573 +               return 0;
7574 +
7575 +       case AFTER_ITEM:
7576 +               coord->between = AT_UNIT;
7577 +               coord->unit_pos = coord_last_unit_pos(coord);
7578 +               return 0;
7579 +
7580 +       case BEFORE_UNIT:
7581 +               if (coord->unit_pos > 0) {
7582 +                       coord->unit_pos -= 1;
7583 +                       coord->between = AT_UNIT;
7584 +                       return 0;
7585 +               } else {
7586 +
7587 +                       if (coord->item_pos == 0) {
7588 +                               coord->between = BEFORE_ITEM;
7589 +                               return 1;
7590 +                       }
7591 +
7592 +                       coord->unit_pos = coord_last_unit_pos(coord);
7593 +                       coord_dec_item_pos(coord);
7594 +                       coord->between = AT_UNIT;
7595 +                       return 0;
7596 +               }
7597 +
7598 +       case BEFORE_ITEM:
7599 +               if (coord->item_pos == 0) {
7600 +                       return 1;
7601 +               }
7602 +
7603 +               coord_dec_item_pos(coord);
7604 +               coord->unit_pos = coord_last_unit_pos(coord);
7605 +               coord->between = AT_UNIT;
7606 +               return 0;
7607 +
7608 +       case EMPTY_NODE:
7609 +               return 1;
7610 +
7611 +       case INVALID_COORD:
7612 +               break;
7613 +       }
7614 +
7615 +       impossible("jmacd-9920", "unreachable");
7616 +       return 0;
7617 +}
7618 +
7619 +static const char *coord_tween_tostring(between_enum n)
7620 +{
7621 +       switch (n) {
7622 +       case BEFORE_UNIT:
7623 +               return "before unit";
7624 +       case BEFORE_ITEM:
7625 +               return "before item";
7626 +       case AT_UNIT:
7627 +               return "at unit";
7628 +       case AFTER_UNIT:
7629 +               return "after unit";
7630 +       case AFTER_ITEM:
7631 +               return "after item";
7632 +       case EMPTY_NODE:
7633 +               return "empty node";
7634 +       case INVALID_COORD:
7635 +               return "invalid";
7636 +       default:
7637 +       {
7638 +               static char buf[30];
7639 +
7640 +               sprintf(buf, "unknown: %i", n);
7641 +               return buf;
7642 +       }
7643 +       }
7644 +}
7645 +
7646 +void print_coord(const char *mes, const coord_t * coord, int node)
7647 +{
7648 +       if (coord == NULL) {
7649 +               printk("%s: null\n", mes);
7650 +               return;
7651 +       }
7652 +       printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
7653 +              mes, coord->item_pos, coord->unit_pos,
7654 +              coord_tween_tostring(coord->between), coord->iplugid);
7655 +}
7656 +
7657 +int
7658 +item_utmost_child_real_block(const coord_t * coord, sideof side,
7659 +                            reiser4_block_nr * blk)
7660 +{
7661 +       return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord,
7662 +                                                                     side,
7663 +                                                                     blk);
7664 +}
7665 +
7666 +int item_utmost_child(const coord_t * coord, sideof side, jnode ** child)
7667 +{
7668 +       return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
7669 +}
7670 +
7671 +/* @count bytes of flow @f got written, update correspondingly f->length,
7672 +   f->data and f->key */
7673 +void move_flow_forward(flow_t * f, unsigned count)
7674 +{
7675 +       if (f->data)
7676 +               f->data += count;
7677 +       f->length -= count;
7678 +       set_key_offset(&f->key, get_key_offset(&f->key) + count);
7679 +}
7680 +
7681 +/*
7682 +   Local variables:
7683 +   c-indentation-style: "K&R"
7684 +   mode-name: "LC"
7685 +   c-basic-offset: 8
7686 +   tab-width: 8
7687 +   fill-column: 120
7688 +   scroll-step: 1
7689 +   End:
7690 +*/
7691 diff -urN linux-2.6.27.orig/fs/reiser4/coord.h linux-2.6.27/fs/reiser4/coord.h
7692 --- linux-2.6.27.orig/fs/reiser4/coord.h        1970-01-01 03:00:00.000000000 +0300
7693 +++ linux-2.6.27/fs/reiser4/coord.h     2008-10-12 18:20:00.000000000 +0400
7694 @@ -0,0 +1,389 @@
7695 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
7696 +
7697 +/* Coords */
7698 +
7699 +#if !defined( __REISER4_COORD_H__ )
7700 +#define __REISER4_COORD_H__
7701 +
7702 +#include "forward.h"
7703 +#include "debug.h"
7704 +#include "dformat.h"
7705 +#include "key.h"
7706 +
7707 +/* insertions happen between coords in the tree, so we need some means
7708 +   of specifying the sense of betweenness. */
7709 +typedef enum {
7710 +       BEFORE_UNIT,            /* Note: we/init_coord depends on this value being zero. */
7711 +       AT_UNIT,
7712 +       AFTER_UNIT,
7713 +       BEFORE_ITEM,
7714 +       AFTER_ITEM,
7715 +       INVALID_COORD,
7716 +       EMPTY_NODE,
7717 +} between_enum;
7718 +
7719 +/* location of coord w.r.t. its node */
7720 +typedef enum {
7721 +       COORD_ON_THE_LEFT = -1,
7722 +       COORD_ON_THE_RIGHT = +1,
7723 +       COORD_INSIDE = 0
7724 +} coord_wrt_node;
7725 +
7726 +typedef enum {
7727 +       COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
7728 +} coord_cmp;
7729 +
7730 +struct coord {
7731 +       /* node in a tree */
7732 +       /*  0 */ znode *node;
7733 +
7734 +       /* position of item within node */
7735 +       /*  4 */ pos_in_node_t item_pos;
7736 +       /* position of unit within item */
7737 +       /*  6 */ pos_in_node_t unit_pos;
7738 +       /* optimization: plugin of item is stored in coord_t. Until this was
7739 +          implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
7740 +          is invalidated (set to 0xff) on each modification of ->item_pos,
7741 +          and all such modifications are funneled through coord_*_item_pos()
7742 +          functions below.
7743 +        */
7744 +       /*  8 */ char iplugid;
7745 +       /* position of coord w.r.t. to neighboring items and/or units.
7746 +          Values are taken from &between_enum above.
7747 +        */
7748 +       /*  9 */ char between;
7749 +       /* padding. It will be added by the compiler anyway to conform to the
7750 +        * C language alignment requirements. We keep it here to be on the
7751 +        * safe side and to have a clear picture of the memory layout of this
7752 +        * structure. */
7753 +       /* 10 */ __u16 pad;
7754 +       /* 12 */ int offset;
7755 +#if REISER4_DEBUG
7756 +       unsigned long plug_v;
7757 +       unsigned long body_v;
7758 +#endif
7759 +};
7760 +
7761 +#define INVALID_PLUGID  ((char)((1 << 8) - 1))
7762 +#define INVALID_OFFSET -1
7763 +
7764 +static inline void coord_clear_iplug(coord_t * coord)
7765 +{
7766 +       assert("nikita-2835", coord != NULL);
7767 +       coord->iplugid = INVALID_PLUGID;
7768 +       coord->offset = INVALID_OFFSET;
7769 +}
7770 +
7771 +static inline int coord_is_iplug_set(const coord_t * coord)
7772 +{
7773 +       assert("nikita-2836", coord != NULL);
7774 +       return coord->iplugid != INVALID_PLUGID;
7775 +}
7776 +
7777 +static inline void coord_set_item_pos(coord_t * coord, pos_in_node_t pos)
7778 +{
7779 +       assert("nikita-2478", coord != NULL);
7780 +       coord->item_pos = pos;
7781 +       coord_clear_iplug(coord);
7782 +}
7783 +
7784 +static inline void coord_dec_item_pos(coord_t * coord)
7785 +{
7786 +       assert("nikita-2480", coord != NULL);
7787 +       --coord->item_pos;
7788 +       coord_clear_iplug(coord);
7789 +}
7790 +
7791 +static inline void coord_inc_item_pos(coord_t * coord)
7792 +{
7793 +       assert("nikita-2481", coord != NULL);
7794 +       ++coord->item_pos;
7795 +       coord_clear_iplug(coord);
7796 +}
7797 +
7798 +static inline void coord_add_item_pos(coord_t * coord, int delta)
7799 +{
7800 +       assert("nikita-2482", coord != NULL);
7801 +       coord->item_pos += delta;
7802 +       coord_clear_iplug(coord);
7803 +}
7804 +
7805 +static inline void coord_invalid_item_pos(coord_t * coord)
7806 +{
7807 +       assert("nikita-2832", coord != NULL);
7808 +       coord->item_pos = (unsigned short)~0;
7809 +       coord_clear_iplug(coord);
7810 +}
7811 +
7812 +/* Reverse a direction. */
7813 +static inline sideof sideof_reverse(sideof side)
7814 +{
7815 +       return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
7816 +}
7817 +
7818 +/* NOTE: There is a somewhat odd mixture of the following opposed terms:
7819 +
7820 +   "first" and "last"
7821 +   "next" and "prev"
7822 +   "before" and "after"
7823 +   "leftmost" and "rightmost"
7824 +
7825 +   But I think the chosen names are decent the way they are.
7826 +*/
7827 +
7828 +/* COORD INITIALIZERS */
7829 +
7830 +/* Initialize an invalid coordinate. */
7831 +extern void coord_init_invalid(coord_t * coord, const znode * node);
7832 +
7833 +extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node);
7834 +
7835 +/* Initialize a coordinate to point at the first unit of the first item.  If the node is
7836 +   empty, it is positioned at the EMPTY_NODE. */
7837 +extern void coord_init_first_unit(coord_t * coord, const znode * node);
7838 +
7839 +/* Initialize a coordinate to point at the last unit of the last item.  If the node is
7840 +   empty, it is positioned at the EMPTY_NODE. */
7841 +extern void coord_init_last_unit(coord_t * coord, const znode * node);
7842 +
7843 +/* Initialize a coordinate to before the first item.  If the node is empty, it is
7844 +   positioned at the EMPTY_NODE. */
7845 +extern void coord_init_before_first_item(coord_t * coord, const znode * node);
7846 +
7847 +/* Initialize a coordinate to after the last item.  If the node is empty, it is positioned
7848 +   at the EMPTY_NODE. */
7849 +extern void coord_init_after_last_item(coord_t * coord, const znode * node);
7850 +
7851 +/* Initialize a coordinate to after last unit in the item. Coord must be set
7852 +   already to existing item */
7853 +void coord_init_after_item_end(coord_t * coord);
7854 +
7855 +/* Initialize a coordinate to before the item. Coord must be set already to existing item */
7856 +void coord_init_before_item(coord_t *);
7857 +/* Initialize a coordinate to after the item. Coord must be set already to existing item */
7858 +void coord_init_after_item(coord_t *);
7859 +
7860 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7861 +extern void coord_init_sideof_unit(coord_t * coord, const znode * node,
7862 +                                  sideof dir);
7863 +
7864 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
7865 +   it was not clear how actually
7866 +   FIXME-VS: added by vs (2002, june, 8) */
7867 +extern void coord_init_zero(coord_t * coord);
7868 +
7869 +/* COORD METHODS */
7870 +
7871 +/* after shifting of node content, coord previously set properly may become
7872 +   invalid, try to "normalize" it. */
7873 +void coord_normalize(coord_t * coord);
7874 +
7875 +/* Copy a coordinate. */
7876 +extern void coord_dup(coord_t * coord, const coord_t * old_coord);
7877 +
7878 +/* Copy a coordinate without check. */
7879 +void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord);
7880 +
7881 +unsigned coord_num_units(const coord_t * coord);
7882 +
7883 +/* Return the last valid unit number at the present item (i.e.,
7884 +   coord_num_units() - 1). */
7885 +static inline unsigned coord_last_unit_pos(const coord_t * coord)
7886 +{
7887 +       return coord_num_units(coord) - 1;
7888 +}
7889 +
7890 +#if REISER4_DEBUG
7891 +/* For assertions only, checks for a valid coordinate. */
7892 +extern int coord_check(const coord_t * coord);
7893 +
7894 +extern unsigned long znode_times_locked(const znode * z);
7895 +
7896 +static inline void coord_update_v(coord_t * coord)
7897 +{
7898 +       coord->plug_v = coord->body_v = znode_times_locked(coord->node);
7899 +}
7900 +#endif
7901 +
7902 +extern int coords_equal(const coord_t * c1, const coord_t * c2);
7903 +
7904 +extern void print_coord(const char *mes, const coord_t * coord, int print_node);
7905 +
7906 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
7907 +   return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
7908 +extern coord_wrt_node coord_wrt(const coord_t * coord);
7909 +
7910 +/* Returns true if the coordinates are positioned at adjacent units, regardless of
7911 +   before-after or item boundaries. */
7912 +extern int coord_are_neighbors(coord_t * c1, coord_t * c2);
7913 +
7914 +/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT,
7915 +   NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2.  */
7916 +extern coord_cmp coord_compare(coord_t * c1, coord_t * c2);
7917 +
7918 +/* COORD PREDICATES */
7919 +
7920 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
7921 +extern int coord_is_invalid(const coord_t * coord);
7922 +
7923 +/* Returns true if the coordinate is positioned at an existing item, not before or after
7924 +   an item.  It may be placed at, before, or after any unit within the item, whether
7925 +   existing or not.  If this is true you can call methods of the item plugin.  */
7926 +extern int coord_is_existing_item(const coord_t * coord);
7927 +
7928 +/* Returns true if the coordinate is positioned after a item, before a item, after the
7929 +   last unit of an item, before the first unit of an item, or at an empty node. */
7930 +extern int coord_is_between_items(const coord_t * coord);
7931 +
7932 +/* Returns true if the coordinate is positioned at an existing unit, not before or after a
7933 +   unit. */
7934 +extern int coord_is_existing_unit(const coord_t * coord);
7935 +
7936 +/* Returns true if the coordinate is positioned at an empty node. */
7937 +extern int coord_is_empty(const coord_t * coord);
7938 +
7939 +/* Returns true if the coordinate is positioned at the first unit of the first item.  Not
7940 +   true for empty nodes nor coordinates positioned before the first item. */
7941 +extern int coord_is_leftmost_unit(const coord_t * coord);
7942 +
7943 +/* Returns true if the coordinate is positioned after the last item or after the last unit
7944 +   of the last item or it is an empty node. */
7945 +extern int coord_is_after_rightmost(const coord_t * coord);
7946 +
7947 +/* Returns true if the coordinate is positioned before the first item or it is an empty
7948 +   node. */
7949 +extern int coord_is_before_leftmost(const coord_t * coord);
7950 +
7951 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
7952 +   argument. */
7953 +extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir);
7954 +
7955 +/* COORD MODIFIERS */
7956 +
7957 +/* Advances the coordinate by one unit to the right.  If empty, no change.  If
7958 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
7959 +   an existing unit. */
7960 +extern int coord_next_unit(coord_t * coord);
7961 +
7962 +/* Advances the coordinate by one item to the right.  If empty, no change.  If
7963 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
7964 +   an existing item. */
7965 +extern int coord_next_item(coord_t * coord);
7966 +
7967 +/* Advances the coordinate by one unit to the left.  If empty, no change.  If
7968 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
7969 +   is an existing unit. */
7970 +extern int coord_prev_unit(coord_t * coord);
7971 +
7972 +/* Advances the coordinate by one item to the left.  If empty, no change.  If
7973 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
7974 +   is an existing item. */
7975 +extern int coord_prev_item(coord_t * coord);
7976 +
7977 +/* If the coordinate is between items, shifts it to the right.  Returns 0 on success and
7978 +   non-zero if there is no position to the right. */
7979 +extern int coord_set_to_right(coord_t * coord);
7980 +
7981 +/* If the coordinate is between items, shifts it to the left.  Returns 0 on success and
7982 +   non-zero if there is no position to the left. */
7983 +extern int coord_set_to_left(coord_t * coord);
7984 +
7985 +/* If the coordinate is at an existing unit, set to after that unit.  Returns 0 on success
7986 +   and non-zero if the unit did not exist. */
7987 +extern int coord_set_after_unit(coord_t * coord);
7988 +
7989 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
7990 +extern int coord_sideof_unit(coord_t * coord, sideof dir);
7991 +
7992 +/* iterate over all units in @node */
7993 +#define for_all_units( coord, node )                                   \
7994 +       for( coord_init_before_first_item( ( coord ), ( node ) ) ;      \
7995 +            coord_next_unit( coord ) == 0 ; )
7996 +
7997 +/* iterate over all items in @node */
7998 +#define for_all_items( coord, node )                                   \
7999 +       for( coord_init_before_first_item( ( coord ), ( node ) ) ;      \
8000 +            coord_next_item( coord ) == 0 ; )
8001 +
8002 +/* COORD/ITEM METHODS */
8003 +
8004 +extern int item_utmost_child_real_block(const coord_t * coord, sideof side,
8005 +                                       reiser4_block_nr * blk);
8006 +extern int item_utmost_child(const coord_t * coord, sideof side,
8007 +                            jnode ** child);
8008 +
8009 +/* a flow is a sequence of bytes being written to or read from the tree.  The
8010 +   tree will slice the flow into items while storing it into nodes, but all of
8011 +   that is hidden from anything outside the tree.  */
8012 +
8013 +struct flow {
8014 +       reiser4_key key;        /* key of start of flow's sequence of bytes */
8015 +       loff_t length;          /* length of flow's sequence of bytes */
8016 +       char *data;             /* start of flow's sequence of bytes */
8017 +       int user;               /* if 1 data is user space, 0 - kernel space */
8018 +       rw_op op;               /* NIKITA-FIXME-HANS: comment is where?  */
8019 +};
8020 +
8021 +void move_flow_forward(flow_t * f, unsigned count);
8022 +
8023 +/* &reiser4_item_data - description of data to be inserted or pasted
8024 +
8025 +   Q: articulate the reasons for the difference between this and flow.
8026 +
8027 +   A: Becides flow we insert into tree other things: stat data, directory
8028 +   entry, etc.  To insert them into tree one has to provide this structure. If
8029 +   one is going to insert flow - he can use insert_flow, where this structure
8030 +   does not have to be created
8031 +*/
8032 +struct reiser4_item_data {
8033 +       /* actual data to be inserted. If NULL, ->create_item() will not
8034 +          do xmemcpy itself, leaving this up to the caller. This can
8035 +          save some amount of unnecessary memory copying, for example,
8036 +          during insertion of stat data.
8037 +
8038 +        */
8039 +       char *data;
8040 +       /* 1 if 'char * data' contains pointer to user space and 0 if it is
8041 +          kernel space */
8042 +       int user;
8043 +       /* amount of data we are going to insert or paste */
8044 +       int length;
8045 +       /* "Arg" is opaque data that is passed down to the
8046 +          ->create_item() method of node layout, which in turn
8047 +          hands it to the ->create_hook() of item being created. This
8048 +          arg is currently used by:
8049 +
8050 +          .  ->create_hook() of internal item
8051 +          (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
8052 +          . ->paste() method of directory item.
8053 +          . ->create_hook() of extent item
8054 +
8055 +          For internal item, this is left "brother" of new node being
8056 +          inserted and it is used to add new node into sibling list
8057 +          after parent to it was just inserted into parent.
8058 +
8059 +          While ->arg does look somewhat of unnecessary compication,
8060 +          it actually saves a lot of headache in many places, because
8061 +          all data necessary to insert or paste new data into tree are
8062 +          collected in one place, and this eliminates a lot of extra
8063 +          argument passing and storing everywhere.
8064 +
8065 +        */
8066 +       void *arg;
8067 +       /* plugin of item we are inserting */
8068 +       item_plugin *iplug;
8069 +};
8070 +
8071 +/* __REISER4_COORD_H__ */
8072 +#endif
8073 +
8074 +/* Make Linus happy.
8075 +   Local variables:
8076 +   c-indentation-style: "K&R"
8077 +   mode-name: "LC"
8078 +   c-basic-offset: 8
8079 +   tab-width: 8
8080 +   fill-column: 120
8081 +   scroll-step: 1
8082 +   End:
8083 +*/
8084 diff -urN linux-2.6.27.orig/fs/reiser4/debug.c linux-2.6.27/fs/reiser4/debug.c
8085 --- linux-2.6.27.orig/fs/reiser4/debug.c        1970-01-01 03:00:00.000000000 +0300
8086 +++ linux-2.6.27/fs/reiser4/debug.c     2008-10-12 18:20:00.000000000 +0400
8087 @@ -0,0 +1,308 @@
8088 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8089 + * reiser4/README */
8090 +
8091 +/* Debugging facilities. */
8092 +
8093 +/*
8094 + * This file contains generic debugging functions used by reiser4. Roughly
8095 + * following:
8096 + *
8097 + *     panicking: reiser4_do_panic(), reiser4_print_prefix().
8098 + *
8099 + *     locking:
8100 + *     reiser4_schedulable(), reiser4_lock_counters(), print_lock_counters(),
8101 + *     reiser4_no_counters_are_held(), reiser4_commit_check_locks()
8102 + *
8103 + *     error code monitoring (see comment before RETERR macro):
8104 + *     reiser4_return_err(), reiser4_report_err().
8105 + *
8106 + *     stack back-tracing: fill_backtrace()
8107 + *
8108 + *     miscellaneous: reiser4_preempt_point(), call_on_each_assert(),
8109 + *     reiser4_debugtrap().
8110 + *
8111 + */
8112 +
8113 +#include "reiser4.h"
8114 +#include "context.h"
8115 +#include "super.h"
8116 +#include "txnmgr.h"
8117 +#include "znode.h"
8118 +
8119 +#include <linux/sysfs.h>
8120 +#include <linux/slab.h>
8121 +#include <linux/types.h>
8122 +#include <linux/fs.h>
8123 +#include <linux/spinlock.h>
8124 +#include <linux/kallsyms.h>
8125 +#include <linux/vmalloc.h>
8126 +#include <linux/ctype.h>
8127 +#include <linux/sysctl.h>
8128 +#include <linux/hardirq.h>
8129 +
8130 +#if 0
8131 +#if REISER4_DEBUG
8132 +static void reiser4_report_err(void);
8133 +#else
8134 +#define reiser4_report_err() noop
8135 +#endif
8136 +#endif  /*  0  */
8137 +
8138 +/*
8139 + * global buffer where message given to reiser4_panic is formatted.
8140 + */
8141 +static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
8142 +
8143 +/*
8144 + * lock protecting consistency of panic_buf under concurrent panics
8145 + */
8146 +static DEFINE_SPINLOCK(panic_guard);
8147 +
8148 +/* Your best friend. Call it on each occasion.  This is called by
8149 +    fs/reiser4/debug.h:reiser4_panic(). */
8150 +void reiser4_do_panic(const char *format /* format string */ , ... /* rest */ )
8151 +{
8152 +       static int in_panic = 0;
8153 +       va_list args;
8154 +
8155 +       /*
8156 +        * check for recursive panic.
8157 +        */
8158 +       if (in_panic == 0) {
8159 +               in_panic = 1;
8160 +
8161 +               spin_lock(&panic_guard);
8162 +               va_start(args, format);
8163 +               vsnprintf(panic_buf, sizeof(panic_buf), format, args);
8164 +               va_end(args);
8165 +               printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
8166 +               spin_unlock(&panic_guard);
8167 +
8168 +               /*
8169 +                * if kernel debugger is configured---drop in. Early dropping
8170 +                * into kgdb is not always convenient, because panic message
8171 +                * is not yet printed most of the times. But:
8172 +                *
8173 +                *     (1) message can be extracted from printk_buf[]
8174 +                *     (declared static inside of printk()), and
8175 +                *
8176 +                *     (2) sometimes serial/kgdb combo dies while printing
8177 +                *     long panic message, so it's more prudent to break into
8178 +                *     debugger earlier.
8179 +                *
8180 +                */
8181 +               DEBUGON(1);
8182 +       }
8183 +       /* to make gcc happy about noreturn attribute */
8184 +       panic("%s", panic_buf);
8185 +}
8186 +
8187 +#if 0
8188 +void
8189 +reiser4_print_prefix(const char *level, int reperr, const char *mid,
8190 +                    const char *function, const char *file, int lineno)
8191 +{
8192 +       const char *comm;
8193 +       int pid;
8194 +
8195 +       if (unlikely(in_interrupt() || in_irq())) {
8196 +               comm = "interrupt";
8197 +               pid = 0;
8198 +       } else {
8199 +               comm = current->comm;
8200 +               pid = current->pid;
8201 +       }
8202 +       printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
8203 +              level, comm, pid, function, file, lineno, mid);
8204 +       if (reperr)
8205 +               reiser4_report_err();
8206 +}
8207 +#endif  /*  0  */
8208 +
8209 +/* Preemption point: this should be called periodically during long running
8210 +   operations (carry, allocate, and squeeze are best examples) */
8211 +int reiser4_preempt_point(void)
8212 +{
8213 +       assert("nikita-3008", reiser4_schedulable());
8214 +       cond_resched();
8215 +       return signal_pending(current);
8216 +}
8217 +
8218 +#if REISER4_DEBUG
8219 +/* Debugging aid: return struct where information about locks taken by current
8220 +   thread is accumulated. This can be used to formulate lock ordering
8221 +   constraints and various assertions.
8222 +
8223 +*/
8224 +reiser4_lock_cnt_info *reiser4_lock_counters(void)
8225 +{
8226 +       reiser4_context *ctx = get_current_context();
8227 +       assert("jmacd-1123", ctx != NULL);
8228 +       return &ctx->locks;
8229 +}
8230 +
8231 +/*
8232 + * print human readable information about locks held by the reiser4 context.
8233 + */
8234 +static void print_lock_counters(const char *prefix,
8235 +                               const reiser4_lock_cnt_info * info)
8236 +{
8237 +       printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
8238 +              "jload: %i, "
8239 +              "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
8240 +              "ktxnmgrd: %i, fq: %i\n"
8241 +              "inode: %i, "
8242 +              "cbk_cache: %i (r:%i,w%i), "
8243 +              "eflush: %i, "
8244 +              "zlock: %i,\n"
8245 +              "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
8246 +              "d: %i, x: %i, t: %i\n", prefix,
8247 +              info->spin_locked_jnode,
8248 +              info->rw_locked_tree, info->read_locked_tree,
8249 +              info->write_locked_tree,
8250 +              info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
8251 +              info->spin_locked_jload,
8252 +              info->spin_locked_txnh,
8253 +              info->spin_locked_atom, info->spin_locked_stack,
8254 +              info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
8255 +              info->spin_locked_fq,
8256 +              info->spin_locked_inode,
8257 +              info->rw_locked_cbk_cache,
8258 +              info->read_locked_cbk_cache,
8259 +              info->write_locked_cbk_cache,
8260 +              info->spin_locked_super_eflush,
8261 +              info->spin_locked_zlock,
8262 +              info->spin_locked,
8263 +              info->long_term_locked_znode,
8264 +              info->inode_sem_r, info->inode_sem_w,
8265 +              info->d_refs, info->x_refs, info->t_refs);
8266 +}
8267 +
8268 +/* check that no spinlocks are held */
8269 +int reiser4_schedulable(void)
8270 +{
8271 +       if (get_current_context_check() != NULL) {
8272 +               if (!LOCK_CNT_NIL(spin_locked)) {
8273 +                       print_lock_counters("in atomic", reiser4_lock_counters());
8274 +                       return 0;
8275 +               }
8276 +       }
8277 +       might_sleep();
8278 +       return 1;
8279 +}
8280 +/*
8281 + * return true, iff no locks are held.
8282 + */
8283 +int reiser4_no_counters_are_held(void)
8284 +{
8285 +       reiser4_lock_cnt_info *counters;
8286 +
8287 +       counters = reiser4_lock_counters();
8288 +       return
8289 +           (counters->spin_locked_zlock == 0) &&
8290 +           (counters->spin_locked_jnode == 0) &&
8291 +           (counters->rw_locked_tree == 0) &&
8292 +           (counters->read_locked_tree == 0) &&
8293 +           (counters->write_locked_tree == 0) &&
8294 +           (counters->rw_locked_dk == 0) &&
8295 +           (counters->read_locked_dk == 0) &&
8296 +           (counters->write_locked_dk == 0) &&
8297 +           (counters->spin_locked_txnh == 0) &&
8298 +           (counters->spin_locked_atom == 0) &&
8299 +           (counters->spin_locked_stack == 0) &&
8300 +           (counters->spin_locked_txnmgr == 0) &&
8301 +           (counters->spin_locked_inode == 0) &&
8302 +           (counters->spin_locked == 0) &&
8303 +           (counters->long_term_locked_znode == 0) &&
8304 +           (counters->inode_sem_r == 0) &&
8305 +           (counters->inode_sem_w == 0) && (counters->d_refs == 0);
8306 +}
8307 +
8308 +/*
8309 + * return true, iff transaction commit can be done under locks held by the
8310 + * current thread.
8311 + */
8312 +int reiser4_commit_check_locks(void)
8313 +{
8314 +       reiser4_lock_cnt_info *counters;
8315 +       int inode_sem_r;
8316 +       int inode_sem_w;
8317 +       int result;
8318 +
8319 +       /*
8320 +        * inode's read/write semaphore is the only reiser4 lock that can be
8321 +        * held during commit.
8322 +        */
8323 +
8324 +       counters = reiser4_lock_counters();
8325 +       inode_sem_r = counters->inode_sem_r;
8326 +       inode_sem_w = counters->inode_sem_w;
8327 +
8328 +       counters->inode_sem_r = counters->inode_sem_w = 0;
8329 +       result = reiser4_no_counters_are_held();
8330 +       counters->inode_sem_r = inode_sem_r;
8331 +       counters->inode_sem_w = inode_sem_w;
8332 +       return result;
8333 +}
8334 +
8335 +/*
8336 + * fill "error site" in the current reiser4 context. See comment before RETERR
8337 + * macro for more details.
8338 + */
8339 +void reiser4_return_err(int code, const char *file, int line)
8340 +{
8341 +       if (code < 0 && is_in_reiser4_context()) {
8342 +               reiser4_context *ctx = get_current_context();
8343 +
8344 +               if (ctx != NULL) {
8345 +                       ctx->err.code = code;
8346 +                       ctx->err.file = file;
8347 +                       ctx->err.line = line;
8348 +               }
8349 +       }
8350 +}
8351 +
8352 +#if 0
8353 +/*
8354 + * report error information recorder by reiser4_return_err().
8355 + */
8356 +static void reiser4_report_err(void)
8357 +{
8358 +       reiser4_context *ctx = get_current_context_check();
8359 +
8360 +       if (ctx != NULL) {
8361 +               if (ctx->err.code != 0) {
8362 +                       printk("code: %i at %s:%i\n",
8363 +                              ctx->err.code, ctx->err.file, ctx->err.line);
8364 +               }
8365 +       }
8366 +}
8367 +#endif  /*  0  */
8368 +
8369 +#endif                         /* REISER4_DEBUG */
8370 +
8371 +#if KERNEL_DEBUGGER
8372 +
8373 +/*
8374 + * this functions just drops into kernel debugger. It is a convenient place to
8375 + * put breakpoint in.
8376 + */
8377 +void reiser4_debugtrap(void)
8378 +{
8379 +       /* do nothing. Put break point here. */
8380 +#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
8381 +       extern void breakpoint(void);
8382 +       breakpoint();
8383 +#endif
8384 +}
8385 +#endif
8386 +
8387 +/* Make Linus happy.
8388 +   Local variables:
8389 +   c-indentation-style: "K&R"
8390 +   mode-name: "LC"
8391 +   c-basic-offset: 8
8392 +   tab-width: 8
8393 +   fill-column: 120
8394 +   End:
8395 +*/
8396 diff -urN linux-2.6.27.orig/fs/reiser4/debug.h linux-2.6.27/fs/reiser4/debug.h
8397 --- linux-2.6.27.orig/fs/reiser4/debug.h        1970-01-01 03:00:00.000000000 +0300
8398 +++ linux-2.6.27/fs/reiser4/debug.h     2008-10-12 18:20:00.000000000 +0400
8399 @@ -0,0 +1,350 @@
8400 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8401 +
8402 +/* Declarations of debug macros. */
8403 +
8404 +#if !defined( __FS_REISER4_DEBUG_H__ )
8405 +#define __FS_REISER4_DEBUG_H__
8406 +
8407 +#include "forward.h"
8408 +#include "reiser4.h"
8409 +
8410 +/* generic function to produce formatted output, decorating it with
8411 +   whatever standard prefixes/postfixes we want. "Fun" is a function
8412 +   that will be actually called, can be printk, panic etc.
8413 +   This is for use by other debugging macros, not by users. */
8414 +#define DCALL(lev, fun, reperr, label, format, ...)                    \
8415 +({                                                                     \
8416 +       fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" ,   \
8417 +           current->comm, current->pid, __FUNCTION__,                  \
8418 +           __FILE__, __LINE__, label, ## __VA_ARGS__);                 \
8419 +})
8420 +
8421 +/*
8422 + * cause kernel to crash
8423 + */
8424 +#define reiser4_panic(mid, format, ...)                                \
8425 +       DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
8426 +
8427 +/* print message with indication of current process, file, line and
8428 +   function */
8429 +#define reiser4_log(label, format, ...)                                \
8430 +       DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
8431 +
8432 +/* Assertion checked during compilation.
8433 +    If "cond" is false (0) we get duplicate case label in switch.
8434 +    Use this to check something like famous
8435 +       cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
8436 +    in 3.x journal.c. If cassertion fails you get compiler error,
8437 +    so no "maintainer-id".
8438 +*/
8439 +#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } })
8440 +
8441 +#define noop   do {;} while(0)
8442 +
8443 +#if REISER4_DEBUG
8444 +/* version of info that only actually prints anything when _d_ebugging
8445 +    is on */
8446 +#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
8447 +/* macro to catch logical errors. Put it into `default' clause of
8448 +    switch() statement. */
8449 +#define impossible(label, format, ...)                         \
8450 +         reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
8451 +/* assert assures that @cond is true. If it is not, reiser4_panic() is
8452 +   called. Use this for checking logical consistency and _never_ call
8453 +   this to check correctness of external data: disk blocks and user-input . */
8454 +#define assert(label, cond)                                                    \
8455 +({                                                                             \
8456 +       /* call_on_each_assert(); */                                            \
8457 +       if (cond) {                                                             \
8458 +               /* put negated check to avoid using !(cond) that would lose     \
8459 +                * warnings for things like assert(a = b); */                   \
8460 +               ;                                                               \
8461 +       } else {                                                                \
8462 +               DEBUGON(1);                                                     \
8463 +               reiser4_panic(label, "assertion failed: %s", #cond);            \
8464 +       }                                                                       \
8465 +})
8466 +
8467 +/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
8468 +#define check_me( label, expr )        assert( label, ( expr ) )
8469 +
8470 +#define ON_DEBUG( exp ) exp
8471 +
8472 +extern int reiser4_schedulable(void);
8473 +extern void call_on_each_assert(void);
8474 +
8475 +#else
8476 +
8477 +#define dinfo( format, args... ) noop
8478 +#define impossible( label, format, args... ) noop
8479 +#define assert( label, cond ) noop
8480 +#define check_me( label, expr )        ( ( void ) ( expr ) )
8481 +#define ON_DEBUG( exp )
8482 +#define reiser4_schedulable() might_sleep()
8483 +
8484 +/* REISER4_DEBUG */
8485 +#endif
8486 +
8487 +#if REISER4_DEBUG
8488 +/* per-thread information about lock acquired by this thread. Used by lock
8489 + * ordering checking in spin_macros.h */
8490 +typedef struct reiser4_lock_cnt_info {
8491 +       int rw_locked_tree;
8492 +       int read_locked_tree;
8493 +       int write_locked_tree;
8494 +
8495 +       int rw_locked_dk;
8496 +       int read_locked_dk;
8497 +       int write_locked_dk;
8498 +
8499 +       int rw_locked_cbk_cache;
8500 +       int read_locked_cbk_cache;
8501 +       int write_locked_cbk_cache;
8502 +
8503 +       int spin_locked_zlock;
8504 +       int spin_locked_jnode;
8505 +       int spin_locked_jload;
8506 +       int spin_locked_txnh;
8507 +       int spin_locked_atom;
8508 +       int spin_locked_stack;
8509 +       int spin_locked_txnmgr;
8510 +       int spin_locked_ktxnmgrd;
8511 +       int spin_locked_fq;
8512 +       int spin_locked_inode;
8513 +       int spin_locked_super_eflush;
8514 +       int spin_locked;
8515 +       int long_term_locked_znode;
8516 +
8517 +       int inode_sem_r;
8518 +       int inode_sem_w;
8519 +
8520 +       int d_refs;
8521 +       int x_refs;
8522 +       int t_refs;
8523 +} reiser4_lock_cnt_info;
8524 +
8525 +extern struct reiser4_lock_cnt_info *reiser4_lock_counters(void);
8526 +#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
8527 +
8528 +/* increment lock-counter @counter, if present */
8529 +#define LOCK_CNT_INC(counter)                                  \
8530 +       IN_CONTEXT(++(reiser4_lock_counters()->counter), 0)
8531 +
8532 +/* decrement lock-counter @counter, if present */
8533 +#define LOCK_CNT_DEC(counter)                                  \
8534 +       IN_CONTEXT(--(reiser4_lock_counters()->counter), 0)
8535 +
8536 +/* check that lock-counter is zero. This is for use in assertions */
8537 +#define LOCK_CNT_NIL(counter)                                  \
8538 +       IN_CONTEXT(reiser4_lock_counters()->counter == 0, 1)
8539 +
8540 +/* check that lock-counter is greater than zero. This is for use in
8541 + * assertions */
8542 +#define LOCK_CNT_GTZ(counter)                                  \
8543 +       IN_CONTEXT(reiser4_lock_counters()->counter > 0, 1)
8544 +#define LOCK_CNT_LT(counter,n)                                 \
8545 +       IN_CONTEXT(reiser4_lock_counters()->counter < n, 1)
8546 +
8547 +#else                          /* REISER4_DEBUG */
8548 +
8549 +/* no-op versions on the above */
8550 +
8551 +typedef struct reiser4_lock_cnt_info {
8552 +} reiser4_lock_cnt_info;
8553 +
8554 +#define reiser4_lock_counters() ((reiser4_lock_cnt_info *)NULL)
8555 +#define LOCK_CNT_INC(counter) noop
8556 +#define LOCK_CNT_DEC(counter) noop
8557 +#define LOCK_CNT_NIL(counter) (1)
8558 +#define LOCK_CNT_GTZ(counter) (1)
8559 +#define LOCK_CNT_LT(counter,n) (1)
8560 +
8561 +#endif                         /* REISER4_DEBUG */
8562 +
8563 +#define assert_spin_not_locked(lock) BUG_ON(0)
8564 +#define assert_rw_write_locked(lock) BUG_ON(0)
8565 +#define assert_rw_read_locked(lock) BUG_ON(0)
8566 +#define assert_rw_locked(lock) BUG_ON(0)
8567 +#define assert_rw_not_write_locked(lock) BUG_ON(0)
8568 +#define assert_rw_not_read_locked(lock) BUG_ON(0)
8569 +#define assert_rw_not_locked(lock) BUG_ON(0)
8570 +
8571 +/* flags controlling debugging behavior. Are set through debug_flags=N mount
8572 +   option. */
8573 +typedef enum {
8574 +       /* print a lot of information during panic. When this is on all jnodes
8575 +        * are listed. This can be *very* large output. Usually you don't want
8576 +        * this. Especially over serial line. */
8577 +       REISER4_VERBOSE_PANIC = 0x00000001,
8578 +       /* print a lot of information during umount */
8579 +       REISER4_VERBOSE_UMOUNT = 0x00000002,
8580 +       /* print gathered statistics on umount */
8581 +       REISER4_STATS_ON_UMOUNT = 0x00000004,
8582 +       /* check node consistency */
8583 +       REISER4_CHECK_NODE = 0x00000008
8584 +} reiser4_debug_flags;
8585 +
8586 +extern int is_in_reiser4_context(void);
8587 +
8588 +/*
8589 + * evaluate expression @e only if with reiser4 context
8590 + */
8591 +#define ON_CONTEXT(e)  do {                    \
8592 +       if(is_in_reiser4_context()) {           \
8593 +               e;                              \
8594 +       } } while(0)
8595 +
8596 +/*
8597 + * evaluate expression @e only when within reiser4_context and debugging is
8598 + * on.
8599 + */
8600 +#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) )
8601 +
8602 +/*
8603 + * complain about unexpected function result and crash. Used in "default"
8604 + * branches of switch statements and alike to assert that invalid results are
8605 + * not silently ignored.
8606 + */
8607 +#define wrong_return_value( label, function )                          \
8608 +       impossible( label, "wrong return value from " function )
8609 +
8610 +/* Issue different types of reiser4 messages to the console */
8611 +#define warning( label, format, ... )                                  \
8612 +       DCALL( KERN_WARNING,                                            \
8613 +              printk, 1, label, "WARNING: " format , ## __VA_ARGS__ )
8614 +#define notice( label, format, ... )                                   \
8615 +       DCALL( KERN_NOTICE,                                             \
8616 +              printk, 1, label, "NOTICE: " format , ## __VA_ARGS__ )
8617 +
8618 +/* mark not yet implemented functionality */
8619 +#define not_yet( label, format, ... )                          \
8620 +       reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ )
8621 +
8622 +extern void reiser4_do_panic(const char *format, ...)
8623 +    __attribute__ ((noreturn, format(printf, 1, 2)));
8624 +
8625 +extern int reiser4_preempt_point(void);
8626 +extern void reiser4_print_stats(void);
8627 +
8628 +#if REISER4_DEBUG
8629 +extern int reiser4_no_counters_are_held(void);
8630 +extern int reiser4_commit_check_locks(void);
8631 +#else
8632 +#define reiser4_no_counters_are_held() (1)
8633 +#define reiser4_commit_check_locks() (1)
8634 +#endif
8635 +
8636 +/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
8637 +#define IS_POW(i)                              \
8638 +({                                             \
8639 +       typeof(i) __i;                          \
8640 +                                               \
8641 +       __i = (i);                              \
8642 +       !(__i & (__i - 1));                     \
8643 +})
8644 +
8645 +#define KERNEL_DEBUGGER (1)
8646 +
8647 +#if KERNEL_DEBUGGER
8648 +
8649 +extern void reiser4_debugtrap(void);
8650 +
8651 +/*
8652 + * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
8653 + * kgdb is not compiled in, do nothing.
8654 + */
8655 +#define DEBUGON(cond)                                  \
8656 +({                                                     \
8657 +       if (unlikely(cond))                             \
8658 +               reiser4_debugtrap();                    \
8659 +})
8660 +#else
8661 +#define DEBUGON(cond) noop
8662 +#endif
8663 +
8664 +/*
8665 + * Error code tracing facility. (Idea is borrowed from XFS code.)
8666 + *
8667 + * Suppose some strange and/or unexpected code is returned from some function
8668 + * (for example, write(2) returns -EEXIST). It is possible to place a
8669 + * breakpoint in the reiser4_write(), but it is too late here. How to find out
8670 + * in what particular place -EEXIST was generated first?
8671 + *
8672 + * In reiser4 all places where actual error codes are produced (that is,
8673 + * statements of the form
8674 + *
8675 + *     return -EFOO;        // (1), or
8676 + *
8677 + *     result = -EFOO;      // (2)
8678 + *
8679 + * are replaced with
8680 + *
8681 + *     return RETERR(-EFOO);        // (1a), and
8682 + *
8683 + *     result = RETERR(-EFOO);      // (2a) respectively
8684 + *
8685 + * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
8686 + * printed in error and warning messages. Moreover, it's possible to put a
8687 + * conditional breakpoint in reiser4_return_err (low-level function called
8688 + * by RETERR() to do the actual work) to break into debugger immediately
8689 + * when particular error happens.
8690 + *
8691 + */
8692 +
8693 +#if REISER4_DEBUG
8694 +
8695 +/*
8696 + * data-type to store information about where error happened ("error site").
8697 + */
8698 +typedef struct err_site {
8699 +       int code;               /* error code */
8700 +       const char *file;       /* source file, filled by __FILE__ */
8701 +       int line;               /* source file line, filled by __LINE__ */
8702 +} err_site;
8703 +
8704 +extern void reiser4_return_err(int code, const char *file, int line);
8705 +
8706 +/*
8707 + * fill &get_current_context()->err_site with error information.
8708 + */
8709 +#define RETERR(code)                                   \
8710 +({                                                     \
8711 +       typeof(code) __code;                            \
8712 +                                                       \
8713 +       __code = (code);                                \
8714 +       reiser4_return_err(__code, __FILE__, __LINE__); \
8715 +       __code;                                         \
8716 +})
8717 +
8718 +#else
8719 +
8720 +/*
8721 + * no-op versions of the above
8722 + */
8723 +
8724 +typedef struct err_site {
8725 +} err_site;
8726 +#define RETERR(code) code
8727 +#endif
8728 +
8729 +#if REISER4_LARGE_KEY
8730 +/*
8731 + * conditionally compile arguments only if REISER4_LARGE_KEY is on.
8732 + */
8733 +#define ON_LARGE_KEY(...) __VA_ARGS__
8734 +#else
8735 +#define ON_LARGE_KEY(...)
8736 +#endif
8737 +
8738 +/* __FS_REISER4_DEBUG_H__ */
8739 +#endif
8740 +
8741 +/* Make Linus happy.
8742 +   Local variables:
8743 +   c-indentation-style: "K&R"
8744 +   mode-name: "LC"
8745 +   c-basic-offset: 8
8746 +   tab-width: 8
8747 +   fill-column: 120
8748 +   End:
8749 +*/
8750 diff -urN linux-2.6.27.orig/fs/reiser4/dformat.h linux-2.6.27/fs/reiser4/dformat.h
8751 --- linux-2.6.27.orig/fs/reiser4/dformat.h      1970-01-01 03:00:00.000000000 +0300
8752 +++ linux-2.6.27/fs/reiser4/dformat.h   2008-10-12 18:20:00.000000000 +0400
8753 @@ -0,0 +1,70 @@
8754 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8755 +
8756 +/* Formats of on-disk data and conversion functions. */
8757 +
8758 +/* put all item formats in the files describing the particular items,
8759 +   our model is, everything you need to do to add an item to reiser4,
8760 +   (excepting the changes to the plugin that uses the item which go
8761 +   into the file defining that plugin), you put into one file. */
8762 +/* Data on disk are stored in little-endian format.
8763 +   To declare fields of on-disk structures, use d8, d16, d32 and d64.
8764 +   d??tocpu() and cputod??() to convert. */
8765 +
8766 +#if !defined( __FS_REISER4_DFORMAT_H__ )
8767 +#define __FS_REISER4_DFORMAT_H__
8768 +
8769 +#include <asm/byteorder.h>
8770 +#include <asm/unaligned.h>
8771 +#include <linux/types.h>
8772 +
8773 +typedef __u8 d8;
8774 +typedef __le16 d16;
8775 +typedef __le32 d32;
8776 +typedef __le64 d64;
8777 +
8778 +#define PACKED __attribute__((packed))
8779 +
8780 +/* data-type for block number */
8781 +typedef __u64 reiser4_block_nr;
8782 +
8783 +/* data-type for block number on disk, disk format */
8784 +typedef __le64 reiser4_dblock_nr;
8785 +
8786 +/**
8787 + * disk_addr_eq - compare disk addresses
8788 + * @b1: pointer to block number ot compare
8789 + * @b2: pointer to block number ot compare
8790 + *
8791 + * Returns true if if disk addresses are the same
8792 + */
8793 +static inline int disk_addr_eq(const reiser4_block_nr *b1,
8794 +                              const reiser4_block_nr * b2)
8795 +{
8796 +       assert("nikita-1033", b1 != NULL);
8797 +       assert("nikita-1266", b2 != NULL);
8798 +
8799 +       return !memcmp(b1, b2, sizeof *b1);
8800 +}
8801 +
8802 +/* structure of master reiser4 super block */
8803 +typedef struct reiser4_master_sb {
8804 +       char magic[16];         /* "ReIsEr4" */
8805 +       __le16 disk_plugin_id;  /* id of disk layout plugin */
8806 +       __le16 blocksize;
8807 +       char uuid[16];          /* unique id */
8808 +       char label[16];         /* filesystem label */
8809 +       __le64 diskmap;         /* location of the diskmap. 0 if not present */
8810 +} reiser4_master_sb;
8811 +
8812 +/* __FS_REISER4_DFORMAT_H__ */
8813 +#endif
8814 +
8815 +/*
8816 + * Local variables:
8817 + * c-indentation-style: "K&R"
8818 + * mode-name: "LC"
8819 + * c-basic-offset: 8
8820 + * tab-width: 8
8821 + * fill-column: 79
8822 + * End:
8823 + */
8824 diff -urN linux-2.6.27.orig/fs/reiser4/dscale.c linux-2.6.27/fs/reiser4/dscale.c
8825 --- linux-2.6.27.orig/fs/reiser4/dscale.c       1970-01-01 03:00:00.000000000 +0300
8826 +++ linux-2.6.27/fs/reiser4/dscale.c    2008-10-12 18:20:00.000000000 +0400
8827 @@ -0,0 +1,192 @@
8828 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8829 + * reiser4/README */
8830 +
8831 +/* Scalable on-disk integers */
8832 +
8833 +/*
8834 + * Various on-disk structures contain integer-like structures. Stat-data
8835 + * contain [yes, "data" is plural, check the dictionary] file size, link
8836 + * count; extent unit contains extent width etc. To accommodate for general
8837 + * case enough space is reserved to keep largest possible value. 64 bits in
8838 + * all cases above. But in overwhelming majority of cases numbers actually
8839 + * stored in these fields will be comparatively small and reserving 8 bytes is
8840 + * a waste of precious disk bandwidth.
8841 + *
8842 + * Scalable integers are one way to solve this problem. dscale_write()
8843 + * function stores __u64 value in the given area consuming from 1 to 9 bytes,
8844 + * depending on the magnitude of the value supplied. dscale_read() reads value
8845 + * previously stored by dscale_write().
8846 + *
8847 + * dscale_write() produces format not completely unlike of UTF: two highest
8848 + * bits of the first byte are used to store "tag". One of 4 possible tag
8849 + * values is chosen depending on the number being encoded:
8850 + *
8851 + *           0 ... 0x3f               => 0           [table 1]
8852 + *        0x40 ... 0x3fff             => 1
8853 + *      0x4000 ... 0x3fffffff         => 2
8854 + *  0x40000000 ... 0xffffffffffffffff => 3
8855 + *
8856 + * (see dscale_range() function)
8857 + *
8858 + * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
8859 + * to be stored, so in this case there is no place in the first byte to store
8860 + * tag. For such values tag is stored in an extra 9th byte.
8861 + *
8862 + * As _highest_ bits are used for the test (which is natural) scaled integers
8863 + * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
8864 + * uses LITTLE-ENDIAN.
8865 + *
8866 + */
8867 +
8868 +#include "debug.h"
8869 +#include "dscale.h"
8870 +
8871 +/* return tag of scaled integer stored at @address */
8872 +static int gettag(const unsigned char *address)
8873 +{
8874 +       /* tag is stored in two highest bits */
8875 +       return (*address) >> 6;
8876 +}
8877 +
8878 +/* clear tag from value. Clear tag embedded into @value. */
8879 +static void cleartag(__u64 * value, int tag)
8880 +{
8881 +       /*
8882 +        * W-w-what ?!
8883 +        *
8884 +        * Actually, this is rather simple: @value passed here was read by
8885 +        * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
8886 +        * zeroes. Tag is still stored in the highest (arithmetically)
8887 +        * non-zero bits of @value, but relative position of tag within __u64
8888 +        * depends on @tag.
8889 +        *
8890 +        * For example if @tag is 0, it's stored 2 highest bits of lowest
8891 +        * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
8892 +        *
8893 +        * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
8894 +        * and it's offset if (2 * 8) - 2 == 14 bits.
8895 +        *
8896 +        * See table 1 above for details.
8897 +        *
8898 +        * All these cases are captured by the formula:
8899 +        */
8900 +       *value &= ~(3 << (((1 << tag) << 3) - 2));
8901 +       /*
8902 +        * That is, clear two (3 == 0t11) bits at the offset
8903 +        *
8904 +        *                  8 * (2 ^ tag) - 2,
8905 +        *
8906 +        * that is, two highest bits of (2 ^ tag)-th byte of @value.
8907 +        */
8908 +}
8909 +
8910 +/* return tag for @value. See table 1 above for details. */
8911 +static int dscale_range(__u64 value)
8912 +{
8913 +       if (value > 0x3fffffff)
8914 +               return 3;
8915 +       if (value > 0x3fff)
8916 +               return 2;
8917 +       if (value > 0x3f)
8918 +               return 1;
8919 +       return 0;
8920 +}
8921 +
8922 +/* restore value stored at @adderss by dscale_write() and return number of
8923 + * bytes consumed */
8924 +int dscale_read(unsigned char *address, __u64 * value)
8925 +{
8926 +       int tag;
8927 +
8928 +       /* read tag */
8929 +       tag = gettag(address);
8930 +       switch (tag) {
8931 +       case 3:
8932 +               /* In this case tag is stored in an extra byte, skip this byte
8933 +                * and decode value stored in the next 8 bytes.*/
8934 +               *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1)));
8935 +               /* worst case: 8 bytes for value itself plus one byte for
8936 +                * tag. */
8937 +               return 9;
8938 +       case 0:
8939 +               *value = get_unaligned(address);
8940 +               break;
8941 +       case 1:
8942 +               *value = __be16_to_cpu(get_unaligned((__be16 *)address));
8943 +               break;
8944 +       case 2:
8945 +               *value = __be32_to_cpu(get_unaligned((__be32 *)address));
8946 +               break;
8947 +       default:
8948 +               return RETERR(-EIO);
8949 +       }
8950 +       /* clear tag embedded into @value */
8951 +       cleartag(value, tag);
8952 +       /* number of bytes consumed is (2 ^ tag)---see table 1. */
8953 +       return 1 << tag;
8954 +}
8955 +
8956 +/* number of bytes consumed */
8957 +int dscale_bytes_to_read(unsigned char *address)
8958 +{
8959 +       int tag;
8960 +
8961 +       tag = gettag(address);
8962 +       switch (tag) {
8963 +       case 0:
8964 +       case 1:
8965 +       case 2:
8966 +               return 1 << tag;
8967 +       case 3:
8968 +               return 9;
8969 +       default:
8970 +               return RETERR(-EIO);
8971 +       }
8972 +}
8973 +
8974 +/* store @value at @address and return number of bytes consumed */
8975 +int dscale_write(unsigned char *address, __u64 value)
8976 +{
8977 +       int tag;
8978 +       int shift;
8979 +       __be64 v;
8980 +       unsigned char *valarr;
8981 +
8982 +       tag = dscale_range(value);
8983 +       v = __cpu_to_be64(value);
8984 +       valarr = (unsigned char *)&v;
8985 +       shift = (tag == 3) ? 1 : 0;
8986 +       memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag);
8987 +       *address |= (tag << 6);
8988 +       return shift + (1 << tag);
8989 +}
8990 +
8991 +/* number of bytes required to store @value */
8992 +int dscale_bytes_to_write(__u64 value)
8993 +{
8994 +       int bytes;
8995 +
8996 +       bytes = 1 << dscale_range(value);
8997 +       if (bytes == 8)
8998 +               ++bytes;
8999 +       return bytes;
9000 +}
9001 +
9002 +/* returns true if @value and @other require the same number of bytes to be
9003 + * stored. Used by detect when data structure (like stat-data) has to be
9004 + * expanded or contracted. */
9005 +int dscale_fit(__u64 value, __u64 other)
9006 +{
9007 +       return dscale_range(value) == dscale_range(other);
9008 +}
9009 +
9010 +/* Make Linus happy.
9011 +   Local variables:
9012 +   c-indentation-style: "K&R"
9013 +   mode-name: "LC"
9014 +   c-basic-offset: 8
9015 +   tab-width: 8
9016 +   fill-column: 120
9017 +   scroll-step: 1
9018 +   End:
9019 +*/
9020 diff -urN linux-2.6.27.orig/fs/reiser4/dscale.h linux-2.6.27/fs/reiser4/dscale.h
9021 --- linux-2.6.27.orig/fs/reiser4/dscale.h       1970-01-01 03:00:00.000000000 +0300
9022 +++ linux-2.6.27/fs/reiser4/dscale.h    2008-10-12 18:20:00.000000000 +0400
9023 @@ -0,0 +1,28 @@
9024 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9025 + * reiser4/README */
9026 +
9027 +/* Scalable on-disk integers. See dscale.h for details. */
9028 +
9029 +#if !defined( __FS_REISER4_DSCALE_H__ )
9030 +#define __FS_REISER4_DSCALE_H__
9031 +
9032 +#include "dformat.h"
9033 +
9034 +extern int dscale_read(unsigned char *address, __u64 * value);
9035 +extern int dscale_write(unsigned char *address, __u64 value);
9036 +extern int dscale_bytes_to_read(unsigned char *address);
9037 +extern int dscale_bytes_to_write(__u64 value);
9038 +extern int dscale_fit(__u64 value, __u64 other);
9039 +
9040 +/* __FS_REISER4_DSCALE_H__ */
9041 +#endif
9042 +
9043 +/* Make Linus happy.
9044 +   Local variables:
9045 +   c-indentation-style: "K&R"
9046 +   mode-name: "LC"
9047 +   c-basic-offset: 8
9048 +   tab-width: 8
9049 +   fill-column: 120
9050 +   End:
9051 +*/
9052 diff -urN linux-2.6.27.orig/fs/reiser4/entd.c linux-2.6.27/fs/reiser4/entd.c
9053 --- linux-2.6.27.orig/fs/reiser4/entd.c 1970-01-01 03:00:00.000000000 +0300
9054 +++ linux-2.6.27/fs/reiser4/entd.c      2008-10-12 18:20:00.000000000 +0400
9055 @@ -0,0 +1,335 @@
9056 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
9057 + * reiser4/README */
9058 +
9059 +/* Ent daemon. */
9060 +
9061 +#include "debug.h"
9062 +#include "txnmgr.h"
9063 +#include "tree.h"
9064 +#include "entd.h"
9065 +#include "super.h"
9066 +#include "context.h"
9067 +#include "reiser4.h"
9068 +#include "vfs_ops.h"
9069 +#include "page_cache.h"
9070 +#include "inode.h"
9071 +
9072 +#include <linux/sched.h>       /* struct task_struct */
9073 +#include <linux/suspend.h>
9074 +#include <linux/kernel.h>
9075 +#include <linux/writeback.h>
9076 +#include <linux/time.h>                /* INITIAL_JIFFIES */
9077 +#include <linux/backing-dev.h> /* bdi_write_congested */
9078 +#include <linux/wait.h>
9079 +#include <linux/kthread.h>
9080 +#include <linux/freezer.h>
9081 +
9082 +#define DEF_PRIORITY 12
9083 +#define MAX_ENTD_ITERS 10
9084 +
9085 +static void entd_flush(struct super_block *, struct wbq *);
9086 +static int entd(void *arg);
9087 +
9088 +/*
9089 + * set ->comm field of end thread to make its state visible to the user level
9090 + */
9091 +#define entd_set_comm(state)                                   \
9092 +       snprintf(current->comm, sizeof(current->comm),  \
9093 +                "ent:%s%s", super->s_id, (state))
9094 +
9095 +/**
9096 + * reiser4_init_entd - initialize entd context and start kernel daemon
9097 + * @super: super block to start ent thread for
9098 + *
9099 + * Creates entd contexts, starts kernel thread and waits until it
9100 + * initializes.
9101 + */
9102 +int reiser4_init_entd(struct super_block *super)
9103 +{
9104 +       entd_context *ctx;
9105 +
9106 +       assert("nikita-3104", super != NULL);
9107 +
9108 +       ctx = get_entd_context(super);
9109 +
9110 +       memset(ctx, 0, sizeof *ctx);
9111 +       spin_lock_init(&ctx->guard);
9112 +       init_waitqueue_head(&ctx->wait);
9113 +#if REISER4_DEBUG
9114 +       INIT_LIST_HEAD(&ctx->flushers_list);
9115 +#endif
9116 +       /* lists of writepage requests */
9117 +       INIT_LIST_HEAD(&ctx->todo_list);
9118 +       INIT_LIST_HEAD(&ctx->done_list);
9119 +       /* start entd */
9120 +       ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id);
9121 +       if (IS_ERR(ctx->tsk))
9122 +               return PTR_ERR(ctx->tsk);
9123 +       return 0;
9124 +}
9125 +
9126 +static void put_wbq(struct wbq *rq)
9127 +{
9128 +       iput(rq->mapping->host);
9129 +       complete(&rq->completion);
9130 +}
9131 +
9132 +/* ent should be locked */
9133 +static struct wbq *__get_wbq(entd_context * ent)
9134 +{
9135 +       struct wbq *wbq;
9136 +
9137 +       if (list_empty(&ent->todo_list))
9138 +               return NULL;
9139 +
9140 +       ent->nr_todo_reqs --;
9141 +       wbq = list_entry(ent->todo_list.next, struct wbq, link);
9142 +       list_del_init(&wbq->link);
9143 +       return wbq;
9144 +}
9145 +
9146 +/* ent thread function */
9147 +static int entd(void *arg)
9148 +{
9149 +       struct super_block *super;
9150 +       entd_context *ent;
9151 +       int done = 0;
9152 +
9153 +       super = arg;
9154 +       /* do_fork() just copies task_struct into the new
9155 +          thread. ->fs_context shouldn't be copied of course. This shouldn't
9156 +          be a problem for the rest of the code though.
9157 +        */
9158 +       current->journal_info = NULL;
9159 +
9160 +       ent = get_entd_context(super);
9161 +
9162 +       while (!done) {
9163 +               try_to_freeze();
9164 +
9165 +               spin_lock(&ent->guard);
9166 +               while (ent->nr_todo_reqs != 0) {
9167 +                       struct wbq *rq;
9168 +
9169 +                       assert("", list_empty(&ent->done_list));
9170 +
9171 +                       /* take request from the queue head */
9172 +                       rq = __get_wbq(ent);
9173 +                       assert("", rq != NULL);
9174 +                       ent->cur_request = rq;
9175 +                       spin_unlock(&ent->guard);
9176 +
9177 +                       entd_set_comm("!");
9178 +                       entd_flush(super, rq);
9179 +
9180 +                       put_wbq(rq);
9181 +
9182 +                       /*
9183 +                        * wakeup all requestors and iput their inodes
9184 +                        */
9185 +                       spin_lock(&ent->guard);
9186 +                       while (!list_empty(&ent->done_list)) {
9187 +                               rq = list_entry(ent->done_list.next, struct wbq, link);
9188 +                               list_del_init(&rq->link);
9189 +                               ent->nr_done_reqs --;
9190 +                               spin_unlock(&ent->guard);
9191 +                               assert("", rq->written == 1);
9192 +                               put_wbq(rq);
9193 +                               spin_lock(&ent->guard);
9194 +                       }
9195 +               }
9196 +               spin_unlock(&ent->guard);
9197 +
9198 +               entd_set_comm(".");
9199 +
9200 +               {
9201 +                       DEFINE_WAIT(__wait);
9202 +
9203 +                       do {
9204 +                               prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE);
9205 +                               if (kthread_should_stop()) {
9206 +                                       done = 1;
9207 +                                       break;
9208 +                               }
9209 +                               if (ent->nr_todo_reqs != 0)
9210 +                                       break;
9211 +                               schedule();
9212 +                       } while (0);
9213 +                       finish_wait(&ent->wait, &__wait);
9214 +               }
9215 +       }
9216 +       BUG_ON(ent->nr_todo_reqs != 0);
9217 +       return 0;
9218 +}
9219 +
9220 +/**
9221 + * reiser4_done_entd - stop entd kernel thread
9222 + * @super: super block to stop ent thread for
9223 + *
9224 + * It is called on umount. Sends stop signal to entd and wait until it handles
9225 + * it.
9226 + */
9227 +void reiser4_done_entd(struct super_block *super)
9228 +{
9229 +       entd_context *ent;
9230 +
9231 +       assert("nikita-3103", super != NULL);
9232 +
9233 +       ent = get_entd_context(super);
9234 +       assert("zam-1055", ent->tsk != NULL);
9235 +       kthread_stop(ent->tsk);
9236 +}
9237 +
9238 +/* called at the beginning of jnode_flush to register flusher thread with ent
9239 + * daemon */
9240 +void reiser4_enter_flush(struct super_block *super)
9241 +{
9242 +       entd_context *ent;
9243 +
9244 +       assert("zam-1029", super != NULL);
9245 +       ent = get_entd_context(super);
9246 +
9247 +       assert("zam-1030", ent != NULL);
9248 +
9249 +       spin_lock(&ent->guard);
9250 +       ent->flushers++;
9251 +#if REISER4_DEBUG
9252 +       list_add(&get_current_context()->flushers_link, &ent->flushers_list);
9253 +#endif
9254 +       spin_unlock(&ent->guard);
9255 +}
9256 +
9257 +/* called at the end of jnode_flush */
9258 +void reiser4_leave_flush(struct super_block *super)
9259 +{
9260 +       entd_context *ent;
9261 +       int wake_up_ent;
9262 +
9263 +       assert("zam-1027", super != NULL);
9264 +       ent = get_entd_context(super);
9265 +
9266 +       assert("zam-1028", ent != NULL);
9267 +
9268 +       spin_lock(&ent->guard);
9269 +       ent->flushers--;
9270 +       wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0);
9271 +#if REISER4_DEBUG
9272 +       list_del_init(&get_current_context()->flushers_link);
9273 +#endif
9274 +       spin_unlock(&ent->guard);
9275 +       if (wake_up_ent)
9276 +               wake_up(&ent->wait);
9277 +}
9278 +
9279 +#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX
9280 +
9281 +static void entd_flush(struct super_block *super, struct wbq *rq)
9282 +{
9283 +       reiser4_context ctx;
9284 +       int tmp;
9285 +
9286 +       init_stack_context(&ctx, super);
9287 +       ctx.entd = 1;
9288 +       ctx.gfp_mask = GFP_NOFS;
9289 +
9290 +       rq->wbc->range_start = page_offset(rq->page);
9291 +       rq->wbc->range_end = rq->wbc->range_start +
9292 +               (ENTD_CAPTURE_APAGE_BURST << PAGE_CACHE_SHIFT);
9293 +       tmp = rq->wbc->nr_to_write;
9294 +       rq->mapping->a_ops->writepages(rq->mapping, rq->wbc);
9295 +
9296 +       if (rq->wbc->nr_to_write > 0) {
9297 +               rq->wbc->range_start = 0;
9298 +               rq->wbc->range_end = LLONG_MAX;
9299 +               generic_sync_sb_inodes(super, rq->wbc);
9300 +       }
9301 +       rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
9302 +       reiser4_writeout(super, rq->wbc);
9303 +
9304 +       context_set_commit_async(&ctx);
9305 +       reiser4_exit_context(&ctx);
9306 +}
9307 +
9308 +/**
9309 + * write_page_by_ent - ask entd thread to flush this page as part of slum
9310 + * @page: page to be written
9311 + * @wbc: writeback control passed to reiser4_writepage
9312 + *
9313 + * Creates a request, puts it on entd list of requests, wakeups entd if
9314 + * necessary, waits until entd completes with the request.
9315 + */
9316 +int write_page_by_ent(struct page *page, struct writeback_control *wbc)
9317 +{
9318 +       struct super_block *sb;
9319 +       struct inode *inode;
9320 +       entd_context *ent;
9321 +       struct wbq rq;
9322 +
9323 +       assert("", PageLocked(page));
9324 +       assert("", page->mapping != NULL);
9325 +
9326 +       sb = page->mapping->host->i_sb;
9327 +       ent = get_entd_context(sb);
9328 +       assert("", ent && ent->done == 0);
9329 +
9330 +       /*
9331 +        * we are going to unlock page and ask ent thread to write the
9332 +        * page. Re-dirty page before unlocking so that if ent thread fails to
9333 +        * write it - it will remain dirty
9334 +        */
9335 +       reiser4_set_page_dirty_internal(page);
9336 +
9337 +       /*
9338 +        * pin inode in memory, unlock page, entd_flush will iput. We can not
9339 +        * iput here becasue we can not allow delete_inode to be called here
9340 +        */
9341 +       inode = igrab(page->mapping->host);
9342 +       unlock_page(page);
9343 +       if (inode == NULL)
9344 +               /* inode is getting freed */
9345 +               return 0;
9346 +
9347 +       /* init wbq */
9348 +       INIT_LIST_HEAD(&rq.link);
9349 +       rq.magic = WBQ_MAGIC;
9350 +       rq.wbc = wbc;
9351 +       rq.page = page;
9352 +       rq.mapping = inode->i_mapping;
9353 +       rq.node = NULL;
9354 +       rq.written = 0;
9355 +       init_completion(&rq.completion);
9356 +
9357 +       /* add request to entd's list of writepage requests */
9358 +       spin_lock(&ent->guard);
9359 +       ent->nr_todo_reqs++;
9360 +       list_add_tail(&rq.link, &ent->todo_list);
9361 +       if (ent->nr_todo_reqs == 1)
9362 +               wake_up(&ent->wait);
9363 +
9364 +       spin_unlock(&ent->guard);
9365 +
9366 +       /* wait until entd finishes */
9367 +       wait_for_completion(&rq.completion);
9368 +
9369 +       if (rq.written)
9370 +               /* Eventually ENTD has written the page to disk. */
9371 +               return 0;
9372 +       return 0;
9373 +}
9374 +
9375 +int wbq_available(void)
9376 +{
9377 +       struct super_block *sb = reiser4_get_current_sb();
9378 +       entd_context *ent = get_entd_context(sb);
9379 +       return ent->nr_todo_reqs;
9380 +}
9381 +
9382 +/*
9383 + * Local variables:
9384 + * c-indentation-style: "K&R"
9385 + * mode-name: "LC"
9386 + * c-basic-offset: 8
9387 + * tab-width: 8
9388 + * fill-column: 79
9389 + * End:
9390 + */
9391 diff -urN linux-2.6.27.orig/fs/reiser4/entd.h linux-2.6.27/fs/reiser4/entd.h
9392 --- linux-2.6.27.orig/fs/reiser4/entd.h 1970-01-01 03:00:00.000000000 +0300
9393 +++ linux-2.6.27/fs/reiser4/entd.h      2008-10-12 18:20:00.000000000 +0400
9394 @@ -0,0 +1,90 @@
9395 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
9396 +
9397 +/* Ent daemon. */
9398 +
9399 +#ifndef __ENTD_H__
9400 +#define __ENTD_H__
9401 +
9402 +#include "context.h"
9403 +
9404 +#include <linux/fs.h>
9405 +#include <linux/completion.h>
9406 +#include <linux/wait.h>
9407 +#include <linux/spinlock.h>
9408 +#include <linux/sched.h>       /* for struct task_struct */
9409 +
9410 +#define WBQ_MAGIC 0x7876dc76
9411 +
9412 +/* write-back request. */
9413 +struct wbq {
9414 +       int magic;
9415 +       struct list_head link; /* list head of this list is in entd context */
9416 +       struct writeback_control *wbc;
9417 +       struct page *page;
9418 +       struct address_space *mapping;
9419 +       struct completion completion;
9420 +       jnode *node; /* set if ent thread captured requested page */
9421 +       int written; /* set if ent thread wrote requested page */
9422 +};
9423 +
9424 +/* ent-thread context. This is used to synchronize starting/stopping ent
9425 + * threads. */
9426 +typedef struct entd_context {
9427 +        /* wait queue that ent thread waits on for more work. It's
9428 +         * signaled by write_page_by_ent(). */
9429 +       wait_queue_head_t wait;
9430 +       /* spinlock protecting other fields */
9431 +       spinlock_t guard;
9432 +       /* ent thread */
9433 +       struct task_struct *tsk;
9434 +       /* set to indicate that ent thread should leave. */
9435 +       int done;
9436 +       /* counter of active flushers */
9437 +       int flushers;
9438 +       /*
9439 +        * when reiser4_writepage asks entd to write a page - it adds struct
9440 +        * wbq to this list
9441 +        */
9442 +       struct list_head todo_list;
9443 +       /* number of elements on the above list */
9444 +       int nr_todo_reqs;
9445 +
9446 +       struct wbq *cur_request;
9447 +       /*
9448 +        * when entd writes a page it moves write-back request from todo_list
9449 +        * to done_list. This list is used at the end of entd iteration to
9450 +        * wakeup requestors and iput inodes.
9451 +        */
9452 +       struct list_head done_list;
9453 +       /* number of elements on the above list */
9454 +       int nr_done_reqs;
9455 +
9456 +#if REISER4_DEBUG
9457 +       /* list of all active flushers */
9458 +       struct list_head flushers_list;
9459 +#endif
9460 +} entd_context;
9461 +
9462 +extern int  reiser4_init_entd(struct super_block *);
9463 +extern void reiser4_done_entd(struct super_block *);
9464 +
9465 +extern void reiser4_enter_flush(struct super_block *);
9466 +extern void reiser4_leave_flush(struct super_block *);
9467 +
9468 +extern int write_page_by_ent(struct page *, struct writeback_control *);
9469 +extern int wbq_available(void);
9470 +extern void ent_writes_page(struct super_block *, struct page *);
9471 +
9472 +extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *);
9473 +/* __ENTD_H__ */
9474 +#endif
9475 +
9476 +/* Make Linus happy.
9477 +   Local variables:
9478 +   c-indentation-style: "K&R"
9479 +   mode-name: "LC"
9480 +   c-basic-offset: 8
9481 +   tab-width: 8
9482 +   fill-column: 120
9483 +   End:
9484 +*/
9485 diff -urN linux-2.6.27.orig/fs/reiser4/eottl.c linux-2.6.27/fs/reiser4/eottl.c
9486 --- linux-2.6.27.orig/fs/reiser4/eottl.c        1970-01-01 03:00:00.000000000 +0300
9487 +++ linux-2.6.27/fs/reiser4/eottl.c     2008-10-12 18:20:00.000000000 +0400
9488 @@ -0,0 +1,509 @@
9489 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
9490 +
9491 +#include "forward.h"
9492 +#include "debug.h"
9493 +#include "key.h"
9494 +#include "coord.h"
9495 +#include "plugin/item/item.h"
9496 +#include "plugin/node/node.h"
9497 +#include "znode.h"
9498 +#include "block_alloc.h"
9499 +#include "tree_walk.h"
9500 +#include "tree_mod.h"
9501 +#include "carry.h"
9502 +#include "tree.h"
9503 +#include "super.h"
9504 +
9505 +#include <linux/types.h>       /* for __u??  */
9506 +
9507 +/*
9508 + * Extents on the twig level (EOTTL) handling.
9509 + *
9510 + * EOTTL poses some problems to the tree traversal, that are better explained
9511 + * by example.
9512 + *
9513 + * Suppose we have block B1 on the twig level with the following items:
9514 + *
9515 + * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
9516 + * offset)
9517 + * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
9518 + * 2. internal item I2 with key (10:0:0:0)
9519 + *
9520 + * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
9521 + * then intra-node lookup is done. This lookup finished on the E1, because the
9522 + * key we are looking for is larger than the key of E1 and is smaller than key
9523 + * the of I2.
9524 + *
9525 + * Here search is stuck.
9526 + *
9527 + * After some thought it is clear what is wrong here: extents on the twig level
9528 + * break some basic property of the *search* tree (on the pretext, that they
9529 + * restore property of balanced tree).
9530 + *
9531 + * Said property is the following: if in the internal node of the search tree
9532 + * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
9533 + * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
9534 + * through the Pointer.
9535 + *
9536 + * This is not true, when Pointer is Extent-Pointer, simply because extent
9537 + * cannot expand indefinitely to the right to include any item with
9538 + *
9539 + *   Key1 <= Key <= Key2.
9540 + *
9541 + * For example, our E1 extent is only responsible for the data with keys
9542 + *
9543 + *   (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
9544 + *
9545 + * so, key range
9546 + *
9547 + *   ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
9548 + *
9549 + * is orphaned: there is no way to get there from the tree root.
9550 + *
9551 + * In other words, extent pointers are different than normal child pointers as
9552 + * far as search tree is concerned, and this creates such problems.
9553 + *
9554 + * Possible solution for this problem is to insert our item into node pointed
9555 + * to by I2. There are some problems through:
9556 + *
9557 + * (1) I2 can be in a different node.
9558 + * (2) E1 can be immediately followed by another extent E2.
9559 + *
9560 + * (1) is solved by calling reiser4_get_right_neighbor() and accounting
9561 + * for locks/coords as necessary.
9562 + *
9563 + * (2) is more complex. Solution here is to insert new empty leaf node and
9564 + * insert internal item between E1 and E2 pointing to said leaf node. This is
9565 + * further complicated by possibility that E2 is in a different node, etc.
9566 + *
9567 + * Problems:
9568 + *
9569 + * (1) if there was internal item I2 immediately on the right of an extent E1
9570 + * we and we decided to insert new item S1 into node N2 pointed to by I2, then
9571 + * key of S1 will be less than smallest key in the N2. Normally, search key
9572 + * checks that key we are looking for is in the range of keys covered by the
9573 + * node key is being looked in. To work around of this situation, while
9574 + * preserving useful consistency check new flag CBK_TRUST_DK was added to the
9575 + * cbk falgs bitmask. This flag is automatically set on entrance to the
9576 + * coord_by_key() and is only cleared when we are about to enter situation
9577 + * described above.
9578 + *
9579 + * (2) If extent E1 is immediately followed by another extent E2 and we are
9580 + * searching for the key that is between E1 and E2 we only have to insert new
9581 + * empty leaf node when coord_by_key was called for insertion, rather than just
9582 + * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
9583 + * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
9584 + * performed by insert_by_key() and friends.
9585 + *
9586 + * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
9587 + * case it requires modification of node content which is only possible under
9588 + * write lock. It may well happen that we only have read lock on the node where
9589 + * new internal pointer is to be inserted (common case: lookup of non-existent
9590 + * stat-data that fells between two extents). If only read lock is held, tree
9591 + * traversal is restarted with lock_level modified so that next time we hit
9592 + * this problem, write lock will be held. Once we have write lock, balancing
9593 + * will be performed.
9594 + */
9595 +
9596 +/**
9597 + * is_next_item_internal - check whether next item is internal
9598 + * @coord: coordinate of extent item in twig node
9599 + * @key: search key
9600 + * @lh: twig node lock handle
9601 + *
9602 + * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
9603 + * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
9604 + * to that node, @coord is set to its first unit. If next item is not internal
9605 + * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
9606 + * is returned if search restart has to be done.
9607 + */
9608 +static int
9609 +is_next_item_internal(coord_t *coord, const reiser4_key *key,
9610 +                     lock_handle *lh)
9611 +{
9612 +       coord_t next;
9613 +       lock_handle rn;
9614 +       int result;
9615 +
9616 +       coord_dup(&next, coord);
9617 +       if (coord_next_unit(&next) == 0) {
9618 +               /* next unit is in this node */
9619 +               if (item_is_internal(&next)) {
9620 +                       coord_dup(coord, &next);
9621 +                       return 1;
9622 +               }
9623 +               assert("vs-3", item_is_extent(&next));
9624 +               return 0;
9625 +       }
9626 +
9627 +       /*
9628 +        * next unit either does not exist or is in right neighbor. If it is in
9629 +        * right neighbor we have to check right delimiting key because
9630 +        * concurrent thread could get their first and insert item with a key
9631 +        * smaller than @key
9632 +        */
9633 +       read_lock_dk(current_tree);
9634 +       result = keycmp(key, znode_get_rd_key(coord->node));
9635 +       read_unlock_dk(current_tree);
9636 +       assert("vs-6", result != EQUAL_TO);
9637 +       if (result == GREATER_THAN)
9638 +               return 2;
9639 +
9640 +       /* lock right neighbor */
9641 +       init_lh(&rn);
9642 +       result = reiser4_get_right_neighbor(&rn, coord->node,
9643 +                                           znode_is_wlocked(coord->node) ?
9644 +                                           ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
9645 +                                           GN_CAN_USE_UPPER_LEVELS);
9646 +       if (result == -E_NO_NEIGHBOR) {
9647 +               /* we are on the rightmost edge of the tree */
9648 +               done_lh(&rn);
9649 +               return 0;
9650 +       }
9651 +
9652 +       if (result) {
9653 +               assert("vs-4", result < 0);
9654 +               done_lh(&rn);
9655 +               return result;
9656 +       }
9657 +
9658 +       /*
9659 +        * check whether concurrent thread managed to insert item with a key
9660 +        * smaller than @key
9661 +        */
9662 +       read_lock_dk(current_tree);
9663 +       result = keycmp(key, znode_get_ld_key(rn.node));
9664 +       read_unlock_dk(current_tree);
9665 +       assert("vs-6", result != EQUAL_TO);
9666 +       if (result == GREATER_THAN) {
9667 +               done_lh(&rn);
9668 +               return 2;
9669 +       }
9670 +
9671 +       result = zload(rn.node);
9672 +       if (result) {
9673 +               assert("vs-5", result < 0);
9674 +               done_lh(&rn);
9675 +               return result;
9676 +       }
9677 +
9678 +       coord_init_first_unit(&next, rn.node);
9679 +       if (item_is_internal(&next)) {
9680 +               /*
9681 +                * next unit is in right neighbor and it is an unit of internal
9682 +                * item. Unlock coord->node. Move @lh to right neighbor. @coord
9683 +                * is set to the first unit of right neighbor.
9684 +                */
9685 +               coord_dup(coord, &next);
9686 +               zrelse(rn.node);
9687 +               done_lh(lh);
9688 +               move_lh(lh, &rn);
9689 +               return 1;
9690 +       }
9691 +
9692 +       /*
9693 +        * next unit is unit of extent item. Return without chaning @lh and
9694 +        * @coord.
9695 +        */
9696 +       assert("vs-6", item_is_extent(&next));
9697 +       zrelse(rn.node);
9698 +       done_lh(&rn);
9699 +       return 0;
9700 +}
9701 +
9702 +/**
9703 + * rd_key - calculate key of an item next to the given one
9704 + * @coord: position in a node
9705 + * @key: storage for result key
9706 + *
9707 + * @coord is set between items or after the last item in a node. Calculate key
9708 + * of item to the right of @coord.
9709 + */
9710 +static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
9711 +{
9712 +       coord_t dup;
9713 +
9714 +       assert("nikita-2281", coord_is_between_items(coord));
9715 +       coord_dup(&dup, coord);
9716 +
9717 +       if (coord_set_to_right(&dup) == 0)
9718 +               /* next item is in this node. Return its key. */
9719 +               unit_key_by_coord(&dup, key);
9720 +       else {
9721 +               /*
9722 +                * next item either does not exist or is in right
9723 +                * neighbor. Return znode's right delimiting key.
9724 +                */
9725 +               read_lock_dk(current_tree);
9726 +               *key = *znode_get_rd_key(coord->node);
9727 +               read_unlock_dk(current_tree);
9728 +       }
9729 +       return key;
9730 +}
9731 +
9732 +/**
9733 + * add_empty_leaf - insert empty leaf between two extents
9734 + * @insert_coord: position in twig node between two extents
9735 + * @lh: twig node lock handle
9736 + * @key: left delimiting key of new node
9737 + * @rdkey: right delimiting key of new node
9738 + *
9739 + * Inserts empty leaf node between two extent items. It is necessary when we
9740 + * have to insert an item on leaf level between two extents (items on the twig
9741 + * level).
9742 + */
9743 +static int
9744 +add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
9745 +              const reiser4_key *key, const reiser4_key *rdkey)
9746 +{
9747 +       int result;
9748 +       carry_pool *pool;
9749 +       carry_level *todo;
9750 +       reiser4_item_data *item;
9751 +       carry_insert_data *cdata;
9752 +       carry_op *op;
9753 +       znode *node;
9754 +       reiser4_tree *tree;
9755 +
9756 +       assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
9757 +       tree = znode_get_tree(insert_coord->node);
9758 +       node = reiser4_new_node(insert_coord->node, LEAF_LEVEL);
9759 +       if (IS_ERR(node))
9760 +               return PTR_ERR(node);
9761 +
9762 +       /* setup delimiting keys for node being inserted */
9763 +       write_lock_dk(tree);
9764 +       znode_set_ld_key(node, key);
9765 +       znode_set_rd_key(node, rdkey);
9766 +       ON_DEBUG(node->creator = current);
9767 +       ON_DEBUG(node->first_key = *key);
9768 +       write_unlock_dk(tree);
9769 +
9770 +       ZF_SET(node, JNODE_ORPHAN);
9771 +
9772 +       /*
9773 +        * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
9774 +        * carry_insert_data
9775 +        */
9776 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
9777 +                              sizeof(*item) + sizeof(*cdata));
9778 +       if (IS_ERR(pool))
9779 +               return PTR_ERR(pool);
9780 +       todo = (carry_level *) (pool + 1);
9781 +       init_carry_level(todo, pool);
9782 +
9783 +       item = (reiser4_item_data *) (todo + 3);
9784 +       cdata = (carry_insert_data *) (item + 1);
9785 +
9786 +       op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0);
9787 +       if (!IS_ERR(op)) {
9788 +               cdata->coord = insert_coord;
9789 +               cdata->key = key;
9790 +               cdata->data = item;
9791 +               op->u.insert.d = cdata;
9792 +               op->u.insert.type = COPT_ITEM_DATA;
9793 +               build_child_ptr_data(node, item);
9794 +               item->arg = NULL;
9795 +               /* have @insert_coord to be set at inserted item after
9796 +                  insertion is done */
9797 +               todo->track_type = CARRY_TRACK_CHANGE;
9798 +               todo->tracked = lh;
9799 +
9800 +               result = reiser4_carry(todo, NULL);
9801 +               if (result == 0) {
9802 +                       /*
9803 +                        * pin node in memory. This is necessary for
9804 +                        * znode_make_dirty() below.
9805 +                        */
9806 +                       result = zload(node);
9807 +                       if (result == 0) {
9808 +                               lock_handle local_lh;
9809 +
9810 +                               /*
9811 +                                * if we inserted new child into tree we have
9812 +                                * to mark it dirty so that flush will be able
9813 +                                * to process it.
9814 +                                */
9815 +                               init_lh(&local_lh);
9816 +                               result = longterm_lock_znode(&local_lh, node,
9817 +                                                            ZNODE_WRITE_LOCK,
9818 +                                                            ZNODE_LOCK_LOPRI);
9819 +                               if (result == 0) {
9820 +                                       znode_make_dirty(node);
9821 +
9822 +                                       /*
9823 +                                        * when internal item pointing to @node
9824 +                                        * was inserted into twig node
9825 +                                        * create_hook_internal did not connect
9826 +                                        * it properly because its right
9827 +                                        * neighbor was not known. Do it
9828 +                                        * here
9829 +                                        */
9830 +                                       write_lock_tree(tree);
9831 +                                       assert("nikita-3312",
9832 +                                              znode_is_right_connected(node));
9833 +                                       assert("nikita-2984",
9834 +                                              node->right == NULL);
9835 +                                       ZF_CLR(node, JNODE_RIGHT_CONNECTED);
9836 +                                       write_unlock_tree(tree);
9837 +                                       result =
9838 +                                           connect_znode(insert_coord, node);
9839 +                                       ON_DEBUG(if (result == 0) check_dkeys(node););
9840 +
9841 +                                       done_lh(lh);
9842 +                                       move_lh(lh, &local_lh);
9843 +                                       assert("vs-1676", node_is_empty(node));
9844 +                                       coord_init_first_unit(insert_coord,
9845 +                                                             node);
9846 +                               } else {
9847 +                                       warning("nikita-3136",
9848 +                                               "Cannot lock child");
9849 +                               }
9850 +                               done_lh(&local_lh);
9851 +                               zrelse(node);
9852 +                       }
9853 +               }
9854 +       } else
9855 +               result = PTR_ERR(op);
9856 +       zput(node);
9857 +       done_carry_pool(pool);
9858 +       return result;
9859 +}
9860 +
9861 +/**
9862 + * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
9863 + * @h: search handle
9864 + * @outcome: flag saying whether search has to restart or is done
9865 + *
9866 + * Handles search on twig level. If this function completes search itself then
9867 + * it returns 1. If search has to go one level down then 0 is returned. If
9868 + * error happens then LOOKUP_DONE is returned via @outcome and error code is saved
9869 + * in @h->result.
9870 + */
9871 +int handle_eottl(cbk_handle *h, int *outcome)
9872 +{
9873 +       int result;
9874 +       reiser4_key key;
9875 +       coord_t *coord;
9876 +
9877 +       coord = h->coord;
9878 +
9879 +       if (h->level != TWIG_LEVEL ||
9880 +           (coord_is_existing_item(coord) && item_is_internal(coord))) {
9881 +               /* Continue to traverse tree downward. */
9882 +               return 0;
9883 +       }
9884 +
9885 +       /*
9886 +        * make sure that @h->coord is set to twig node and that it is either
9887 +        * set to extent item or after extent item
9888 +        */
9889 +       assert("vs-356", h->level == TWIG_LEVEL);
9890 +       assert("vs-357", ( {
9891 +                         coord_t lcoord;
9892 +                         coord_dup(&lcoord, coord);
9893 +                         check_me("vs-733", coord_set_to_left(&lcoord) == 0);
9894 +                         item_is_extent(&lcoord);
9895 +                         }
9896 +              ));
9897 +
9898 +       if (*outcome == NS_FOUND) {
9899 +               /* we have found desired key on twig level in extent item */
9900 +               h->result = CBK_COORD_FOUND;
9901 +               *outcome = LOOKUP_DONE;
9902 +               return 1;
9903 +       }
9904 +
9905 +       if (!(h->flags & CBK_FOR_INSERT)) {
9906 +               /* tree traversal is not for insertion. Just return
9907 +                  CBK_COORD_NOTFOUND. */
9908 +               h->result = CBK_COORD_NOTFOUND;
9909 +               *outcome = LOOKUP_DONE;
9910 +               return 1;
9911 +       }
9912 +
9913 +       /* take a look at the item to the right of h -> coord */
9914 +       result = is_next_item_internal(coord, h->key, h->active_lh);
9915 +       if (unlikely(result < 0)) {
9916 +               h->error = "get_right_neighbor failed";
9917 +               h->result = result;
9918 +               *outcome = LOOKUP_DONE;
9919 +               return 1;
9920 +       }
9921 +       if (result == 0) {
9922 +               /*
9923 +                * item to the right is also an extent one. Allocate a new node
9924 +                * and insert pointer to it after item h -> coord.
9925 +                *
9926 +                * This is a result of extents being located at the twig
9927 +                * level. For explanation, see comment just above
9928 +                * is_next_item_internal().
9929 +                */
9930 +               znode *loaded;
9931 +
9932 +               if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
9933 +                       /*
9934 +                        * we got node read locked, restart coord_by_key to
9935 +                        * have write lock on twig level
9936 +                        */
9937 +                       h->lock_level = TWIG_LEVEL;
9938 +                       h->lock_mode = ZNODE_WRITE_LOCK;
9939 +                       *outcome = LOOKUP_REST;
9940 +                       return 1;
9941 +               }
9942 +
9943 +               loaded = coord->node;
9944 +               result =
9945 +                   add_empty_leaf(coord, h->active_lh, h->key,
9946 +                                  rd_key(coord, &key));
9947 +               if (result) {
9948 +                       h->error = "could not add empty leaf";
9949 +                       h->result = result;
9950 +                       *outcome = LOOKUP_DONE;
9951 +                       return 1;
9952 +               }
9953 +               /* added empty leaf is locked (h->active_lh), its parent node
9954 +                  is unlocked, h->coord is set as EMPTY */
9955 +               assert("vs-13", coord->between == EMPTY_NODE);
9956 +               assert("vs-14", znode_is_write_locked(coord->node));
9957 +               assert("vs-15",
9958 +                      WITH_DATA(coord->node, node_is_empty(coord->node)));
9959 +               assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
9960 +               assert("vs-17", coord->node == h->active_lh->node);
9961 +               *outcome = LOOKUP_DONE;
9962 +               h->result = CBK_COORD_NOTFOUND;
9963 +               return 1;
9964 +       } else if (result == 1) {
9965 +               /*
9966 +                * this is special case mentioned in the comment on
9967 +                * tree.h:cbk_flags. We have found internal item immediately on
9968 +                * the right of extent, and we are going to insert new item
9969 +                * there. Key of item we are going to insert is smaller than
9970 +                * leftmost key in the node pointed to by said internal item
9971 +                * (otherwise search wouldn't come to the extent in the first
9972 +                * place).
9973 +                *
9974 +                * This is a result of extents being located at the twig
9975 +                * level. For explanation, see comment just above
9976 +                * is_next_item_internal().
9977 +                */
9978 +               h->flags &= ~CBK_TRUST_DK;
9979 +       } else {
9980 +               assert("vs-8", result == 2);
9981 +               *outcome = LOOKUP_REST;
9982 +               return 1;
9983 +       }
9984 +       assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
9985 +       return 0;
9986 +}
9987 +
9988 +/*
9989 + * Local variables:
9990 + * c-indentation-style: "K&R"
9991 + * mode-name: "LC"
9992 + * c-basic-offset: 8
9993 + * tab-width: 8
9994 + * fill-column: 120
9995 + * scroll-step: 1
9996 + * End:
9997 + */
9998 diff -urN linux-2.6.27.orig/fs/reiser4/estimate.c linux-2.6.27/fs/reiser4/estimate.c
9999 --- linux-2.6.27.orig/fs/reiser4/estimate.c     1970-01-01 03:00:00.000000000 +0300
10000 +++ linux-2.6.27/fs/reiser4/estimate.c  2008-10-12 18:20:00.000000000 +0400
10001 @@ -0,0 +1,120 @@
10002 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10003 +
10004 +#include "debug.h"
10005 +#include "dformat.h"
10006 +#include "tree.h"
10007 +#include "carry.h"
10008 +#include "inode.h"
10009 +#include "plugin/cluster.h"
10010 +#include "plugin/item/ctail.h"
10011 +
10012 +/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied
10013 +
10014 +   Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing
10015 +   is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1
10016 +   neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for
10017 +   leaf level, 3 for twig level, 2 on upper + 1 for root.
10018 +
10019 +   Do not calculate the current node of the lowest level here - this is overhead only.
10020 +
10021 +   children is almost always 1 here. Exception is flow insertion
10022 +*/
10023 +static reiser4_block_nr
10024 +max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
10025 +{
10026 +       reiser4_block_nr ten_percent;
10027 +
10028 +       ten_percent = ((103 * childen) >> 10);
10029 +
10030 +       /* If we have too many balancings at the time, tree height can raise on more
10031 +          then 1. Assume that if tree_height is 5, it can raise on 1 only. */
10032 +       return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
10033 +}
10034 +
10035 +/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10036 +   perform insertion of one item into the tree */
10037 +/* it is only called when tree height changes, or gets initialized */
10038 +reiser4_block_nr calc_estimate_one_insert(tree_level height)
10039 +{
10040 +       return 1 + max_balance_overhead(1, height);
10041 +}
10042 +
10043 +reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree)
10044 +{
10045 +       return tree->estimate_one_insert;
10046 +}
10047 +
10048 +/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10049 +   perform insertion of one unit into an item in the tree */
10050 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree)
10051 +{
10052 +       /* estimate insert into item just like item insertion */
10053 +       return tree->estimate_one_insert;
10054 +}
10055 +
10056 +reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree)
10057 +{
10058 +       /* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf
10059 +          level */
10060 +       return tree->estimate_one_insert;
10061 +}
10062 +
10063 +/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and
10064 +   both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal
10065 +   levels */
10066 +reiser4_block_nr estimate_insert_flow(tree_level height)
10067 +{
10068 +       return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 +
10069 +                                                                    CARRY_FLOW_NEW_NODES_LIMIT,
10070 +                                                                    height);
10071 +}
10072 +
10073 +/* returnes max number of nodes can be occupied by disk cluster */
10074 +static reiser4_block_nr estimate_cluster(struct inode * inode, int unprepped)
10075 +{
10076 +       int per_cluster;
10077 +       per_cluster = (unprepped ? 1 : cluster_nrpages(inode));
10078 +       return 3 + per_cluster +
10079 +               max_balance_overhead(3 + per_cluster,
10080 +                                    REISER4_MAX_ZTREE_HEIGHT);
10081 +}
10082 +
10083 +/* how many nodes might get dirty and added
10084 +   during insertion of a disk cluster */
10085 +reiser4_block_nr estimate_insert_cluster(struct inode * inode)
10086 +{
10087 +       return estimate_cluster(inode, 1); /* 24 */
10088 +}
10089 +
10090 +/* how many nodes might get dirty and added
10091 +   during update of a (prepped or unprepped) disk cluster */
10092 +reiser4_block_nr estimate_update_cluster(struct inode * inode)
10093 +{
10094 +       return estimate_cluster(inode, 0); /* 44, for 64K-cluster */
10095 +}
10096 +
10097 +/* How many nodes occupied by a disk cluster might get dirty.
10098 +   Note that this estimation is not precise (i.e. disk cluster
10099 +   can occupy more nodes).
10100 +   Q: Why we don't use precise estimation?
10101 +   A: 1.Because precise estimation is fairly bad: 65536 nodes
10102 +        for 64K logical cluster, it means 256M of dead space on
10103 +       a partition
10104 +      2.It is a very rare case when disk cluster occupies more
10105 +        nodes then this estimation returns.
10106 +*/
10107 +reiser4_block_nr estimate_dirty_cluster(struct inode * inode)
10108 +{
10109 +       return cluster_nrpages(inode) + 4;
10110 +}
10111 +
10112 +/* Make Linus happy.
10113 +   Local variables:
10114 +   c-indentation-style: "K&R"
10115 +   mode-name: "LC"
10116 +   c-basic-offset: 8
10117 +   tab-width: 8
10118 +   fill-column: 120
10119 +   scroll-step: 1
10120 +   End:
10121 +*/
10122 diff -urN linux-2.6.27.orig/fs/reiser4/export_ops.c linux-2.6.27/fs/reiser4/export_ops.c
10123 --- linux-2.6.27.orig/fs/reiser4/export_ops.c   1970-01-01 03:00:00.000000000 +0300
10124 +++ linux-2.6.27/fs/reiser4/export_ops.c        2008-10-12 18:20:00.000000000 +0400
10125 @@ -0,0 +1,328 @@
10126 +/* Copyright 2005 by Hans Reiser, licensing governed by
10127 + * reiser4/README */
10128 +
10129 +#include "inode.h"
10130 +#include "plugin/plugin.h"
10131 +
10132 +/*
10133 + * Supported file-handle types
10134 + */
10135 +typedef enum {
10136 +       FH_WITH_PARENT = 0x10,  /* file handle with parent */
10137 +       FH_WITHOUT_PARENT = 0x11        /* file handle without parent */
10138 +} reiser4_fhtype;
10139 +
10140 +#define NFSERROR (255)
10141 +
10142 +/* initialize place-holder for object */
10143 +static void object_on_wire_init(reiser4_object_on_wire *o)
10144 +{
10145 +       o->plugin = NULL;
10146 +}
10147 +
10148 +/* finish with @o */
10149 +static void object_on_wire_done(reiser4_object_on_wire *o)
10150 +{
10151 +       if (o->plugin != NULL)
10152 +               o->plugin->wire.done(o);
10153 +}
10154 +
10155 +/*
10156 + * read serialized object identity from @addr and store information about
10157 + * object in @obj. This is dual to encode_inode().
10158 + */
10159 +static char *decode_inode(struct super_block *s, char *addr,
10160 +                         reiser4_object_on_wire * obj)
10161 +{
10162 +       file_plugin *fplug;
10163 +
10164 +       /* identifier of object plugin is stored in the first two bytes,
10165 +        * followed by... */
10166 +       fplug = file_plugin_by_disk_id(reiser4_get_tree(s), (d16 *) addr);
10167 +       if (fplug != NULL) {
10168 +               addr += sizeof(d16);
10169 +               obj->plugin = fplug;
10170 +               assert("nikita-3520", fplug->wire.read != NULL);
10171 +               /* plugin specific encoding of object identity. */
10172 +               addr = fplug->wire.read(addr, obj);
10173 +       } else
10174 +               addr = ERR_PTR(RETERR(-EINVAL));
10175 +       return addr;
10176 +}
10177 +
10178 +static struct dentry *reiser4_get_dentry(struct super_block *super,
10179 +                                        void *data);
10180 +/**
10181 + * reiser4_decode_fh: decode on-wire object - helper function
10182 + * for fh_to_dentry, fh_to_parent export operations;
10183 + * @super: super block;
10184 + * @addr: onwire object to be decoded;
10185 + *
10186 + * Returns dentry referring to the object being decoded.
10187 + */
10188 +static struct dentry *reiser4_decode_fh(struct super_block * super,
10189 +                                       char * addr)
10190 +{
10191 +       reiser4_object_on_wire object;
10192 +
10193 +       object_on_wire_init(&object);
10194 +
10195 +       addr = decode_inode(super, addr, &object);
10196 +       if (!IS_ERR(addr)) {
10197 +               struct dentry *d;
10198 +               d = reiser4_get_dentry(super, &object);
10199 +               if (d != NULL && !IS_ERR(d))
10200 +                       /* FIXME check for -ENOMEM */
10201 +                       reiser4_get_dentry_fsdata(d)->stateless = 1;
10202 +               addr = (char *)d;
10203 +       }
10204 +       object_on_wire_done(&object);
10205 +       return (void *)addr;
10206 +}
10207 +
10208 +static struct dentry *reiser4_fh_to_dentry(struct super_block *sb,
10209 +                                          struct fid *fid,
10210 +                                          int fh_len, int fh_type)
10211 +{
10212 +       reiser4_context *ctx;
10213 +       struct dentry *d;
10214 +
10215 +       assert("edward-1536",
10216 +              fh_type == FH_WITH_PARENT || fh_type == FH_WITHOUT_PARENT);
10217 +
10218 +       ctx = reiser4_init_context(sb);
10219 +       if (IS_ERR(ctx))
10220 +               return (struct dentry *)ctx;
10221 +
10222 +       d = reiser4_decode_fh(sb, (char *)fid->raw);
10223 +
10224 +       reiser4_exit_context(ctx);
10225 +       return d;
10226 +}
10227 +
10228 +static struct dentry *reiser4_fh_to_parent(struct super_block *sb,
10229 +                                          struct fid *fid,
10230 +                                          int fh_len, int fh_type)
10231 +{
10232 +       char * addr;
10233 +       struct dentry * d;
10234 +       reiser4_context *ctx;
10235 +       file_plugin *fplug;
10236 +
10237 +       if (fh_type == FH_WITHOUT_PARENT)
10238 +               return NULL;
10239 +       assert("edward-1537", fh_type == FH_WITH_PARENT);
10240 +
10241 +       ctx = reiser4_init_context(sb);
10242 +       if (IS_ERR(ctx))
10243 +               return (struct dentry *)ctx;
10244 +       addr = (char *)fid->raw;
10245 +       /* extract 2-bytes file plugin id */
10246 +       fplug = file_plugin_by_disk_id(reiser4_get_tree(sb), (d16 *)addr);
10247 +       if (fplug == NULL) {
10248 +               d = ERR_PTR(RETERR(-EINVAL));
10249 +               goto exit;
10250 +       }
10251 +       addr += sizeof(d16);
10252 +       /* skip previously encoded object */
10253 +       addr = fplug->wire.read(addr, NULL /* skip */);
10254 +       if (IS_ERR(addr)) {
10255 +               d = (struct dentry *)addr;
10256 +               goto exit;
10257 +       }
10258 +       /* @extract and decode parent object */
10259 +       d = reiser4_decode_fh(sb, addr);
10260 + exit:
10261 +       reiser4_exit_context(ctx);
10262 +       return d;
10263 +}
10264 +
10265 +/*
10266 + * Object serialization support.
10267 + *
10268 + * To support knfsd file system provides export_operations that are used to
10269 + * construct and interpret NFS file handles. As a generalization of this,
10270 + * reiser4 object plugins have serialization support: it provides methods to
10271 + * create on-wire representation of identity of reiser4 object, and
10272 + * re-create/locate object given its on-wire identity.
10273 + *
10274 + */
10275 +
10276 +/*
10277 + * return number of bytes that on-wire representation of @inode's identity
10278 + * consumes.
10279 + */
10280 +static int encode_inode_size(struct inode *inode)
10281 +{
10282 +       assert("nikita-3514", inode != NULL);
10283 +       assert("nikita-3515", inode_file_plugin(inode) != NULL);
10284 +       assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
10285 +
10286 +       return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
10287 +}
10288 +
10289 +/*
10290 + * store on-wire representation of @inode's identity at the area beginning at
10291 + * @start.
10292 + */
10293 +static char *encode_inode(struct inode *inode, char *start)
10294 +{
10295 +       assert("nikita-3517", inode != NULL);
10296 +       assert("nikita-3518", inode_file_plugin(inode) != NULL);
10297 +       assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
10298 +
10299 +       /*
10300 +        * first, store two-byte identifier of object plugin, then
10301 +        */
10302 +       save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
10303 +                      (d16 *) start);
10304 +       start += sizeof(d16);
10305 +       /*
10306 +        * call plugin to serialize object's identity
10307 +        */
10308 +       return inode_file_plugin(inode)->wire.write(inode, start);
10309 +}
10310 +
10311 +/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
10312 + * returned if file handle can not be stored */
10313 +/**
10314 + * reiser4_encode_fh - encode_fh of export operations
10315 + * @dentry:
10316 + * @fh:
10317 + * @lenp:
10318 + * @need_parent:
10319 + *
10320 + */
10321 +static int
10322 +reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
10323 +                 int need_parent)
10324 +{
10325 +       struct inode *inode;
10326 +       struct inode *parent;
10327 +       char *addr;
10328 +       int need;
10329 +       int delta;
10330 +       int result;
10331 +       reiser4_context *ctx;
10332 +
10333 +       /*
10334 +        * knfsd asks as to serialize object in @dentry, and, optionally its
10335 +        * parent (if need_parent != 0).
10336 +        *
10337 +        * encode_inode() and encode_inode_size() is used to build
10338 +        * representation of object and its parent. All hard work is done by
10339 +        * object plugins.
10340 +        */
10341 +       inode = dentry->d_inode;
10342 +       parent = dentry->d_parent->d_inode;
10343 +
10344 +       addr = (char *)fh;
10345 +
10346 +       need = encode_inode_size(inode);
10347 +       if (need < 0)
10348 +               return NFSERROR;
10349 +       if (need_parent) {
10350 +               delta = encode_inode_size(parent);
10351 +               if (delta < 0)
10352 +                       return NFSERROR;
10353 +               need += delta;
10354 +       }
10355 +
10356 +       ctx = reiser4_init_context(dentry->d_inode->i_sb);
10357 +       if (IS_ERR(ctx))
10358 +               return PTR_ERR(ctx);
10359 +
10360 +       if (need <= sizeof(__u32) * (*lenp)) {
10361 +               addr = encode_inode(inode, addr);
10362 +               if (need_parent)
10363 +                       addr = encode_inode(parent, addr);
10364 +
10365 +               /* store in lenp number of 32bit words required for file
10366 +                * handle. */
10367 +               *lenp = (need + sizeof(__u32) - 1) >> 2;
10368 +               result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
10369 +       } else
10370 +               /* no enough space in file handle */
10371 +               result = NFSERROR;
10372 +       reiser4_exit_context(ctx);
10373 +       return result;
10374 +}
10375 +
10376 +/**
10377 + * reiser4_get_dentry_parent - get_parent of export operations
10378 + * @child:
10379 + *
10380 + */
10381 +static struct dentry *reiser4_get_dentry_parent(struct dentry *child)
10382 +{
10383 +       struct inode *dir;
10384 +       dir_plugin *dplug;
10385 +       struct dentry *result;
10386 +       reiser4_context *ctx;
10387 +
10388 +       assert("nikita-3527", child != NULL);
10389 +
10390 +       dir = child->d_inode;
10391 +       assert("nikita-3529", dir != NULL);
10392 +
10393 +       ctx = reiser4_init_context(dir->i_sb);
10394 +       if (IS_ERR(ctx))
10395 +               return (void *)ctx;
10396 +
10397 +       dplug = inode_dir_plugin(dir);
10398 +       assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
10399 +
10400 +       if (unlikely(dplug == NULL)) {
10401 +               reiser4_exit_context(ctx);
10402 +               return ERR_PTR(RETERR(-ENOTDIR));
10403 +       }
10404 +       result = dplug->get_parent(dir);
10405 +       reiser4_exit_context(ctx);
10406 +       return result;
10407 +}
10408 +
10409 +/**
10410 + * reiser4_get_dentry - get_dentry of export operations
10411 + * @super:
10412 + * @data:
10413 + *
10414 + *
10415 + */
10416 +static struct dentry *reiser4_get_dentry(struct super_block *super, void *data)
10417 +{
10418 +       reiser4_object_on_wire *o;
10419 +
10420 +       assert("nikita-3522", super != NULL);
10421 +       assert("nikita-3523", data != NULL);
10422 +       /*
10423 +        * this is only supposed to be called by
10424 +        *
10425 +        *     reiser4_decode_fh->find_exported_dentry
10426 +        *
10427 +        * so, reiser4_context should be here already.
10428 +        */
10429 +       assert("nikita-3526", is_in_reiser4_context());
10430 +
10431 +       o = (reiser4_object_on_wire *)data;
10432 +       assert("nikita-3524", o->plugin != NULL);
10433 +       assert("nikita-3525", o->plugin->wire.get != NULL);
10434 +
10435 +       return o->plugin->wire.get(super, o);
10436 +}
10437 +
10438 +struct export_operations reiser4_export_operations = {
10439 +       .encode_fh = reiser4_encode_fh,
10440 +       .fh_to_dentry = reiser4_fh_to_dentry,
10441 +       .fh_to_parent = reiser4_fh_to_parent,
10442 +       .get_parent = reiser4_get_dentry_parent,
10443 +};
10444 +
10445 +/*
10446 + * Local variables:
10447 + * c-indentation-style: "K&R"
10448 + * mode-name: "LC"
10449 + * c-basic-offset: 8
10450 + * tab-width: 8
10451 + * fill-column: 79
10452 + * End:
10453 + */
10454 diff -urN linux-2.6.27.orig/fs/reiser4/flush.c linux-2.6.27/fs/reiser4/flush.c
10455 --- linux-2.6.27.orig/fs/reiser4/flush.c        1970-01-01 03:00:00.000000000 +0300
10456 +++ linux-2.6.27/fs/reiser4/flush.c     2008-10-12 18:20:00.000000000 +0400
10457 @@ -0,0 +1,3625 @@
10458 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10459 +
10460 +/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
10461 +
10462 +#include "forward.h"
10463 +#include "debug.h"
10464 +#include "dformat.h"
10465 +#include "key.h"
10466 +#include "coord.h"
10467 +#include "plugin/item/item.h"
10468 +#include "plugin/plugin.h"
10469 +#include "plugin/object.h"
10470 +#include "txnmgr.h"
10471 +#include "jnode.h"
10472 +#include "znode.h"
10473 +#include "block_alloc.h"
10474 +#include "tree_walk.h"
10475 +#include "carry.h"
10476 +#include "tree.h"
10477 +#include "vfs_ops.h"
10478 +#include "inode.h"
10479 +#include "page_cache.h"
10480 +#include "wander.h"
10481 +#include "super.h"
10482 +#include "entd.h"
10483 +#include "reiser4.h"
10484 +#include "flush.h"
10485 +#include "writeout.h"
10486 +
10487 +#include <asm/atomic.h>
10488 +#include <linux/fs.h>          /* for struct super_block  */
10489 +#include <linux/mm.h>          /* for struct page */
10490 +#include <linux/bio.h>         /* for struct bio */
10491 +#include <linux/pagemap.h>
10492 +#include <linux/blkdev.h>
10493 +
10494 +/* IMPLEMENTATION NOTES */
10495 +
10496 +/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total
10497 +   order to the nodes of the tree in which the parent is placed before its children, which
10498 +   are ordered (recursively) in left-to-right order.  When we speak of a "parent-first preceder", it
10499 +   describes the node that "came before in forward parent-first order".  When we speak of a
10500 +   "parent-first follower", it describes the node that "comes next in parent-first
10501 +   order" (alternatively the node that "came before in reverse parent-first order").
10502 +
10503 +   The following pseudo-code prints the nodes of a tree in forward parent-first order:
10504 +
10505 +   void parent_first (node)
10506 +   {
10507 +     print_node (node);
10508 +     if (node->level > leaf) {
10509 +       for (i = 0; i < num_children; i += 1) {
10510 +         parent_first (node->child[i]);
10511 +       }
10512 +     }
10513 +   }
10514 +*/
10515 +
10516 +/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE?  The idea is to optimize block allocation so
10517 +   that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order)
10518 +   can be accomplished with sequential reads, which results in reading nodes in their
10519 +   parent-first order.  This is a read-optimization aspect of the flush algorithm, and
10520 +   there is also a write-optimization aspect, which is that we wish to make large
10521 +   sequential writes to the disk by allocating or reallocating blocks so that they can be
10522 +   written in sequence.  Sometimes the read-optimization and write-optimization goals
10523 +   conflict with each other, as we discuss in more detail below.
10524 +*/
10525 +
10526 +/* STATE BITS: The flush code revolves around the state of the jnodes it covers.  Here are
10527 +   the relevant jnode->state bits and their relevence to flush:
10528 +
10529 +     JNODE_DIRTY: If a node is dirty, it must be flushed.  But in order to be written it
10530 +     must be allocated first.  In order to be considered allocated, the jnode must have
10531 +     exactly one of { JNODE_OVRWR, JNODE_RELOC } set.  These two bits are exclusive, and
10532 +     all dirtied jnodes eventually have one of these bits set during each transaction.
10533 +
10534 +     JNODE_CREATED: The node was freshly created in its transaction and has no previous
10535 +     block address, so it is unconditionally assigned to be relocated, although this is
10536 +     mainly for code-convenience.  It is not being 'relocated' from anything, but in
10537 +     almost every regard it is treated as part of the relocate set.  The JNODE_CREATED bit
10538 +     remains set even after JNODE_RELOC is set, so the actual relocate can be
10539 +     distinguished from the created-and-allocated set easily: relocate-set members
10540 +     (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which
10541 +     have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
10542 +
10543 +     JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the
10544 +     decision to maintain the pre-existing location for this node and it will be written
10545 +     to the wandered-log.
10546 +
10547 +     JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was
10548 +     not created, see note above).  A block with JNODE_RELOC set is eligible for
10549 +     early-flushing and may be submitted during flush_empty_queues.  When the JNODE_RELOC
10550 +     bit is set on a znode, the parent node's internal item is modified and the znode is
10551 +     rehashed.
10552 +
10553 +     JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm scans the node
10554 +     and calls plugin->f.squeeze() method for its items. By this technology we update disk
10555 +     clusters of cryptcompress objects. Also if leftmost point that was found by flush scan
10556 +     has this flag (races with write(), rare case) the flush algorythm makes the decision
10557 +     to pass it to squalloc() in spite of its flushprepped status for squeezing, not for
10558 +     repeated allocation.
10559 +
10560 +     JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its
10561 +     flush queue.  This means the jnode is not on any clean or dirty list, instead it is
10562 +     moved to one of the flush queue (see flush_queue.h) object private list. This
10563 +     prevents multiple concurrent flushes from attempting to start flushing from the
10564 +     same node.
10565 +
10566 +     (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
10567 +     squeeze-and-allocate on a node while its children are actively being squeezed and
10568 +     allocated.  This flag was created to avoid submitting a write request for a node
10569 +     while its children are still being allocated and squeezed. Then flush queue was
10570 +     re-implemented to allow unlimited number of nodes be queued. This flag support was
10571 +     commented out in source code because we decided that there was no reason to submit
10572 +     queued nodes before jnode_flush() finishes.  However, current code calls fq_write()
10573 +     during a slum traversal and may submit "busy nodes" to disk. Probably we can
10574 +     re-enable the JNODE_FLUSH_BUSY bit support in future.
10575 +
10576 +   With these state bits, we describe a test used frequently in the code below,
10577 +   jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()).  The
10578 +   test for "flushprepped" returns true if any of the following are true:
10579 +
10580 +     - The node is not dirty
10581 +     - The node has JNODE_RELOC set
10582 +     - The node has JNODE_OVRWR set
10583 +
10584 +   If either the node is not dirty or it has already been processed by flush (and assigned
10585 +   JNODE_OVRWR or JNODE_RELOC), then it is prepped.  If jnode_is_flushprepped() returns
10586 +   true then flush has work to do on that node.
10587 +*/
10588 +
10589 +/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
10590 +   flushprepped twice (unless an explicit call to flush_unprep is made as described in
10591 +   detail below).  For example a node is dirtied, allocated, and then early-flushed to
10592 +   disk and set clean.  Before the transaction commits, the page is dirtied again and, due
10593 +   to memory pressure, the node is flushed again.  The flush algorithm will not relocate
10594 +   the node to a new disk location, it will simply write it to the same, previously
10595 +   relocated position again.
10596 +*/
10597 +
10598 +/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we
10599 +   start at a leaf node and allocate in parent-first order by iterating to the right.  At
10600 +   each step of the iteration, we check for the right neighbor.  Before advancing to the
10601 +   right neighbor, we check if the current position and the right neighbor share the same
10602 +   parent.  If they do not share the same parent, the parent is allocated before the right
10603 +   neighbor.
10604 +
10605 +   This process goes recursively up the tree and squeeze nodes level by level as long as
10606 +   the right neighbor and the current position have different parents, then it allocates
10607 +   the right-neighbors-with-different-parents on the way back down.  This process is
10608 +   described in more detail in flush_squalloc_changed_ancestor and the recursive function
10609 +   squalloc_one_changed_ancestor.  But the purpose here is not to discuss the
10610 +   specifics of the bottom-up approach as it is to contrast the bottom-up and top-down
10611 +   approaches.
10612 +
10613 +   The top-down algorithm was implemented earlier (April-May 2002).  In the top-down
10614 +   approach, we find a starting point by scanning left along each level past dirty nodes,
10615 +   then going up and repeating the process until the left node and the parent node are
10616 +   clean.  We then perform a parent-first traversal from the starting point, which makes
10617 +   allocating in parent-first order trivial.  After one subtree has been allocated in this
10618 +   manner, we move to the right, try moving upward, then repeat the parent-first
10619 +   traversal.
10620 +
10621 +   Both approaches have problems that need to be addressed.  Both are approximately the
10622 +   same amount of code, but the bottom-up approach has advantages in the order it acquires
10623 +   locks which, at the very least, make it the better approach.  At first glance each one
10624 +   makes the other one look simpler, so it is important to remember a few of the problems
10625 +   with each one.
10626 +
10627 +   Main problem with the top-down approach: When you encounter a clean child during the
10628 +   parent-first traversal, what do you do?  You would like to avoid searching through a
10629 +   large tree of nodes just to find a few dirty leaves at the bottom, and there is not an
10630 +   obvious solution.  One of the advantages of the top-down approach is that during the
10631 +   parent-first traversal you check every child of a parent to see if it is dirty.  In
10632 +   this way, the top-down approach easily handles the main problem of the bottom-up
10633 +   approach: unallocated children.
10634 +
10635 +   The unallocated children problem is that before writing a node to disk we must make
10636 +   sure that all of its children are allocated.  Otherwise, the writing the node means
10637 +   extra I/O because the node will have to be written again when the child is finally
10638 +   allocated.
10639 +
10640 +   WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM.  Except for bugs, this
10641 +   should not cause any file system corruption, it only degrades I/O performance because a
10642 +   node may be written when it is sure to be written at least one more time in the same
10643 +   transaction when the remaining children are allocated.  What follows is a description
10644 +   of how we will solve the problem.
10645 +*/
10646 +
10647 +/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then,
10648 +   proceeding in parent first order, allocate some of its left-children, then encounter a
10649 +   clean child in the middle of the parent.  We do not allocate the clean child, but there
10650 +   may remain unallocated (dirty) children to the right of the clean child.  If we were to
10651 +   stop flushing at this moment and write everything to disk, the parent might still
10652 +   contain unallocated children.
10653 +
10654 +   We could try to allocate all the descendents of every node that we allocate, but this
10655 +   is not necessary.  Doing so could result in allocating the entire tree: if the root
10656 +   node is allocated then every unallocated node would have to be allocated before
10657 +   flushing.  Actually, we do not have to write a node just because we allocate it.  It is
10658 +   possible to allocate but not write a node during flush, when it still has unallocated
10659 +   children.  However, this approach is probably not optimal for the following reason.
10660 +
10661 +   The flush algorithm is designed to allocate nodes in parent-first order in an attempt
10662 +   to optimize reads that occur in the same order.  Thus we are read-optimizing for a
10663 +   left-to-right scan through all the leaves in the system, and we are hoping to
10664 +   write-optimize at the same time because those nodes will be written together in batch.
10665 +   What happens, however, if we assign a block number to a node in its read-optimized
10666 +   order but then avoid writing it because it has unallocated children?  In that
10667 +   situation, we lose out on the write-optimization aspect because a node will have to be
10668 +   written again to the its location on the device, later, which likely means seeking back
10669 +   to that location.
10670 +
10671 +   So there are tradeoffs. We can choose either:
10672 +
10673 +   A. Allocate all unallocated children to preserve both write-optimization and
10674 +   read-optimization, but this is not always desirable because it may mean having to
10675 +   allocate and flush very many nodes at once.
10676 +
10677 +   B. Defer writing nodes with unallocated children, keep their read-optimized locations,
10678 +   but sacrifice write-optimization because those nodes will be written again.
10679 +
10680 +   C. Defer writing nodes with unallocated children, but do not keep their read-optimized
10681 +   locations.  Instead, choose to write-optimize them later, when they are written.  To
10682 +   facilitate this, we "undo" the read-optimized allocation that was given to the node so
10683 +   that later it can be write-optimized, thus "unpreparing" the flush decision.  This is a
10684 +   case where we disturb the FLUSH_PREP_ONCE_PER_TRANSACTION rule described above.  By a
10685 +   call to flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
10686 +   if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate its block
10687 +   location, and set the JNODE_CREATED bit, effectively setting the node back to an
10688 +   unallocated state.
10689 +
10690 +   We will take the following approach in v4.0: for twig nodes we will always finish
10691 +   allocating unallocated children (A).  For nodes with (level > TWIG) we will defer
10692 +   writing and choose write-optimization (C).
10693 +
10694 +   To summarize, there are several parts to a solution that avoids the problem with
10695 +   unallocated children:
10696 +
10697 +   FIXME-ZAM: Still no one approach is implemented to eliminate the "UNALLOCATED CHILDREN"
10698 +   problem because there was an experiment which was done showed that we have 1-2 nodes
10699 +   with unallocated children for thousands of written nodes.  The experiment was simple
10700 +   like coping / deletion of linux kernel sources.  However the problem can arise in more
10701 +   complex tests.  I think we have jnode_io_hook to insert a check for unallocated
10702 +   children and see what kind of problem we have.
10703 +
10704 +   1. When flush reaches a stopping point (e.g., a clean node), it should continue calling
10705 +   squeeze-and-allocate on any remaining unallocated children.  FIXME: Difficulty to
10706 +   implement: should be simple -- amounts to adding a while loop to jnode_flush, see
10707 +   comments in that function.
10708 +
10709 +   2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes may still
10710 +   have unallocated children.  If the twig level has unallocated children it is an
10711 +   assertion failure.  If a higher-level node has unallocated children, then it should be
10712 +   explicitly de-allocated by a call to flush_unprep().  FIXME: Difficulty to implement:
10713 +   should be simple.
10714 +
10715 +   3. (CPU-Optimization) Checking whether a node has unallocated children may consume more
10716 +   CPU cycles than we would like, and it is possible (but medium complexity) to optimize
10717 +   this somewhat in the case where large sub-trees are flushed.  The following observation
10718 +   helps: if both the left- and right-neighbor of a node are processed by the flush
10719 +   algorithm then the node itself is guaranteed to have all of its children allocated.
10720 +   However, the cost of this check may not be so expensive after all: it is not needed for
10721 +   leaves and flush can guarantee this property for twigs.  That leaves only (level >
10722 +   TWIG) nodes that have to be checked, so this optimization only helps if at least three
10723 +   (level > TWIG) nodes are flushed in one pass, and the savings will be very small unless
10724 +   there are many more (level > TWIG) nodes.  But if there are many (level > TWIG) nodes
10725 +   then the number of blocks being written will be very large, so the savings may be
10726 +   insignificant.  That said, the idea is to maintain both the left and right edges of
10727 +   nodes that are processed in flush.  When flush_empty_queue() is called, a relatively
10728 +   simple test will tell whether the (level > TWIG) node is on the edge.  If it is on the
10729 +   edge, the slow check is necessary, but if it is in the interior then it can be assumed
10730 +   to have all of its children allocated.  FIXME: medium complexity to implement, but
10731 +   simple to verify given that we must have a slow check anyway.
10732 +
10733 +   4. (Optional) This part is optional, not for v4.0--flush should work independently of
10734 +   whether this option is used or not.  Called RAPID_SCAN, the idea is to amend the
10735 +   left-scan operation to take unallocated children into account.  Normally, the left-scan
10736 +   operation goes left as long as adjacent nodes are dirty up until some large maximum
10737 +   value (FLUSH_SCAN_MAXNODES) at which point it stops and begins flushing.  But scan-left
10738 +   may stop at a position where there are unallocated children to the left with the same
10739 +   parent.  When RAPID_SCAN is enabled, the ordinary scan-left operation stops after
10740 +   FLUSH_RELOCATE_THRESHOLD, which is much smaller than FLUSH_SCAN_MAXNODES, then procedes
10741 +   with a rapid scan.  The rapid scan skips all the interior children of a node--if the
10742 +   leftmost child of a twig is dirty, check its left neighbor (the rightmost child of the
10743 +   twig to the left).  If the left neighbor of the leftmost child is also dirty, then
10744 +   continue the scan at the left twig and repeat.  This option will cause flush to
10745 +   allocate more twigs in a single pass, but it also has the potential to write many more
10746 +   nodes than would otherwise be written without the RAPID_SCAN option.  RAPID_SCAN
10747 +   was partially implemented, code removed August 12, 2002 by JMACD.
10748 +*/
10749 +
10750 +/* FLUSH CALLED ON NON-LEAF LEVEL.  Most of our design considerations assume that the
10751 +   starting point for flush is a leaf node, but actually the flush code cares very little
10752 +   about whether or not this is true.  It is possible that all the leaf nodes are flushed
10753 +   and dirty parent nodes still remain, in which case jnode_flush() is called on a
10754 +   non-leaf argument.  Flush doesn't care--it treats the argument node as if it were a
10755 +   leaf, even when it is not.  This is a simple approach, and there may be a more optimal
10756 +   policy but until a problem with this approach is discovered, simplest is probably best.
10757 +
10758 +   NOTE: In this case, the ordering produced by flush is parent-first only if you ignore
10759 +   the leaves.  This is done as a matter of simplicity and there is only one (shaky)
10760 +   justification.  When an atom commits, it flushes all leaf level nodes first, followed
10761 +   by twigs, and so on.  With flushing done in this order, if flush is eventually called
10762 +   on a non-leaf node it means that (somehow) we reached a point where all leaves are
10763 +   clean and only internal nodes need to be flushed.  If that it the case, then it means
10764 +   there were no leaves that were the parent-first preceder/follower of the parent.  This
10765 +   is expected to be a rare case, which is why we do nothing special about it.  However,
10766 +   memory pressure may pass an internal node to flush when there are still dirty leaf
10767 +   nodes that need to be flushed, which could prove our original assumptions
10768 +   "inoperative".  If this needs to be fixed, then scan_left/right should have
10769 +   special checks for the non-leaf levels.  For example, instead of passing from a node to
10770 +   the left neighbor, it should pass from the node to the left neighbor's rightmost
10771 +   descendent (if dirty).
10772 +
10773 +*/
10774 +
10775 +/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING.  We walk the tree in 4MB-16MB chunks, dirtying everything and putting
10776 +   it into a transaction.  We tell the allocator to allocate the blocks as far as possible towards one end of the
10777 +   logical device--the left (starting) end of the device if we are walking from left to right, the right end of the
10778 +   device if we are walking from right to left.  We then make passes in alternating directions, and as we do this the
10779 +   device becomes sorted such that tree order and block number order fully correlate.
10780 +
10781 +   Resizing is done by shifting everything either all the way to the left or all the way
10782 +   to the right, and then reporting the last block.
10783 +*/
10784 +
10785 +/* RELOCATE DECISIONS: The code makes a decision to relocate in several places.  This
10786 +   descibes the policy from the highest level:
10787 +
10788 +   The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive nodes on the
10789 +   leaf level during flush-scan (right, left), then we unconditionally decide to relocate
10790 +   leaf nodes.
10791 +
10792 +   Otherwise, there are two contexts in which we make a decision to relocate:
10793 +
10794 +   1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
10795 +   During the initial stages of flush, after scan-right completes, we want to ask the
10796 +   question: should we relocate this leaf node and thus dirty the parent node.  Then if
10797 +   the node is a leftmost child its parent is its own parent-first preceder, thus we repeat
10798 +   the question at the next level up, and so on.  In these cases we are moving in the
10799 +   reverse-parent first direction.
10800 +
10801 +   There is another case which is considered the reverse direction, which comes at the end
10802 +   of a twig in reverse_relocate_end_of_twig().  As we finish processing a twig we may
10803 +   reach a point where there is a clean twig to the right with a dirty leftmost child.  In
10804 +   this case, we may wish to relocate the child by testing if it should be relocated
10805 +   relative to its parent.
10806 +
10807 +   2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done in
10808 +   allocate_znode.  What distinguishes the forward parent-first case from the
10809 +   reverse-parent first case is that the preceder has already been allocated in the
10810 +   forward case, whereas in the reverse case we don't know what the preceder is until we
10811 +   finish "going in reverse".  That simplifies the forward case considerably, and there we
10812 +   actually use the block allocator to determine whether, e.g., a block closer to the
10813 +   preceder is available.
10814 +*/
10815 +
10816 +/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration.  The idea is, once we
10817 +   finish scan-left and find a starting point, if the parent's left neighbor is dirty then
10818 +   squeeze the parent's left neighbor and the parent.  This may change the
10819 +   flush-starting-node's parent.  Repeat until the child's parent is stable.  If the child
10820 +   is a leftmost child, repeat this left-edge squeezing operation at the next level up.
10821 +   Note that we cannot allocate extents during this or they will be out of parent-first
10822 +   order.  There is also some difficult coordinate maintenence issues.  We can't do a tree
10823 +   search to find coordinates again (because we hold locks), we have to determine them
10824 +   from the two nodes being squeezed.  Looks difficult, but has potential to increase
10825 +   space utilization. */
10826 +
10827 +/* Flush-scan helper functions. */
10828 +static void scan_init(flush_scan * scan);
10829 +static void scan_done(flush_scan * scan);
10830 +
10831 +/* Flush-scan algorithm. */
10832 +static int scan_left(flush_scan * scan, flush_scan * right, jnode * node,
10833 +                    unsigned limit);
10834 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
10835 +static int scan_common(flush_scan * scan, flush_scan * other);
10836 +static int scan_formatted(flush_scan * scan);
10837 +static int scan_unformatted(flush_scan * scan, flush_scan * other);
10838 +static int scan_by_coord(flush_scan * scan);
10839 +
10840 +/* Initial flush-point ancestor allocation. */
10841 +static int alloc_pos_and_ancestors(flush_pos_t * pos);
10842 +static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos);
10843 +static int set_preceder(const coord_t * coord_in, flush_pos_t * pos);
10844 +
10845 +/* Main flush algorithm.  Note on abbreviation: "squeeze and allocate" == "squalloc". */
10846 +static int squalloc(flush_pos_t * pos);
10847 +
10848 +/* Flush squeeze implementation. */
10849 +static int squeeze_right_non_twig(znode * left, znode * right);
10850 +static int shift_one_internal_unit(znode * left, znode * right);
10851 +
10852 +/* Flush reverse parent-first relocation routines. */
10853 +static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
10854 +                                           const reiser4_block_nr * nblk);
10855 +static int reverse_relocate_test(jnode * node, const coord_t * parent_coord,
10856 +                                flush_pos_t * pos);
10857 +static int reverse_relocate_check_dirty_parent(jnode * node,
10858 +                                              const coord_t * parent_coord,
10859 +                                              flush_pos_t * pos);
10860 +
10861 +/* Flush allocate write-queueing functions: */
10862 +static int allocate_znode(znode * node, const coord_t * parent_coord,
10863 +                         flush_pos_t * pos);
10864 +static int allocate_znode_update(znode * node, const coord_t * parent_coord,
10865 +                                flush_pos_t * pos);
10866 +static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
10867 +
10868 +/* Flush helper functions: */
10869 +static int jnode_lock_parent_coord(jnode * node,
10870 +                                  coord_t * coord,
10871 +                                  lock_handle * parent_lh,
10872 +                                  load_count * parent_zh,
10873 +                                  znode_lock_mode mode, int try);
10874 +static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
10875 +                           znode_lock_mode mode, int check_dirty, int expected);
10876 +static int znode_same_parents(znode * a, znode * b);
10877 +
10878 +static int znode_check_flushprepped(znode * node)
10879 +{
10880 +       return jnode_check_flushprepped(ZJNODE(node));
10881 +}
10882 +
10883 +/* Flush position functions */
10884 +static void pos_init(flush_pos_t * pos);
10885 +static int pos_valid(flush_pos_t * pos);
10886 +static void pos_done(flush_pos_t * pos);
10887 +static int pos_stop(flush_pos_t * pos);
10888 +
10889 +/* check that @org is first jnode extent unit, if extent is unallocated,
10890 + * because all jnodes of unallocated extent are dirty and of the same atom. */
10891 +#define checkchild(scan)                                               \
10892 +assert("nikita-3435",                                                  \
10893 +       ergo(scan->direction == LEFT_SIDE &&                            \
10894 +            (scan->parent_coord.node->level == TWIG_LEVEL) &&           \
10895 +           jnode_is_unformatted(scan->node) &&                         \
10896 +           extent_is_unallocated(&scan->parent_coord),                 \
10897 +           extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
10898 +
10899 +/* This flush_cnt variable is used to track the number of concurrent flush operations,
10900 +   useful for debugging.  It is initialized in txnmgr.c out of laziness (because flush has
10901 +   no static initializer function...) */
10902 +ON_DEBUG(atomic_t flush_cnt;
10903 +    )
10904 +
10905 +/* check fs backing device for write congestion */
10906 +static int check_write_congestion(void)
10907 +{
10908 +       struct super_block *sb;
10909 +       struct backing_dev_info *bdi;
10910 +
10911 +       sb = reiser4_get_current_sb();
10912 +       bdi = reiser4_get_super_fake(sb)->i_mapping->backing_dev_info;
10913 +       return bdi_write_congested(bdi);
10914 +}
10915 +
10916 +/* conditionally write flush queue */
10917 +static int write_prepped_nodes(flush_pos_t * pos)
10918 +{
10919 +       int ret;
10920 +
10921 +       assert("zam-831", pos);
10922 +       assert("zam-832", pos->fq);
10923 +
10924 +       if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
10925 +               return 0;
10926 +
10927 +       if (check_write_congestion())
10928 +               return 0;
10929 +
10930 +       ret = reiser4_write_fq(pos->fq, pos->nr_written,
10931 +                      WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
10932 +       return ret;
10933 +}
10934 +
10935 +/* Proper release all flush pos. resources then move flush position to new
10936 +   locked node */
10937 +static void move_flush_pos(flush_pos_t * pos, lock_handle * new_lock,
10938 +                          load_count * new_load, const coord_t * new_coord)
10939 +{
10940 +       assert("zam-857", new_lock->node == new_load->node);
10941 +
10942 +       if (new_coord) {
10943 +               assert("zam-858", new_coord->node == new_lock->node);
10944 +               coord_dup(&pos->coord, new_coord);
10945 +       } else {
10946 +               coord_init_first_unit(&pos->coord, new_lock->node);
10947 +       }
10948 +
10949 +       if (pos->child) {
10950 +               jput(pos->child);
10951 +               pos->child = NULL;
10952 +       }
10953 +
10954 +       move_load_count(&pos->load, new_load);
10955 +       done_lh(&pos->lock);
10956 +       move_lh(&pos->lock, new_lock);
10957 +}
10958 +
10959 +/* delete empty node which link from the parent still exists. */
10960 +static int delete_empty_node(znode * node)
10961 +{
10962 +       reiser4_key smallest_removed;
10963 +
10964 +       assert("zam-1019", node != NULL);
10965 +       assert("zam-1020", node_is_empty(node));
10966 +       assert("zam-1023", znode_is_wlocked(node));
10967 +
10968 +       return reiser4_delete_node(node, &smallest_removed, NULL, 1);
10969 +}
10970 +
10971 +/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
10972 +static int prepare_flush_pos(flush_pos_t * pos, jnode * org)
10973 +{
10974 +       int ret;
10975 +       load_count load;
10976 +       lock_handle lock;
10977 +
10978 +       init_lh(&lock);
10979 +       init_load_count(&load);
10980 +
10981 +       if (jnode_is_znode(org)) {
10982 +               ret = longterm_lock_znode(&lock, JZNODE(org),
10983 +                                         ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
10984 +               if (ret)
10985 +                       return ret;
10986 +
10987 +               ret = incr_load_count_znode(&load, JZNODE(org));
10988 +               if (ret)
10989 +                       return ret;
10990 +
10991 +               pos->state =
10992 +                   (jnode_get_level(org) ==
10993 +                    LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
10994 +               move_flush_pos(pos, &lock, &load, NULL);
10995 +       } else {
10996 +               coord_t parent_coord;
10997 +               ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
10998 +                                             &load, ZNODE_WRITE_LOCK, 0);
10999 +               if (ret)
11000 +                       goto done;
11001 +               if (!item_is_extent(&parent_coord)) {
11002 +                       /* file was converted to tail, org became HB, we found internal
11003 +                          item */
11004 +                       ret = -EAGAIN;
11005 +                       goto done;
11006 +               }
11007 +
11008 +               pos->state = POS_ON_EPOINT;
11009 +               move_flush_pos(pos, &lock, &load, &parent_coord);
11010 +               pos->child = jref(org);
11011 +               if (extent_is_unallocated(&parent_coord)
11012 +                   && extent_unit_index(&parent_coord) != index_jnode(org)) {
11013 +                       /* @org is not first child of its parent unit. This may happen
11014 +                          because longerm lock of its parent node was released between
11015 +                          scan_left and scan_right. For now work around this having flush to repeat */
11016 +                       ret = -EAGAIN;
11017 +               }
11018 +       }
11019 +
11020 +      done:
11021 +       done_load_count(&load);
11022 +       done_lh(&lock);
11023 +       return ret;
11024 +}
11025 +
11026 +/* TODO LIST (no particular order): */
11027 +/* I have labelled most of the legitimate FIXME comments in this file with letters to
11028 +   indicate which issue they relate to.  There are a few miscellaneous FIXMEs with
11029 +   specific names mentioned instead that need to be inspected/resolved. */
11030 +/* B. There is an issue described in reverse_relocate_test having to do with an
11031 +   imprecise is_preceder? check having to do with partially-dirty extents.  The code that
11032 +   sets preceder hints and computes the preceder is basically untested.  Careful testing
11033 +   needs to be done that preceder calculations are done correctly, since if it doesn't
11034 +   affect correctness we will not catch this stuff during regular testing. */
11035 +/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling.  It is unclear which of these are
11036 +   considered expected but unlikely conditions.  Flush currently returns 0 (i.e., success
11037 +   but no progress, i.e., restart) whenever it receives any of these in jnode_flush().
11038 +   Many of the calls that may produce one of these return values (i.e.,
11039 +   longterm_lock_znode, reiser4_get_parent, reiser4_get_neighbor, ...) check some of these
11040 +   values themselves and, for instance, stop flushing instead of resulting in a restart.
11041 +   If any of these results are true error conditions then flush will go into a busy-loop,
11042 +   as we noticed during testing when a corrupt tree caused find_child_ptr to return
11043 +   ENOENT.  It needs careful thought and testing of corner conditions.
11044 +*/
11045 +/* D. Atomicity of flush_prep against deletion and flush concurrency.  Suppose a created
11046 +   block is assigned a block number then early-flushed to disk.  It is dirtied again and
11047 +   flush is called again.  Concurrently, that block is deleted, and the de-allocation of
11048 +   its block number does not need to be deferred, since it is not part of the preserve set
11049 +   (i.e., it didn't exist before the transaction).  I think there may be a race condition
11050 +   where flush writes the dirty, created block after the non-deferred deallocated block
11051 +   number is re-allocated, making it possible to write deleted data on top of non-deleted
11052 +   data.  Its just a theory, but it needs to be thought out. */
11053 +/* F. bio_alloc() failure is not handled gracefully. */
11054 +/* G. Unallocated children. */
11055 +/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered blocks. */
11056 +/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
11057 +
11058 +/* JNODE_FLUSH: MAIN ENTRY POINT */
11059 +/* This is the main entry point for flushing a jnode and its dirty neighborhood (dirty
11060 +   neighborhood is named "slum").  Jnode_flush() is called if reiser4 has to write dirty
11061 +   blocks to disk, it happens when Linux VM decides to reduce number of dirty pages or as
11062 +   a part of transaction commit.
11063 +
11064 +   Our objective here is to prep and flush the slum the jnode belongs to. We want to
11065 +   squish the slum together, and allocate the nodes in it as we squish because allocation
11066 +   of children affects squishing of parents.
11067 +
11068 +   The "argument" @node tells flush where to start.  From there, flush finds the left edge
11069 +   of the slum, and calls squalloc (in which nodes are squeezed and allocated).  To find a
11070 +   "better place" to start squalloc first we perform a flush_scan.
11071 +
11072 +   Flush-scanning may be performed in both left and right directions, but for different
11073 +   purposes.  When scanning to the left, we are searching for a node that precedes a
11074 +   sequence of parent-first-ordered nodes which we will then flush in parent-first order.
11075 +   During flush-scanning, we also take the opportunity to count the number of consecutive
11076 +   leaf nodes.  If this number is past some threshold (FLUSH_RELOCATE_THRESHOLD), then we
11077 +   make a decision to reallocate leaf nodes (thus favoring write-optimization).
11078 +
11079 +   Since the flush argument node can be anywhere in a sequence of dirty leaves, there may
11080 +   also be dirty nodes to the right of the argument.  If the scan-left operation does not
11081 +   count at least FLUSH_RELOCATE_THRESHOLD nodes then we follow it with a right-scan
11082 +   operation to see whether there is, in fact, enough nodes to meet the relocate
11083 +   threshold.  Each right- and left-scan operation uses a single flush_scan object.
11084 +
11085 +   After left-scan and possibly right-scan, we prepare a flush_position object with the
11086 +   starting flush point or parent coordinate, which was determined using scan-left.
11087 +
11088 +   Next we call the main flush routine, squalloc, which iterates along the
11089 +   leaf level, squeezing and allocating nodes (and placing them into the flush queue).
11090 +
11091 +   After squalloc returns we take extra steps to ensure that all the children
11092 +   of the final twig node are allocated--this involves repeating squalloc
11093 +   until we finish at a twig with no unallocated children.
11094 +
11095 +   Finally, we call flush_empty_queue to submit write-requests to disk.  If we encounter
11096 +   any above-twig nodes during flush_empty_queue that still have unallocated children, we
11097 +   flush_unprep them.
11098 +
11099 +   Flush treats several "failure" cases as non-failures, essentially causing them to start
11100 +   over.  E_DEADLOCK is one example.  FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should
11101 +   probably be handled properly rather than restarting, but there are a bunch of cases to
11102 +   audit.
11103 +*/
11104 +
11105 +static int
11106 +jnode_flush(jnode * node, long nr_to_write, long *nr_written,
11107 +           flush_queue_t * fq, int flags)
11108 +{
11109 +       long ret = 0;
11110 +       flush_scan *right_scan;
11111 +       flush_scan *left_scan;
11112 +       flush_pos_t *flush_pos;
11113 +       int todo;
11114 +       struct super_block *sb;
11115 +       reiser4_super_info_data *sbinfo;
11116 +       jnode *leftmost_in_slum = NULL;
11117 +
11118 +       assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
11119 +       assert("nikita-3022", reiser4_schedulable());
11120 +
11121 +       assert("nikita-3185",
11122 +              get_current_super_private()->delete_mutex_owner != current);
11123 +
11124 +       /* allocate right_scan, left_scan and flush_pos */
11125 +       right_scan =
11126 +           kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos),
11127 +                   reiser4_ctx_gfp_mask_get());
11128 +       if (right_scan == NULL)
11129 +               return RETERR(-ENOMEM);
11130 +       left_scan = right_scan + 1;
11131 +       flush_pos = (flush_pos_t *) (left_scan + 1);
11132 +
11133 +       sb = reiser4_get_current_sb();
11134 +       sbinfo = get_super_private(sb);
11135 +
11136 +       /* Flush-concurrency debug code */
11137 +#if REISER4_DEBUG
11138 +       atomic_inc(&flush_cnt);
11139 +#endif
11140 +
11141 +       reiser4_enter_flush(sb);
11142 +
11143 +       /* Initialize a flush position. */
11144 +       pos_init(flush_pos);
11145 +
11146 +       flush_pos->nr_written = nr_written;
11147 +       flush_pos->fq = fq;
11148 +       flush_pos->flags = flags;
11149 +       flush_pos->nr_to_write = nr_to_write;
11150 +
11151 +       scan_init(right_scan);
11152 +       scan_init(left_scan);
11153 +
11154 +       /* First scan left and remember the leftmost scan position.  If the leftmost
11155 +          position is unformatted we remember its parent_coord.  We scan until counting
11156 +          FLUSH_SCAN_MAXNODES.
11157 +
11158 +          If starting @node is unformatted, at the beginning of left scan its
11159 +          parent (twig level node, containing extent item) will be long term
11160 +          locked and lock handle will be stored in the
11161 +          @right_scan->parent_lock. This lock is used to start the rightward
11162 +          scan without redoing the tree traversal (necessary to find parent)
11163 +          and, hence, is kept during leftward scan. As a result, we have to
11164 +          use try-lock when taking long term locks during the leftward scan.
11165 +        */
11166 +       ret = scan_left(left_scan, right_scan,
11167 +                       node, sbinfo->flush.scan_maxnodes);
11168 +       if (ret != 0)
11169 +               goto failed;
11170 +
11171 +       leftmost_in_slum = jref(left_scan->node);
11172 +       scan_done(left_scan);
11173 +
11174 +       /* Then possibly go right to decide if we will use a policy of relocating leaves.
11175 +          This is only done if we did not scan past (and count) enough nodes during the
11176 +          leftward scan.  If we do scan right, we only care to go far enough to establish
11177 +          that at least FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed.  The
11178 +          scan limit is the difference between left_scan.count and the threshold. */
11179 +
11180 +       todo = sbinfo->flush.relocate_threshold - left_scan->count;
11181 +       /* scan right is inherently deadlock prone, because we are
11182 +        * (potentially) holding a lock on the twig node at this moment.
11183 +        * FIXME: this is incorrect comment: lock is not held */
11184 +       if (todo > 0) {
11185 +               ret = scan_right(right_scan, node, (unsigned)todo);
11186 +               if (ret != 0)
11187 +                       goto failed;
11188 +       }
11189 +
11190 +       /* Only the right-scan count is needed, release any rightward locks right away. */
11191 +       scan_done(right_scan);
11192 +
11193 +       /* ... and the answer is: we should relocate leaf nodes if at least
11194 +          FLUSH_RELOCATE_THRESHOLD nodes were found. */
11195 +       flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
11196 +           (left_scan->count + right_scan->count >=
11197 +            sbinfo->flush.relocate_threshold);
11198 +
11199 +       /* Funny business here.  We set the 'point' in the flush_position at prior to
11200 +          starting squalloc regardless of whether the first point is
11201 +          formatted or unformatted.  Without this there would be an invariant, in the
11202 +          rest of the code, that if the flush_position is unformatted then
11203 +          flush_position->point is NULL and flush_position->parent_{lock,coord} is set,
11204 +          and if the flush_position is formatted then flush_position->point is non-NULL
11205 +          and no parent info is set.
11206 +
11207 +          This seems lazy, but it makes the initial calls to reverse_relocate_test
11208 +          (which ask "is it the pos->point the leftmost child of its parent") much easier
11209 +          because we know the first child already.  Nothing is broken by this, but the
11210 +          reasoning is subtle.  Holding an extra reference on a jnode during flush can
11211 +          cause us to see nodes with HEARD_BANSHEE during squalloc, because nodes are not
11212 +          removed from sibling lists until they have zero reference count.  Flush would
11213 +          never observe a HEARD_BANSHEE node on the left-edge of flush, nodes are only
11214 +          deleted to the right.  So if nothing is broken, why fix it?
11215 +
11216 +          NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
11217 +          point and in any moment, because of the concurrent file system
11218 +          activity (for example, truncate). */
11219 +
11220 +       /* Check jnode state after flush_scan completed. Having a lock on this
11221 +          node or its parent (in case of unformatted) helps us in case of
11222 +          concurrent flushing. */
11223 +       if (jnode_check_flushprepped(leftmost_in_slum)
11224 +           && !jnode_convertible(leftmost_in_slum)) {
11225 +               ret = 0;
11226 +               goto failed;
11227 +       }
11228 +
11229 +       /* Now setup flush_pos using scan_left's endpoint. */
11230 +       ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
11231 +       if (ret)
11232 +               goto failed;
11233 +
11234 +       if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
11235 +           && node_is_empty(flush_pos->coord.node)) {
11236 +               znode *empty = flush_pos->coord.node;
11237 +
11238 +               assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
11239 +               ret = delete_empty_node(empty);
11240 +               goto failed;
11241 +       }
11242 +
11243 +       if (jnode_check_flushprepped(leftmost_in_slum)
11244 +           && !jnode_convertible(leftmost_in_slum)) {
11245 +               ret = 0;
11246 +               goto failed;
11247 +       }
11248 +
11249 +       /* Set pos->preceder and (re)allocate pos and its ancestors if it is needed  */
11250 +       ret = alloc_pos_and_ancestors(flush_pos);
11251 +       if (ret)
11252 +               goto failed;
11253 +
11254 +       /* Do the main rightward-bottom-up squeeze and allocate loop. */
11255 +       ret = squalloc(flush_pos);
11256 +       pos_stop(flush_pos);
11257 +       if (ret)
11258 +               goto failed;
11259 +
11260 +       /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated children.
11261 +          First, the pos_stop() and pos_valid() routines should be modified
11262 +          so that pos_stop() sets a flush_position->stop flag to 1 without
11263 +          releasing the current position immediately--instead release it in
11264 +          pos_done().  This is a better implementation than the current one anyway.
11265 +
11266 +          It is not clear that all fields of the flush_position should not be released,
11267 +          but at the very least the parent_lock, parent_coord, and parent_load should
11268 +          remain held because they are hold the last twig when pos_stop() is
11269 +          called.
11270 +
11271 +          When we reach this point in the code, if the parent_coord is set to after the
11272 +          last item then we know that flush reached the end of a twig (and according to
11273 +          the new flush queueing design, we will return now).  If parent_coord is not
11274 +          past the last item, we should check if the current twig has any unallocated
11275 +          children to the right (we are not concerned with unallocated children to the
11276 +          left--in that case the twig itself should not have been allocated).  If the
11277 +          twig has unallocated children to the right, set the parent_coord to that
11278 +          position and then repeat the call to squalloc.
11279 +
11280 +          Testing for unallocated children may be defined in two ways: if any internal
11281 +          item has a fake block number, it is unallocated; if any extent item is
11282 +          unallocated then all of its children are unallocated.  But there is a more
11283 +          aggressive approach: if there are any dirty children of the twig to the right
11284 +          of the current position, we may wish to relocate those nodes now.  Checking for
11285 +          potential relocation is more expensive as it requires knowing whether there are
11286 +          any dirty children that are not unallocated.  The extent_needs_allocation
11287 +          should be used after setting the correct preceder.
11288 +
11289 +          When we reach the end of a twig at this point in the code, if the flush can
11290 +          continue (when the queue is ready) it will need some information on the future
11291 +          starting point.  That should be stored away in the flush_handle using a seal, I
11292 +          believe.  Holding a jref() on the future starting point may break other code
11293 +          that deletes that node.
11294 +        */
11295 +
11296 +       /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is called
11297 +          above the twig level.  If the VM calls flush above the twig level, do nothing
11298 +          and return (but figure out why this happens).  The txnmgr should be modified to
11299 +          only flush its leaf-level dirty list.  This will do all the necessary squeeze
11300 +          and allocate steps but leave unallocated branches and possibly unallocated
11301 +          twigs (when the twig's leftmost child is not dirty).  After flushing the leaf
11302 +          level, the remaining unallocated nodes should be given write-optimized
11303 +          locations.  (Possibly, the remaining unallocated twigs should be allocated just
11304 +          before their leftmost child.)
11305 +        */
11306 +
11307 +       /* Any failure reaches this point. */
11308 +      failed:
11309 +
11310 +       switch (ret) {
11311 +       case -E_REPEAT:
11312 +       case -EINVAL:
11313 +       case -E_DEADLOCK:
11314 +       case -E_NO_NEIGHBOR:
11315 +       case -ENOENT:
11316 +               /* FIXME(C): Except for E_DEADLOCK, these should probably be handled properly
11317 +                  in each case.  They already are handled in many cases. */
11318 +               /* Something bad happened, but difficult to avoid...  Try again! */
11319 +               ret = 0;
11320 +       }
11321 +
11322 +       if (leftmost_in_slum)
11323 +               jput(leftmost_in_slum);
11324 +
11325 +       pos_done(flush_pos);
11326 +       scan_done(left_scan);
11327 +       scan_done(right_scan);
11328 +       kfree(right_scan);
11329 +
11330 +       ON_DEBUG(atomic_dec(&flush_cnt));
11331 +
11332 +       reiser4_leave_flush(sb);
11333 +
11334 +       return ret;
11335 +}
11336 +
11337 +/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
11338 + * flusher should submit all prepped nodes immediately without keeping them in
11339 + * flush queues for long time.  The reason for rapid flush mode is to free
11340 + * memory as fast as possible. */
11341 +
11342 +#if REISER4_USE_RAPID_FLUSH
11343 +
11344 +/**
11345 + * submit all prepped nodes if rapid flush mode is set,
11346 + * turn rapid flush mode off.
11347 + */
11348 +
11349 +static int rapid_flush(flush_pos_t * pos)
11350 +{
11351 +       if (!wbq_available())
11352 +               return 0;
11353 +
11354 +       return write_prepped_nodes(pos);
11355 +}
11356 +
11357 +#else
11358 +
11359 +#define rapid_flush(pos) (0)
11360 +
11361 +#endif                         /* REISER4_USE_RAPID_FLUSH */
11362 +
11363 +static jnode *find_flush_start_jnode(jnode *start, txn_atom *atom,
11364 +                                    flush_queue_t *fq, int *nr_queued,
11365 +                                    int flags)
11366 +{
11367 +       jnode * node;
11368 +
11369 +       if (start != NULL) {
11370 +               spin_lock_jnode(start);
11371 +               if (!jnode_is_flushprepped(start)) {
11372 +                       assert("zam-1056", start->atom == atom);
11373 +                       node = start;
11374 +                       goto enter;
11375 +               }
11376 +               spin_unlock_jnode(start);
11377 +       }
11378 +       /*
11379 +        * In this loop we process all already prepped (RELOC or OVRWR) and dirtied again
11380 +        * nodes. The atom spin lock is not released until all dirty nodes processed or
11381 +        * not prepped node found in the atom dirty lists.
11382 +        */
11383 +       while ((node = find_first_dirty_jnode(atom, flags))) {
11384 +               spin_lock_jnode(node);
11385 +       enter:
11386 +               assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
11387 +               assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
11388 +
11389 +               if (JF_ISSET(node, JNODE_WRITEBACK)) {
11390 +                       /* move node to the end of atom's writeback list */
11391 +                       list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
11392 +
11393 +                       /*
11394 +                        * jnode is not necessarily on dirty list: if it was dirtied when
11395 +                        * it was on flush queue - it does not get moved to dirty list
11396 +                        */
11397 +                       ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
11398 +                                            WB_LIST, 1));
11399 +
11400 +               } else if (jnode_is_znode(node)
11401 +                          && znode_above_root(JZNODE(node))) {
11402 +                       /*
11403 +                        * A special case for znode-above-root.  The above-root (fake)
11404 +                        * znode is captured and dirtied when the tree height changes or
11405 +                        * when the root node is relocated.  This causes atoms to fuse so
11406 +                        * that changes at the root are serialized.  However, this node is
11407 +                        * never flushed.  This special case used to be in lock.c to
11408 +                        * prevent the above-root node from ever being captured, but now
11409 +                        * that it is captured we simply prevent it from flushing.  The
11410 +                        * log-writer code relies on this to properly log superblock
11411 +                        * modifications of the tree height.
11412 +                        */
11413 +                       jnode_make_wander_nolock(node);
11414 +               } else if (JF_ISSET(node, JNODE_RELOC)) {
11415 +                       queue_jnode(fq, node);
11416 +                       ++(*nr_queued);
11417 +               } else
11418 +                       break;
11419 +
11420 +               spin_unlock_jnode(node);
11421 +       }
11422 +       return node;
11423 +}
11424 +
11425 +/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are more nodes
11426 + * to flush, return 0 if atom's dirty lists empty and keep current atom locked, return
11427 + * other errors as they are. */
11428 +int
11429 +flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
11430 +                  txn_atom ** atom, jnode *start)
11431 +{
11432 +       reiser4_super_info_data *sinfo = get_current_super_private();
11433 +       flush_queue_t *fq = NULL;
11434 +       jnode *node;
11435 +       int nr_queued;
11436 +       int ret;
11437 +
11438 +       assert("zam-889", atom != NULL && *atom != NULL);
11439 +       assert_spin_locked(&((*atom)->alock));
11440 +       assert("zam-892", get_current_context()->trans->atom == *atom);
11441 +
11442 +       nr_to_write = LONG_MAX;
11443 +       while (1) {
11444 +               ret = reiser4_fq_by_atom(*atom, &fq);
11445 +               if (ret != -E_REPEAT)
11446 +                       break;
11447 +               *atom = get_current_atom_locked();
11448 +       }
11449 +       if (ret)
11450 +               return ret;
11451 +
11452 +       assert_spin_locked(&((*atom)->alock));
11453 +
11454 +       /* parallel flushers limit */
11455 +       if (sinfo->tmgr.atom_max_flushers != 0) {
11456 +               while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
11457 +                       /* An reiser4_atom_send_event() call is inside
11458 +                          reiser4_fq_put_nolock() which is called when flush is
11459 +                          finished and nr_flushers is decremented. */
11460 +                       reiser4_atom_wait_event(*atom);
11461 +                       *atom = get_current_atom_locked();
11462 +               }
11463 +       }
11464 +
11465 +       /* count ourself as a flusher */
11466 +       (*atom)->nr_flushers++;
11467 +
11468 +       writeout_mode_enable();
11469 +
11470 +       nr_queued = 0;
11471 +       node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
11472 +
11473 +       if (node == NULL) {
11474 +               if (nr_queued == 0) {
11475 +                       (*atom)->nr_flushers--;
11476 +                       reiser4_fq_put_nolock(fq);
11477 +                       reiser4_atom_send_event(*atom);
11478 +                       /* current atom remains locked */
11479 +                       writeout_mode_disable();
11480 +                       return 0;
11481 +               }
11482 +               spin_unlock_atom(*atom);
11483 +       } else {
11484 +               jref(node);
11485 +               BUG_ON((*atom)->super != node->tree->super);
11486 +               spin_unlock_atom(*atom);
11487 +               spin_unlock_jnode(node);
11488 +               BUG_ON(nr_to_write == 0);
11489 +               ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
11490 +               jput(node);
11491 +       }
11492 +
11493 +       ret =
11494 +           reiser4_write_fq(fq, nr_submitted,
11495 +                    WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11496 +
11497 +       *atom = get_current_atom_locked();
11498 +       (*atom)->nr_flushers--;
11499 +       reiser4_fq_put_nolock(fq);
11500 +       reiser4_atom_send_event(*atom);
11501 +       spin_unlock_atom(*atom);
11502 +
11503 +       writeout_mode_disable();
11504 +
11505 +       if (ret == 0)
11506 +               ret = -E_REPEAT;
11507 +
11508 +       return ret;
11509 +}
11510 +
11511 +/* REVERSE PARENT-FIRST RELOCATION POLICIES */
11512 +
11513 +/* This implements the is-it-close-enough-to-its-preceder? test for relocation in the
11514 +   reverse parent-first relocate context.  Here all we know is the preceder and the block
11515 +   number.  Since we are going in reverse, the preceder may still be relocated as well, so
11516 +   we can't ask the block allocator "is there a closer block available to relocate?" here.
11517 +   In the _forward_ parent-first relocate context (not here) we actually call the block
11518 +   allocator to try and find a closer location. */
11519 +static int
11520 +reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
11521 +                                const reiser4_block_nr * nblk)
11522 +{
11523 +       reiser4_block_nr dist;
11524 +
11525 +       assert("jmacd-7710", *pblk != 0 && *nblk != 0);
11526 +       assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk));
11527 +       assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk));
11528 +
11529 +       /* Distance is the absolute value. */
11530 +       dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
11531 +
11532 +       /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from its preceder
11533 +          block, do not relocate. */
11534 +       if (dist <= get_current_super_private()->flush.relocate_distance) {
11535 +               return 0;
11536 +       }
11537 +
11538 +       return 1;
11539 +}
11540 +
11541 +/* This function is a predicate that tests for relocation.  Always called in the
11542 +   reverse-parent-first context, when we are asking whether the current node should be
11543 +   relocated in order to expand the flush by dirtying the parent level (and thus
11544 +   proceeding to flush that level).  When traversing in the forward parent-first direction
11545 +   (not here), relocation decisions are handled in two places: allocate_znode() and
11546 +   extent_needs_allocation(). */
11547 +static int
11548 +reverse_relocate_test(jnode * node, const coord_t * parent_coord,
11549 +                     flush_pos_t * pos)
11550 +{
11551 +       reiser4_block_nr pblk = 0;
11552 +       reiser4_block_nr nblk = 0;
11553 +
11554 +       assert("jmacd-8989", !jnode_is_root(node));
11555 +
11556 +       /*
11557 +        * This function is called only from the
11558 +        * reverse_relocate_check_dirty_parent() and only if the parent
11559 +        * node is clean. This implies that the parent has the real (i.e., not
11560 +        * fake) block number, and, so does the child, because otherwise the
11561 +        * parent would be dirty.
11562 +        */
11563 +
11564 +       /* New nodes are treated as if they are being relocated. */
11565 +       if (JF_ISSET (node, JNODE_CREATED) ||
11566 +           (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) {
11567 +               return 1;
11568 +       }
11569 +
11570 +       /* Find the preceder.  FIXME(B): When the child is an unformatted, previously
11571 +          existing node, the coord may be leftmost even though the child is not the
11572 +          parent-first preceder of the parent.  If the first dirty node appears somewhere
11573 +          in the middle of the first extent unit, this preceder calculation is wrong.
11574 +          Needs more logic in here. */
11575 +       if (coord_is_leftmost_unit(parent_coord)) {
11576 +               pblk = *znode_get_block(parent_coord->node);
11577 +       } else {
11578 +               pblk = pos->preceder.blk;
11579 +       }
11580 +       check_preceder(pblk);
11581 +
11582 +       /* If (pblk == 0) then the preceder isn't allocated or isn't known: relocate. */
11583 +       if (pblk == 0) {
11584 +               return 1;
11585 +       }
11586 +
11587 +       nblk = *jnode_get_block(node);
11588 +
11589 +       if (reiser4_blocknr_is_fake(&nblk))
11590 +               /* child is unallocated, mark parent dirty */
11591 +               return 1;
11592 +
11593 +       return reverse_relocate_if_close_enough(&pblk, &nblk);
11594 +}
11595 +
11596 +/* This function calls reverse_relocate_test to make a reverse-parent-first
11597 +   relocation decision and then, if yes, it marks the parent dirty. */
11598 +static int
11599 +reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord,
11600 +                                   flush_pos_t * pos)
11601 +{
11602 +       int ret;
11603 +
11604 +       if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
11605 +
11606 +               ret = reverse_relocate_test(node, parent_coord, pos);
11607 +               if (ret < 0) {
11608 +                       return ret;
11609 +               }
11610 +
11611 +               /* FIXME-ZAM
11612 +                  if parent is already relocated - we do not want to grab space, right? */
11613 +               if (ret == 1) {
11614 +                       int grabbed;
11615 +
11616 +                       grabbed = get_current_context()->grabbed_blocks;
11617 +                       if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) !=
11618 +                           0)
11619 +                               reiser4_panic("umka-1250",
11620 +                                             "No space left during flush.");
11621 +
11622 +                       assert("jmacd-18923",
11623 +                              znode_is_write_locked(parent_coord->node));
11624 +                       znode_make_dirty(parent_coord->node);
11625 +                       grabbed2free_mark(grabbed);
11626 +               }
11627 +       }
11628 +
11629 +       return 0;
11630 +}
11631 +
11632 +/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE FORWARD
11633 +   PARENT-FIRST LOOP BEGINS) */
11634 +
11635 +/* Get the leftmost child for given coord. */
11636 +static int get_leftmost_child_of_unit(const coord_t * coord, jnode ** child)
11637 +{
11638 +       int ret;
11639 +
11640 +       ret = item_utmost_child(coord, LEFT_SIDE, child);
11641 +
11642 +       if (ret)
11643 +               return ret;
11644 +
11645 +       if (IS_ERR(*child))
11646 +               return PTR_ERR(*child);
11647 +
11648 +       return 0;
11649 +}
11650 +
11651 +/* This step occurs after the left- and right-scans are completed, before starting the
11652 +   forward parent-first traversal.  Here we attempt to allocate ancestors of the starting
11653 +   flush point, which means continuing in the reverse parent-first direction to the
11654 +   parent, grandparent, and so on (as long as the child is a leftmost child).  This
11655 +   routine calls a recursive process, alloc_one_ancestor, which does the real work,
11656 +   except there is special-case handling here for the first ancestor, which may be a twig.
11657 +   At each level (here and alloc_one_ancestor), we check for relocation and then, if
11658 +   the child is a leftmost child, repeat at the next level.  On the way back down (the
11659 +   recursion), we allocate the ancestors in parent-first order. */
11660 +static int alloc_pos_and_ancestors(flush_pos_t * pos)
11661 +{
11662 +       int ret = 0;
11663 +       lock_handle plock;
11664 +       load_count pload;
11665 +       coord_t pcoord;
11666 +
11667 +       if (znode_check_flushprepped(pos->lock.node))
11668 +               return 0;
11669 +
11670 +       coord_init_invalid(&pcoord, NULL);
11671 +       init_lh(&plock);
11672 +       init_load_count(&pload);
11673 +
11674 +       if (pos->state == POS_ON_EPOINT) {
11675 +               /* a special case for pos on twig level, where we already have
11676 +                  a lock on parent node. */
11677 +               /* The parent may not be dirty, in which case we should decide
11678 +                  whether to relocate the child now. If decision is made to
11679 +                  relocate the child, the parent is marked dirty. */
11680 +               ret =
11681 +                   reverse_relocate_check_dirty_parent(pos->child, &pos->coord,
11682 +                                                       pos);
11683 +               if (ret)
11684 +                       goto exit;
11685 +
11686 +               /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
11687 +                  is leftmost) and the leaf/child, so recursion is not needed.
11688 +                  Levels above the twig will be allocated for
11689 +                  write-optimization before the transaction commits.  */
11690 +
11691 +               /* Do the recursive step, allocating zero or more of our
11692 +                * ancestors. */
11693 +               ret = alloc_one_ancestor(&pos->coord, pos);
11694 +
11695 +       } else {
11696 +               if (!znode_is_root(pos->lock.node)) {
11697 +                       /* all formatted nodes except tree root */
11698 +                       ret =
11699 +                           reiser4_get_parent(&plock, pos->lock.node,
11700 +                                              ZNODE_WRITE_LOCK);
11701 +                       if (ret)
11702 +                               goto exit;
11703 +
11704 +                       ret = incr_load_count_znode(&pload, plock.node);
11705 +                       if (ret)
11706 +                               goto exit;
11707 +
11708 +                       ret =
11709 +                           find_child_ptr(plock.node, pos->lock.node, &pcoord);
11710 +                       if (ret)
11711 +                               goto exit;
11712 +
11713 +                       ret =
11714 +                           reverse_relocate_check_dirty_parent(ZJNODE
11715 +                                                               (pos->lock.
11716 +                                                                node), &pcoord,
11717 +                                                               pos);
11718 +                       if (ret)
11719 +                               goto exit;
11720 +
11721 +                       ret = alloc_one_ancestor(&pcoord, pos);
11722 +                       if (ret)
11723 +                               goto exit;
11724 +               }
11725 +
11726 +               ret = allocate_znode(pos->lock.node, &pcoord, pos);
11727 +       }
11728 +      exit:
11729 +       done_load_count(&pload);
11730 +       done_lh(&plock);
11731 +       return ret;
11732 +}
11733 +
11734 +/* This is the recursive step described in alloc_pos_and_ancestors, above.  Ignoring the
11735 +   call to set_preceder, which is the next function described, this checks if the
11736 +   child is a leftmost child and returns if it is not.  If the child is a leftmost child
11737 +   it checks for relocation, possibly dirtying the parent.  Then it performs the recursive
11738 +   step. */
11739 +static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos)
11740 +{
11741 +       int ret = 0;
11742 +       lock_handle alock;
11743 +       load_count aload;
11744 +       coord_t acoord;
11745 +
11746 +       /* As we ascend at the left-edge of the region to flush, take this opportunity at
11747 +          the twig level to find our parent-first preceder unless we have already set
11748 +          it. */
11749 +       if (pos->preceder.blk == 0) {
11750 +               ret = set_preceder(coord, pos);
11751 +               if (ret != 0)
11752 +                       return ret;
11753 +       }
11754 +
11755 +       /* If the ancestor is clean or already allocated, or if the child is not a
11756 +          leftmost child, stop going up, even leaving coord->node not flushprepped. */
11757 +       if (znode_check_flushprepped(coord->node)
11758 +           || !coord_is_leftmost_unit(coord))
11759 +               return 0;
11760 +
11761 +       init_lh(&alock);
11762 +       init_load_count(&aload);
11763 +       coord_init_invalid(&acoord, NULL);
11764 +
11765 +       /* Only ascend to the next level if it is a leftmost child, but write-lock the
11766 +          parent in case we will relocate the child. */
11767 +       if (!znode_is_root(coord->node)) {
11768 +
11769 +               ret =
11770 +                   jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
11771 +                                           &alock, &aload, ZNODE_WRITE_LOCK,
11772 +                                           0);
11773 +               if (ret != 0) {
11774 +                       /* FIXME(C): check EINVAL, E_DEADLOCK */
11775 +                       goto exit;
11776 +               }
11777 +
11778 +               ret =
11779 +                   reverse_relocate_check_dirty_parent(ZJNODE(coord->node),
11780 +                                                       &acoord, pos);
11781 +               if (ret != 0) {
11782 +                       goto exit;
11783 +               }
11784 +
11785 +               /* Recursive call. */
11786 +               if (!znode_check_flushprepped(acoord.node)) {
11787 +                       ret = alloc_one_ancestor(&acoord, pos);
11788 +                       if (ret)
11789 +                               goto exit;
11790 +               }
11791 +       }
11792 +
11793 +       /* Note: we call allocate with the parent write-locked (except at the root) in
11794 +          case we relocate the child, in which case it will modify the parent during this
11795 +          call. */
11796 +       ret = allocate_znode(coord->node, &acoord, pos);
11797 +
11798 +      exit:
11799 +       done_load_count(&aload);
11800 +       done_lh(&alock);
11801 +       return ret;
11802 +}
11803 +
11804 +/* During the reverse parent-first alloc_pos_and_ancestors process described above there is
11805 +   a call to this function at the twig level.  During alloc_pos_and_ancestors we may ask:
11806 +   should this node be relocated (in reverse parent-first context)?  We repeat this
11807 +   process as long as the child is the leftmost child, eventually reaching an ancestor of
11808 +   the flush point that is not a leftmost child.  The preceder of that ancestors, which is
11809 +   not a leftmost child, is actually on the leaf level.  The preceder of that block is the
11810 +   left-neighbor of the flush point.  The preceder of that block is the rightmost child of
11811 +   the twig on the left.  So, when alloc_pos_and_ancestors passes upward through the twig
11812 +   level, it stops momentarily to remember the block of the rightmost child of the twig on
11813 +   the left and sets it to the flush_position's preceder_hint.
11814 +
11815 +   There is one other place where we may set the flush_position's preceder hint, which is
11816 +   during scan-left.
11817 +*/
11818 +static int set_preceder(const coord_t * coord_in, flush_pos_t * pos)
11819 +{
11820 +       int ret;
11821 +       coord_t coord;
11822 +       lock_handle left_lock;
11823 +       load_count left_load;
11824 +
11825 +       coord_dup(&coord, coord_in);
11826 +
11827 +       init_lh(&left_lock);
11828 +       init_load_count(&left_load);
11829 +
11830 +       /* FIXME(B): Same FIXME as in "Find the preceder" in reverse_relocate_test.
11831 +          coord_is_leftmost_unit is not the right test if the unformatted child is in the
11832 +          middle of the first extent unit. */
11833 +       if (!coord_is_leftmost_unit(&coord)) {
11834 +               coord_prev_unit(&coord);
11835 +       } else {
11836 +               ret =
11837 +                   reiser4_get_left_neighbor(&left_lock, coord.node,
11838 +                                             ZNODE_READ_LOCK, GN_SAME_ATOM);
11839 +               if (ret) {
11840 +                       /* If we fail for any reason it doesn't matter because the
11841 +                          preceder is only a hint.  We are low-priority at this point, so
11842 +                          this must be the case. */
11843 +                       if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
11844 +                           ret == -ENOENT || ret == -EINVAL
11845 +                           || ret == -E_DEADLOCK) {
11846 +                               ret = 0;
11847 +                       }
11848 +                       goto exit;
11849 +               }
11850 +
11851 +               ret = incr_load_count_znode(&left_load, left_lock.node);
11852 +               if (ret)
11853 +                       goto exit;
11854 +
11855 +               coord_init_last_unit(&coord, left_lock.node);
11856 +       }
11857 +
11858 +       ret =
11859 +           item_utmost_child_real_block(&coord, RIGHT_SIDE,
11860 +                                        &pos->preceder.blk);
11861 +      exit:
11862 +       check_preceder(pos->preceder.blk);
11863 +       done_load_count(&left_load);
11864 +       done_lh(&left_lock);
11865 +       return ret;
11866 +}
11867 +
11868 +/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
11869 +
11870 +/* This procedure implements the outer loop of the flush algorithm.  To put this in
11871 +   context, here is the general list of steps taken by the flush routine as a whole:
11872 +
11873 +   1. Scan-left
11874 +   2. Scan-right (maybe)
11875 +   3. Allocate initial flush position and its ancestors
11876 +   4. <handle extents>
11877 +   5. <squeeze and next position and its ancestors to-the-right,
11878 +       then update position to-the-right>
11879 +   6. <repeat from #4 until flush is stopped>
11880 +
11881 +   This procedure implements the loop in steps 4 through 6 in the above listing.
11882 +
11883 +   Step 4: if the current flush position is an extent item (position on the twig level),
11884 +   it allocates the extent (allocate_extent_item_in_place) then shifts to the next
11885 +   coordinate.  If the next coordinate's leftmost child needs flushprep, we will continue.
11886 +   If the next coordinate is an internal item, we descend back to the leaf level,
11887 +   otherwise we repeat a step #4 (labeled ALLOC_EXTENTS below).  If the "next coordinate"
11888 +   brings us past the end of the twig level, then we call
11889 +   reverse_relocate_end_of_twig to possibly dirty the next (right) twig, prior to
11890 +   step #5 which moves to the right.
11891 +
11892 +   Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up the
11893 +   tree to allocate any ancestors of the next-right flush position that are not also
11894 +   ancestors of the current position.  Those ancestors (in top-down order) are the next in
11895 +   parent-first order.  We squeeze adjacent nodes on the way up until the right node and
11896 +   current node share the same parent, then allocate on the way back down.  Finally, this
11897 +   step sets the flush position to the next-right node.  Then repeat steps 4 and 5.
11898 +*/
11899 +
11900 +/* SQUEEZE CODE */
11901 +
11902 +/* squalloc_right_twig helper function, cut a range of extent items from
11903 +   cut node to->node from the beginning up to coord @to. */
11904 +static int squalloc_right_twig_cut(coord_t * to, reiser4_key * to_key,
11905 +                                  znode * left)
11906 +{
11907 +       coord_t from;
11908 +       reiser4_key from_key;
11909 +
11910 +       coord_init_first_unit(&from, to->node);
11911 +       item_key_by_coord(&from, &from_key);
11912 +
11913 +       return cut_node_content(&from, to, &from_key, to_key, NULL);
11914 +}
11915 +
11916 +/* Copy as much of the leading extents from @right to @left, allocating
11917 +   unallocated extents as they are copied.  Returns SQUEEZE_TARGET_FULL or
11918 +   SQUEEZE_SOURCE_EMPTY when no more can be shifted.  If the next item is an
11919 +   internal item it calls shift_one_internal_unit and may then return
11920 +   SUBTREE_MOVED. */
11921 +static int squeeze_right_twig(znode * left, znode * right, flush_pos_t * pos)
11922 +{
11923 +       int ret = SUBTREE_MOVED;
11924 +       coord_t coord;          /* used to iterate over items */
11925 +       reiser4_key stop_key;
11926 +
11927 +       assert("jmacd-2008", !node_is_empty(right));
11928 +       coord_init_first_unit(&coord, right);
11929 +
11930 +       /* FIXME: can be optimized to cut once */
11931 +       while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
11932 +               ON_DEBUG(void *vp);
11933 +
11934 +               assert("vs-1468", coord_is_leftmost_unit(&coord));
11935 +               ON_DEBUG(vp = shift_check_prepare(left, coord.node));
11936 +
11937 +               /* stop_key is used to find what was copied and what to cut */
11938 +               stop_key = *reiser4_min_key();
11939 +               ret = squalloc_extent(left, &coord, pos, &stop_key);
11940 +               if (ret != SQUEEZE_CONTINUE) {
11941 +                       ON_DEBUG(kfree(vp));
11942 +                       break;
11943 +               }
11944 +               assert("vs-1465", !keyeq(&stop_key, reiser4_min_key()));
11945 +
11946 +               /* Helper function to do the cutting. */
11947 +               set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
11948 +               check_me("vs-1466",
11949 +                        squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
11950 +
11951 +               ON_DEBUG(shift_check(vp, left, coord.node));
11952 +       }
11953 +
11954 +       if (node_is_empty(coord.node))
11955 +               ret = SQUEEZE_SOURCE_EMPTY;
11956 +
11957 +       if (ret == SQUEEZE_TARGET_FULL) {
11958 +               goto out;
11959 +       }
11960 +
11961 +       if (node_is_empty(right)) {
11962 +               /* The whole right node was copied into @left. */
11963 +               assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
11964 +               goto out;
11965 +       }
11966 +
11967 +       coord_init_first_unit(&coord, right);
11968 +
11969 +       if (!item_is_internal(&coord)) {
11970 +               /* we do not want to squeeze anything else to left neighbor because "slum"
11971 +                  is over */
11972 +               ret = SQUEEZE_TARGET_FULL;
11973 +               goto out;
11974 +       }
11975 +       assert("jmacd-433", item_is_internal(&coord));
11976 +
11977 +       /* Shift an internal unit.  The child must be allocated before shifting any more
11978 +          extents, so we stop here. */
11979 +       ret = shift_one_internal_unit(left, right);
11980 +
11981 +      out:
11982 +       assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
11983 +              || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
11984 +
11985 +       if (ret == SQUEEZE_TARGET_FULL) {
11986 +               /* We submit prepped nodes here and expect that this @left twig
11987 +                * will not be modified again during this jnode_flush() call. */
11988 +               int ret1;
11989 +
11990 +               /* NOTE: seems like io is done under long term locks. */
11991 +               ret1 = write_prepped_nodes(pos);
11992 +               if (ret1 < 0)
11993 +                       return ret1;
11994 +       }
11995 +
11996 +       return ret;
11997 +}
11998 +
11999 +#if REISER4_DEBUG
12000 +static void item_convert_invariant(flush_pos_t * pos)
12001 +{
12002 +       assert("edward-1225", coord_is_existing_item(&pos->coord));
12003 +       if (chaining_data_present(pos)) {
12004 +               item_plugin *iplug = item_convert_plug(pos);
12005 +
12006 +               assert("edward-1000",
12007 +                      iplug == item_plugin_by_coord(&pos->coord));
12008 +               assert("edward-1001", iplug->f.convert != NULL);
12009 +       } else
12010 +               assert("edward-1226", pos->child == NULL);
12011 +}
12012 +#else
12013 +
12014 +#define item_convert_invariant(pos) noop
12015 +
12016 +#endif
12017 +
12018 +/* Scan node items starting from the first one and apply for each
12019 +   item its flush ->convert() method (if any). This method may
12020 +   resize/kill the item so the tree will be changed.
12021 +*/
12022 +static int convert_node(flush_pos_t * pos, znode * node)
12023 +{
12024 +       int ret = 0;
12025 +       item_plugin *iplug;
12026 +
12027 +       assert("edward-304", pos != NULL);
12028 +       assert("edward-305", pos->child == NULL);
12029 +       assert("edward-475", znode_convertible(node));
12030 +       assert("edward-669", znode_is_wlocked(node));
12031 +       assert("edward-1210", !node_is_empty(node));
12032 +
12033 +       if (znode_get_level(node) != LEAF_LEVEL)
12034 +               /* unsupported */
12035 +               goto exit;
12036 +
12037 +       coord_init_first_unit(&pos->coord, node);
12038 +
12039 +       while (1) {
12040 +               ret = 0;
12041 +               coord_set_to_left(&pos->coord);
12042 +               item_convert_invariant(pos);
12043 +
12044 +               iplug = item_plugin_by_coord(&pos->coord);
12045 +               assert("edward-844", iplug != NULL);
12046 +
12047 +               if (iplug->f.convert) {
12048 +                       ret = iplug->f.convert(pos);
12049 +                       if (ret)
12050 +                               goto exit;
12051 +               }
12052 +               assert("edward-307", pos->child == NULL);
12053 +
12054 +               if (coord_next_item(&pos->coord)) {
12055 +                       /* node is over */
12056 +
12057 +                       if (!chaining_data_present(pos))
12058 +                               /* finished this node */
12059 +                               break;
12060 +                       if (should_chain_next_node(pos)) {
12061 +                               /* go to next node */
12062 +                               move_chaining_data(pos, 0 /* to next node */ );
12063 +                               break;
12064 +                       }
12065 +                       /* repeat this node */
12066 +                       move_chaining_data(pos, 1 /* this node */ );
12067 +                       continue;
12068 +               }
12069 +               /* Node is not over.
12070 +                  Check if there is attached convert data.
12071 +                  If so roll one item position back and repeat
12072 +                  on this node
12073 +                */
12074 +               if (chaining_data_present(pos)) {
12075 +
12076 +                       if (iplug != item_plugin_by_coord(&pos->coord))
12077 +                               set_item_convert_count(pos, 0);
12078 +
12079 +                       ret = coord_prev_item(&pos->coord);
12080 +                       assert("edward-1003", !ret);
12081 +
12082 +                       move_chaining_data(pos, 1 /* this node */ );
12083 +               }
12084 +       }
12085 +       JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
12086 +       znode_make_dirty(node);
12087 +      exit:
12088 +       assert("edward-1004", !ret);
12089 +       return ret;
12090 +}
12091 +
12092 +/* Squeeze and allocate the right neighbor.  This is called after @left and
12093 +   its current children have been squeezed and allocated already.  This
12094 +   procedure's job is to squeeze and items from @right to @left.
12095 +
12096 +   If at the leaf level, use the shift_everything_left memcpy-optimized
12097 +   version of shifting (squeeze_right_leaf).
12098 +
12099 +   If at the twig level, extents are allocated as they are shifted from @right
12100 +   to @left (squalloc_right_twig).
12101 +
12102 +   At any other level, shift one internal item and return to the caller
12103 +   (squalloc_parent_first) so that the shifted-subtree can be processed in
12104 +   parent-first order.
12105 +
12106 +   When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
12107 +   returned.  When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
12108 +   returned.  If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
12109 +   is returned.
12110 +*/
12111 +
12112 +static int squeeze_right_neighbor(flush_pos_t * pos, znode * left,
12113 +                                 znode * right)
12114 +{
12115 +       int ret;
12116 +
12117 +       /* FIXME it is possible to see empty hasn't-heard-banshee node in a
12118 +        * tree owing to error (for example, ENOSPC) in write */
12119 +       /* assert("jmacd-9321", !node_is_empty(left)); */
12120 +       assert("jmacd-9322", !node_is_empty(right));
12121 +       assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
12122 +
12123 +       switch (znode_get_level(left)) {
12124 +       case TWIG_LEVEL:
12125 +               /* Shift with extent allocating until either an internal item
12126 +                  is encountered or everything is shifted or no free space
12127 +                  left in @left */
12128 +               ret = squeeze_right_twig(left, right, pos);
12129 +               break;
12130 +
12131 +       default:
12132 +               /* All other levels can use shift_everything until we implement per-item
12133 +                  flush plugins. */
12134 +               ret = squeeze_right_non_twig(left, right);
12135 +               break;
12136 +       }
12137 +
12138 +       assert("jmacd-2011", (ret < 0 ||
12139 +                             ret == SQUEEZE_SOURCE_EMPTY
12140 +                             || ret == SQUEEZE_TARGET_FULL
12141 +                             || ret == SUBTREE_MOVED));
12142 +       return ret;
12143 +}
12144 +
12145 +static int squeeze_right_twig_and_advance_coord(flush_pos_t * pos,
12146 +                                               znode * right)
12147 +{
12148 +       int ret;
12149 +
12150 +       ret = squeeze_right_twig(pos->lock.node, right, pos);
12151 +       if (ret < 0)
12152 +               return ret;
12153 +       if (ret > 0) {
12154 +               coord_init_after_last_item(&pos->coord, pos->lock.node);
12155 +               return ret;
12156 +       }
12157 +
12158 +       coord_init_last_unit(&pos->coord, pos->lock.node);
12159 +       return 0;
12160 +}
12161 +
12162 +/* forward declaration */
12163 +static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
12164 +
12165 +/* do a fast check for "same parents" condition before calling
12166 + * squalloc_upper_levels() */
12167 +static inline int check_parents_and_squalloc_upper_levels(flush_pos_t * pos,
12168 +                                                         znode * left,
12169 +                                                         znode * right)
12170 +{
12171 +       if (znode_same_parents(left, right))
12172 +               return 0;
12173 +
12174 +       return squalloc_upper_levels(pos, left, right);
12175 +}
12176 +
12177 +/* Check whether the parent of given @right node needs to be processes
12178 +   ((re)allocated) prior to processing of the child.  If @left and @right do not
12179 +   share at least the parent of the @right is after the @left but before the
12180 +   @right in parent-first order, we have to (re)allocate it before the @right
12181 +   gets (re)allocated. */
12182 +static int squalloc_upper_levels(flush_pos_t * pos, znode * left, znode * right)
12183 +{
12184 +       int ret;
12185 +
12186 +       lock_handle left_parent_lock;
12187 +       lock_handle right_parent_lock;
12188 +
12189 +       load_count left_parent_load;
12190 +       load_count right_parent_load;
12191 +
12192 +       init_lh(&left_parent_lock);
12193 +       init_lh(&right_parent_lock);
12194 +
12195 +       init_load_count(&left_parent_load);
12196 +       init_load_count(&right_parent_load);
12197 +
12198 +       ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
12199 +       if (ret)
12200 +               goto out;
12201 +
12202 +       ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
12203 +       if (ret)
12204 +               goto out;
12205 +
12206 +       /* Check for same parents */
12207 +       if (left_parent_lock.node == right_parent_lock.node)
12208 +               goto out;
12209 +
12210 +       if (znode_check_flushprepped(right_parent_lock.node)) {
12211 +               /* Keep parent-first order.  In the order, the right parent node stands
12212 +                  before the @right node.  If it is already allocated, we set the
12213 +                  preceder (next block search start point) to its block number, @right
12214 +                  node should be allocated after it.
12215 +
12216 +                  However, preceder is set only if the right parent is on twig level.
12217 +                  The explanation is the following: new branch nodes are allocated over
12218 +                  already allocated children while the tree grows, it is difficult to
12219 +                  keep tree ordered, we assume that only leaves and twings are correctly
12220 +                  allocated.  So, only twigs are used as a preceder for allocating of the
12221 +                  rest of the slum. */
12222 +               if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
12223 +                       pos->preceder.blk =
12224 +                           *znode_get_block(right_parent_lock.node);
12225 +                       check_preceder(pos->preceder.blk);
12226 +               }
12227 +               goto out;
12228 +       }
12229 +
12230 +       ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
12231 +       if (ret)
12232 +               goto out;
12233 +
12234 +       ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
12235 +       if (ret)
12236 +               goto out;
12237 +
12238 +       ret =
12239 +           squeeze_right_neighbor(pos, left_parent_lock.node,
12240 +                                  right_parent_lock.node);
12241 +       /* We stop if error. We stop if some items/units were shifted (ret == 0)
12242 +        * and thus @right changed its parent. It means we have not process
12243 +        * right_parent node prior to processing of @right. Positive return
12244 +        * values say that shifting items was not happen because of "empty
12245 +        * source" or "target full" conditions. */
12246 +       if (ret <= 0)
12247 +               goto out;
12248 +
12249 +       /* parent(@left) and parent(@right) may have different parents also. We
12250 +        * do a recursive call for checking that. */
12251 +       ret =
12252 +           check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
12253 +                                                   right_parent_lock.node);
12254 +       if (ret)
12255 +               goto out;
12256 +
12257 +       /* allocate znode when going down */
12258 +       ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
12259 +
12260 +      out:
12261 +       done_load_count(&left_parent_load);
12262 +       done_load_count(&right_parent_load);
12263 +
12264 +       done_lh(&left_parent_lock);
12265 +       done_lh(&right_parent_lock);
12266 +
12267 +       return ret;
12268 +}
12269 +
12270 +/* Check the leftmost child "flushprepped" status, also returns true if child
12271 + * node was not found in cache.  */
12272 +static int leftmost_child_of_unit_check_flushprepped(const coord_t * coord)
12273 +{
12274 +       int ret;
12275 +       int prepped;
12276 +
12277 +       jnode *child;
12278 +
12279 +       ret = get_leftmost_child_of_unit(coord, &child);
12280 +
12281 +       if (ret)
12282 +               return ret;
12283 +
12284 +       if (child) {
12285 +               prepped = jnode_check_flushprepped(child);
12286 +               jput(child);
12287 +       } else {
12288 +               /* We consider not existing child as a node which slum
12289 +                  processing should not continue to.  Not cached node is clean,
12290 +                  so it is flushprepped. */
12291 +               prepped = 1;
12292 +       }
12293 +
12294 +       return prepped;
12295 +}
12296 +
12297 +/* (re)allocate znode with automated getting parent node */
12298 +static int lock_parent_and_allocate_znode(znode * node, flush_pos_t * pos)
12299 +{
12300 +       int ret;
12301 +       lock_handle parent_lock;
12302 +       load_count parent_load;
12303 +       coord_t pcoord;
12304 +
12305 +       assert("zam-851", znode_is_write_locked(node));
12306 +
12307 +       init_lh(&parent_lock);
12308 +       init_load_count(&parent_load);
12309 +
12310 +       ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
12311 +       if (ret)
12312 +               goto out;
12313 +
12314 +       ret = incr_load_count_znode(&parent_load, parent_lock.node);
12315 +       if (ret)
12316 +               goto out;
12317 +
12318 +       ret = find_child_ptr(parent_lock.node, node, &pcoord);
12319 +       if (ret)
12320 +               goto out;
12321 +
12322 +       ret = allocate_znode(node, &pcoord, pos);
12323 +
12324 +      out:
12325 +       done_load_count(&parent_load);
12326 +       done_lh(&parent_lock);
12327 +       return ret;
12328 +}
12329 +
12330 +/* Process nodes on leaf level until unformatted node or rightmost node in the
12331 + * slum reached.  */
12332 +static int handle_pos_on_formatted(flush_pos_t * pos)
12333 +{
12334 +       int ret;
12335 +       lock_handle right_lock;
12336 +       load_count right_load;
12337 +
12338 +       init_lh(&right_lock);
12339 +       init_load_count(&right_load);
12340 +
12341 +       if (should_convert_node(pos, pos->lock.node)) {
12342 +               ret = convert_node(pos, pos->lock.node);
12343 +               if (ret)
12344 +                       return ret;
12345 +       }
12346 +
12347 +       while (1) {
12348 +               int expected;
12349 +               expected = should_convert_next_node(pos);
12350 +               ret = neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE,
12351 +                                      ZNODE_WRITE_LOCK, !expected, expected);
12352 +               if (ret) {
12353 +                       if (expected)
12354 +                               warning("edward-1495",
12355 +                               "Expected neighbor not found (ret = %d). Fsck?",
12356 +                                       ret);
12357 +                       break;
12358 +               }
12359 +
12360 +               /* we don't prep(allocate) nodes for flushing twice.  This can be suboptimal, or it
12361 +                * can be optimal.  For now we choose to live with the risk that it will
12362 +                * be suboptimal because it would be quite complex to code it to be
12363 +                * smarter. */
12364 +               if (znode_check_flushprepped(right_lock.node)
12365 +                   && !znode_convertible(right_lock.node)) {
12366 +                       assert("edward-1005", !should_convert_next_node(pos));
12367 +                       pos_stop(pos);
12368 +                       break;
12369 +               }
12370 +
12371 +               ret = incr_load_count_znode(&right_load, right_lock.node);
12372 +               if (ret)
12373 +                       break;
12374 +               if (should_convert_node(pos, right_lock.node)) {
12375 +                       ret = convert_node(pos, right_lock.node);
12376 +                       if (ret)
12377 +                               break;
12378 +                       if (node_is_empty(right_lock.node)) {
12379 +                               /* node became empty after converting, repeat */
12380 +                               done_load_count(&right_load);
12381 +                               done_lh(&right_lock);
12382 +                               continue;
12383 +                       }
12384 +               }
12385 +
12386 +               /* squeeze _before_ going upward. */
12387 +               ret =
12388 +                   squeeze_right_neighbor(pos, pos->lock.node,
12389 +                                          right_lock.node);
12390 +               if (ret < 0)
12391 +                       break;
12392 +
12393 +               if (znode_check_flushprepped(right_lock.node)) {
12394 +                       if (should_convert_next_node(pos)) {
12395 +                               /* in spite of flushprepped status of the node,
12396 +                                  its right slum neighbor should be converted */
12397 +                               assert("edward-953", convert_data(pos));
12398 +                               assert("edward-954", item_convert_data(pos));
12399 +
12400 +                               if (node_is_empty(right_lock.node)) {
12401 +                                       done_load_count(&right_load);
12402 +                                       done_lh(&right_lock);
12403 +                               } else
12404 +                                       move_flush_pos(pos, &right_lock,
12405 +                                                      &right_load, NULL);
12406 +                               continue;
12407 +                       }
12408 +                       pos_stop(pos);
12409 +                       break;
12410 +               }
12411 +
12412 +               if (node_is_empty(right_lock.node)) {
12413 +                       /* repeat if right node was squeezed completely */
12414 +                       done_load_count(&right_load);
12415 +                       done_lh(&right_lock);
12416 +                       continue;
12417 +               }
12418 +
12419 +               /* parent(right_lock.node) has to be processed before
12420 +                * (right_lock.node) due to "parent-first" allocation order. */
12421 +               ret =
12422 +                   check_parents_and_squalloc_upper_levels(pos, pos->lock.node,
12423 +                                                           right_lock.node);
12424 +               if (ret)
12425 +                       break;
12426 +               /* (re)allocate _after_ going upward */
12427 +               ret = lock_parent_and_allocate_znode(right_lock.node, pos);
12428 +               if (ret)
12429 +                       break;
12430 +               if (should_terminate_squalloc(pos)) {
12431 +                       set_item_convert_count(pos, 0);
12432 +                       break;
12433 +               }
12434 +
12435 +               /* advance the flush position to the right neighbor */
12436 +               move_flush_pos(pos, &right_lock, &right_load, NULL);
12437 +
12438 +               ret = rapid_flush(pos);
12439 +               if (ret)
12440 +                       break;
12441 +       }
12442 +       check_convert_info(pos);
12443 +       done_load_count(&right_load);
12444 +       done_lh(&right_lock);
12445 +
12446 +       /* This function indicates via pos whether to stop or go to twig or continue on current
12447 +        * level. */
12448 +       return ret;
12449 +
12450 +}
12451 +
12452 +/* Process nodes on leaf level until unformatted node or rightmost node in the
12453 + * slum reached.  */
12454 +static int handle_pos_on_leaf(flush_pos_t * pos)
12455 +{
12456 +       int ret;
12457 +
12458 +       assert("zam-845", pos->state == POS_ON_LEAF);
12459 +
12460 +       ret = handle_pos_on_formatted(pos);
12461 +
12462 +       if (ret == -E_NO_NEIGHBOR) {
12463 +               /* cannot get right neighbor, go process extents. */
12464 +               pos->state = POS_TO_TWIG;
12465 +               return 0;
12466 +       }
12467 +
12468 +       return ret;
12469 +}
12470 +
12471 +/* Process slum on level > 1 */
12472 +static int handle_pos_on_internal(flush_pos_t * pos)
12473 +{
12474 +       assert("zam-850", pos->state == POS_ON_INTERNAL);
12475 +       return handle_pos_on_formatted(pos);
12476 +}
12477 +
12478 +/* check whether squalloc should stop before processing given extent */
12479 +static int squalloc_extent_should_stop(flush_pos_t * pos)
12480 +{
12481 +       assert("zam-869", item_is_extent(&pos->coord));
12482 +
12483 +       /* pos->child is a jnode handle_pos_on_extent() should start with in
12484 +        * stead of the first child of the first extent unit. */
12485 +       if (pos->child) {
12486 +               int prepped;
12487 +
12488 +               assert("vs-1383", jnode_is_unformatted(pos->child));
12489 +               prepped = jnode_check_flushprepped(pos->child);
12490 +               pos->pos_in_unit =
12491 +                   jnode_get_index(pos->child) -
12492 +                   extent_unit_index(&pos->coord);
12493 +               assert("vs-1470",
12494 +                      pos->pos_in_unit < extent_unit_width(&pos->coord));
12495 +               assert("nikita-3434",
12496 +                      ergo(extent_is_unallocated(&pos->coord),
12497 +                           pos->pos_in_unit == 0));
12498 +               jput(pos->child);
12499 +               pos->child = NULL;
12500 +
12501 +               return prepped;
12502 +       }
12503 +
12504 +       pos->pos_in_unit = 0;
12505 +       if (extent_is_unallocated(&pos->coord))
12506 +               return 0;
12507 +
12508 +       return leftmost_child_of_unit_check_flushprepped(&pos->coord);
12509 +}
12510 +
12511 +/* Handle the case when regular reiser4 tree (znodes connected one to its
12512 + * neighbors by sibling pointers) is interrupted on leaf level by one or more
12513 + * unformatted nodes.  By having a lock on twig level and use extent code
12514 + * routines to process unformatted nodes we swim around an irregular part of
12515 + * reiser4 tree. */
12516 +static int handle_pos_on_twig(flush_pos_t * pos)
12517 +{
12518 +       int ret;
12519 +
12520 +       assert("zam-844", pos->state == POS_ON_EPOINT);
12521 +       assert("zam-843", item_is_extent(&pos->coord));
12522 +
12523 +       /* We decide should we continue slum processing with current extent
12524 +          unit: if leftmost child of current extent unit is flushprepped
12525 +          (i.e. clean or already processed by flush) we stop squalloc().  There
12526 +          is a fast check for unallocated extents which we assume contain all
12527 +          not flushprepped nodes. */
12528 +       /* FIXME: Here we implement simple check, we are only looking on the
12529 +          leftmost child. */
12530 +       ret = squalloc_extent_should_stop(pos);
12531 +       if (ret != 0) {
12532 +               pos_stop(pos);
12533 +               return ret;
12534 +       }
12535 +
12536 +       while (pos_valid(pos) && coord_is_existing_unit(&pos->coord)
12537 +              && item_is_extent(&pos->coord)) {
12538 +               ret = reiser4_alloc_extent(pos);
12539 +               if (ret) {
12540 +                       break;
12541 +               }
12542 +               coord_next_unit(&pos->coord);
12543 +       }
12544 +
12545 +       if (coord_is_after_rightmost(&pos->coord)) {
12546 +               pos->state = POS_END_OF_TWIG;
12547 +               return 0;
12548 +       }
12549 +       if (item_is_internal(&pos->coord)) {
12550 +               pos->state = POS_TO_LEAF;
12551 +               return 0;
12552 +       }
12553 +
12554 +       assert("zam-860", item_is_extent(&pos->coord));
12555 +
12556 +       /* "slum" is over */
12557 +       pos->state = POS_INVALID;
12558 +       return 0;
12559 +}
12560 +
12561 +/* When we about to return flush position from twig to leaf level we can process
12562 + * the right twig node or move position to the leaf.  This processes right twig
12563 + * if it is possible and jump to leaf level if not. */
12564 +static int handle_pos_end_of_twig(flush_pos_t * pos)
12565 +{
12566 +       int ret;
12567 +       lock_handle right_lock;
12568 +       load_count right_load;
12569 +       coord_t at_right;
12570 +       jnode *child = NULL;
12571 +
12572 +       assert("zam-848", pos->state == POS_END_OF_TWIG);
12573 +       assert("zam-849", coord_is_after_rightmost(&pos->coord));
12574 +
12575 +       init_lh(&right_lock);
12576 +       init_load_count(&right_load);
12577 +
12578 +       /* We get a lock on the right twig node even it is not dirty because
12579 +        * slum continues or discontinues on leaf level not on next twig. This
12580 +        * lock on the right twig is needed for getting its leftmost child. */
12581 +       ret =
12582 +           reiser4_get_right_neighbor(&right_lock, pos->lock.node,
12583 +                                      ZNODE_WRITE_LOCK, GN_SAME_ATOM);
12584 +       if (ret)
12585 +               goto out;
12586 +
12587 +       ret = incr_load_count_znode(&right_load, right_lock.node);
12588 +       if (ret)
12589 +               goto out;
12590 +
12591 +       /* right twig could be not dirty */
12592 +       if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
12593 +               /* If right twig node is dirty we always attempt to squeeze it
12594 +                * content to the left... */
12595 +             became_dirty:
12596 +               ret =
12597 +                   squeeze_right_twig_and_advance_coord(pos, right_lock.node);
12598 +               if (ret <= 0) {
12599 +                       /* pos->coord is on internal item, go to leaf level, or
12600 +                        * we have an error which will be caught in squalloc() */
12601 +                       pos->state = POS_TO_LEAF;
12602 +                       goto out;
12603 +               }
12604 +
12605 +               /* If right twig was squeezed completely we wave to re-lock
12606 +                * right twig. now it is done through the top-level squalloc
12607 +                * routine. */
12608 +               if (node_is_empty(right_lock.node))
12609 +                       goto out;
12610 +
12611 +               /* ... and prep it if it is not yet prepped */
12612 +               if (!znode_check_flushprepped(right_lock.node)) {
12613 +                       /* As usual, process parent before ... */
12614 +                       ret =
12615 +                           check_parents_and_squalloc_upper_levels(pos,
12616 +                                                                   pos->lock.
12617 +                                                                   node,
12618 +                                                                   right_lock.
12619 +                                                                   node);
12620 +                       if (ret)
12621 +                               goto out;
12622 +
12623 +                       /* ... processing the child */
12624 +                       ret =
12625 +                           lock_parent_and_allocate_znode(right_lock.node,
12626 +                                                          pos);
12627 +                       if (ret)
12628 +                               goto out;
12629 +               }
12630 +       } else {
12631 +               coord_init_first_unit(&at_right, right_lock.node);
12632 +
12633 +               /* check first child of next twig, should we continue there ? */
12634 +               ret = get_leftmost_child_of_unit(&at_right, &child);
12635 +               if (ret || child == NULL || jnode_check_flushprepped(child)) {
12636 +                       pos_stop(pos);
12637 +                       goto out;
12638 +               }
12639 +
12640 +               /* check clean twig for possible relocation */
12641 +               if (!znode_check_flushprepped(right_lock.node)) {
12642 +                       ret =
12643 +                           reverse_relocate_check_dirty_parent(child,
12644 +                                                               &at_right, pos);
12645 +                       if (ret)
12646 +                               goto out;
12647 +                       if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
12648 +                               goto became_dirty;
12649 +               }
12650 +       }
12651 +
12652 +       assert("zam-875", znode_check_flushprepped(right_lock.node));
12653 +
12654 +       /* Update the preceder by a block number of just processed right twig
12655 +        * node. The code above could miss the preceder updating because
12656 +        * allocate_znode() could not be called for this node. */
12657 +       pos->preceder.blk = *znode_get_block(right_lock.node);
12658 +       check_preceder(pos->preceder.blk);
12659 +
12660 +       coord_init_first_unit(&at_right, right_lock.node);
12661 +       assert("zam-868", coord_is_existing_unit(&at_right));
12662 +
12663 +       pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
12664 +       move_flush_pos(pos, &right_lock, &right_load, &at_right);
12665 +
12666 +      out:
12667 +       done_load_count(&right_load);
12668 +       done_lh(&right_lock);
12669 +
12670 +       if (child)
12671 +               jput(child);
12672 +
12673 +       return ret;
12674 +}
12675 +
12676 +/* Move the pos->lock to leaf node pointed by pos->coord, check should we
12677 + * continue there. */
12678 +static int handle_pos_to_leaf(flush_pos_t * pos)
12679 +{
12680 +       int ret;
12681 +       lock_handle child_lock;
12682 +       load_count child_load;
12683 +       jnode *child;
12684 +
12685 +       assert("zam-846", pos->state == POS_TO_LEAF);
12686 +       assert("zam-847", item_is_internal(&pos->coord));
12687 +
12688 +       init_lh(&child_lock);
12689 +       init_load_count(&child_load);
12690 +
12691 +       ret = get_leftmost_child_of_unit(&pos->coord, &child);
12692 +       if (ret)
12693 +               return ret;
12694 +       if (child == NULL) {
12695 +               pos_stop(pos);
12696 +               return 0;
12697 +       }
12698 +
12699 +       if (jnode_check_flushprepped(child)) {
12700 +               pos->state = POS_INVALID;
12701 +               goto out;
12702 +       }
12703 +
12704 +       ret =
12705 +           longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
12706 +                               ZNODE_LOCK_LOPRI);
12707 +       if (ret)
12708 +               goto out;
12709 +
12710 +       ret = incr_load_count_znode(&child_load, JZNODE(child));
12711 +       if (ret)
12712 +               goto out;
12713 +
12714 +       ret = allocate_znode(JZNODE(child), &pos->coord, pos);
12715 +       if (ret)
12716 +               goto out;
12717 +
12718 +       /* move flush position to leaf level */
12719 +       pos->state = POS_ON_LEAF;
12720 +       move_flush_pos(pos, &child_lock, &child_load, NULL);
12721 +
12722 +       if (node_is_empty(JZNODE(child))) {
12723 +               ret = delete_empty_node(JZNODE(child));
12724 +               pos->state = POS_INVALID;
12725 +       }
12726 +      out:
12727 +       done_load_count(&child_load);
12728 +       done_lh(&child_lock);
12729 +       jput(child);
12730 +
12731 +       return ret;
12732 +}
12733 +
12734 +/* move pos from leaf to twig, and move lock from leaf to twig. */
12735 +/* Move pos->lock to upper (twig) level */
12736 +static int handle_pos_to_twig(flush_pos_t * pos)
12737 +{
12738 +       int ret;
12739 +
12740 +       lock_handle parent_lock;
12741 +       load_count parent_load;
12742 +       coord_t pcoord;
12743 +
12744 +       assert("zam-852", pos->state == POS_TO_TWIG);
12745 +
12746 +       init_lh(&parent_lock);
12747 +       init_load_count(&parent_load);
12748 +
12749 +       ret =
12750 +           reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK);
12751 +       if (ret)
12752 +               goto out;
12753 +
12754 +       ret = incr_load_count_znode(&parent_load, parent_lock.node);
12755 +       if (ret)
12756 +               goto out;
12757 +
12758 +       ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
12759 +       if (ret)
12760 +               goto out;
12761 +
12762 +       assert("zam-870", item_is_internal(&pcoord));
12763 +       coord_next_item(&pcoord);
12764 +
12765 +       if (coord_is_after_rightmost(&pcoord))
12766 +               pos->state = POS_END_OF_TWIG;
12767 +       else if (item_is_extent(&pcoord))
12768 +               pos->state = POS_ON_EPOINT;
12769 +       else {
12770 +               /* Here we understand that getting -E_NO_NEIGHBOR in
12771 +                * handle_pos_on_leaf() was because of just a reaching edge of
12772 +                * slum */
12773 +               pos_stop(pos);
12774 +               goto out;
12775 +       }
12776 +
12777 +       move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
12778 +
12779 +      out:
12780 +       done_load_count(&parent_load);
12781 +       done_lh(&parent_lock);
12782 +
12783 +       return ret;
12784 +}
12785 +
12786 +typedef int (*pos_state_handle_t) (flush_pos_t *);
12787 +static pos_state_handle_t flush_pos_handlers[] = {
12788 +       /* process formatted nodes on leaf level, keep lock on a leaf node */
12789 +       [POS_ON_LEAF] = handle_pos_on_leaf,
12790 +       /* process unformatted nodes, keep lock on twig node, pos->coord points to extent currently
12791 +        * being processed */
12792 +       [POS_ON_EPOINT] = handle_pos_on_twig,
12793 +       /* move a lock from leaf node to its parent for further processing of unformatted nodes */
12794 +       [POS_TO_TWIG] = handle_pos_to_twig,
12795 +       /* move a lock from twig to leaf level when a processing of unformatted nodes finishes,
12796 +        * pos->coord points to the leaf node we jump to */
12797 +       [POS_TO_LEAF] = handle_pos_to_leaf,
12798 +       /* after processing last extent in the twig node, attempting to shift items from the twigs
12799 +        * right neighbor and process them while shifting */
12800 +       [POS_END_OF_TWIG] = handle_pos_end_of_twig,
12801 +       /* process formatted nodes on internal level, keep lock on an internal node */
12802 +       [POS_ON_INTERNAL] = handle_pos_on_internal
12803 +};
12804 +
12805 +/* Advance flush position horizontally, prepare for flushing ((re)allocate, squeeze,
12806 + * encrypt) nodes and their ancestors in "parent-first" order */
12807 +static int squalloc(flush_pos_t * pos)
12808 +{
12809 +       int ret = 0;
12810 +
12811 +       /* maybe needs to be made a case statement with handle_pos_on_leaf as first case, for
12812 +        * greater CPU efficiency? Measure and see.... -Hans */
12813 +       while (pos_valid(pos)) {
12814 +               ret = flush_pos_handlers[pos->state] (pos);
12815 +               if (ret < 0)
12816 +                       break;
12817 +
12818 +               ret = rapid_flush(pos);
12819 +               if (ret)
12820 +                       break;
12821 +       }
12822 +
12823 +       /* any positive value or -E_NO_NEIGHBOR are legal return codes for handle_pos*
12824 +          routines, -E_NO_NEIGHBOR means that slum edge was reached */
12825 +       if (ret > 0 || ret == -E_NO_NEIGHBOR)
12826 +               ret = 0;
12827 +
12828 +       return ret;
12829 +}
12830 +
12831 +static void update_ldkey(znode * node)
12832 +{
12833 +       reiser4_key ldkey;
12834 +
12835 +       assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
12836 +       if (node_is_empty(node))
12837 +               return;
12838 +
12839 +       znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
12840 +}
12841 +
12842 +/* this is to be called after calling of shift node's method to shift data from @right to
12843 +   @left. It sets left delimiting keys of @left and @right to keys of first items of @left
12844 +   and @right correspondingly and sets right delimiting key of @left to first key of @right */
12845 +static void update_znode_dkeys(znode * left, znode * right)
12846 +{
12847 +       assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
12848 +       assert("vs-1629", (znode_is_write_locked(left) &&
12849 +                          znode_is_write_locked(right)));
12850 +
12851 +       /* we need to update left delimiting of left if it was empty before shift */
12852 +       update_ldkey(left);
12853 +       update_ldkey(right);
12854 +       if (node_is_empty(right))
12855 +               znode_set_rd_key(left, znode_get_rd_key(right));
12856 +       else
12857 +               znode_set_rd_key(left, znode_get_ld_key(right));
12858 +}
12859 +
12860 +/* try to shift everything from @right to @left. If everything was shifted -
12861 +   @right is removed from the tree.  Result is the number of bytes shifted. */
12862 +static int
12863 +shift_everything_left(znode * right, znode * left, carry_level * todo)
12864 +{
12865 +       coord_t from;
12866 +       node_plugin *nplug;
12867 +       carry_plugin_info info;
12868 +
12869 +       coord_init_after_last_item(&from, right);
12870 +
12871 +       nplug = node_plugin_by_node(right);
12872 +       info.doing = NULL;
12873 +       info.todo = todo;
12874 +       return nplug->shift(&from, left, SHIFT_LEFT,
12875 +                           1 /* delete @right if it becomes empty */ ,
12876 +                           1
12877 +                           /* move coord @from to node @left if everything will be shifted */
12878 +                           ,
12879 +                           &info);
12880 +}
12881 +
12882 +/* Shift as much as possible from @right to @left using the memcpy-optimized
12883 +   shift_everything_left.  @left and @right are formatted neighboring nodes on
12884 +   leaf level. */
12885 +static int squeeze_right_non_twig(znode * left, znode * right)
12886 +{
12887 +       int ret;
12888 +       carry_pool *pool;
12889 +       carry_level *todo;
12890 +
12891 +       assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
12892 +
12893 +       if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
12894 +           !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
12895 +               return SQUEEZE_TARGET_FULL;
12896 +
12897 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
12898 +       if (IS_ERR(pool))
12899 +               return PTR_ERR(pool);
12900 +       todo = (carry_level *) (pool + 1);
12901 +       init_carry_level(todo, pool);
12902 +
12903 +       ret = shift_everything_left(right, left, todo);
12904 +       if (ret > 0) {
12905 +               /* something was shifted */
12906 +               reiser4_tree *tree;
12907 +               __u64 grabbed;
12908 +
12909 +               znode_make_dirty(left);
12910 +               znode_make_dirty(right);
12911 +
12912 +               /* update delimiting keys of nodes which participated in
12913 +                  shift. FIXME: it would be better to have this in shift
12914 +                  node's operation. But it can not be done there. Nobody
12915 +                  remembers why, though */
12916 +               tree = znode_get_tree(left);
12917 +               write_lock_dk(tree);
12918 +               update_znode_dkeys(left, right);
12919 +               write_unlock_dk(tree);
12920 +
12921 +               /* Carry is called to update delimiting key and, maybe, to remove empty
12922 +                  node. */
12923 +               grabbed = get_current_context()->grabbed_blocks;
12924 +               ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
12925 +               assert("nikita-3003", ret == 0);        /* reserved space is exhausted. Ask Hans. */
12926 +               ret = reiser4_carry(todo, NULL /* previous level */ );
12927 +               grabbed2free_mark(grabbed);
12928 +       } else {
12929 +               /* Shifting impossible, we return appropriate result code */
12930 +               ret =
12931 +                   node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
12932 +                   SQUEEZE_TARGET_FULL;
12933 +       }
12934 +
12935 +       done_carry_pool(pool);
12936 +
12937 +       return ret;
12938 +}
12939 +
12940 +#if REISER4_DEBUG
12941 +static int sibling_link_is_ok(const znode *left, const znode *right)
12942 +{
12943 +       int result;
12944 +
12945 +       read_lock_tree(znode_get_tree(left));
12946 +       result = (left->right == right && left == right->left);
12947 +       read_unlock_tree(znode_get_tree(left));
12948 +       return result;
12949 +}
12950 +#endif
12951 +
12952 +/* Shift first unit of first item if it is an internal one.  Return
12953 +   SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
12954 +   SUBTREE_MOVED. */
12955 +static int shift_one_internal_unit(znode * left, znode * right)
12956 +{
12957 +       int ret;
12958 +       carry_pool *pool;
12959 +       carry_level *todo;
12960 +       coord_t *coord;
12961 +       carry_plugin_info *info;
12962 +       int size, moved;
12963 +
12964 +       assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
12965 +       assert("nikita-2435", znode_is_write_locked(left));
12966 +       assert("nikita-2436", znode_is_write_locked(right));
12967 +       assert("nikita-2434", sibling_link_is_ok(left, right));
12968 +
12969 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
12970 +                              sizeof(*coord) + sizeof(*info)
12971 +#if REISER4_DEBUG
12972 +                              + sizeof(*coord) + 2 * sizeof(reiser4_key)
12973 +#endif
12974 +           );
12975 +       if (IS_ERR(pool))
12976 +               return PTR_ERR(pool);
12977 +       todo = (carry_level *) (pool + 1);
12978 +       init_carry_level(todo, pool);
12979 +
12980 +       coord = (coord_t *) (todo + 3);
12981 +       coord_init_first_unit(coord, right);
12982 +       info = (carry_plugin_info *) (coord + 1);
12983 +
12984 +#if REISER4_DEBUG
12985 +       if (!node_is_empty(left)) {
12986 +               coord_t *last;
12987 +               reiser4_key *right_key;
12988 +               reiser4_key *left_key;
12989 +
12990 +               last = (coord_t *) (info + 1);
12991 +               right_key = (reiser4_key *) (last + 1);
12992 +               left_key = right_key + 1;
12993 +               coord_init_last_unit(last, left);
12994 +
12995 +               assert("nikita-2463",
12996 +                      keyle(item_key_by_coord(last, left_key),
12997 +                            item_key_by_coord(coord, right_key)));
12998 +       }
12999 +#endif
13000 +
13001 +       assert("jmacd-2007", item_is_internal(coord));
13002 +
13003 +       size = item_length_by_coord(coord);
13004 +       info->todo = todo;
13005 +       info->doing = NULL;
13006 +
13007 +       ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
13008 +                                              1
13009 +                                              /* delete @right if it becomes empty */
13010 +                                              ,
13011 +                                              0
13012 +                                              /* do not move coord @coord to node @left */
13013 +                                              ,
13014 +                                              info);
13015 +
13016 +       /* If shift returns positive, then we shifted the item. */
13017 +       assert("vs-423", ret <= 0 || size == ret);
13018 +       moved = (ret > 0);
13019 +
13020 +       if (moved) {
13021 +               /* something was moved */
13022 +               reiser4_tree *tree;
13023 +               int grabbed;
13024 +
13025 +               znode_make_dirty(left);
13026 +               znode_make_dirty(right);
13027 +               tree = znode_get_tree(left);
13028 +               write_lock_dk(tree);
13029 +               update_znode_dkeys(left, right);
13030 +               write_unlock_dk(tree);
13031 +
13032 +               /* reserve space for delimiting keys after shifting */
13033 +               grabbed = get_current_context()->grabbed_blocks;
13034 +               ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13035 +               assert("nikita-3003", ret == 0);        /* reserved space is exhausted. Ask Hans. */
13036 +
13037 +               ret = reiser4_carry(todo, NULL /* previous level */ );
13038 +               grabbed2free_mark(grabbed);
13039 +       }
13040 +
13041 +       done_carry_pool(pool);
13042 +
13043 +       if (ret != 0) {
13044 +               /* Shift or carry operation failed. */
13045 +               assert("jmacd-7325", ret < 0);
13046 +               return ret;
13047 +       }
13048 +
13049 +       return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
13050 +}
13051 +
13052 +/* Make the final relocate/wander decision during forward parent-first squalloc for a
13053 +   znode.  For unformatted nodes this is done in plugin/item/extent.c:extent_needs_allocation(). */
13054 +static int
13055 +allocate_znode_loaded(znode * node,
13056 +                     const coord_t * parent_coord, flush_pos_t * pos)
13057 +{
13058 +       int ret;
13059 +       reiser4_super_info_data *sbinfo = get_current_super_private();
13060 +       /* FIXME(D): We have the node write-locked and should have checked for !
13061 +          allocated() somewhere before reaching this point, but there can be a race, so
13062 +          this assertion is bogus. */
13063 +       assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
13064 +       assert("jmacd-7988", znode_is_write_locked(node));
13065 +       assert("jmacd-7989", coord_is_invalid(parent_coord)
13066 +              || znode_is_write_locked(parent_coord->node));
13067 +
13068 +       if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
13069 +           znode_is_root(node) ||
13070 +           /* We have enough nodes to relocate no matter what. */
13071 +           (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) {
13072 +               /* No need to decide with new nodes, they are treated the same as
13073 +                  relocate. If the root node is dirty, relocate. */
13074 +               if (pos->preceder.blk == 0) {
13075 +                       /* preceder is unknown and we have decided to relocate node --
13076 +                          using of default value for search start is better than search
13077 +                          from block #0. */
13078 +                       get_blocknr_hint_default(&pos->preceder.blk);
13079 +                       check_preceder(pos->preceder.blk);
13080 +               }
13081 +
13082 +               goto best_reloc;
13083 +
13084 +       } else if (pos->preceder.blk == 0) {
13085 +               /* If we don't know the preceder, leave it where it is. */
13086 +               jnode_make_wander(ZJNODE(node));
13087 +       } else {
13088 +               /* Make a decision based on block distance. */
13089 +               reiser4_block_nr dist;
13090 +               reiser4_block_nr nblk = *znode_get_block(node);
13091 +
13092 +               assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk));
13093 +               assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13094 +               assert("jmacd-6174", pos->preceder.blk != 0);
13095 +
13096 +               if (pos->preceder.blk == nblk - 1) {
13097 +                       /* Ideal. */
13098 +                       jnode_make_wander(ZJNODE(node));
13099 +               } else {
13100 +
13101 +                       dist =
13102 +                           (nblk <
13103 +                            pos->preceder.blk) ? (pos->preceder.blk -
13104 +                                                  nblk) : (nblk -
13105 +                                                           pos->preceder.blk);
13106 +
13107 +                       /* See if we can find a closer block (forward direction only). */
13108 +                       pos->preceder.max_dist =
13109 +                           min((reiser4_block_nr) sbinfo->flush.
13110 +                               relocate_distance, dist);
13111 +                       pos->preceder.level = znode_get_level(node);
13112 +
13113 +                       ret = allocate_znode_update(node, parent_coord, pos);
13114 +
13115 +                       pos->preceder.max_dist = 0;
13116 +
13117 +                       if (ret && (ret != -ENOSPC))
13118 +                               return ret;
13119 +
13120 +                       if (ret == 0) {
13121 +                               /* Got a better allocation. */
13122 +                               znode_make_reloc(node, pos->fq);
13123 +                       } else if (dist < sbinfo->flush.relocate_distance) {
13124 +                               /* The present allocation is good enough. */
13125 +                               jnode_make_wander(ZJNODE(node));
13126 +                       } else {
13127 +                               /* Otherwise, try to relocate to the best position. */
13128 +                             best_reloc:
13129 +                               ret =
13130 +                                   allocate_znode_update(node, parent_coord,
13131 +                                                         pos);
13132 +                               if (ret != 0)
13133 +                                       return ret;
13134 +
13135 +                               /* set JNODE_RELOC bit _after_ node gets allocated */
13136 +                               znode_make_reloc(node, pos->fq);
13137 +                       }
13138 +               }
13139 +       }
13140 +
13141 +       /* This is the new preceder. */
13142 +       pos->preceder.blk = *znode_get_block(node);
13143 +       check_preceder(pos->preceder.blk);
13144 +       pos->alloc_cnt += 1;
13145 +
13146 +       assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13147 +
13148 +       return 0;
13149 +}
13150 +
13151 +static int
13152 +allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos)
13153 +{
13154 +       /*
13155 +        * perform znode allocation with znode pinned in memory to avoid races
13156 +        * with asynchronous emergency flush (which plays with
13157 +        * JNODE_FLUSH_RESERVED bit).
13158 +        */
13159 +       return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
13160 +}
13161 +
13162 +/* A subroutine of allocate_znode, this is called first to see if there is a close
13163 +   position to relocate to.  It may return ENOSPC if there is no close position.  If there
13164 +   is no close position it may not relocate.  This takes care of updating the parent node
13165 +   with the relocated block address. */
13166 +static int
13167 +allocate_znode_update(znode * node, const coord_t * parent_coord,
13168 +                     flush_pos_t * pos)
13169 +{
13170 +       int ret;
13171 +       reiser4_block_nr blk;
13172 +       lock_handle uber_lock;
13173 +       int flush_reserved_used = 0;
13174 +       int grabbed;
13175 +       reiser4_context *ctx;
13176 +       reiser4_super_info_data *sbinfo;
13177 +
13178 +       init_lh(&uber_lock);
13179 +
13180 +       ctx = get_current_context();
13181 +       sbinfo = get_super_private(ctx->super);
13182 +
13183 +       grabbed = ctx->grabbed_blocks;
13184 +
13185 +       /* discard e-flush allocation */
13186 +       ret = zload(node);
13187 +       if (ret)
13188 +               return ret;
13189 +
13190 +       if (ZF_ISSET(node, JNODE_CREATED)) {
13191 +               assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node)));
13192 +               pos->preceder.block_stage = BLOCK_UNALLOCATED;
13193 +       } else {
13194 +               pos->preceder.block_stage = BLOCK_GRABBED;
13195 +
13196 +               /* The disk space for relocating the @node is already reserved in "flush reserved"
13197 +                * counter if @node is leaf, otherwise we grab space using BA_RESERVED (means grab
13198 +                * space from whole disk not from only 95%). */
13199 +               if (znode_get_level(node) == LEAF_LEVEL) {
13200 +                       /*
13201 +                        * earlier (during do_jnode_make_dirty()) we decided
13202 +                        * that @node can possibly go into overwrite set and
13203 +                        * reserved block for its wandering location.
13204 +                        */
13205 +                       txn_atom *atom = get_current_atom_locked();
13206 +                       assert("nikita-3449",
13207 +                              ZF_ISSET(node, JNODE_FLUSH_RESERVED));
13208 +                       flush_reserved2grabbed(atom, (__u64) 1);
13209 +                       spin_unlock_atom(atom);
13210 +                       /*
13211 +                        * we are trying to move node into relocate
13212 +                        * set. Allocation of relocated position "uses"
13213 +                        * reserved block.
13214 +                        */
13215 +                       ZF_CLR(node, JNODE_FLUSH_RESERVED);
13216 +                       flush_reserved_used = 1;
13217 +               } else {
13218 +                       ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
13219 +                       if (ret != 0)
13220 +                               goto exit;
13221 +               }
13222 +       }
13223 +
13224 +       /* We may do not use 5% of reserved disk space here and flush will not pack tightly. */
13225 +       ret = reiser4_alloc_block(&pos->preceder, &blk,
13226 +                                 BA_FORMATTED | BA_PERMANENT);
13227 +       if (ret)
13228 +               goto exit;
13229 +
13230 +       if (!ZF_ISSET(node, JNODE_CREATED) &&
13231 +           (ret =
13232 +            reiser4_dealloc_block(znode_get_block(node), 0,
13233 +                                  BA_DEFER | BA_FORMATTED)))
13234 +               goto exit;
13235 +
13236 +       if (likely(!znode_is_root(node))) {
13237 +               item_plugin *iplug;
13238 +
13239 +               iplug = item_plugin_by_coord(parent_coord);
13240 +               assert("nikita-2954", iplug->f.update != NULL);
13241 +               iplug->f.update(parent_coord, &blk);
13242 +
13243 +               znode_make_dirty(parent_coord->node);
13244 +
13245 +       } else {
13246 +               reiser4_tree *tree = znode_get_tree(node);
13247 +               znode *uber;
13248 +
13249 +               /* We take a longterm lock on the fake node in order to change
13250 +                  the root block number.  This may cause atom fusion. */
13251 +               ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
13252 +                                    &uber_lock);
13253 +               /* The fake node cannot be deleted, and we must have priority
13254 +                  here, and may not be confused with ENOSPC. */
13255 +               assert("jmacd-74412",
13256 +                      ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
13257 +
13258 +               if (ret)
13259 +                       goto exit;
13260 +
13261 +               uber = uber_lock.node;
13262 +
13263 +               write_lock_tree(tree);
13264 +               tree->root_block = blk;
13265 +               write_unlock_tree(tree);
13266 +
13267 +               znode_make_dirty(uber);
13268 +       }
13269 +
13270 +       ret = znode_rehash(node, &blk);
13271 +      exit:
13272 +       if (ret) {
13273 +               /* Get flush reserved block back if something fails, because
13274 +                * callers assume that on error block wasn't relocated and its
13275 +                * flush reserved block wasn't used. */
13276 +               if (flush_reserved_used) {
13277 +                       /*
13278 +                        * ok, we failed to move node into relocate
13279 +                        * set. Restore status quo.
13280 +                        */
13281 +                       grabbed2flush_reserved((__u64) 1);
13282 +                       ZF_SET(node, JNODE_FLUSH_RESERVED);
13283 +               }
13284 +       }
13285 +       zrelse(node);
13286 +       done_lh(&uber_lock);
13287 +       grabbed2free_mark(grabbed);
13288 +       return ret;
13289 +}
13290 +
13291 +/* JNODE INTERFACE */
13292 +
13293 +/* Lock a node (if formatted) and then get its parent locked, set the child's
13294 +   coordinate in the parent.  If the child is the root node, the above_root
13295 +   znode is returned but the coord is not set.  This function may cause atom
13296 +   fusion, but it is only used for read locks (at this point) and therefore
13297 +   fusion only occurs when the parent is already dirty. */
13298 +/* Hans adds this note: remember to ask how expensive this operation is vs. storing parent
13299 +   pointer in jnodes. */
13300 +static int
13301 +jnode_lock_parent_coord(jnode * node,
13302 +                       coord_t * coord,
13303 +                       lock_handle * parent_lh,
13304 +                       load_count * parent_zh,
13305 +                       znode_lock_mode parent_mode, int try)
13306 +{
13307 +       int ret;
13308 +
13309 +       assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
13310 +       assert("edward-54", jnode_is_unformatted(node)
13311 +              || znode_is_any_locked(JZNODE(node)));
13312 +
13313 +       if (!jnode_is_znode(node)) {
13314 +               reiser4_key key;
13315 +               tree_level stop_level = TWIG_LEVEL;
13316 +               lookup_bias bias = FIND_EXACT;
13317 +
13318 +               assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
13319 +
13320 +               /* The case when node is not znode, but can have parent coord
13321 +                  (unformatted node, node which represents cluster page,
13322 +                  etc..).  Generate a key for the appropriate entry, search
13323 +                  in the tree using coord_by_key, which handles locking for
13324 +                  us. */
13325 +
13326 +               /*
13327 +                * nothing is locked at this moment, so, nothing prevents
13328 +                * concurrent truncate from removing jnode from inode. To
13329 +                * prevent this spin-lock jnode. jnode can be truncated just
13330 +                * after call to the jnode_build_key(), but this is ok,
13331 +                * because coord_by_key() will just fail to find appropriate
13332 +                * extent.
13333 +                */
13334 +               spin_lock_jnode(node);
13335 +               if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13336 +                       jnode_build_key(node, &key);
13337 +                       ret = 0;
13338 +               } else
13339 +                       ret = RETERR(-ENOENT);
13340 +               spin_unlock_jnode(node);
13341 +
13342 +               if (ret != 0)
13343 +                       return ret;
13344 +
13345 +               if (jnode_is_cluster_page(node))
13346 +                       stop_level = LEAF_LEVEL;
13347 +
13348 +               assert("jmacd-1812", coord != NULL);
13349 +
13350 +               ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
13351 +                                  parent_mode, bias, stop_level, stop_level,
13352 +                                  CBK_UNIQUE, NULL /*ra_info */ );
13353 +               switch (ret) {
13354 +               case CBK_COORD_NOTFOUND:
13355 +                       assert("edward-1038",
13356 +                              ergo(jnode_is_cluster_page(node),
13357 +                                   JF_ISSET(node, JNODE_HEARD_BANSHEE)));
13358 +                       if (!JF_ISSET(node, JNODE_HEARD_BANSHEE))
13359 +                               warning("nikita-3177", "Parent not found");
13360 +                       return ret;
13361 +               case CBK_COORD_FOUND:
13362 +                       if (coord->between != AT_UNIT) {
13363 +                               /* FIXME: comment needed */
13364 +                               done_lh(parent_lh);
13365 +                               if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13366 +                                       warning("nikita-3178",
13367 +                                               "Found but not happy: %i",
13368 +                                               coord->between);
13369 +                               }
13370 +                               return RETERR(-ENOENT);
13371 +                       }
13372 +                       ret = incr_load_count_znode(parent_zh, parent_lh->node);
13373 +                       if (ret != 0)
13374 +                               return ret;
13375 +                       /* if (jnode_is_cluster_page(node)) {
13376 +                          races with write() are possible
13377 +                          check_child_cluster (parent_lh->node);
13378 +                          }
13379 +                        */
13380 +                       break;
13381 +               default:
13382 +                       return ret;
13383 +               }
13384 +
13385 +       } else {
13386 +               int flags;
13387 +               znode *z;
13388 +
13389 +               z = JZNODE(node);
13390 +               /* Formatted node case: */
13391 +               assert("jmacd-2061", !znode_is_root(z));
13392 +
13393 +               flags = GN_ALLOW_NOT_CONNECTED;
13394 +               if (try)
13395 +                       flags |= GN_TRY_LOCK;
13396 +
13397 +               ret =
13398 +                   reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
13399 +               if (ret != 0)
13400 +                       /* -E_REPEAT is ok here, it is handled by the caller. */
13401 +                       return ret;
13402 +
13403 +               /* Make the child's position "hint" up-to-date.  (Unless above
13404 +                  root, which caller must check.) */
13405 +               if (coord != NULL) {
13406 +
13407 +                       ret = incr_load_count_znode(parent_zh, parent_lh->node);
13408 +                       if (ret != 0) {
13409 +                               warning("jmacd-976812386",
13410 +                                       "incr_load_count_znode failed: %d",
13411 +                                       ret);
13412 +                               return ret;
13413 +                       }
13414 +
13415 +                       ret = find_child_ptr(parent_lh->node, z, coord);
13416 +                       if (ret != 0) {
13417 +                               warning("jmacd-976812",
13418 +                                       "find_child_ptr failed: %d", ret);
13419 +                               return ret;
13420 +                       }
13421 +               }
13422 +       }
13423 +
13424 +       return 0;
13425 +}
13426 +
13427 +/* Get the (locked) next neighbor of a znode which is dirty and a member of the same atom.
13428 +   If there is no next neighbor or the neighbor is not in memory or if there is a
13429 +   neighbor but it is not dirty or not in the same atom, -E_NO_NEIGHBOR is returned.
13430 +   In some cases the slum may include nodes which are not dirty, if so @check_dirty should be 0 */
13431 +static int neighbor_in_slum(znode * node,      /* starting point */
13432 +                           lock_handle * lock, /* lock on starting point */
13433 +                           sideof side,        /* left or right direction we seek the next node in */
13434 +                           znode_lock_mode mode,       /* kind of lock we want */
13435 +                           int check_dirty, /* true if the neighbor should be dirty */
13436 +                           int use_upper_levels /* get neighbor by going though
13437 +                                                   upper levels */)
13438 +{
13439 +       int ret;
13440 +       int flags;
13441 +
13442 +       assert("jmacd-6334", znode_is_connected(node));
13443 +
13444 +       flags =  GN_SAME_ATOM | (side == LEFT_SIDE ? GN_GO_LEFT : 0);
13445 +       if (use_upper_levels)
13446 +               flags |= GN_CAN_USE_UPPER_LEVELS;
13447 +
13448 +       ret = reiser4_get_neighbor(lock, node, mode, flags);
13449 +       if (ret) {
13450 +               /* May return -ENOENT or -E_NO_NEIGHBOR. */
13451 +               /* FIXME(C): check EINVAL, E_DEADLOCK */
13452 +               if (ret == -ENOENT) {
13453 +                       ret = RETERR(-E_NO_NEIGHBOR);
13454 +               }
13455 +               return ret;
13456 +       }
13457 +       if (!check_dirty)
13458 +               return 0;
13459 +       /* Check dirty bit of locked znode, no races here */
13460 +       if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
13461 +               return 0;
13462 +
13463 +       done_lh(lock);
13464 +       return RETERR(-E_NO_NEIGHBOR);
13465 +}
13466 +
13467 +/* Return true if two znodes have the same parent.  This is called with both nodes
13468 +   write-locked (for squeezing) so no tree lock is needed. */
13469 +static int znode_same_parents(znode * a, znode * b)
13470 +{
13471 +       int result;
13472 +
13473 +       assert("jmacd-7011", znode_is_write_locked(a));
13474 +       assert("jmacd-7012", znode_is_write_locked(b));
13475 +
13476 +       /* We lock the whole tree for this check.... I really don't like whole tree
13477 +        * locks... -Hans */
13478 +       read_lock_tree(znode_get_tree(a));
13479 +       result = (znode_parent(a) == znode_parent(b));
13480 +       read_unlock_tree(znode_get_tree(a));
13481 +       return result;
13482 +}
13483 +
13484 +/* FLUSH SCAN */
13485 +
13486 +/* Initialize the flush_scan data structure. */
13487 +static void scan_init(flush_scan * scan)
13488 +{
13489 +       memset(scan, 0, sizeof(*scan));
13490 +       init_lh(&scan->node_lock);
13491 +       init_lh(&scan->parent_lock);
13492 +       init_load_count(&scan->parent_load);
13493 +       init_load_count(&scan->node_load);
13494 +       coord_init_invalid(&scan->parent_coord, NULL);
13495 +}
13496 +
13497 +/* Release any resources held by the flush scan, e.g., release locks, free memory, etc. */
13498 +static void scan_done(flush_scan * scan)
13499 +{
13500 +       done_load_count(&scan->node_load);
13501 +       if (scan->node != NULL) {
13502 +               jput(scan->node);
13503 +               scan->node = NULL;
13504 +       }
13505 +       done_load_count(&scan->parent_load);
13506 +       done_lh(&scan->parent_lock);
13507 +       done_lh(&scan->node_lock);
13508 +}
13509 +
13510 +/* Returns true if flush scanning is finished. */
13511 +int reiser4_scan_finished(flush_scan * scan)
13512 +{
13513 +       return scan->stop || (scan->direction == RIGHT_SIDE &&
13514 +                             scan->count >= scan->max_count);
13515 +}
13516 +
13517 +/* Return true if the scan should continue to the @tonode.  True if the node meets the
13518 +   same_slum_check condition.  If not, deref the "left" node and stop the scan. */
13519 +int reiser4_scan_goto(flush_scan * scan, jnode * tonode)
13520 +{
13521 +       int go = same_slum_check(scan->node, tonode, 1, 0);
13522 +
13523 +       if (!go) {
13524 +               scan->stop = 1;
13525 +               jput(tonode);
13526 +       }
13527 +
13528 +       return go;
13529 +}
13530 +
13531 +/* Set the current scan->node, refcount it, increment count by the @add_count (number to
13532 +   count, e.g., skipped unallocated nodes), deref previous current, and copy the current
13533 +   parent coordinate. */
13534 +int
13535 +scan_set_current(flush_scan * scan, jnode * node, unsigned add_count,
13536 +                const coord_t * parent)
13537 +{
13538 +       /* Release the old references, take the new reference. */
13539 +       done_load_count(&scan->node_load);
13540 +
13541 +       if (scan->node != NULL) {
13542 +               jput(scan->node);
13543 +       }
13544 +       scan->node = node;
13545 +       scan->count += add_count;
13546 +
13547 +       /* This next stmt is somewhat inefficient.  The reiser4_scan_extent() code could
13548 +          delay this update step until it finishes and update the parent_coord only once.
13549 +          It did that before, but there was a bug and this was the easiest way to make it
13550 +          correct. */
13551 +       if (parent != NULL) {
13552 +               coord_dup(&scan->parent_coord, parent);
13553 +       }
13554 +
13555 +       /* Failure may happen at the incr_load_count call, but the caller can assume the reference
13556 +          is safely taken. */
13557 +       return incr_load_count_jnode(&scan->node_load, node);
13558 +}
13559 +
13560 +/* Return true if scanning in the leftward direction. */
13561 +int reiser4_scanning_left(flush_scan * scan)
13562 +{
13563 +       return scan->direction == LEFT_SIDE;
13564 +}
13565 +
13566 +/* Performs leftward scanning starting from either kind of node.  Counts the starting
13567 +   node.  The right-scan object is passed in for the left-scan in order to copy the parent
13568 +   of an unformatted starting position.  This way we avoid searching for the unformatted
13569 +   node's parent when scanning in each direction.  If we search for the parent once it is
13570 +   set in both scan objects.  The limit parameter tells flush-scan when to stop.
13571 +
13572 +   Rapid scanning is used only during scan_left, where we are interested in finding the
13573 +   'leftpoint' where we begin flushing.  We are interested in stopping at the left child
13574 +   of a twig that does not have a dirty left neighbor.  THIS IS A SPECIAL CASE.  The
13575 +   problem is finding a way to flush only those nodes without unallocated children, and it
13576 +   is difficult to solve in the bottom-up flushing algorithm we are currently using.  The
13577 +   problem can be solved by scanning left at every level as we go upward, but this would
13578 +   basically bring us back to using a top-down allocation strategy, which we already tried
13579 +   (see BK history from May 2002), and has a different set of problems.  The top-down
13580 +   strategy makes avoiding unallocated children easier, but makes it difficult to
13581 +   propertly flush dirty children with clean parents that would otherwise stop the
13582 +   top-down flush, only later to dirty the parent once the children are flushed.  So we
13583 +   solve the problem in the bottom-up algorithm with a special case for twigs and leaves
13584 +   only.
13585 +
13586 +   The first step in solving the problem is this rapid leftward scan.  After we determine
13587 +   that there are at least enough nodes counted to qualify for FLUSH_RELOCATE_THRESHOLD we
13588 +   are no longer interested in the exact count, we are only interested in finding a the
13589 +   best place to start the flush.  We could choose one of two possibilities:
13590 +
13591 +   1. Stop at the leftmost child (of a twig) that does not have a dirty left neighbor.
13592 +   This requires checking one leaf per rapid-scan twig
13593 +
13594 +   2. Stop at the leftmost child (of a twig) where there are no dirty children of the twig
13595 +   to the left.  This requires checking possibly all of the in-memory children of each
13596 +   twig during the rapid scan.
13597 +
13598 +   For now we implement the first policy.
13599 +*/
13600 +static int
13601 +scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
13602 +{
13603 +       int ret = 0;
13604 +
13605 +       scan->max_count = limit;
13606 +       scan->direction = LEFT_SIDE;
13607 +
13608 +       ret = scan_set_current(scan, jref(node), 1, NULL);
13609 +       if (ret != 0) {
13610 +               return ret;
13611 +       }
13612 +
13613 +       ret = scan_common(scan, right);
13614 +       if (ret != 0) {
13615 +               return ret;
13616 +       }
13617 +
13618 +       /* Before rapid scanning, we need a lock on scan->node so that we can get its
13619 +          parent, only if formatted. */
13620 +       if (jnode_is_znode(scan->node)) {
13621 +               ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
13622 +                                         ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
13623 +       }
13624 +
13625 +       /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD). */
13626 +       return ret;
13627 +}
13628 +
13629 +/* Performs rightward scanning... Does not count the starting node.  The limit parameter
13630 +   is described in scan_left.  If the starting node is unformatted then the
13631 +   parent_coord was already set during scan_left.  The rapid_after parameter is not used
13632 +   during right-scanning.
13633 +
13634 +   scan_right is only called if the scan_left operation does not count at least
13635 +   FLUSH_RELOCATE_THRESHOLD nodes for flushing.  Otherwise, the limit parameter is set to
13636 +   the difference between scan-left's count and FLUSH_RELOCATE_THRESHOLD, meaning
13637 +   scan-right counts as high as FLUSH_RELOCATE_THRESHOLD and then stops. */
13638 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit)
13639 +{
13640 +       int ret;
13641 +
13642 +       scan->max_count = limit;
13643 +       scan->direction = RIGHT_SIDE;
13644 +
13645 +       ret = scan_set_current(scan, jref(node), 0, NULL);
13646 +       if (ret != 0) {
13647 +               return ret;
13648 +       }
13649 +
13650 +       return scan_common(scan, NULL);
13651 +}
13652 +
13653 +/* Common code to perform left or right scanning. */
13654 +static int scan_common(flush_scan * scan, flush_scan * other)
13655 +{
13656 +       int ret;
13657 +
13658 +       assert("nikita-2376", scan->node != NULL);
13659 +       assert("edward-54", jnode_is_unformatted(scan->node)
13660 +              || jnode_is_znode(scan->node));
13661 +
13662 +       /* Special case for starting at an unformatted node.  Optimization: we only want
13663 +          to search for the parent (which requires a tree traversal) once.  Obviously, we
13664 +          shouldn't have to call it once for the left scan and once for the right scan.
13665 +          For this reason, if we search for the parent during scan-left we then duplicate
13666 +          the coord/lock/load into the scan-right object. */
13667 +       if (jnode_is_unformatted(scan->node)) {
13668 +               ret = scan_unformatted(scan, other);
13669 +               if (ret != 0)
13670 +                       return ret;
13671 +       }
13672 +       /* This loop expects to start at a formatted position and performs chaining of
13673 +          formatted regions */
13674 +       while (!reiser4_scan_finished(scan)) {
13675 +
13676 +               ret = scan_formatted(scan);
13677 +               if (ret != 0) {
13678 +                       return ret;
13679 +               }
13680 +       }
13681 +
13682 +       return 0;
13683 +}
13684 +
13685 +static int scan_unformatted(flush_scan * scan, flush_scan * other)
13686 +{
13687 +       int ret = 0;
13688 +       int try = 0;
13689 +
13690 +       if (!coord_is_invalid(&scan->parent_coord))
13691 +               goto scan;
13692 +
13693 +       /* set parent coord from */
13694 +       if (!jnode_is_unformatted(scan->node)) {
13695 +               /* formatted position */
13696 +
13697 +               lock_handle lock;
13698 +               assert("edward-301", jnode_is_znode(scan->node));
13699 +               init_lh(&lock);
13700 +
13701 +               /*
13702 +                * when flush starts from unformatted node, first thing it
13703 +                * does is tree traversal to find formatted parent of starting
13704 +                * node. This parent is then kept lock across scans to the
13705 +                * left and to the right. This means that during scan to the
13706 +                * left we cannot take left-ward lock, because this is
13707 +                * dead-lock prone. So, if we are scanning to the left and
13708 +                * there is already lock held by this thread,
13709 +                * jnode_lock_parent_coord() should use try-lock.
13710 +                */
13711 +               try = reiser4_scanning_left(scan)
13712 +                   && !lock_stack_isclean(get_current_lock_stack());
13713 +               /* Need the node locked to get the parent lock, We have to
13714 +                  take write lock since there is at least one call path
13715 +                  where this znode is already write-locked by us. */
13716 +               ret =
13717 +                   longterm_lock_znode(&lock, JZNODE(scan->node),
13718 +                                       ZNODE_WRITE_LOCK,
13719 +                                       reiser4_scanning_left(scan) ?
13720 +                                       ZNODE_LOCK_LOPRI :
13721 +                                       ZNODE_LOCK_HIPRI);
13722 +               if (ret != 0)
13723 +                       /* EINVAL or E_DEADLOCK here mean... try again!  At this point we've
13724 +                          scanned too far and can't back out, just start over. */
13725 +                       return ret;
13726 +
13727 +               ret = jnode_lock_parent_coord(scan->node,
13728 +                                             &scan->parent_coord,
13729 +                                             &scan->parent_lock,
13730 +                                             &scan->parent_load,
13731 +                                             ZNODE_WRITE_LOCK, try);
13732 +
13733 +               /* FIXME(C): check EINVAL, E_DEADLOCK */
13734 +               done_lh(&lock);
13735 +               if (ret == -E_REPEAT) {
13736 +                       scan->stop = 1;
13737 +                       return 0;
13738 +               }
13739 +               if (ret)
13740 +                       return ret;
13741 +
13742 +       } else {
13743 +               /* unformatted position */
13744 +
13745 +               ret =
13746 +                   jnode_lock_parent_coord(scan->node, &scan->parent_coord,
13747 +                                           &scan->parent_lock,
13748 +                                           &scan->parent_load,
13749 +                                           ZNODE_WRITE_LOCK, try);
13750 +
13751 +               if (IS_CBKERR(ret))
13752 +                       return ret;
13753 +
13754 +               if (ret == CBK_COORD_NOTFOUND)
13755 +                       /* FIXME(C): check EINVAL, E_DEADLOCK */
13756 +                       return ret;
13757 +
13758 +               /* parent was found */
13759 +               assert("jmacd-8661", other != NULL);
13760 +               /* Duplicate the reference into the other flush_scan. */
13761 +               coord_dup(&other->parent_coord, &scan->parent_coord);
13762 +               copy_lh(&other->parent_lock, &scan->parent_lock);
13763 +               copy_load_count(&other->parent_load, &scan->parent_load);
13764 +       }
13765 +      scan:
13766 +       return scan_by_coord(scan);
13767 +}
13768 +
13769 +/* Performs left- or rightward scanning starting from a formatted node. Follow left
13770 +   pointers under tree lock as long as:
13771 +
13772 +   - node->left/right is non-NULL
13773 +   - node->left/right is connected, dirty
13774 +   - node->left/right belongs to the same atom
13775 +   - scan has not reached maximum count
13776 +*/
13777 +static int scan_formatted(flush_scan * scan)
13778 +{
13779 +       int ret;
13780 +       znode *neighbor = NULL;
13781 +
13782 +       assert("jmacd-1401", !reiser4_scan_finished(scan));
13783 +
13784 +       do {
13785 +               znode *node = JZNODE(scan->node);
13786 +
13787 +               /* Node should be connected, but if not stop the scan. */
13788 +               if (!znode_is_connected(node)) {
13789 +                       scan->stop = 1;
13790 +                       break;
13791 +               }
13792 +
13793 +               /* Lock the tree, check-for and reference the next sibling. */
13794 +               read_lock_tree(znode_get_tree(node));
13795 +
13796 +               /* It may be that a node is inserted or removed between a node and its
13797 +                  left sibling while the tree lock is released, but the flush-scan count
13798 +                  does not need to be precise.  Thus, we release the tree lock as soon as
13799 +                  we get the neighboring node. */
13800 +               neighbor =
13801 +                       reiser4_scanning_left(scan) ? node->left : node->right;
13802 +               if (neighbor != NULL) {
13803 +                       zref(neighbor);
13804 +               }
13805 +
13806 +               read_unlock_tree(znode_get_tree(node));
13807 +
13808 +               /* If neighbor is NULL at the leaf level, need to check for an unformatted
13809 +                  sibling using the parent--break in any case. */
13810 +               if (neighbor == NULL) {
13811 +                       break;
13812 +               }
13813 +
13814 +               /* Check the condition for going left, break if it is not met.  This also
13815 +                  releases (jputs) the neighbor if false. */
13816 +               if (!reiser4_scan_goto(scan, ZJNODE(neighbor))) {
13817 +                       break;
13818 +               }
13819 +
13820 +               /* Advance the flush_scan state to the left, repeat. */
13821 +               ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
13822 +               if (ret != 0) {
13823 +                       return ret;
13824 +               }
13825 +
13826 +       } while (!reiser4_scan_finished(scan));
13827 +
13828 +       /* If neighbor is NULL then we reached the end of a formatted region, or else the
13829 +          sibling is out of memory, now check for an extent to the left (as long as
13830 +          LEAF_LEVEL). */
13831 +       if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL
13832 +           || reiser4_scan_finished(scan)) {
13833 +               scan->stop = 1;
13834 +               return 0;
13835 +       }
13836 +       /* Otherwise, calls scan_by_coord for the right(left)most item of the
13837 +          left(right) neighbor on the parent level, then possibly continue. */
13838 +
13839 +       coord_init_invalid(&scan->parent_coord, NULL);
13840 +       return scan_unformatted(scan, NULL);
13841 +}
13842 +
13843 +/* NOTE-EDWARD:
13844 +   This scans adjacent items of the same type and calls scan flush plugin for each one.
13845 +   Performs left(right)ward scanning starting from a (possibly) unformatted node.  If we start
13846 +   from unformatted node, then we continue only if the next neighbor is also unformatted.
13847 +   When called from scan_formatted, we skip first iteration (to make sure that
13848 +   right(left)most item of the left(right) neighbor on the parent level is of the same
13849 +   type and set appropriate coord). */
13850 +static int scan_by_coord(flush_scan * scan)
13851 +{
13852 +       int ret = 0;
13853 +       int scan_this_coord;
13854 +       lock_handle next_lock;
13855 +       load_count next_load;
13856 +       coord_t next_coord;
13857 +       jnode *child;
13858 +       item_plugin *iplug;
13859 +
13860 +       init_lh(&next_lock);
13861 +       init_load_count(&next_load);
13862 +       scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
13863 +
13864 +       /* set initial item id */
13865 +       iplug = item_plugin_by_coord(&scan->parent_coord);
13866 +
13867 +       for (; !reiser4_scan_finished(scan); scan_this_coord = 1) {
13868 +               if (scan_this_coord) {
13869 +                       /* Here we expect that unit is scannable. it would not be so due
13870 +                        * to race with extent->tail conversion.  */
13871 +                       if (iplug->f.scan == NULL) {
13872 +                               scan->stop = 1;
13873 +                               ret = -E_REPEAT;
13874 +                               /* skip the check at the end. */
13875 +                               goto race;
13876 +                       }
13877 +
13878 +                       ret = iplug->f.scan(scan);
13879 +                       if (ret != 0)
13880 +                               goto exit;
13881 +
13882 +                       if (reiser4_scan_finished(scan)) {
13883 +                               checkchild(scan);
13884 +                               break;
13885 +                       }
13886 +               } else {
13887 +                       /* the same race against truncate as above is possible
13888 +                        * here, it seems */
13889 +
13890 +                       /* NOTE-JMACD: In this case, apply the same end-of-node logic but don't scan
13891 +                          the first coordinate. */
13892 +                       assert("jmacd-1231",
13893 +                              item_is_internal(&scan->parent_coord));
13894 +               }
13895 +
13896 +               if (iplug->f.utmost_child == NULL
13897 +                   || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
13898 +                       /* stop this coord and continue on parrent level */
13899 +                       ret =
13900 +                           scan_set_current(scan,
13901 +                                            ZJNODE(zref
13902 +                                                   (scan->parent_coord.node)),
13903 +                                            1, NULL);
13904 +                       if (ret != 0)
13905 +                               goto exit;
13906 +                       break;
13907 +               }
13908 +
13909 +               /* Either way, the invariant is that scan->parent_coord is set to the
13910 +                  parent of scan->node. Now get the next unit. */
13911 +               coord_dup(&next_coord, &scan->parent_coord);
13912 +               coord_sideof_unit(&next_coord, scan->direction);
13913 +
13914 +               /* If off-the-end of the twig, try the next twig. */
13915 +               if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
13916 +                       /* We take the write lock because we may start flushing from this
13917 +                        * coordinate. */
13918 +                       ret = neighbor_in_slum(next_coord.node,
13919 +                                              &next_lock,
13920 +                                              scan->direction,
13921 +                                              ZNODE_WRITE_LOCK,
13922 +                                              1 /* check dirty */,
13923 +                                              0 /* don't go though upper
13924 +                                                   levels */);
13925 +                       if (ret == -E_NO_NEIGHBOR) {
13926 +                               scan->stop = 1;
13927 +                               ret = 0;
13928 +                               break;
13929 +                       }
13930 +
13931 +                       if (ret != 0) {
13932 +                               goto exit;
13933 +                       }
13934 +
13935 +                       ret = incr_load_count_znode(&next_load, next_lock.node);
13936 +                       if (ret != 0) {
13937 +                               goto exit;
13938 +                       }
13939 +
13940 +                       coord_init_sideof_unit(&next_coord, next_lock.node,
13941 +                                              sideof_reverse(scan->direction));
13942 +               }
13943 +
13944 +               iplug = item_plugin_by_coord(&next_coord);
13945 +
13946 +               /* Get the next child. */
13947 +               ret =
13948 +                   iplug->f.utmost_child(&next_coord,
13949 +                                         sideof_reverse(scan->direction),
13950 +                                         &child);
13951 +               if (ret != 0)
13952 +                       goto exit;
13953 +               /* If the next child is not in memory, or, item_utmost_child
13954 +                  failed (due to race with unlink, most probably), stop
13955 +                  here. */
13956 +               if (child == NULL || IS_ERR(child)) {
13957 +                       scan->stop = 1;
13958 +                       checkchild(scan);
13959 +                       break;
13960 +               }
13961 +
13962 +               assert("nikita-2374", jnode_is_unformatted(child)
13963 +                      || jnode_is_znode(child));
13964 +
13965 +               /* See if it is dirty, part of the same atom. */
13966 +               if (!reiser4_scan_goto(scan, child)) {
13967 +                       checkchild(scan);
13968 +                       break;
13969 +               }
13970 +
13971 +               /* If so, make this child current. */
13972 +               ret = scan_set_current(scan, child, 1, &next_coord);
13973 +               if (ret != 0)
13974 +                       goto exit;
13975 +
13976 +               /* Now continue.  If formatted we release the parent lock and return, then
13977 +                  proceed. */
13978 +               if (jnode_is_znode(child))
13979 +                       break;
13980 +
13981 +               /* Otherwise, repeat the above loop with next_coord. */
13982 +               if (next_load.node != NULL) {
13983 +                       done_lh(&scan->parent_lock);
13984 +                       move_lh(&scan->parent_lock, &next_lock);
13985 +                       move_load_count(&scan->parent_load, &next_load);
13986 +               }
13987 +       }
13988 +
13989 +       assert("jmacd-6233",
13990 +              reiser4_scan_finished(scan) || jnode_is_znode(scan->node));
13991 +      exit:
13992 +       checkchild(scan);
13993 +      race:                    /* skip the above check  */
13994 +       if (jnode_is_znode(scan->node)) {
13995 +               done_lh(&scan->parent_lock);
13996 +               done_load_count(&scan->parent_load);
13997 +       }
13998 +
13999 +       done_load_count(&next_load);
14000 +       done_lh(&next_lock);
14001 +       return ret;
14002 +}
14003 +
14004 +/* FLUSH POS HELPERS */
14005 +
14006 +/* Initialize the fields of a flush_position. */
14007 +static void pos_init(flush_pos_t * pos)
14008 +{
14009 +       memset(pos, 0, sizeof *pos);
14010 +
14011 +       pos->state = POS_INVALID;
14012 +       coord_init_invalid(&pos->coord, NULL);
14013 +       init_lh(&pos->lock);
14014 +       init_load_count(&pos->load);
14015 +
14016 +       reiser4_blocknr_hint_init(&pos->preceder);
14017 +}
14018 +
14019 +/* The flush loop inside squalloc periodically checks pos_valid to
14020 +   determine when "enough flushing" has been performed.  This will return true until one
14021 +   of the following conditions is met:
14022 +
14023 +   1. the number of flush-queued nodes has reached the kernel-supplied "int *nr_to_flush"
14024 +   parameter, meaning we have flushed as many blocks as the kernel requested.  When
14025 +   flushing to commit, this parameter is NULL.
14026 +
14027 +   2. pos_stop() is called because squalloc discovers that the "next" node in the
14028 +   flush order is either non-existant, not dirty, or not in the same atom.
14029 +*/
14030 +
14031 +static int pos_valid(flush_pos_t * pos)
14032 +{
14033 +       return pos->state != POS_INVALID;
14034 +}
14035 +
14036 +/* Release any resources of a flush_position.  Called when jnode_flush finishes. */
14037 +static void pos_done(flush_pos_t * pos)
14038 +{
14039 +       pos_stop(pos);
14040 +       reiser4_blocknr_hint_done(&pos->preceder);
14041 +       if (convert_data(pos))
14042 +               free_convert_data(pos);
14043 +}
14044 +
14045 +/* Reset the point and parent.  Called during flush subroutines to terminate the
14046 +   squalloc loop. */
14047 +static int pos_stop(flush_pos_t * pos)
14048 +{
14049 +       pos->state = POS_INVALID;
14050 +       done_lh(&pos->lock);
14051 +       done_load_count(&pos->load);
14052 +       coord_init_invalid(&pos->coord, NULL);
14053 +
14054 +       if (pos->child) {
14055 +               jput(pos->child);
14056 +               pos->child = NULL;
14057 +       }
14058 +
14059 +       return 0;
14060 +}
14061 +
14062 +/* Return the flush_position's block allocator hint. */
14063 +reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos)
14064 +{
14065 +       return &pos->preceder;
14066 +}
14067 +
14068 +flush_queue_t * reiser4_pos_fq(flush_pos_t * pos)
14069 +{
14070 +       return pos->fq;
14071 +}
14072 +
14073 +/* Make Linus happy.
14074 +   Local variables:
14075 +   c-indentation-style: "K&R"
14076 +   mode-name: "LC"
14077 +   c-basic-offset: 8
14078 +   tab-width: 8
14079 +   fill-column: 90
14080 +   LocalWords:  preceder
14081 +   End:
14082 +*/
14083 diff -urN linux-2.6.27.orig/fs/reiser4/flush.h linux-2.6.27/fs/reiser4/flush.h
14084 --- linux-2.6.27.orig/fs/reiser4/flush.h        1970-01-01 03:00:00.000000000 +0300
14085 +++ linux-2.6.27/fs/reiser4/flush.h     2008-10-12 18:20:00.000000000 +0400
14086 @@ -0,0 +1,290 @@
14087 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14088 +
14089 +/* DECLARATIONS: */
14090 +
14091 +#if !defined(__REISER4_FLUSH_H__)
14092 +#define __REISER4_FLUSH_H__
14093 +
14094 +#include "plugin/cluster.h"
14095 +
14096 +/* The flush_scan data structure maintains the state of an in-progress flush-scan on a
14097 +   single level of the tree.  A flush-scan is used for counting the number of adjacent
14098 +   nodes to flush, which is used to determine whether we should relocate, and it is also
14099 +   used to find a starting point for flush.  A flush-scan object can scan in both right
14100 +   and left directions via the scan_left() and scan_right() interfaces.  The
14101 +   right- and left-variations are similar but perform different functions.  When scanning
14102 +   left we (optionally perform rapid scanning and then) longterm-lock the endpoint node.
14103 +   When scanning right we are simply counting the number of adjacent, dirty nodes. */
14104 +struct flush_scan {
14105 +
14106 +       /* The current number of nodes scanned on this level. */
14107 +       unsigned count;
14108 +
14109 +       /* There may be a maximum number of nodes for a scan on any single level.  When
14110 +          going leftward, max_count is determined by FLUSH_SCAN_MAXNODES (see reiser4.h) */
14111 +       unsigned max_count;
14112 +
14113 +       /* Direction: Set to one of the sideof enumeration: { LEFT_SIDE, RIGHT_SIDE }. */
14114 +       sideof direction;
14115 +
14116 +       /* Initially @stop is set to false then set true once some condition stops the
14117 +          search (e.g., we found a clean node before reaching max_count or we found a
14118 +          node belonging to another atom). */
14119 +       int stop;
14120 +
14121 +       /* The current scan position.  If @node is non-NULL then its reference count has
14122 +          been incremented to reflect this reference. */
14123 +       jnode *node;
14124 +
14125 +       /* A handle for zload/zrelse of current scan position node. */
14126 +       load_count node_load;
14127 +
14128 +       /* During left-scan, if the final position (a.k.a. endpoint node) is formatted the
14129 +          node is locked using this lock handle.  The endpoint needs to be locked for
14130 +          transfer to the flush_position object after scanning finishes. */
14131 +       lock_handle node_lock;
14132 +
14133 +       /* When the position is unformatted, its parent, coordinate, and parent
14134 +          zload/zrelse handle. */
14135 +       lock_handle parent_lock;
14136 +       coord_t parent_coord;
14137 +       load_count parent_load;
14138 +
14139 +       /* The block allocator preceder hint.  Sometimes flush_scan determines what the
14140 +          preceder is and if so it sets it here, after which it is copied into the
14141 +          flush_position.  Otherwise, the preceder is computed later. */
14142 +       reiser4_block_nr preceder_blk;
14143 +};
14144 +
14145 +struct convert_item_info {
14146 +       dc_item_stat d_cur;     /* disk cluster state of the current item */
14147 +       dc_item_stat d_next;    /* disk cluster state of the next slum item */
14148 +       int cluster_shift;      /* disk cluster shift */
14149 +       flow_t flow;            /* disk cluster data */
14150 +};
14151 +
14152 +struct convert_info {
14153 +       int count;              /* for squalloc terminating */
14154 +       item_plugin *iplug;     /* current item plugin */
14155 +       struct convert_item_info *itm;  /* current item info */
14156 +       struct cluster_handle clust;    /* transform cluster */
14157 +};
14158 +
14159 +typedef enum flush_position_state {
14160 +       POS_INVALID,            /* Invalid or stopped pos, do not continue slum
14161 +                                * processing */
14162 +       POS_ON_LEAF,            /* pos points to already prepped, locked formatted node at
14163 +                                * leaf level */
14164 +       POS_ON_EPOINT,          /* pos keeps a lock on twig level, "coord" field is used
14165 +                                * to traverse unformatted nodes */
14166 +       POS_TO_LEAF,            /* pos is being moved to leaf level */
14167 +       POS_TO_TWIG,            /* pos is being moved to twig level */
14168 +       POS_END_OF_TWIG,        /* special case of POS_ON_TWIG, when coord is after
14169 +                                * rightmost unit of the current twig */
14170 +       POS_ON_INTERNAL         /* same as POS_ON_LEAF, but points to internal node */
14171 +} flushpos_state_t;
14172 +
14173 +/* An encapsulation of the current flush point and all the parameters that are passed
14174 +   through the entire squeeze-and-allocate stage of the flush routine.  A single
14175 +   flush_position object is constructed after left- and right-scanning finishes. */
14176 +struct flush_position {
14177 +       flushpos_state_t state;
14178 +
14179 +       coord_t coord;          /* coord to traverse unformatted nodes */
14180 +       lock_handle lock;       /* current lock we hold */
14181 +       load_count load;        /* load status for current locked formatted node  */
14182 +
14183 +       jnode *child;           /* for passing a reference to unformatted child
14184 +                                * across pos state changes */
14185 +
14186 +       reiser4_blocknr_hint preceder;  /* The flush 'hint' state. */
14187 +       int leaf_relocate;      /* True if enough leaf-level nodes were
14188 +                                * found to suggest a relocate policy. */
14189 +       int alloc_cnt;          /* The number of nodes allocated during squeeze and allococate. */
14190 +       int prep_or_free_cnt;   /* The number of nodes prepared for write (allocate) or squeezed and freed. */
14191 +       flush_queue_t *fq;
14192 +       long *nr_written;       /* number of nodes submitted to disk */
14193 +       int flags;              /* a copy of jnode_flush flags argument */
14194 +
14195 +       znode *prev_twig;       /* previous parent pointer value, used to catch
14196 +                                * processing of new twig node */
14197 +       struct convert_info *sq;        /* convert info */
14198 +
14199 +       unsigned long pos_in_unit;      /* for extents only. Position
14200 +                                          within an extent unit of first
14201 +                                          jnode of slum */
14202 +       long nr_to_write;       /* number of unformatted nodes to handle on flush */
14203 +};
14204 +
14205 +static inline int item_convert_count(flush_pos_t * pos)
14206 +{
14207 +       return pos->sq->count;
14208 +}
14209 +static inline void inc_item_convert_count(flush_pos_t * pos)
14210 +{
14211 +       pos->sq->count++;
14212 +}
14213 +static inline void set_item_convert_count(flush_pos_t * pos, int count)
14214 +{
14215 +       pos->sq->count = count;
14216 +}
14217 +static inline item_plugin *item_convert_plug(flush_pos_t * pos)
14218 +{
14219 +       return pos->sq->iplug;
14220 +}
14221 +
14222 +static inline struct convert_info *convert_data(flush_pos_t * pos)
14223 +{
14224 +       return pos->sq;
14225 +}
14226 +
14227 +static inline struct convert_item_info *item_convert_data(flush_pos_t * pos)
14228 +{
14229 +       assert("edward-955", convert_data(pos));
14230 +       return pos->sq->itm;
14231 +}
14232 +
14233 +static inline struct tfm_cluster * tfm_cluster_sq(flush_pos_t * pos)
14234 +{
14235 +       return &pos->sq->clust.tc;
14236 +}
14237 +
14238 +static inline struct tfm_stream * tfm_stream_sq(flush_pos_t * pos,
14239 +                                               tfm_stream_id id)
14240 +{
14241 +       assert("edward-854", pos->sq != NULL);
14242 +       return get_tfm_stream(tfm_cluster_sq(pos), id);
14243 +}
14244 +
14245 +static inline int chaining_data_present(flush_pos_t * pos)
14246 +{
14247 +       return convert_data(pos) && item_convert_data(pos);
14248 +}
14249 +
14250 +/* Returns true if next node contains next item of the disk cluster
14251 +   so item convert data should be moved to the right slum neighbor.
14252 +*/
14253 +static inline int should_chain_next_node(flush_pos_t * pos)
14254 +{
14255 +       int result = 0;
14256 +
14257 +       assert("edward-1007", chaining_data_present(pos));
14258 +
14259 +       switch (item_convert_data(pos)->d_next) {
14260 +       case DC_CHAINED_ITEM:
14261 +               result = 1;
14262 +               break;
14263 +       case DC_AFTER_CLUSTER:
14264 +               break;
14265 +       default:
14266 +               impossible("edward-1009", "bad state of next slum item");
14267 +       }
14268 +       return result;
14269 +}
14270 +
14271 +/* update item state in a disk cluster to assign conversion mode */
14272 +static inline void
14273 +move_chaining_data(flush_pos_t * pos, int this_node /* where is next item */ )
14274 +{
14275 +
14276 +       assert("edward-1010", chaining_data_present(pos));
14277 +
14278 +       if (this_node == 0) {
14279 +               /* next item is on the right neighbor */
14280 +               assert("edward-1011",
14281 +                      item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14282 +                      item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14283 +               assert("edward-1012",
14284 +                      item_convert_data(pos)->d_next == DC_CHAINED_ITEM);
14285 +
14286 +               item_convert_data(pos)->d_cur = DC_CHAINED_ITEM;
14287 +               item_convert_data(pos)->d_next = DC_INVALID_STATE;
14288 +       } else {
14289 +               /* next item is on the same node */
14290 +               assert("edward-1013",
14291 +                      item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14292 +                      item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14293 +               assert("edward-1227",
14294 +                      item_convert_data(pos)->d_next == DC_AFTER_CLUSTER ||
14295 +                      item_convert_data(pos)->d_next == DC_INVALID_STATE);
14296 +
14297 +               item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER;
14298 +               item_convert_data(pos)->d_next = DC_INVALID_STATE;
14299 +       }
14300 +}
14301 +
14302 +static inline int should_convert_node(flush_pos_t * pos, znode * node)
14303 +{
14304 +       return znode_convertible(node);
14305 +}
14306 +
14307 +/* true if there is attached convert item info */
14308 +static inline int should_convert_next_node(flush_pos_t * pos)
14309 +{
14310 +       return convert_data(pos) && item_convert_data(pos);
14311 +}
14312 +
14313 +#define SQUALLOC_THRESHOLD 256
14314 +
14315 +static inline int should_terminate_squalloc(flush_pos_t * pos)
14316 +{
14317 +       return convert_data(pos) &&
14318 +           !item_convert_data(pos) &&
14319 +           item_convert_count(pos) >= SQUALLOC_THRESHOLD;
14320 +}
14321 +
14322 +#if 1
14323 +#define check_convert_info(pos)                                                \
14324 +do {                                                                   \
14325 +       if (unlikely(should_convert_next_node(pos))){                   \
14326 +               warning("edward-1006", "unprocessed chained data");     \
14327 +               printk("d_cur = %d, d_next = %d, flow.len = %llu\n",    \
14328 +                      item_convert_data(pos)->d_cur,                   \
14329 +                      item_convert_data(pos)->d_next,                  \
14330 +                      item_convert_data(pos)->flow.length);            \
14331 +       }                                                               \
14332 +} while (0)
14333 +#else
14334 +#define check_convert_info(pos)
14335 +#endif /* REISER4_DEBUG */
14336 +
14337 +void free_convert_data(flush_pos_t * pos);
14338 +/* used in extent.c */
14339 +int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size,
14340 +                    const coord_t * parent);
14341 +int reiser4_scan_finished(flush_scan * scan);
14342 +int reiser4_scanning_left(flush_scan * scan);
14343 +int reiser4_scan_goto(flush_scan * scan, jnode * tonode);
14344 +txn_atom *atom_locked_by_fq(flush_queue_t * fq);
14345 +int reiser4_alloc_extent(flush_pos_t *flush_pos);
14346 +squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *,
14347 +                              reiser4_key *stop_key);
14348 +extern int reiser4_init_fqs(void);
14349 +extern void reiser4_done_fqs(void);
14350 +
14351 +#if REISER4_DEBUG
14352 +
14353 +extern void reiser4_check_fq(const txn_atom *atom);
14354 +extern atomic_t flush_cnt;
14355 +
14356 +#define check_preceder(blk) \
14357 +assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb()));
14358 +extern void check_pos(flush_pos_t * pos);
14359 +#else
14360 +#define check_preceder(b) noop
14361 +#define check_pos(pos) noop
14362 +#endif
14363 +
14364 +/* __REISER4_FLUSH_H__ */
14365 +#endif
14366 +
14367 +/* Make Linus happy.
14368 +   Local variables:
14369 +   c-indentation-style: "K&R"
14370 +   mode-name: "LC"
14371 +   c-basic-offset: 8
14372 +   tab-width: 8
14373 +   fill-column: 90
14374 +   LocalWords:  preceder
14375 +   End:
14376 +*/
14377 diff -urN linux-2.6.27.orig/fs/reiser4/flush_queue.c linux-2.6.27/fs/reiser4/flush_queue.c
14378 --- linux-2.6.27.orig/fs/reiser4/flush_queue.c  1970-01-01 03:00:00.000000000 +0300
14379 +++ linux-2.6.27/fs/reiser4/flush_queue.c       2008-10-12 18:20:00.000000000 +0400
14380 @@ -0,0 +1,674 @@
14381 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14382 +
14383 +#include "debug.h"
14384 +#include "super.h"
14385 +#include "txnmgr.h"
14386 +#include "jnode.h"
14387 +#include "znode.h"
14388 +#include "page_cache.h"
14389 +#include "wander.h"
14390 +#include "vfs_ops.h"
14391 +#include "writeout.h"
14392 +#include "flush.h"
14393 +
14394 +#include <linux/bio.h>
14395 +#include <linux/mm.h>
14396 +#include <linux/pagemap.h>
14397 +#include <linux/blkdev.h>
14398 +#include <linux/writeback.h>
14399 +
14400 +/* A flush queue object is an accumulator for keeping jnodes prepared
14401 +   by the jnode_flush() function for writing to disk. Those "queued" jnodes are
14402 +   kept on the flush queue until memory pressure or atom commit asks
14403 +   flush queues to write some or all from their jnodes. */
14404 +
14405 +/*
14406 +   LOCKING:
14407 +
14408 +   fq->guard spin lock protects fq->atom pointer and nothing else.  fq->prepped
14409 +   list protected by atom spin lock.  fq->prepped list uses the following
14410 +   locking:
14411 +
14412 +   two ways to protect fq->prepped list for read-only list traversal:
14413 +
14414 +   1. atom spin-lock atom.
14415 +   2. fq is IN_USE, atom->nr_running_queues increased.
14416 +
14417 +   and one for list modification:
14418 +
14419 +   1. atom is spin-locked and one condition is true: fq is IN_USE or
14420 +      atom->nr_running_queues == 0.
14421 +
14422 +   The deadlock-safe order for flush queues and atoms is: first lock atom, then
14423 +   lock flush queue, then lock jnode.
14424 +*/
14425 +
14426 +#define fq_in_use(fq)          ((fq)->state & FQ_IN_USE)
14427 +#define fq_ready(fq)           (!fq_in_use(fq))
14428 +
14429 +#define mark_fq_in_use(fq)     do { (fq)->state |= FQ_IN_USE;    } while (0)
14430 +#define mark_fq_ready(fq)      do { (fq)->state &= ~FQ_IN_USE;   } while (0)
14431 +
14432 +/* get lock on atom from locked flush queue object */
14433 +static txn_atom *atom_locked_by_fq_nolock(flush_queue_t * fq)
14434 +{
14435 +       /* This code is similar to jnode_get_atom(), look at it for the
14436 +        * explanation. */
14437 +       txn_atom *atom;
14438 +
14439 +       assert_spin_locked(&(fq->guard));
14440 +
14441 +       while (1) {
14442 +               atom = fq->atom;
14443 +               if (atom == NULL)
14444 +                       break;
14445 +
14446 +               if (spin_trylock_atom(atom))
14447 +                       break;
14448 +
14449 +               atomic_inc(&atom->refcount);
14450 +               spin_unlock(&(fq->guard));
14451 +               spin_lock_atom(atom);
14452 +               spin_lock(&(fq->guard));
14453 +
14454 +               if (fq->atom == atom) {
14455 +                       atomic_dec(&atom->refcount);
14456 +                       break;
14457 +               }
14458 +
14459 +               spin_unlock(&(fq->guard));
14460 +               atom_dec_and_unlock(atom);
14461 +               spin_lock(&(fq->guard));
14462 +       }
14463 +
14464 +       return atom;
14465 +}
14466 +
14467 +txn_atom *atom_locked_by_fq(flush_queue_t * fq)
14468 +{
14469 +       txn_atom *atom;
14470 +
14471 +       spin_lock(&(fq->guard));
14472 +       atom = atom_locked_by_fq_nolock(fq);
14473 +       spin_unlock(&(fq->guard));
14474 +       return atom;
14475 +}
14476 +
14477 +static void init_fq(flush_queue_t * fq)
14478 +{
14479 +       memset(fq, 0, sizeof *fq);
14480 +
14481 +       atomic_set(&fq->nr_submitted, 0);
14482 +
14483 +       INIT_LIST_HEAD(ATOM_FQ_LIST(fq));
14484 +
14485 +       init_waitqueue_head(&fq->wait);
14486 +       spin_lock_init(&fq->guard);
14487 +}
14488 +
14489 +/* slab for flush queues */
14490 +static struct kmem_cache *fq_slab;
14491 +
14492 +/**
14493 + * reiser4_init_fqs - create flush queue cache
14494 + *
14495 + * Initializes slab cache of flush queues. It is part of reiser4 module
14496 + * initialization.
14497 + */
14498 +int reiser4_init_fqs(void)
14499 +{
14500 +       fq_slab = kmem_cache_create("fq",
14501 +                                   sizeof(flush_queue_t),
14502 +                                   0, SLAB_HWCACHE_ALIGN, NULL);
14503 +       if (fq_slab == NULL)
14504 +               return RETERR(-ENOMEM);
14505 +       return 0;
14506 +}
14507 +
14508 +/**
14509 + * reiser4_done_fqs - delete flush queue cache
14510 + *
14511 + * This is called on reiser4 module unloading or system shutdown.
14512 + */
14513 +void reiser4_done_fqs(void)
14514 +{
14515 +       destroy_reiser4_cache(&fq_slab);
14516 +}
14517 +
14518 +/* create new flush queue object */
14519 +static flush_queue_t *create_fq(gfp_t gfp)
14520 +{
14521 +       flush_queue_t *fq;
14522 +
14523 +       fq = kmem_cache_alloc(fq_slab, gfp);
14524 +       if (fq)
14525 +               init_fq(fq);
14526 +
14527 +       return fq;
14528 +}
14529 +
14530 +/* adjust atom's and flush queue's counters of queued nodes */
14531 +static void count_enqueued_node(flush_queue_t * fq)
14532 +{
14533 +       ON_DEBUG(fq->atom->num_queued++);
14534 +}
14535 +
14536 +static void count_dequeued_node(flush_queue_t * fq)
14537 +{
14538 +       assert("zam-993", fq->atom->num_queued > 0);
14539 +       ON_DEBUG(fq->atom->num_queued--);
14540 +}
14541 +
14542 +/* attach flush queue object to the atom */
14543 +static void attach_fq(txn_atom *atom, flush_queue_t *fq)
14544 +{
14545 +       assert_spin_locked(&(atom->alock));
14546 +       list_add(&fq->alink, &atom->flush_queues);
14547 +       fq->atom = atom;
14548 +       ON_DEBUG(atom->nr_flush_queues++);
14549 +}
14550 +
14551 +static void detach_fq(flush_queue_t * fq)
14552 +{
14553 +       assert_spin_locked(&(fq->atom->alock));
14554 +
14555 +       spin_lock(&(fq->guard));
14556 +       list_del_init(&fq->alink);
14557 +       assert("vs-1456", fq->atom->nr_flush_queues > 0);
14558 +       ON_DEBUG(fq->atom->nr_flush_queues--);
14559 +       fq->atom = NULL;
14560 +       spin_unlock(&(fq->guard));
14561 +}
14562 +
14563 +/* destroy flush queue object */
14564 +static void done_fq(flush_queue_t * fq)
14565 +{
14566 +       assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq)));
14567 +       assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
14568 +
14569 +       kmem_cache_free(fq_slab, fq);
14570 +}
14571 +
14572 +/* */
14573 +static void mark_jnode_queued(flush_queue_t * fq, jnode * node)
14574 +{
14575 +       JF_SET(node, JNODE_FLUSH_QUEUED);
14576 +       count_enqueued_node(fq);
14577 +}
14578 +
14579 +/* Putting jnode into the flush queue. Both atom and jnode should be
14580 +   spin-locked. */
14581 +void queue_jnode(flush_queue_t * fq, jnode * node)
14582 +{
14583 +       assert_spin_locked(&(node->guard));
14584 +       assert("zam-713", node->atom != NULL);
14585 +       assert_spin_locked(&(node->atom->alock));
14586 +       assert("zam-716", fq->atom != NULL);
14587 +       assert("zam-717", fq->atom == node->atom);
14588 +       assert("zam-907", fq_in_use(fq));
14589 +
14590 +       assert("zam-714", JF_ISSET(node, JNODE_DIRTY));
14591 +       assert("zam-826", JF_ISSET(node, JNODE_RELOC));
14592 +       assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
14593 +       assert("vs-1481", NODE_LIST(node) != FQ_LIST);
14594 +
14595 +       mark_jnode_queued(fq, node);
14596 +       list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq));
14597 +
14598 +       ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
14599 +                            FQ_LIST, 1));
14600 +}
14601 +
14602 +/* repeatable process for waiting io completion on a flush queue object */
14603 +static int wait_io(flush_queue_t * fq, int *nr_io_errors)
14604 +{
14605 +       assert("zam-738", fq->atom != NULL);
14606 +       assert_spin_locked(&(fq->atom->alock));
14607 +       assert("zam-736", fq_in_use(fq));
14608 +       assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq)));
14609 +
14610 +       if (atomic_read(&fq->nr_submitted) != 0) {
14611 +               struct super_block *super;
14612 +
14613 +               spin_unlock_atom(fq->atom);
14614 +
14615 +               assert("nikita-3013", reiser4_schedulable());
14616 +
14617 +               super = reiser4_get_current_sb();
14618 +
14619 +               /* FIXME: this is instead of blk_run_queues() */
14620 +               blk_run_address_space(reiser4_get_super_fake(super)->i_mapping);
14621 +
14622 +               if (!(super->s_flags & MS_RDONLY))
14623 +                       wait_event(fq->wait, atomic_read(&fq->nr_submitted) == 0);
14624 +
14625 +               /* Ask the caller to re-acquire the locks and call this
14626 +                  function again. Note: this technique is commonly used in
14627 +                  the txnmgr code. */
14628 +               return -E_REPEAT;
14629 +       }
14630 +
14631 +       *nr_io_errors += atomic_read(&fq->nr_errors);
14632 +       return 0;
14633 +}
14634 +
14635 +/* wait on I/O completion, re-submit dirty nodes to write */
14636 +static int finish_fq(flush_queue_t * fq, int *nr_io_errors)
14637 +{
14638 +       int ret;
14639 +       txn_atom *atom = fq->atom;
14640 +
14641 +       assert("zam-801", atom != NULL);
14642 +       assert_spin_locked(&(atom->alock));
14643 +       assert("zam-762", fq_in_use(fq));
14644 +
14645 +       ret = wait_io(fq, nr_io_errors);
14646 +       if (ret)
14647 +               return ret;
14648 +
14649 +       detach_fq(fq);
14650 +       done_fq(fq);
14651 +
14652 +       reiser4_atom_send_event(atom);
14653 +
14654 +       return 0;
14655 +}
14656 +
14657 +/* wait for all i/o for given atom to be completed, actually do one iteration
14658 +   on that and return -E_REPEAT if there more iterations needed */
14659 +static int finish_all_fq(txn_atom * atom, int *nr_io_errors)
14660 +{
14661 +       flush_queue_t *fq;
14662 +
14663 +       assert_spin_locked(&(atom->alock));
14664 +
14665 +       if (list_empty_careful(&atom->flush_queues))
14666 +               return 0;
14667 +
14668 +       list_for_each_entry(fq, &atom->flush_queues, alink) {
14669 +               if (fq_ready(fq)) {
14670 +                       int ret;
14671 +
14672 +                       mark_fq_in_use(fq);
14673 +                       assert("vs-1247", fq->owner == NULL);
14674 +                       ON_DEBUG(fq->owner = current);
14675 +                       ret = finish_fq(fq, nr_io_errors);
14676 +
14677 +                       if (*nr_io_errors)
14678 +                               reiser4_handle_error();
14679 +
14680 +                       if (ret) {
14681 +                               reiser4_fq_put(fq);
14682 +                               return ret;
14683 +                       }
14684 +
14685 +                       spin_unlock_atom(atom);
14686 +
14687 +                       return -E_REPEAT;
14688 +               }
14689 +       }
14690 +
14691 +       /* All flush queues are in use; atom remains locked */
14692 +       return -EBUSY;
14693 +}
14694 +
14695 +/* wait all i/o for current atom */
14696 +int current_atom_finish_all_fq(void)
14697 +{
14698 +       txn_atom *atom;
14699 +       int nr_io_errors = 0;
14700 +       int ret = 0;
14701 +
14702 +       do {
14703 +               while (1) {
14704 +                       atom = get_current_atom_locked();
14705 +                       ret = finish_all_fq(atom, &nr_io_errors);
14706 +                       if (ret != -EBUSY)
14707 +                               break;
14708 +                       reiser4_atom_wait_event(atom);
14709 +               }
14710 +       } while (ret == -E_REPEAT);
14711 +
14712 +       /* we do not need locked atom after this function finishes, SUCCESS or
14713 +          -EBUSY are two return codes when atom remains locked after
14714 +          finish_all_fq */
14715 +       if (!ret)
14716 +               spin_unlock_atom(atom);
14717 +
14718 +       assert_spin_not_locked(&(atom->alock));
14719 +
14720 +       if (ret)
14721 +               return ret;
14722 +
14723 +       if (nr_io_errors)
14724 +               return RETERR(-EIO);
14725 +
14726 +       return 0;
14727 +}
14728 +
14729 +/* change node->atom field for all jnode from given list */
14730 +static void
14731 +scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom)
14732 +{
14733 +       jnode *cur;
14734 +
14735 +       list_for_each_entry(cur, list, capture_link) {
14736 +               spin_lock_jnode(cur);
14737 +               cur->atom = atom;
14738 +               spin_unlock_jnode(cur);
14739 +       }
14740 +}
14741 +
14742 +/* support for atom fusion operation */
14743 +void reiser4_fuse_fq(txn_atom *to, txn_atom *from)
14744 +{
14745 +       flush_queue_t *fq;
14746 +
14747 +       assert_spin_locked(&(to->alock));
14748 +       assert_spin_locked(&(from->alock));
14749 +
14750 +       list_for_each_entry(fq, &from->flush_queues, alink) {
14751 +               scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
14752 +               spin_lock(&(fq->guard));
14753 +               fq->atom = to;
14754 +               spin_unlock(&(fq->guard));
14755 +       }
14756 +
14757 +       list_splice_init(&from->flush_queues, to->flush_queues.prev);
14758 +
14759 +#if REISER4_DEBUG
14760 +       to->num_queued += from->num_queued;
14761 +       to->nr_flush_queues += from->nr_flush_queues;
14762 +       from->nr_flush_queues = 0;
14763 +#endif
14764 +}
14765 +
14766 +#if REISER4_DEBUG
14767 +int atom_fq_parts_are_clean(txn_atom * atom)
14768 +{
14769 +       assert("zam-915", atom != NULL);
14770 +       return list_empty_careful(&atom->flush_queues);
14771 +}
14772 +#endif
14773 +/* Bio i/o completion routine for reiser4 write operations. */
14774 +static void
14775 +end_io_handler(struct bio *bio, int err)
14776 +{
14777 +       int i;
14778 +       int nr_errors = 0;
14779 +       flush_queue_t *fq;
14780 +
14781 +       assert("zam-958", bio->bi_rw & WRITE);
14782 +
14783 +       if (err == -EOPNOTSUPP)
14784 +               set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
14785 +
14786 +       /* we expect that bio->private is set to NULL or fq object which is used
14787 +        * for synchronization and error counting. */
14788 +       fq = bio->bi_private;
14789 +       /* Check all elements of io_vec for correct write completion. */
14790 +       for (i = 0; i < bio->bi_vcnt; i += 1) {
14791 +               struct page *pg = bio->bi_io_vec[i].bv_page;
14792 +
14793 +               if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
14794 +                       SetPageError(pg);
14795 +                       nr_errors++;
14796 +               }
14797 +
14798 +               {
14799 +                       /* jnode WRITEBACK ("write is in progress bit") is
14800 +                        * atomically cleared here. */
14801 +                       jnode *node;
14802 +
14803 +                       assert("zam-736", pg != NULL);
14804 +                       assert("zam-736", PagePrivate(pg));
14805 +                       node = jprivate(pg);
14806 +
14807 +                       JF_CLR(node, JNODE_WRITEBACK);
14808 +               }
14809 +
14810 +               end_page_writeback(pg);
14811 +               page_cache_release(pg);
14812 +       }
14813 +
14814 +       if (fq) {
14815 +               /* count i/o error in fq object */
14816 +               atomic_add(nr_errors, &fq->nr_errors);
14817 +
14818 +               /* If all write requests registered in this "fq" are done we up
14819 +                * the waiter. */
14820 +               if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted))
14821 +                       wake_up(&fq->wait);
14822 +       }
14823 +
14824 +       bio_put(bio);
14825 +}
14826 +
14827 +/* Count I/O requests which will be submitted by @bio in given flush queues
14828 +   @fq */
14829 +void add_fq_to_bio(flush_queue_t * fq, struct bio *bio)
14830 +{
14831 +       bio->bi_private = fq;
14832 +       bio->bi_end_io = end_io_handler;
14833 +
14834 +       if (fq)
14835 +               atomic_add(bio->bi_vcnt, &fq->nr_submitted);
14836 +}
14837 +
14838 +/* Move all queued nodes out from @fq->prepped list. */
14839 +static void release_prepped_list(flush_queue_t * fq)
14840 +{
14841 +       txn_atom *atom;
14842 +
14843 +       assert("zam-904", fq_in_use(fq));
14844 +       atom = atom_locked_by_fq(fq);
14845 +
14846 +       while (!list_empty(ATOM_FQ_LIST(fq))) {
14847 +               jnode *cur;
14848 +
14849 +               cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link);
14850 +               list_del_init(&cur->capture_link);
14851 +
14852 +               count_dequeued_node(fq);
14853 +               spin_lock_jnode(cur);
14854 +               assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
14855 +               assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
14856 +               assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
14857 +               JF_CLR(cur, JNODE_FLUSH_QUEUED);
14858 +
14859 +               if (JF_ISSET(cur, JNODE_DIRTY)) {
14860 +                       list_add_tail(&cur->capture_link,
14861 +                                     ATOM_DIRTY_LIST(atom, jnode_get_level(cur)));
14862 +                       ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
14863 +                                            DIRTY_LIST, 1));
14864 +               } else {
14865 +                       list_add_tail(&cur->capture_link, ATOM_CLEAN_LIST(atom));
14866 +                       ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
14867 +                                            CLEAN_LIST, 1));
14868 +               }
14869 +
14870 +               spin_unlock_jnode(cur);
14871 +       }
14872 +
14873 +       if (--atom->nr_running_queues == 0)
14874 +               reiser4_atom_send_event(atom);
14875 +
14876 +       spin_unlock_atom(atom);
14877 +}
14878 +
14879 +/* Submit write requests for nodes on the already filled flush queue @fq.
14880 +
14881 +   @fq: flush queue object which contains jnodes we can (and will) write.
14882 +   @return: number of submitted blocks (>=0) if success, otherwise -- an error
14883 +            code (<0). */
14884 +int reiser4_write_fq(flush_queue_t * fq, long *nr_submitted, int flags)
14885 +{
14886 +       int ret;
14887 +       txn_atom *atom;
14888 +
14889 +       while (1) {
14890 +               atom = atom_locked_by_fq(fq);
14891 +               assert("zam-924", atom);
14892 +               /* do not write fq in parallel. */
14893 +               if (atom->nr_running_queues == 0
14894 +                   || !(flags & WRITEOUT_SINGLE_STREAM))
14895 +                       break;
14896 +               reiser4_atom_wait_event(atom);
14897 +       }
14898 +
14899 +       atom->nr_running_queues++;
14900 +       spin_unlock_atom(atom);
14901 +
14902 +       ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
14903 +       release_prepped_list(fq);
14904 +
14905 +       return ret;
14906 +}
14907 +
14908 +/* Getting flush queue object for exclusive use by one thread. May require
14909 +   several iterations which is indicated by -E_REPEAT return code.
14910 +
14911 +   This function does not contain code for obtaining an atom lock because an
14912 +   atom lock is obtained by different ways in different parts of reiser4,
14913 +   usually it is current atom, but we need a possibility for getting fq for the
14914 +   atom of given jnode. */
14915 +static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp)
14916 +{
14917 +       flush_queue_t *fq;
14918 +
14919 +       assert_spin_locked(&(atom->alock));
14920 +
14921 +       fq = list_entry(atom->flush_queues.next, flush_queue_t, alink);
14922 +       while (&atom->flush_queues != &fq->alink) {
14923 +               spin_lock(&(fq->guard));
14924 +
14925 +               if (fq_ready(fq)) {
14926 +                       mark_fq_in_use(fq);
14927 +                       assert("vs-1246", fq->owner == NULL);
14928 +                       ON_DEBUG(fq->owner = current);
14929 +                       spin_unlock(&(fq->guard));
14930 +
14931 +                       if (*new_fq)
14932 +                               done_fq(*new_fq);
14933 +
14934 +                       *new_fq = fq;
14935 +
14936 +                       return 0;
14937 +               }
14938 +
14939 +               spin_unlock(&(fq->guard));
14940 +
14941 +               fq = list_entry(fq->alink.next, flush_queue_t, alink);
14942 +       }
14943 +
14944 +       /* Use previously allocated fq object */
14945 +       if (*new_fq) {
14946 +               mark_fq_in_use(*new_fq);
14947 +               assert("vs-1248", (*new_fq)->owner == 0);
14948 +               ON_DEBUG((*new_fq)->owner = current);
14949 +               attach_fq(atom, *new_fq);
14950 +
14951 +               return 0;
14952 +       }
14953 +
14954 +       spin_unlock_atom(atom);
14955 +
14956 +       *new_fq = create_fq(gfp);
14957 +
14958 +       if (*new_fq == NULL)
14959 +               return RETERR(-ENOMEM);
14960 +
14961 +       return RETERR(-E_REPEAT);
14962 +}
14963 +
14964 +int reiser4_fq_by_atom(txn_atom * atom, flush_queue_t ** new_fq)
14965 +{
14966 +       return fq_by_atom_gfp(atom, new_fq, reiser4_ctx_gfp_mask_get());
14967 +}
14968 +
14969 +/* A wrapper around reiser4_fq_by_atom for getting a flush queue
14970 +   object for current atom, if success fq->atom remains locked. */
14971 +flush_queue_t *get_fq_for_current_atom(void)
14972 +{
14973 +       flush_queue_t *fq = NULL;
14974 +       txn_atom *atom;
14975 +       int ret;
14976 +
14977 +       do {
14978 +               atom = get_current_atom_locked();
14979 +               ret = reiser4_fq_by_atom(atom, &fq);
14980 +       } while (ret == -E_REPEAT);
14981 +
14982 +       if (ret)
14983 +               return ERR_PTR(ret);
14984 +       return fq;
14985 +}
14986 +
14987 +/* Releasing flush queue object after exclusive use */
14988 +void reiser4_fq_put_nolock(flush_queue_t *fq)
14989 +{
14990 +       assert("zam-747", fq->atom != NULL);
14991 +       assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq)));
14992 +       mark_fq_ready(fq);
14993 +       assert("vs-1245", fq->owner == current);
14994 +       ON_DEBUG(fq->owner = NULL);
14995 +}
14996 +
14997 +void reiser4_fq_put(flush_queue_t * fq)
14998 +{
14999 +       txn_atom *atom;
15000 +
15001 +       spin_lock(&(fq->guard));
15002 +       atom = atom_locked_by_fq_nolock(fq);
15003 +
15004 +       assert("zam-746", atom != NULL);
15005 +
15006 +       reiser4_fq_put_nolock(fq);
15007 +       reiser4_atom_send_event(atom);
15008 +
15009 +       spin_unlock(&(fq->guard));
15010 +       spin_unlock_atom(atom);
15011 +}
15012 +
15013 +/* A part of atom object initialization related to the embedded flush queue
15014 +   list head */
15015 +
15016 +void init_atom_fq_parts(txn_atom *atom)
15017 +{
15018 +       INIT_LIST_HEAD(&atom->flush_queues);
15019 +}
15020 +
15021 +#if REISER4_DEBUG
15022 +
15023 +void reiser4_check_fq(const txn_atom *atom)
15024 +{
15025 +       /* check number of nodes on all atom's flush queues */
15026 +       flush_queue_t *fq;
15027 +       int count;
15028 +       struct list_head *pos;
15029 +
15030 +       count = 0;
15031 +       list_for_each_entry(fq, &atom->flush_queues, alink) {
15032 +               spin_lock(&(fq->guard));
15033 +               /* calculate number of jnodes on fq' list of prepped jnodes */
15034 +               list_for_each(pos, ATOM_FQ_LIST(fq))
15035 +                       count++;
15036 +               spin_unlock(&(fq->guard));
15037 +       }
15038 +       if (count != atom->fq)
15039 +               warning("", "fq counter %d, real %d\n", atom->fq, count);
15040 +
15041 +}
15042 +
15043 +#endif
15044 +
15045 +/*
15046 + * Local variables:
15047 + * c-indentation-style: "K&R"
15048 + * mode-name: "LC"
15049 + * c-basic-offset: 8
15050 + * tab-width: 8
15051 + * fill-column: 79
15052 + * scroll-step: 1
15053 + * End:
15054 + */
15055 diff -urN linux-2.6.27.orig/fs/reiser4/forward.h linux-2.6.27/fs/reiser4/forward.h
15056 --- linux-2.6.27.orig/fs/reiser4/forward.h      1970-01-01 03:00:00.000000000 +0300
15057 +++ linux-2.6.27/fs/reiser4/forward.h   2008-10-12 18:20:00.000000000 +0400
15058 @@ -0,0 +1,252 @@
15059 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
15060 +
15061 +/* Forward declarations. Thank you Kernighan. */
15062 +
15063 +#if !defined( __REISER4_FORWARD_H__ )
15064 +#define __REISER4_FORWARD_H__
15065 +
15066 +#include <asm/errno.h>
15067 +#include <linux/types.h>
15068 +
15069 +typedef struct zlock zlock;
15070 +typedef struct lock_stack lock_stack;
15071 +typedef struct lock_handle lock_handle;
15072 +typedef struct znode znode;
15073 +typedef struct flow flow_t;
15074 +typedef struct coord coord_t;
15075 +typedef struct tree_access_pointer tap_t;
15076 +typedef struct reiser4_object_create_data reiser4_object_create_data;
15077 +typedef union reiser4_plugin reiser4_plugin;
15078 +typedef __u16 reiser4_plugin_id;
15079 +typedef __u64 reiser4_plugin_groups;
15080 +typedef struct item_plugin item_plugin;
15081 +typedef struct jnode_plugin jnode_plugin;
15082 +typedef struct reiser4_item_data reiser4_item_data;
15083 +typedef union reiser4_key reiser4_key;
15084 +typedef struct reiser4_tree reiser4_tree;
15085 +typedef struct carry_cut_data carry_cut_data;
15086 +typedef struct carry_kill_data carry_kill_data;
15087 +typedef struct carry_tree_op carry_tree_op;
15088 +typedef struct carry_tree_node carry_tree_node;
15089 +typedef struct carry_plugin_info carry_plugin_info;
15090 +typedef struct reiser4_journal reiser4_journal;
15091 +typedef struct txn_atom txn_atom;
15092 +typedef struct txn_handle txn_handle;
15093 +typedef struct txn_mgr txn_mgr;
15094 +typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
15095 +typedef struct reiser4_context reiser4_context;
15096 +typedef struct carry_level carry_level;
15097 +typedef struct blocknr_set_entry blocknr_set_entry;
15098 +/* super_block->s_fs_info points to this */
15099 +typedef struct reiser4_super_info_data reiser4_super_info_data;
15100 +/* next two objects are fields of reiser4_super_info_data */
15101 +typedef struct reiser4_oid_allocator reiser4_oid_allocator;
15102 +typedef struct reiser4_space_allocator reiser4_space_allocator;
15103 +
15104 +typedef struct flush_scan flush_scan;
15105 +typedef struct flush_position flush_pos_t;
15106 +
15107 +typedef unsigned short pos_in_node_t;
15108 +#define MAX_POS_IN_NODE 65535
15109 +
15110 +typedef struct jnode jnode;
15111 +typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
15112 +
15113 +typedef struct uf_coord uf_coord_t;
15114 +typedef struct hint hint_t;
15115 +
15116 +typedef struct ktxnmgrd_context ktxnmgrd_context;
15117 +
15118 +struct inode;
15119 +struct page;
15120 +struct file;
15121 +struct dentry;
15122 +struct super_block;
15123 +
15124 +/* return values of coord_by_key(). cbk == coord_by_key */
15125 +typedef enum {
15126 +       CBK_COORD_FOUND = 0,
15127 +       CBK_COORD_NOTFOUND = -ENOENT,
15128 +} lookup_result;
15129 +
15130 +/* results of lookup with directory file */
15131 +typedef enum {
15132 +       FILE_NAME_FOUND = 0,
15133 +       FILE_NAME_NOTFOUND = -ENOENT,
15134 +       FILE_IO_ERROR = -EIO,   /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15135 +       FILE_OOM = -ENOMEM      /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15136 +} file_lookup_result;
15137 +
15138 +/* behaviors of lookup. If coord we are looking for is actually in a tree,
15139 +    both coincide. */
15140 +typedef enum {
15141 +       /* search exactly for the coord with key given */
15142 +       FIND_EXACT,
15143 +       /* search for coord with the maximal key not greater than one
15144 +          given */
15145 +       FIND_MAX_NOT_MORE_THAN  /*LEFT_SLANT_BIAS */
15146 +} lookup_bias;
15147 +
15148 +typedef enum {
15149 +       /* number of leaf level of the tree
15150 +          The fake root has (tree_level=0). */
15151 +       LEAF_LEVEL = 1,
15152 +
15153 +       /* number of level one above leaf level of the tree.
15154 +
15155 +          It is supposed that internal tree used by reiser4 to store file
15156 +          system data and meta data will have height 2 initially (when
15157 +          created by mkfs).
15158 +        */
15159 +       TWIG_LEVEL = 2,
15160 +} tree_level;
15161 +
15162 +/* The "real" maximum ztree height is the 0-origin size of any per-level
15163 +   array, since the zero'th level is not used. */
15164 +#define REAL_MAX_ZTREE_HEIGHT     (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
15165 +
15166 +/* enumeration of possible mutual position of item and coord.  This enum is
15167 +    return type of ->is_in_item() item plugin method which see. */
15168 +typedef enum {
15169 +       /* coord is on the left of an item */
15170 +       IP_ON_THE_LEFT,
15171 +       /* coord is inside item */
15172 +       IP_INSIDE,
15173 +       /* coord is inside item, but to the right of the rightmost unit of
15174 +          this item */
15175 +       IP_RIGHT_EDGE,
15176 +       /* coord is on the right of an item */
15177 +       IP_ON_THE_RIGHT
15178 +} interposition;
15179 +
15180 +/* type of lock to acquire on znode before returning it to caller */
15181 +typedef enum {
15182 +       ZNODE_NO_LOCK = 0,
15183 +       ZNODE_READ_LOCK = 1,
15184 +       ZNODE_WRITE_LOCK = 2,
15185 +} znode_lock_mode;
15186 +
15187 +/* type of lock request */
15188 +typedef enum {
15189 +       ZNODE_LOCK_LOPRI = 0,
15190 +       ZNODE_LOCK_HIPRI = (1 << 0),
15191 +
15192 +       /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to longterm_lock_znode will not sleep
15193 +          waiting for the lock to become available.  If the lock is unavailable, reiser4_znode_lock will immediately
15194 +          return the value -E_REPEAT. */
15195 +       ZNODE_LOCK_NONBLOCK = (1 << 1),
15196 +       /* An option for longterm_lock_znode which prevents atom fusion */
15197 +       ZNODE_LOCK_DONT_FUSE = (1 << 2)
15198 +} znode_lock_request;
15199 +
15200 +typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
15201 +
15202 +/* used to specify direction of shift. These must be -1 and 1 */
15203 +typedef enum {
15204 +       SHIFT_LEFT = 1,
15205 +       SHIFT_RIGHT = -1
15206 +} shift_direction;
15207 +
15208 +typedef enum {
15209 +       LEFT_SIDE,
15210 +       RIGHT_SIDE
15211 +} sideof;
15212 +
15213 +#define round_up( value, order )                                               \
15214 +       ( ( typeof( value ) )( ( ( long ) ( value ) + ( order ) - 1U ) &        \
15215 +                            ~( ( order ) - 1 ) ) )
15216 +
15217 +/* values returned by squalloc_right_neighbor and its auxiliary functions */
15218 +typedef enum {
15219 +       /* unit of internal item is moved */
15220 +       SUBTREE_MOVED = 0,
15221 +       /* nothing else can be squeezed into left neighbor */
15222 +       SQUEEZE_TARGET_FULL = 1,
15223 +       /* all content of node is squeezed into its left neighbor */
15224 +       SQUEEZE_SOURCE_EMPTY = 2,
15225 +       /* one more item is copied (this is only returned by
15226 +          allocate_and_copy_extent to squalloc_twig)) */
15227 +       SQUEEZE_CONTINUE = 3
15228 +} squeeze_result;
15229 +
15230 +/* Do not change items ids. If you do - there will be format change */
15231 +typedef enum {
15232 +       STATIC_STAT_DATA_ID = 0x0,
15233 +       SIMPLE_DIR_ENTRY_ID = 0x1,
15234 +       COMPOUND_DIR_ID = 0x2,
15235 +       NODE_POINTER_ID = 0x3,
15236 +       EXTENT_POINTER_ID = 0x5,
15237 +       FORMATTING_ID = 0x6,
15238 +       CTAIL_ID = 0x7,
15239 +       BLACK_BOX_ID = 0x8,
15240 +       LAST_ITEM_ID = 0x9
15241 +} item_id;
15242 +
15243 +/* Flags passed to jnode_flush() to allow it to distinguish default settings based on
15244 +   whether commit() was called or VM memory pressure was applied. */
15245 +typedef enum {
15246 +       /* submit flush queue to disk at jnode_flush completion */
15247 +       JNODE_FLUSH_WRITE_BLOCKS = 1,
15248 +
15249 +       /* flush is called for commit */
15250 +       JNODE_FLUSH_COMMIT = 2,
15251 +       /* not implemented */
15252 +       JNODE_FLUSH_MEMORY_FORMATTED = 4,
15253 +
15254 +       /* not implemented */
15255 +       JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
15256 +} jnode_flush_flags;
15257 +
15258 +/* Flags to insert/paste carry operations. Currently they only used in
15259 +   flushing code, but in future, they can be used to optimize for repetitive
15260 +   accesses.  */
15261 +typedef enum {
15262 +       /* carry is not allowed to shift data to the left when trying to find
15263 +          free space  */
15264 +       COPI_DONT_SHIFT_LEFT = (1 << 0),
15265 +       /* carry is not allowed to shift data to the right when trying to find
15266 +          free space  */
15267 +       COPI_DONT_SHIFT_RIGHT = (1 << 1),
15268 +       /* carry is not allowed to allocate new node(s) when trying to find
15269 +          free space */
15270 +       COPI_DONT_ALLOCATE = (1 << 2),
15271 +       /* try to load left neighbor if its not in a cache */
15272 +       COPI_LOAD_LEFT = (1 << 3),
15273 +       /* try to load right neighbor if its not in a cache */
15274 +       COPI_LOAD_RIGHT = (1 << 4),
15275 +       /* shift insertion point to the left neighbor */
15276 +       COPI_GO_LEFT = (1 << 5),
15277 +       /* shift insertion point to the right neighbor */
15278 +       COPI_GO_RIGHT = (1 << 6),
15279 +       /* try to step back into original node if insertion into new node
15280 +          fails after shifting data there. */
15281 +       COPI_STEP_BACK = (1 << 7)
15282 +} cop_insert_flag;
15283 +
15284 +typedef enum {
15285 +       SAFE_UNLINK,            /* safe-link for unlink */
15286 +       SAFE_TRUNCATE           /* safe-link for truncate */
15287 +} reiser4_safe_link_t;
15288 +
15289 +/* this is to show on which list of atom jnode is */
15290 +typedef enum {
15291 +       NOT_CAPTURED,
15292 +       DIRTY_LIST,
15293 +       CLEAN_LIST,
15294 +       FQ_LIST,
15295 +       WB_LIST,
15296 +       OVRWR_LIST
15297 +} atom_list;
15298 +
15299 +/* __REISER4_FORWARD_H__ */
15300 +#endif
15301 +
15302 +/* Make Linus happy.
15303 +   Local variables:
15304 +   c-indentation-style: "K&R"
15305 +   mode-name: "LC"
15306 +   c-basic-offset: 8
15307 +   tab-width: 8
15308 +   fill-column: 120
15309 +   End:
15310 +*/
15311 diff -urN linux-2.6.27.orig/fs/reiser4/fsdata.c linux-2.6.27/fs/reiser4/fsdata.c
15312 --- linux-2.6.27.orig/fs/reiser4/fsdata.c       1970-01-01 03:00:00.000000000 +0300
15313 +++ linux-2.6.27/fs/reiser4/fsdata.c    2008-10-12 18:20:00.000000000 +0400
15314 @@ -0,0 +1,804 @@
15315 +/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by
15316 + * reiser4/README */
15317 +
15318 +#include "fsdata.h"
15319 +#include "inode.h"
15320 +
15321 +
15322 +/* cache or dir_cursors */
15323 +static struct kmem_cache *d_cursor_cache;
15324 +
15325 +/* list of unused cursors */
15326 +static LIST_HEAD(cursor_cache);
15327 +
15328 +/* number of cursors in list of ununsed cursors */
15329 +static unsigned long d_cursor_unused = 0;
15330 +
15331 +/* spinlock protecting manipulations with dir_cursor's hash table and lists */
15332 +DEFINE_SPINLOCK(d_lock);
15333 +
15334 +static reiser4_file_fsdata *create_fsdata(struct file *file);
15335 +static int file_is_stateless(struct file *file);
15336 +static void free_fsdata(reiser4_file_fsdata *fsdata);
15337 +static void kill_cursor(dir_cursor *);
15338 +
15339 +/**
15340 + * d_cursor_shrink - shrink callback for cache of dir_cursor-s
15341 + * @nr: number of objects to free
15342 + * @mask: GFP mask
15343 + *
15344 + * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested
15345 + * number. Return number of still freeable cursors.
15346 + */
15347 +static int d_cursor_shrink(int nr, gfp_t mask)
15348 +{
15349 +       if (nr != 0) {
15350 +               dir_cursor *scan;
15351 +               int killed;
15352 +
15353 +               killed = 0;
15354 +               spin_lock(&d_lock);
15355 +               while (!list_empty(&cursor_cache)) {
15356 +                       scan = list_entry(cursor_cache.next, dir_cursor, alist);
15357 +                       assert("nikita-3567", scan->ref == 0);
15358 +                       kill_cursor(scan);
15359 +                       ++killed;
15360 +                       --nr;
15361 +                       if (nr == 0)
15362 +                               break;
15363 +               }
15364 +               spin_unlock(&d_lock);
15365 +       }
15366 +       return d_cursor_unused;
15367 +}
15368 +
15369 +/*
15370 + * actually, d_cursors are "priceless", because there is no way to
15371 + * recover information stored in them. On the other hand, we don't
15372 + * want to consume all kernel memory by them. As a compromise, just
15373 + * assign higher "seeks" value to d_cursor cache, so that it will be
15374 + * shrunk only if system is really tight on memory.
15375 + */
15376 +static struct shrinker d_cursor_shrinker = {
15377 +       .shrink = d_cursor_shrink,
15378 +       .seeks = DEFAULT_SEEKS << 3,
15379 +};
15380 +
15381 +/**
15382 + * reiser4_init_d_cursor - create d_cursor cache
15383 + *
15384 + * Initializes slab cache of d_cursors. It is part of reiser4 module
15385 + * initialization.
15386 + */
15387 +int reiser4_init_d_cursor(void)
15388 +{
15389 +       d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0,
15390 +                                          SLAB_HWCACHE_ALIGN, NULL);
15391 +       if (d_cursor_cache == NULL)
15392 +               return RETERR(-ENOMEM);
15393 +
15394 +       register_shrinker(&d_cursor_shrinker);
15395 +       return 0;
15396 +}
15397 +
15398 +/**
15399 + * reiser4_done_d_cursor - delete d_cursor cache and d_cursor shrinker
15400 + *
15401 + * This is called on reiser4 module unloading or system shutdown.
15402 + */
15403 +void reiser4_done_d_cursor(void)
15404 +{
15405 +       unregister_shrinker(&d_cursor_shrinker);
15406 +
15407 +       destroy_reiser4_cache(&d_cursor_cache);
15408 +}
15409 +
15410 +#define D_CURSOR_TABLE_SIZE (256)
15411 +
15412 +static inline unsigned long
15413 +d_cursor_hash(d_cursor_hash_table *table, const struct d_cursor_key *key)
15414 +{
15415 +       assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
15416 +       return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
15417 +}
15418 +
15419 +static inline int d_cursor_eq(const struct d_cursor_key *k1,
15420 +                             const struct d_cursor_key *k2)
15421 +{
15422 +       return k1->cid == k2->cid && k1->oid == k2->oid;
15423 +}
15424 +
15425 +/*
15426 + * define functions to manipulate reiser4 super block's hash table of
15427 + * dir_cursors
15428 + */
15429 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
15430 +#define KFREE(ptr, size) kfree(ptr)
15431 +TYPE_SAFE_HASH_DEFINE(d_cursor,
15432 +                     dir_cursor,
15433 +                     struct d_cursor_key,
15434 +                     key, hash, d_cursor_hash, d_cursor_eq);
15435 +#undef KFREE
15436 +#undef KMALLOC
15437 +
15438 +/**
15439 + * reiser4_init_super_d_info - initialize per-super-block d_cursor resources
15440 + * @super: super block to initialize
15441 + *
15442 + * Initializes per-super-block d_cursor's hash table and radix tree. It is part
15443 + * of mount.
15444 + */
15445 +int reiser4_init_super_d_info(struct super_block *super)
15446 +{
15447 +       struct d_cursor_info *p;
15448 +
15449 +       p = &get_super_private(super)->d_info;
15450 +
15451 +       INIT_RADIX_TREE(&p->tree, reiser4_ctx_gfp_mask_get());
15452 +       return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE);
15453 +}
15454 +
15455 +/**
15456 + * reiser4_done_super_d_info - release per-super-block d_cursor resources
15457 + * @super: super block being umounted
15458 + *
15459 + * It is called on umount. Kills all directory cursors attached to suoer block.
15460 + */
15461 +void reiser4_done_super_d_info(struct super_block *super)
15462 +{
15463 +       struct d_cursor_info *d_info;
15464 +       dir_cursor *cursor, *next;
15465 +
15466 +       d_info = &get_super_private(super)->d_info;
15467 +       for_all_in_htable(&d_info->table, d_cursor, cursor, next)
15468 +               kill_cursor(cursor);
15469 +
15470 +       BUG_ON(d_info->tree.rnode != NULL);
15471 +       d_cursor_hash_done(&d_info->table);
15472 +}
15473 +
15474 +/**
15475 + * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it
15476 + * @cursor: cursor to free
15477 + *
15478 + * Removes reiser4_file_fsdata attached to @cursor from readdir list of
15479 + * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from
15480 + * indices, hash table, list of unused cursors and frees it.
15481 + */
15482 +static void kill_cursor(dir_cursor *cursor)
15483 +{
15484 +       unsigned long index;
15485 +
15486 +       assert("nikita-3566", cursor->ref == 0);
15487 +       assert("nikita-3572", cursor->fsdata != NULL);
15488 +
15489 +       index = (unsigned long)cursor->key.oid;
15490 +       list_del_init(&cursor->fsdata->dir.linkage);
15491 +       free_fsdata(cursor->fsdata);
15492 +       cursor->fsdata = NULL;
15493 +
15494 +       if (list_empty_careful(&cursor->list))
15495 +               /* this is last cursor for a file. Kill radix-tree entry */
15496 +               radix_tree_delete(&cursor->info->tree, index);
15497 +       else {
15498 +               void **slot;
15499 +
15500 +               /*
15501 +                * there are other cursors for the same oid.
15502 +                */
15503 +
15504 +               /*
15505 +                * if radix tree point to the cursor being removed, re-target
15506 +                * radix tree slot to the next cursor in the (non-empty as was
15507 +                * checked above) element of the circular list of all cursors
15508 +                * for this oid.
15509 +                */
15510 +               slot = radix_tree_lookup_slot(&cursor->info->tree, index);
15511 +               assert("nikita-3571", *slot != NULL);
15512 +               if (*slot == cursor)
15513 +                       *slot = list_entry(cursor->list.next, dir_cursor, list);
15514 +               /* remove cursor from circular list */
15515 +               list_del_init(&cursor->list);
15516 +       }
15517 +       /* remove cursor from the list of unused cursors */
15518 +       list_del_init(&cursor->alist);
15519 +       /* remove cursor from the hash table */
15520 +       d_cursor_hash_remove(&cursor->info->table, cursor);
15521 +       /* and free it */
15522 +       kmem_cache_free(d_cursor_cache, cursor);
15523 +       --d_cursor_unused;
15524 +}
15525 +
15526 +/* possible actions that can be performed on all cursors for the given file */
15527 +enum cursor_action {
15528 +       /*
15529 +        * load all detached state: this is called when stat-data is loaded
15530 +        * from the disk to recover information about all pending readdirs
15531 +        */
15532 +       CURSOR_LOAD,
15533 +       /*
15534 +        * detach all state from inode, leaving it in the cache. This is called
15535 +        * when inode is removed form the memory by memory pressure
15536 +        */
15537 +       CURSOR_DISPOSE,
15538 +       /*
15539 +        * detach cursors from the inode, and free them. This is called when
15540 +        * inode is destroyed
15541 +        */
15542 +       CURSOR_KILL
15543 +};
15544 +
15545 +/*
15546 + * return d_cursor data for the file system @inode is in.
15547 + */
15548 +static inline struct d_cursor_info *d_info(struct inode *inode)
15549 +{
15550 +       return &get_super_private(inode->i_sb)->d_info;
15551 +}
15552 +
15553 +/*
15554 + * lookup d_cursor in the per-super-block radix tree.
15555 + */
15556 +static inline dir_cursor *lookup(struct d_cursor_info * info,
15557 +                                unsigned long index)
15558 +{
15559 +       return (dir_cursor *) radix_tree_lookup(&info->tree, index);
15560 +}
15561 +
15562 +/*
15563 + * attach @cursor to the radix tree. There may be multiple cursors for the
15564 + * same oid, they are chained into circular list.
15565 + */
15566 +static void bind_cursor(dir_cursor * cursor, unsigned long index)
15567 +{
15568 +       dir_cursor *head;
15569 +
15570 +       head = lookup(cursor->info, index);
15571 +       if (head == NULL) {
15572 +               /* this is the first cursor for this index */
15573 +               INIT_LIST_HEAD(&cursor->list);
15574 +               radix_tree_insert(&cursor->info->tree, index, cursor);
15575 +       } else {
15576 +               /* some cursor already exists. Chain ours */
15577 +               list_add(&cursor->list, &head->list);
15578 +       }
15579 +}
15580 +
15581 +/*
15582 + * detach fsdata (if detachable) from file descriptor, and put cursor on the
15583 + * "unused" list. Called when file descriptor is not longer in active use.
15584 + */
15585 +static void clean_fsdata(struct file *file)
15586 +{
15587 +       dir_cursor *cursor;
15588 +       reiser4_file_fsdata *fsdata;
15589 +
15590 +       assert("nikita-3570", file_is_stateless(file));
15591 +
15592 +       fsdata = (reiser4_file_fsdata *) file->private_data;
15593 +       if (fsdata != NULL) {
15594 +               cursor = fsdata->cursor;
15595 +               if (cursor != NULL) {
15596 +                       spin_lock(&d_lock);
15597 +                       --cursor->ref;
15598 +                       if (cursor->ref == 0) {
15599 +                               list_add_tail(&cursor->alist, &cursor_cache);
15600 +                               ++d_cursor_unused;
15601 +                       }
15602 +                       spin_unlock(&d_lock);
15603 +                       file->private_data = NULL;
15604 +               }
15605 +       }
15606 +}
15607 +
15608 +/*
15609 + * global counter used to generate "client ids". These ids are encoded into
15610 + * high bits of fpos.
15611 + */
15612 +static __u32 cid_counter = 0;
15613 +#define CID_SHIFT (20)
15614 +#define CID_MASK  (0xfffffull)
15615 +
15616 +static void free_file_fsdata_nolock(struct file *);
15617 +
15618 +/**
15619 + * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table
15620 + * @cursor:
15621 + * @file:
15622 + * @inode:
15623 + *
15624 + * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to
15625 + * reiser4 super block's hash table and radix tree.
15626 + add detachable readdir
15627 + * state to the @f
15628 + */
15629 +static int insert_cursor(dir_cursor *cursor, struct file *file,
15630 +                        struct inode *inode)
15631 +{
15632 +       int result;
15633 +       reiser4_file_fsdata *fsdata;
15634 +
15635 +       memset(cursor, 0, sizeof *cursor);
15636 +
15637 +       /* this is either first call to readdir, or rewind. Anyway, create new
15638 +        * cursor. */
15639 +       fsdata = create_fsdata(NULL);
15640 +       if (fsdata != NULL) {
15641 +               result = radix_tree_preload(reiser4_ctx_gfp_mask_get());
15642 +               if (result == 0) {
15643 +                       struct d_cursor_info *info;
15644 +                       oid_t oid;
15645 +
15646 +                       info = d_info(inode);
15647 +                       oid = get_inode_oid(inode);
15648 +                       /* cid occupies higher 12 bits of f->f_pos. Don't
15649 +                        * allow it to become negative: this confuses
15650 +                        * nfsd_readdir() */
15651 +                       cursor->key.cid = (++cid_counter) & 0x7ff;
15652 +                       cursor->key.oid = oid;
15653 +                       cursor->fsdata = fsdata;
15654 +                       cursor->info = info;
15655 +                       cursor->ref = 1;
15656 +
15657 +                       spin_lock_inode(inode);
15658 +                       /* install cursor as @f's private_data, discarding old
15659 +                        * one if necessary */
15660 +#if REISER4_DEBUG
15661 +                       if (file->private_data)
15662 +                               warning("", "file has fsdata already");
15663 +#endif
15664 +                       clean_fsdata(file);
15665 +                       free_file_fsdata_nolock(file);
15666 +                       file->private_data = fsdata;
15667 +                       fsdata->cursor = cursor;
15668 +                       spin_unlock_inode(inode);
15669 +                       spin_lock(&d_lock);
15670 +                       /* insert cursor into hash table */
15671 +                       d_cursor_hash_insert(&info->table, cursor);
15672 +                       /* and chain it into radix-tree */
15673 +                       bind_cursor(cursor, (unsigned long)oid);
15674 +                       spin_unlock(&d_lock);
15675 +                       radix_tree_preload_end();
15676 +                       file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT;
15677 +               }
15678 +       } else
15679 +               result = RETERR(-ENOMEM);
15680 +       return result;
15681 +}
15682 +
15683 +/**
15684 + * process_cursors - do action on each cursor attached to inode
15685 + * @inode:
15686 + * @act: action to do
15687 + *
15688 + * Finds all cursors of @inode in reiser4's super block radix tree of cursors
15689 + * and performs action specified by @act on each of cursors.
15690 + */
15691 +static void process_cursors(struct inode *inode, enum cursor_action act)
15692 +{
15693 +       oid_t oid;
15694 +       dir_cursor *start;
15695 +       struct list_head *head;
15696 +       reiser4_context *ctx;
15697 +       struct d_cursor_info *info;
15698 +
15699 +       /* this can be called by
15700 +        *
15701 +        * kswapd->...->prune_icache->..reiser4_destroy_inode
15702 +        *
15703 +        * without reiser4_context
15704 +        */
15705 +       ctx = reiser4_init_context(inode->i_sb);
15706 +       if (IS_ERR(ctx)) {
15707 +               warning("vs-23", "failed to init context");
15708 +               return;
15709 +       }
15710 +
15711 +       assert("nikita-3558", inode != NULL);
15712 +
15713 +       info = d_info(inode);
15714 +       oid = get_inode_oid(inode);
15715 +       spin_lock_inode(inode);
15716 +       head = get_readdir_list(inode);
15717 +       spin_lock(&d_lock);
15718 +       /* find any cursor for this oid: reference to it is hanging of radix
15719 +        * tree */
15720 +       start = lookup(info, (unsigned long)oid);
15721 +       if (start != NULL) {
15722 +               dir_cursor *scan;
15723 +               reiser4_file_fsdata *fsdata;
15724 +
15725 +               /* process circular list of cursors for this oid */
15726 +               scan = start;
15727 +               do {
15728 +                       dir_cursor *next;
15729 +
15730 +                       next = list_entry(scan->list.next, dir_cursor, list);
15731 +                       fsdata = scan->fsdata;
15732 +                       assert("nikita-3557", fsdata != NULL);
15733 +                       if (scan->key.oid == oid) {
15734 +                               switch (act) {
15735 +                               case CURSOR_DISPOSE:
15736 +                                       list_del_init(&fsdata->dir.linkage);
15737 +                                       break;
15738 +                               case CURSOR_LOAD:
15739 +                                       list_add(&fsdata->dir.linkage, head);
15740 +                                       break;
15741 +                               case CURSOR_KILL:
15742 +                                       kill_cursor(scan);
15743 +                                       break;
15744 +                               }
15745 +                       }
15746 +                       if (scan == next)
15747 +                               /* last cursor was just killed */
15748 +                               break;
15749 +                       scan = next;
15750 +               } while (scan != start);
15751 +       }
15752 +       spin_unlock(&d_lock);
15753 +       /* check that we killed 'em all */
15754 +       assert("nikita-3568",
15755 +              ergo(act == CURSOR_KILL,
15756 +                   list_empty_careful(get_readdir_list(inode))));
15757 +       assert("nikita-3569",
15758 +              ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
15759 +       spin_unlock_inode(inode);
15760 +       reiser4_exit_context(ctx);
15761 +}
15762 +
15763 +/**
15764 + * reiser4_dispose_cursors - removes cursors from inode's list
15765 + * @inode: inode to dispose cursors of
15766 + *
15767 + * For each of cursors corresponding to @inode - removes reiser4_file_fsdata
15768 + * attached to cursor from inode's readdir list. This is called when inode is
15769 + * removed from the memory by memory pressure.
15770 + */
15771 +void reiser4_dispose_cursors(struct inode *inode)
15772 +{
15773 +       process_cursors(inode, CURSOR_DISPOSE);
15774 +}
15775 +
15776 +/**
15777 + * reiser4_load_cursors - attach cursors to inode
15778 + * @inode: inode to load cursors to
15779 + *
15780 + * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata
15781 + * attached to cursor to inode's readdir list. This is done when inode is
15782 + * loaded into memory.
15783 + */
15784 +void reiser4_load_cursors(struct inode *inode)
15785 +{
15786 +       process_cursors(inode, CURSOR_LOAD);
15787 +}
15788 +
15789 +/**
15790 + * reiser4_kill_cursors - kill all inode cursors
15791 + * @inode: inode to kill cursors of
15792 + *
15793 + * Frees all cursors for this inode. This is called when inode is destroyed.
15794 + */
15795 +void reiser4_kill_cursors(struct inode *inode)
15796 +{
15797 +       process_cursors(inode, CURSOR_KILL);
15798 +}
15799 +
15800 +/**
15801 + * file_is_stateless -
15802 + * @file:
15803 + *
15804 + * true, if file descriptor @f is created by NFS server by "demand" to serve
15805 + * one file system operation. This means that there may be "detached state"
15806 + * for underlying inode.
15807 + */
15808 +static int file_is_stateless(struct file *file)
15809 +{
15810 +       return reiser4_get_dentry_fsdata(file->f_dentry)->stateless;
15811 +}
15812 +
15813 +/**
15814 + * reiser4_get_dir_fpos -
15815 + * @dir:
15816 + *
15817 + * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
15818 + * in the case of stateless directory operation (readdir-over-nfs), client id
15819 + * was encoded in the high bits of cookie and should me masked off.
15820 + */
15821 +loff_t reiser4_get_dir_fpos(struct file *dir)
15822 +{
15823 +       if (file_is_stateless(dir))
15824 +               return dir->f_pos & CID_MASK;
15825 +       else
15826 +               return dir->f_pos;
15827 +}
15828 +
15829 +/**
15830 + * reiser4_attach_fsdata - try to attach fsdata
15831 + * @file:
15832 + * @inode:
15833 + *
15834 + * Finds or creates cursor for readdir-over-nfs.
15835 + */
15836 +int reiser4_attach_fsdata(struct file *file, struct inode *inode)
15837 +{
15838 +       loff_t pos;
15839 +       int result;
15840 +       dir_cursor *cursor;
15841 +
15842 +       /*
15843 +        * we are serialized by inode->i_mutex
15844 +        */
15845 +       if (!file_is_stateless(file))
15846 +               return 0;
15847 +
15848 +       pos = file->f_pos;
15849 +       result = 0;
15850 +       if (pos == 0) {
15851 +               /*
15852 +                * first call to readdir (or rewind to the beginning of
15853 +                * directory)
15854 +                */
15855 +               cursor = kmem_cache_alloc(d_cursor_cache,
15856 +                                         reiser4_ctx_gfp_mask_get());
15857 +               if (cursor != NULL)
15858 +                       result = insert_cursor(cursor, file, inode);
15859 +               else
15860 +                       result = RETERR(-ENOMEM);
15861 +       } else {
15862 +               /* try to find existing cursor */
15863 +               struct d_cursor_key key;
15864 +
15865 +               key.cid = pos >> CID_SHIFT;
15866 +               key.oid = get_inode_oid(inode);
15867 +               spin_lock(&d_lock);
15868 +               cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
15869 +               if (cursor != NULL) {
15870 +                       /* cursor was found */
15871 +                       if (cursor->ref == 0) {
15872 +                               /* move it from unused list */
15873 +                               list_del_init(&cursor->alist);
15874 +                               --d_cursor_unused;
15875 +                       }
15876 +                       ++cursor->ref;
15877 +               }
15878 +               spin_unlock(&d_lock);
15879 +               if (cursor != NULL) {
15880 +                       spin_lock_inode(inode);
15881 +                       assert("nikita-3556", cursor->fsdata->back == NULL);
15882 +                       clean_fsdata(file);
15883 +                       free_file_fsdata_nolock(file);
15884 +                       file->private_data = cursor->fsdata;
15885 +                       spin_unlock_inode(inode);
15886 +               }
15887 +       }
15888 +       return result;
15889 +}
15890 +
15891 +/**
15892 + * reiser4_detach_fsdata - ???
15893 + * @file:
15894 + *
15895 + * detach fsdata, if necessary
15896 + */
15897 +void reiser4_detach_fsdata(struct file *file)
15898 +{
15899 +       struct inode *inode;
15900 +
15901 +       if (!file_is_stateless(file))
15902 +               return;
15903 +
15904 +       inode = file->f_dentry->d_inode;
15905 +       spin_lock_inode(inode);
15906 +       clean_fsdata(file);
15907 +       spin_unlock_inode(inode);
15908 +}
15909 +
15910 +/* slab for reiser4_dentry_fsdata */
15911 +static struct kmem_cache *dentry_fsdata_cache;
15912 +
15913 +/**
15914 + * reiser4_init_dentry_fsdata - create cache of dentry_fsdata
15915 + *
15916 + * Initializes slab cache of structures attached to denty->d_fsdata. It is
15917 + * part of reiser4 module initialization.
15918 + */
15919 +int reiser4_init_dentry_fsdata(void)
15920 +{
15921 +       dentry_fsdata_cache = kmem_cache_create("dentry_fsdata",
15922 +                                          sizeof(struct reiser4_dentry_fsdata),
15923 +                                          0,
15924 +                                          SLAB_HWCACHE_ALIGN |
15925 +                                          SLAB_RECLAIM_ACCOUNT,
15926 +                                          NULL);
15927 +       if (dentry_fsdata_cache == NULL)
15928 +               return RETERR(-ENOMEM);
15929 +       return 0;
15930 +}
15931 +
15932 +/**
15933 + * reiser4_done_dentry_fsdata - delete cache of dentry_fsdata
15934 + *
15935 + * This is called on reiser4 module unloading or system shutdown.
15936 + */
15937 +void reiser4_done_dentry_fsdata(void)
15938 +{
15939 +       destroy_reiser4_cache(&dentry_fsdata_cache);
15940 +}
15941 +
15942 +/**
15943 + * reiser4_get_dentry_fsdata - get fs-specific dentry data
15944 + * @dentry: queried dentry
15945 + *
15946 + * Allocates if necessary and returns per-dentry data that we attach to each
15947 + * dentry.
15948 + */
15949 +struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry)
15950 +{
15951 +       assert("nikita-1365", dentry != NULL);
15952 +
15953 +       if (dentry->d_fsdata == NULL) {
15954 +               dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache,
15955 +                                                   reiser4_ctx_gfp_mask_get());
15956 +               if (dentry->d_fsdata == NULL)
15957 +                       return ERR_PTR(RETERR(-ENOMEM));
15958 +               memset(dentry->d_fsdata, 0,
15959 +                      sizeof(struct reiser4_dentry_fsdata));
15960 +       }
15961 +       return dentry->d_fsdata;
15962 +}
15963 +
15964 +/**
15965 + * reiser4_free_dentry_fsdata - detach and free dentry_fsdata
15966 + * @dentry: dentry to free fsdata of
15967 + *
15968 + * Detaches and frees fs-specific dentry data
15969 + */
15970 +void reiser4_free_dentry_fsdata(struct dentry *dentry)
15971 +{
15972 +       if (dentry->d_fsdata != NULL) {
15973 +               kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata);
15974 +               dentry->d_fsdata = NULL;
15975 +       }
15976 +}
15977 +
15978 +/* slab for reiser4_file_fsdata */
15979 +static struct kmem_cache *file_fsdata_cache;
15980 +
15981 +/**
15982 + * reiser4_init_file_fsdata - create cache of reiser4_file_fsdata
15983 + *
15984 + * Initializes slab cache of structures attached to file->private_data. It is
15985 + * part of reiser4 module initialization.
15986 + */
15987 +int reiser4_init_file_fsdata(void)
15988 +{
15989 +       file_fsdata_cache = kmem_cache_create("file_fsdata",
15990 +                                             sizeof(reiser4_file_fsdata),
15991 +                                             0,
15992 +                                             SLAB_HWCACHE_ALIGN |
15993 +                                             SLAB_RECLAIM_ACCOUNT, NULL);
15994 +       if (file_fsdata_cache == NULL)
15995 +               return RETERR(-ENOMEM);
15996 +       return 0;
15997 +}
15998 +
15999 +/**
16000 + * reiser4_done_file_fsdata - delete cache of reiser4_file_fsdata
16001 + *
16002 + * This is called on reiser4 module unloading or system shutdown.
16003 + */
16004 +void reiser4_done_file_fsdata(void)
16005 +{
16006 +       destroy_reiser4_cache(&file_fsdata_cache);
16007 +}
16008 +
16009 +/**
16010 + * create_fsdata - allocate and initialize reiser4_file_fsdata
16011 + * @file: what to create file_fsdata for, may be NULL
16012 + *
16013 + * Allocates and initializes reiser4_file_fsdata structure.
16014 + */
16015 +static reiser4_file_fsdata *create_fsdata(struct file *file)
16016 +{
16017 +       reiser4_file_fsdata *fsdata;
16018 +
16019 +       fsdata = kmem_cache_alloc(file_fsdata_cache,
16020 +                                 reiser4_ctx_gfp_mask_get());
16021 +       if (fsdata != NULL) {
16022 +               memset(fsdata, 0, sizeof *fsdata);
16023 +               fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024;
16024 +               fsdata->back = file;
16025 +               INIT_LIST_HEAD(&fsdata->dir.linkage);
16026 +       }
16027 +       return fsdata;
16028 +}
16029 +
16030 +/**
16031 + * free_fsdata - free reiser4_file_fsdata
16032 + * @fsdata: object to free
16033 + *
16034 + * Dual to create_fsdata(). Free reiser4_file_fsdata.
16035 + */
16036 +static void free_fsdata(reiser4_file_fsdata *fsdata)
16037 +{
16038 +       BUG_ON(fsdata == NULL);
16039 +       kmem_cache_free(file_fsdata_cache, fsdata);
16040 +}
16041 +
16042 +/**
16043 + * reiser4_get_file_fsdata - get fs-specific file data
16044 + * @file: queried file
16045 + *
16046 + * Returns fs-specific data of @file. If it is NULL, allocates it and attaches
16047 + * to @file.
16048 + */
16049 +reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file)
16050 +{
16051 +       assert("nikita-1603", file != NULL);
16052 +
16053 +       if (file->private_data == NULL) {
16054 +               reiser4_file_fsdata *fsdata;
16055 +               struct inode *inode;
16056 +
16057 +               fsdata = create_fsdata(file);
16058 +               if (fsdata == NULL)
16059 +                       return ERR_PTR(RETERR(-ENOMEM));
16060 +
16061 +               inode = file->f_dentry->d_inode;
16062 +               spin_lock_inode(inode);
16063 +               if (file->private_data == NULL) {
16064 +                       file->private_data = fsdata;
16065 +                       fsdata = NULL;
16066 +               }
16067 +               spin_unlock_inode(inode);
16068 +               if (fsdata != NULL)
16069 +                       /* other thread initialized ->fsdata */
16070 +                       kmem_cache_free(file_fsdata_cache, fsdata);
16071 +       }
16072 +       assert("nikita-2665", file->private_data != NULL);
16073 +       return file->private_data;
16074 +}
16075 +
16076 +/**
16077 + * free_file_fsdata_nolock - detach and free reiser4_file_fsdata
16078 + * @file:
16079 + *
16080 + * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from
16081 + * readdir list, frees if it is not linked to d_cursor object.
16082 + */
16083 +static void free_file_fsdata_nolock(struct file *file)
16084 +{
16085 +       reiser4_file_fsdata *fsdata;
16086 +
16087 +       assert("", spin_inode_is_locked(file->f_dentry->d_inode));
16088 +       fsdata = file->private_data;
16089 +       if (fsdata != NULL) {
16090 +               list_del_init(&fsdata->dir.linkage);
16091 +               if (fsdata->cursor == NULL)
16092 +                       free_fsdata(fsdata);
16093 +       }
16094 +       file->private_data = NULL;
16095 +}
16096 +
16097 +/**
16098 + * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata
16099 + * @file:
16100 + *
16101 + * Spinlocks inode and calls free_file_fsdata_nolock to do the work.
16102 + */
16103 +void reiser4_free_file_fsdata(struct file *file)
16104 +{
16105 +       spin_lock_inode(file->f_dentry->d_inode);
16106 +       free_file_fsdata_nolock(file);
16107 +       spin_unlock_inode(file->f_dentry->d_inode);
16108 +}
16109 +
16110 +/*
16111 + * Local variables:
16112 + * c-indentation-style: "K&R"
16113 + * mode-name: "LC"
16114 + * c-basic-offset: 8
16115 + * tab-width: 8
16116 + * fill-column: 79
16117 + * End:
16118 + */
16119 diff -urN linux-2.6.27.orig/fs/reiser4/fsdata.h linux-2.6.27/fs/reiser4/fsdata.h
16120 --- linux-2.6.27.orig/fs/reiser4/fsdata.h       1970-01-01 03:00:00.000000000 +0300
16121 +++ linux-2.6.27/fs/reiser4/fsdata.h    2008-10-12 18:20:00.000000000 +0400
16122 @@ -0,0 +1,205 @@
16123 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
16124 + * reiser4/README */
16125 +
16126 +#if !defined( __REISER4_FSDATA_H__ )
16127 +#define __REISER4_FSDATA_H__
16128 +
16129 +#include "debug.h"
16130 +#include "kassign.h"
16131 +#include "seal.h"
16132 +#include "type_safe_hash.h"
16133 +#include "plugin/file/file.h"
16134 +#include "readahead.h"
16135 +
16136 +/*
16137 + * comment about reiser4_dentry_fsdata
16138 + *
16139 + *
16140 + */
16141 +
16142 +/*
16143 + * locking: fields of per file descriptor readdir_pos and ->f_pos are
16144 + * protected by ->i_mutex on inode. Under this lock following invariant
16145 + * holds:
16146 + *
16147 + *     file descriptor is "looking" at the entry_no-th directory entry from
16148 + *     the beginning of directory. This entry has key dir_entry_key and is
16149 + *     pos-th entry with duplicate-key sequence.
16150 + *
16151 + */
16152 +
16153 +/* logical position within directory */
16154 +struct dir_pos {
16155 +       /* key of directory entry (actually, part of a key sufficient to
16156 +          identify directory entry)  */
16157 +       de_id dir_entry_key;
16158 +       /* ordinal number of directory entry among all entries with the same
16159 +          key. (Starting from 0.) */
16160 +       unsigned pos;
16161 +};
16162 +
16163 +struct readdir_pos {
16164 +       /* f_pos corresponding to this readdir position */
16165 +       __u64 fpos;
16166 +       /* logical position within directory */
16167 +       struct dir_pos position;
16168 +       /* logical number of directory entry within
16169 +          directory  */
16170 +       __u64 entry_no;
16171 +};
16172 +
16173 +/*
16174 + * this is used to speed up lookups for directory entry: on initial call to
16175 + * ->lookup() seal and coord of directory entry (if found, that is) are stored
16176 + * in struct dentry and reused later to avoid tree traversals.
16177 + */
16178 +struct de_location {
16179 +       /* seal covering directory entry */
16180 +       seal_t entry_seal;
16181 +       /* coord of directory entry */
16182 +       coord_t entry_coord;
16183 +       /* ordinal number of directory entry among all entries with the same
16184 +          key. (Starting from 0.) */
16185 +       int pos;
16186 +};
16187 +
16188 +/**
16189 + * reiser4_dentry_fsdata - reiser4-specific data attached to dentries
16190 + *
16191 + * This is allocated dynamically and released in d_op->d_release()
16192 + *
16193 + * Currently it only contains cached location (hint) of directory entry, but
16194 + * it is expected that other information will be accumulated here.
16195 + */
16196 +struct reiser4_dentry_fsdata {
16197 +       /*
16198 +        * here will go fields filled by ->lookup() to speedup next
16199 +        * create/unlink, like blocknr of znode with stat-data, or key of
16200 +        * stat-data.
16201 +        */
16202 +       struct de_location dec;
16203 +       int stateless;          /* created through reiser4_decode_fh, needs special
16204 +                                * treatment in readdir. */
16205 +};
16206 +
16207 +extern int reiser4_init_dentry_fsdata(void);
16208 +extern void reiser4_done_dentry_fsdata(void);
16209 +extern struct reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *);
16210 +extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
16211 +
16212 +/**
16213 + * reiser4_file_fsdata - reiser4-specific data attached to file->private_data
16214 + *
16215 + * This is allocated dynamically and released in inode->i_fop->release
16216 + */
16217 +typedef struct reiser4_file_fsdata {
16218 +       /*
16219 +        * pointer back to the struct file which this reiser4_file_fsdata is
16220 +        * part of
16221 +        */
16222 +       struct file *back;
16223 +       /* detached cursor for stateless readdir. */
16224 +       struct dir_cursor *cursor;
16225 +       /*
16226 +        * We need both directory and regular file parts here, because there
16227 +        * are file system objects that are files and directories.
16228 +        */
16229 +       struct {
16230 +               /*
16231 +                * position in directory. It is updated each time directory is
16232 +                * modified
16233 +                */
16234 +               struct readdir_pos readdir;
16235 +               /* head of this list is reiser4_inode->lists.readdir_list */
16236 +               struct list_head linkage;
16237 +       } dir;
16238 +       /* hints to speed up operations with regular files: read and write. */
16239 +       struct {
16240 +               hint_t hint;
16241 +       } reg;
16242 +       struct reiser4_file_ra_state ra1;
16243 +
16244 +} reiser4_file_fsdata;
16245 +
16246 +extern int reiser4_init_file_fsdata(void);
16247 +extern void reiser4_done_file_fsdata(void);
16248 +extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *);
16249 +extern void reiser4_free_file_fsdata(struct file *);
16250 +
16251 +/*
16252 + * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are
16253 + * used to address problem reiser4 has with readdir accesses via NFS. See
16254 + * plugin/file_ops_readdir.c for more details.
16255 + */
16256 +struct d_cursor_key{
16257 +       __u16 cid;
16258 +       __u64 oid;
16259 +};
16260 +
16261 +/*
16262 + * define structures d_cursor_hash_table d_cursor_hash_link which are used to
16263 + * maintain hash table of dir_cursor-s in reiser4's super block
16264 + */
16265 +typedef struct dir_cursor dir_cursor;
16266 +TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
16267 +
16268 +struct dir_cursor {
16269 +       int ref;
16270 +       reiser4_file_fsdata *fsdata;
16271 +
16272 +       /* link to reiser4 super block hash table of cursors */
16273 +       d_cursor_hash_link hash;
16274 +
16275 +       /*
16276 +        * this is to link cursors to reiser4 super block's radix tree of
16277 +        * cursors if there are more than one cursor of the same objectid
16278 +        */
16279 +       struct list_head list;
16280 +       struct d_cursor_key key;
16281 +       struct d_cursor_info *info;
16282 +       /* list of unused cursors */
16283 +       struct list_head alist;
16284 +};
16285 +
16286 +extern int reiser4_init_d_cursor(void);
16287 +extern void reiser4_done_d_cursor(void);
16288 +
16289 +extern int reiser4_init_super_d_info(struct super_block *);
16290 +extern void reiser4_done_super_d_info(struct super_block *);
16291 +
16292 +extern loff_t reiser4_get_dir_fpos(struct file *);
16293 +extern int reiser4_attach_fsdata(struct file *, struct inode *);
16294 +extern void reiser4_detach_fsdata(struct file *);
16295 +
16296 +/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for
16297 +   more details */
16298 +void reiser4_dispose_cursors(struct inode *inode);
16299 +void reiser4_load_cursors(struct inode *inode);
16300 +void reiser4_kill_cursors(struct inode *inode);
16301 +void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
16302 +                            int offset, int adj);
16303 +
16304 +/*
16305 + * this structure is embedded to reise4_super_info_data. It maintains d_cursors
16306 + * (detached readdir state). See plugin/file_ops_readdir.c for more details.
16307 + */
16308 +struct d_cursor_info {
16309 +       d_cursor_hash_table table;
16310 +       struct radix_tree_root tree;
16311 +};
16312 +
16313 +/* spinlock protecting readdir cursors */
16314 +extern spinlock_t d_lock;
16315 +
16316 +/* __REISER4_FSDATA_H__ */
16317 +#endif
16318 +
16319 +/*
16320 + * Local variables:
16321 + * c-indentation-style: "K&R"
16322 + * mode-name: "LC"
16323 + * c-basic-offset: 8
16324 + * tab-width: 8
16325 + * fill-column: 120
16326 + * End:
16327 + */
16328 diff -urN linux-2.6.27.orig/fs/reiser4/init_super.c linux-2.6.27/fs/reiser4/init_super.c
16329 --- linux-2.6.27.orig/fs/reiser4/init_super.c   1970-01-01 03:00:00.000000000 +0300
16330 +++ linux-2.6.27/fs/reiser4/init_super.c        2008-10-12 18:20:00.000000000 +0400
16331 @@ -0,0 +1,751 @@
16332 +/* Copyright by Hans Reiser, 2003 */
16333 +
16334 +#include "super.h"
16335 +#include "inode.h"
16336 +#include "plugin/plugin_set.h"
16337 +
16338 +#include <linux/swap.h>
16339 +
16340 +/**
16341 + * init_fs_info - allocate reiser4 specific super block
16342 + * @super: super block of filesystem
16343 + *
16344 + * Allocates and initialize reiser4_super_info_data, attaches it to
16345 + * super->s_fs_info, initializes structures maintaining d_cursor-s.
16346 + */
16347 +int reiser4_init_fs_info(struct super_block *super)
16348 +{
16349 +       reiser4_super_info_data *sbinfo;
16350 +
16351 +       sbinfo = kzalloc(sizeof(reiser4_super_info_data),
16352 +                        reiser4_ctx_gfp_mask_get());
16353 +       if (!sbinfo)
16354 +               return RETERR(-ENOMEM);
16355 +
16356 +       super->s_fs_info = sbinfo;
16357 +       super->s_op = NULL;
16358 +
16359 +       ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
16360 +       ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
16361 +
16362 +       mutex_init(&sbinfo->delete_mutex);
16363 +       spin_lock_init(&(sbinfo->guard));
16364 +
16365 +       /*  initialize per-super-block d_cursor resources */
16366 +       reiser4_init_super_d_info(super);
16367 +
16368 +       return 0;
16369 +}
16370 +
16371 +/**
16372 + * reiser4_done_fs_info - free reiser4 specific super block
16373 + * @super: super block of filesystem
16374 + *
16375 + * Performs some sanity checks, releases structures maintaining d_cursor-s,
16376 + * frees reiser4_super_info_data.
16377 + */
16378 +void reiser4_done_fs_info(struct super_block *super)
16379 +{
16380 +       assert("zam-990", super->s_fs_info != NULL);
16381 +
16382 +       /* release per-super-block d_cursor resources */
16383 +       reiser4_done_super_d_info(super);
16384 +
16385 +       /* make sure that there are not jnodes already */
16386 +       assert("", list_empty(&get_super_private(super)->all_jnodes));
16387 +       assert("", get_current_context()->trans->atom == NULL);
16388 +       reiser4_check_block_counters(super);
16389 +       kfree(super->s_fs_info);
16390 +       super->s_fs_info = NULL;
16391 +}
16392 +
16393 +/* type of option parseable by parse_option() */
16394 +typedef enum {
16395 +       /* value of option is arbitrary string */
16396 +       OPT_STRING,
16397 +
16398 +       /*
16399 +        * option specifies bit in a bitmask. When option is set - bit in
16400 +        * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush,
16401 +        * dont_load_bitmap, atomic_write.
16402 +        */
16403 +       OPT_BIT,
16404 +
16405 +       /*
16406 +        * value of option should conform to sprintf() format. Examples are
16407 +        * tmgr.atom_max_size=N, tmgr.atom_max_age=N
16408 +        */
16409 +       OPT_FORMAT,
16410 +
16411 +       /*
16412 +        * option can take one of predefined values. Example is onerror=panic or
16413 +        * onerror=remount-ro
16414 +        */
16415 +       OPT_ONEOF,
16416 +} opt_type_t;
16417 +
16418 +#if 0
16419 +struct opt_bitmask_bit {
16420 +       const char *bit_name;
16421 +       int bit_nr;
16422 +};
16423 +#endif
16424 +
16425 +/* description of option parseable by parse_option() */
16426 +struct opt_desc {
16427 +       /* option name.
16428 +
16429 +          parsed portion of string has a form "name=value".
16430 +        */
16431 +       const char *name;
16432 +       /* type of option */
16433 +       opt_type_t type;
16434 +       union {
16435 +               /* where to store value of string option (type == OPT_STRING) */
16436 +               char **string;
16437 +               /* description of bits for bit option (type == OPT_BIT) */
16438 +               struct {
16439 +                       int nr;
16440 +                       void *addr;
16441 +               } bit;
16442 +               /* description of format and targets for format option (type
16443 +                  == OPT_FORMAT) */
16444 +               struct {
16445 +                       const char *format;
16446 +                       int nr_args;
16447 +                       void *arg1;
16448 +                       void *arg2;
16449 +                       void *arg3;
16450 +                       void *arg4;
16451 +               } f;
16452 +               struct {
16453 +                       int *result;
16454 +                       const char *list[10];
16455 +               } oneof;
16456 +               struct {
16457 +                       void *addr;
16458 +                       int nr_bits;
16459 +                       //struct opt_bitmask_bit *bits;
16460 +               } bitmask;
16461 +       } u;
16462 +};
16463 +
16464 +/**
16465 + * parse_option - parse one option
16466 + * @opt_strin: starting point of parsing
16467 + * @opt: option description
16468 + *
16469 + * foo=bar,
16470 + * ^   ^  ^
16471 + * |   |  +-- replaced to '\0'
16472 + * |   +-- val_start
16473 + * +-- opt_string
16474 + * Figures out option type and handles option correspondingly.
16475 + */
16476 +static int parse_option(char *opt_string, struct opt_desc *opt)
16477 +{
16478 +       char *val_start;
16479 +       int result;
16480 +       const char *err_msg;
16481 +
16482 +       /* NOTE-NIKITA think about using lib/cmdline.c functions here. */
16483 +
16484 +       val_start = strchr(opt_string, '=');
16485 +       if (val_start != NULL) {
16486 +               *val_start = '\0';
16487 +               ++val_start;
16488 +       }
16489 +
16490 +       err_msg = NULL;
16491 +       result = 0;
16492 +       switch (opt->type) {
16493 +       case OPT_STRING:
16494 +               if (val_start == NULL) {
16495 +                       err_msg = "String arg missing";
16496 +                       result = RETERR(-EINVAL);
16497 +               } else
16498 +                       *opt->u.string = val_start;
16499 +               break;
16500 +       case OPT_BIT:
16501 +               if (val_start != NULL)
16502 +                       err_msg = "Value ignored";
16503 +               else
16504 +                       set_bit(opt->u.bit.nr, opt->u.bit.addr);
16505 +               break;
16506 +       case OPT_FORMAT:
16507 +               if (val_start == NULL) {
16508 +                       err_msg = "Formatted arg missing";
16509 +                       result = RETERR(-EINVAL);
16510 +                       break;
16511 +               }
16512 +               if (sscanf(val_start, opt->u.f.format,
16513 +                          opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3,
16514 +                          opt->u.f.arg4) != opt->u.f.nr_args) {
16515 +                       err_msg = "Wrong conversion";
16516 +                       result = RETERR(-EINVAL);
16517 +               }
16518 +               break;
16519 +       case OPT_ONEOF:
16520 +               {
16521 +                       int i = 0;
16522 +
16523 +                       if (val_start == NULL) {
16524 +                               err_msg = "Value is missing";
16525 +                               result = RETERR(-EINVAL);
16526 +                               break;
16527 +                       }
16528 +                       err_msg = "Wrong option value";
16529 +                       result = RETERR(-EINVAL);
16530 +                       while (opt->u.oneof.list[i]) {
16531 +                               if (!strcmp(opt->u.oneof.list[i], val_start)) {
16532 +                                       result = 0;
16533 +                                       err_msg = NULL;
16534 +                                       *opt->u.oneof.result = i;
16535 +                                       break;
16536 +                               }
16537 +                               i++;
16538 +                       }
16539 +                       break;
16540 +               }
16541 +       default:
16542 +               wrong_return_value("nikita-2100", "opt -> type");
16543 +               break;
16544 +       }
16545 +       if (err_msg != NULL) {
16546 +               warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
16547 +                       err_msg, opt->name, val_start ? "=" : "",
16548 +                       val_start ? : "");
16549 +       }
16550 +       return result;
16551 +}
16552 +
16553 +/**
16554 + * parse_options - parse reiser4 mount options
16555 + * @opt_string: starting point
16556 + * @opts: array of option description
16557 + * @nr_opts: number of elements in @opts
16558 + *
16559 + * Parses comma separated list of reiser4 mount options.
16560 + */
16561 +static int parse_options(char *opt_string, struct opt_desc *opts, int nr_opts)
16562 +{
16563 +       int result;
16564 +
16565 +       result = 0;
16566 +       while ((result == 0) && opt_string && *opt_string) {
16567 +               int j;
16568 +               char *next;
16569 +
16570 +               next = strchr(opt_string, ',');
16571 +               if (next != NULL) {
16572 +                       *next = '\0';
16573 +                       ++next;
16574 +               }
16575 +               for (j = 0; j < nr_opts; ++j) {
16576 +                       if (!strncmp(opt_string, opts[j].name,
16577 +                                    strlen(opts[j].name))) {
16578 +                               result = parse_option(opt_string, &opts[j]);
16579 +                               break;
16580 +                       }
16581 +               }
16582 +               if (j == nr_opts) {
16583 +                       warning("nikita-2307", "Unrecognized option: \"%s\"",
16584 +                               opt_string);
16585 +                       /* traditionally, -EINVAL is returned on wrong mount
16586 +                          option */
16587 +                       result = RETERR(-EINVAL);
16588 +               }
16589 +               opt_string = next;
16590 +       }
16591 +       return result;
16592 +}
16593 +
16594 +#define NUM_OPT( label, fmt, addr )                            \
16595 +               {                                               \
16596 +                       .name = ( label ),                      \
16597 +                       .type = OPT_FORMAT,                     \
16598 +                       .u = {                                  \
16599 +                               .f = {                          \
16600 +                                       .format  = ( fmt ),     \
16601 +                                       .nr_args = 1,           \
16602 +                                       .arg1 = ( addr ),       \
16603 +                                       .arg2 = NULL,           \
16604 +                                       .arg3 = NULL,           \
16605 +                                       .arg4 = NULL            \
16606 +                               }                               \
16607 +                       }                                       \
16608 +               }
16609 +
16610 +#define SB_FIELD_OPT( field, fmt ) NUM_OPT( #field, fmt, &sbinfo -> field )
16611 +
16612 +#define BIT_OPT(label, bitnr)                                  \
16613 +       {                                                       \
16614 +               .name = label,                                  \
16615 +               .type = OPT_BIT,                                \
16616 +               .u = {                                          \
16617 +                       .bit = {                                \
16618 +                               .nr = bitnr,                    \
16619 +                               .addr = &sbinfo->fs_flags       \
16620 +                       }                                       \
16621 +               }                                               \
16622 +       }
16623 +
16624 +#define MAX_NR_OPTIONS (30)
16625 +
16626 +/**
16627 + * reiser4_init_super_data - initialize reiser4 private super block
16628 + * @super: super block to initialize
16629 + * @opt_string: list of reiser4 mount options
16630 + *
16631 + * Sets various reiser4 parameters to default values. Parses mount options and
16632 + * overwrites default settings.
16633 + */
16634 +int reiser4_init_super_data(struct super_block *super, char *opt_string)
16635 +{
16636 +       int result;
16637 +       struct opt_desc *opts, *p;
16638 +       reiser4_super_info_data *sbinfo = get_super_private(super);
16639 +
16640 +       /* initialize super, export, dentry operations */
16641 +       sbinfo->ops.super = reiser4_super_operations;
16642 +       sbinfo->ops.export = reiser4_export_operations;
16643 +       sbinfo->ops.dentry = reiser4_dentry_operations;
16644 +       super->s_op = &sbinfo->ops.super;
16645 +       super->s_export_op = &sbinfo->ops.export;
16646 +
16647 +       /* initialize transaction manager parameters to default values */
16648 +       sbinfo->tmgr.atom_max_size = totalram_pages / 4;
16649 +       sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
16650 +       sbinfo->tmgr.atom_min_size = 256;
16651 +       sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
16652 +
16653 +       /* initialize cbk cache parameter */
16654 +       sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
16655 +
16656 +       /* initialize flush parameters */
16657 +       sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
16658 +       sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
16659 +       sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
16660 +       sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
16661 +
16662 +       sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
16663 +
16664 +       /* preliminary tree initializations */
16665 +       sbinfo->tree.super = super;
16666 +       sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
16667 +       sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
16668 +       sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
16669 +       sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
16670 +       rwlock_init(&(sbinfo->tree.tree_lock));
16671 +       spin_lock_init(&(sbinfo->tree.epoch_lock));
16672 +
16673 +       /* initialize default readahead params */
16674 +       sbinfo->ra_params.max = num_physpages / 4;
16675 +       sbinfo->ra_params.flags = 0;
16676 +
16677 +       /* allocate memory for structure describing reiser4 mount options */
16678 +       opts = kmalloc(sizeof(struct opt_desc) * MAX_NR_OPTIONS,
16679 +                      reiser4_ctx_gfp_mask_get());
16680 +       if (opts == NULL)
16681 +               return RETERR(-ENOMEM);
16682 +
16683 +       /* initialize structure describing reiser4 mount options */
16684 +       p = opts;
16685 +
16686 +#if REISER4_DEBUG
16687 +#  define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) {         \
16688 +               warning ("zam-1046", "opt array is overloaded"); break; \
16689 +       }
16690 +#else
16691 +#   define OPT_ARRAY_CHECK noop
16692 +#endif
16693 +
16694 +#define PUSH_OPT(...)                          \
16695 +do {                                           \
16696 +       struct opt_desc o = __VA_ARGS__;        \
16697 +       OPT_ARRAY_CHECK;                        \
16698 +       *p ++ = o;                              \
16699 +} while (0)
16700 +
16701 +#define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format))
16702 +#define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit))
16703 +
16704 +       /*
16705 +        * tmgr.atom_max_size=N
16706 +        * Atoms containing more than N blocks will be forced to commit. N is
16707 +        * decimal.
16708 +        */
16709 +       PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u");
16710 +       /*
16711 +        * tmgr.atom_max_age=N
16712 +        * Atoms older than N seconds will be forced to commit. N is decimal.
16713 +        */
16714 +       PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u");
16715 +       /*
16716 +        * tmgr.atom_min_size=N
16717 +        * In committing an atom to free dirty pages, force the atom less than
16718 +        * N in size to fuse with another one.
16719 +        */
16720 +       PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u");
16721 +       /*
16722 +        * tmgr.atom_max_flushers=N
16723 +        * limit of concurrent flushers for one atom. 0 means no limit.
16724 +        */
16725 +       PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u");
16726 +       /*
16727 +        * tree.cbk_cache_slots=N
16728 +        * Number of slots in the cbk cache.
16729 +        */
16730 +       PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u");
16731 +       /*
16732 +        * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty
16733 +        * leaf-level blocks it will force them to be relocated.
16734 +        */
16735 +       PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u");
16736 +       /*
16737 +        * If flush finds can find a block allocation closer than at most
16738 +        * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that
16739 +        * position.
16740 +        */
16741 +       PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u");
16742 +       /*
16743 +        * If we have written this much or more blocks before encountering busy
16744 +        * jnode in flush list - abort flushing hoping that next time we get
16745 +        * called this jnode will be clean already, and we will save some
16746 +        * seeks.
16747 +        */
16748 +       PUSH_SB_FIELD_OPT(flush.written_threshold, "%u");
16749 +       /* The maximum number of nodes to scan left on a level during flush. */
16750 +       PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u");
16751 +       /* preferred IO size */
16752 +       PUSH_SB_FIELD_OPT(optimal_io_size, "%u");
16753 +       /* carry flags used for insertion of new nodes */
16754 +       PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u");
16755 +       /* carry flags used for insertion of new extents */
16756 +       PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u");
16757 +       /* carry flags used for paste operations */
16758 +       PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u");
16759 +       /* carry flags used for insert operations */
16760 +       PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u");
16761 +
16762 +#ifdef CONFIG_REISER4_BADBLOCKS
16763 +       /*
16764 +        * Alternative master superblock location in case if it's original
16765 +        * location is not writeable/accessable. This is offset in BYTES.
16766 +        */
16767 +       PUSH_SB_FIELD_OPT(altsuper, "%lu");
16768 +#endif
16769 +
16770 +       /* turn on BSD-style gid assignment */
16771 +       PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
16772 +       /* turn on 32 bit times */
16773 +       PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES);
16774 +       /*
16775 +        * Don't load all bitmap blocks at mount time, it is useful for
16776 +        * machines with tiny RAM and large disks.
16777 +        */
16778 +       PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP);
16779 +       /* disable transaction commits during write() */
16780 +       PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE);
16781 +       /* disable use of write barriers in the reiser4 log writer. */
16782 +       PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER);
16783 +
16784 +       PUSH_OPT(
16785 +       {
16786 +               /*
16787 +                * tree traversal readahead parameters:
16788 +                * -o readahead:MAXNUM:FLAGS
16789 +                * MAXNUM - max number fo nodes to request readahead for: -1UL
16790 +                * will set it to max_sane_readahead()
16791 +                * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS,
16792 +                * CONTINUE_ON_PRESENT
16793 +                */
16794 +               .name = "readahead",
16795 +               .type = OPT_FORMAT,
16796 +               .u = {
16797 +                       .f = {
16798 +                               .format = "%u:%u",
16799 +                               .nr_args = 2,
16800 +                               .arg1 = &sbinfo->ra_params.max,
16801 +                               .arg2 = &sbinfo->ra_params.flags,
16802 +                               .arg3 = NULL,
16803 +                               .arg4 = NULL
16804 +                       }
16805 +               }
16806 +       }
16807 +       );
16808 +
16809 +       /* What to do in case of fs error */
16810 +       PUSH_OPT(
16811 +       {
16812 +               .name = "onerror",
16813 +               .type = OPT_ONEOF,
16814 +               .u = {
16815 +                       .oneof = {
16816 +                               .result = &sbinfo->onerror,
16817 +                               .list = {
16818 +                                       "panic", "remount-ro", NULL
16819 +                               },
16820 +                       }
16821 +               }
16822 +       }
16823 +       );
16824 +
16825 +       /* modify default settings to values set by mount options */
16826 +       result = parse_options(opt_string, opts, p - opts);
16827 +       kfree(opts);
16828 +       if (result != 0)
16829 +               return result;
16830 +
16831 +       /* correct settings to sanity values */
16832 +       sbinfo->tmgr.atom_max_age *= HZ;
16833 +       if (sbinfo->tmgr.atom_max_age <= 0)
16834 +               /* overflow */
16835 +               sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE;
16836 +
16837 +       /* round optimal io size up to 512 bytes */
16838 +       sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS;
16839 +       sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS;
16840 +       if (sbinfo->optimal_io_size == 0) {
16841 +               warning("nikita-2497", "optimal_io_size is too small");
16842 +               return RETERR(-EINVAL);
16843 +       }
16844 +       return result;
16845 +}
16846 +
16847 +/**
16848 + * reiser4_init_read_super - read reiser4 master super block
16849 + * @super: super block to fill
16850 + * @silent: if 0 - print warnings
16851 + *
16852 + * Reads reiser4 master super block either from predefined location or from
16853 + * location specified by altsuper mount option, initializes disk format plugin.
16854 + */
16855 +int reiser4_init_read_super(struct super_block *super, int silent)
16856 +{
16857 +       struct buffer_head *super_bh;
16858 +       struct reiser4_master_sb *master_sb;
16859 +       reiser4_super_info_data *sbinfo = get_super_private(super);
16860 +       unsigned long blocksize;
16861 +
16862 + read_super_block:
16863 +#ifdef CONFIG_REISER4_BADBLOCKS
16864 +       if (sbinfo->altsuper)
16865 +               /*
16866 +                * read reiser4 master super block at position specified by
16867 +                * mount option
16868 +                */
16869 +               super_bh = sb_bread(super,
16870 +                                   (sector_t)(sbinfo->altsuper / super->s_blocksize));
16871 +       else
16872 +#endif
16873 +               /* read reiser4 master super block at 16-th 4096 block */
16874 +               super_bh = sb_bread(super,
16875 +                                   (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize));
16876 +       if (!super_bh)
16877 +               return RETERR(-EIO);
16878 +
16879 +       master_sb = (struct reiser4_master_sb *)super_bh->b_data;
16880 +       /* check reiser4 magic string */
16881 +       if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING,
16882 +                    sizeof(REISER4_SUPER_MAGIC_STRING))) {
16883 +               /* reiser4 master super block contains filesystem blocksize */
16884 +               blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize));
16885 +
16886 +               if (blocksize != PAGE_CACHE_SIZE) {
16887 +                       /*
16888 +                        * currenly reiser4's blocksize must be equal to
16889 +                        * pagesize
16890 +                        */
16891 +                       if (!silent)
16892 +                               warning("nikita-2609",
16893 +                                       "%s: wrong block size %ld\n", super->s_id,
16894 +                                       blocksize);
16895 +                       brelse(super_bh);
16896 +                       return RETERR(-EINVAL);
16897 +               }
16898 +               if (blocksize != super->s_blocksize) {
16899 +                       /*
16900 +                        * filesystem uses different blocksize. Reread master
16901 +                        * super block with correct blocksize
16902 +                        */
16903 +                       brelse(super_bh);
16904 +                       if (!sb_set_blocksize(super, (int)blocksize))
16905 +                               return RETERR(-EINVAL);
16906 +                       goto read_super_block;
16907 +               }
16908 +
16909 +               sbinfo->df_plug =
16910 +                       disk_format_plugin_by_id(
16911 +                               le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
16912 +               if (sbinfo->df_plug == NULL) {
16913 +                       if (!silent)
16914 +                               warning("nikita-26091",
16915 +                                       "%s: unknown disk format plugin %d\n",
16916 +                                       super->s_id,
16917 +                                       le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
16918 +                       brelse(super_bh);
16919 +                       return RETERR(-EINVAL);
16920 +               }
16921 +               sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap));
16922 +               brelse(super_bh);
16923 +               return 0;
16924 +       }
16925 +
16926 +       /* there is no reiser4 on the device */
16927 +       if (!silent)
16928 +               warning("nikita-2608",
16929 +                       "%s: wrong master super block magic", super->s_id);
16930 +       brelse(super_bh);
16931 +       return RETERR(-EINVAL);
16932 +}
16933 +
16934 +static struct {
16935 +       reiser4_plugin_type type;
16936 +       reiser4_plugin_id id;
16937 +} default_plugins[PSET_LAST] = {
16938 +       [PSET_FILE] = {
16939 +               .type = REISER4_FILE_PLUGIN_TYPE,
16940 +               .id = UNIX_FILE_PLUGIN_ID
16941 +       },
16942 +       [PSET_DIR] = {
16943 +               .type = REISER4_DIR_PLUGIN_TYPE,
16944 +               .id = HASHED_DIR_PLUGIN_ID
16945 +       },
16946 +       [PSET_HASH] = {
16947 +               .type = REISER4_HASH_PLUGIN_TYPE,
16948 +               .id = R5_HASH_ID
16949 +       },
16950 +       [PSET_FIBRATION] = {
16951 +               .type = REISER4_FIBRATION_PLUGIN_TYPE,
16952 +               .id = FIBRATION_DOT_O
16953 +       },
16954 +       [PSET_PERM] = {
16955 +               .type = REISER4_PERM_PLUGIN_TYPE,
16956 +               .id = NULL_PERM_ID
16957 +       },
16958 +       [PSET_FORMATTING] = {
16959 +               .type = REISER4_FORMATTING_PLUGIN_TYPE,
16960 +               .id = SMALL_FILE_FORMATTING_ID
16961 +       },
16962 +       [PSET_SD] = {
16963 +               .type = REISER4_ITEM_PLUGIN_TYPE,
16964 +               .id = STATIC_STAT_DATA_ID
16965 +       },
16966 +       [PSET_DIR_ITEM] = {
16967 +               .type = REISER4_ITEM_PLUGIN_TYPE,
16968 +               .id = COMPOUND_DIR_ID
16969 +       },
16970 +       [PSET_CIPHER] = {
16971 +               .type = REISER4_CIPHER_PLUGIN_TYPE,
16972 +               .id = NONE_CIPHER_ID
16973 +       },
16974 +       [PSET_DIGEST] = {
16975 +               .type = REISER4_DIGEST_PLUGIN_TYPE,
16976 +               .id = SHA256_32_DIGEST_ID
16977 +       },
16978 +       [PSET_COMPRESSION] = {
16979 +               .type = REISER4_COMPRESSION_PLUGIN_TYPE,
16980 +               .id = LZO1_COMPRESSION_ID
16981 +       },
16982 +       [PSET_COMPRESSION_MODE] = {
16983 +               .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
16984 +               .id = CONVX_COMPRESSION_MODE_ID
16985 +       },
16986 +       [PSET_CLUSTER] = {
16987 +               .type = REISER4_CLUSTER_PLUGIN_TYPE,
16988 +               .id = CLUSTER_64K_ID
16989 +       },
16990 +       [PSET_CREATE] = {
16991 +               .type = REISER4_FILE_PLUGIN_TYPE,
16992 +               .id = UNIX_FILE_PLUGIN_ID
16993 +       }
16994 +};
16995 +
16996 +/* access to default plugin table */
16997 +reiser4_plugin *get_default_plugin(pset_member memb)
16998 +{
16999 +       return plugin_by_id(default_plugins[memb].type,
17000 +                           default_plugins[memb].id);
17001 +}
17002 +
17003 +/**
17004 + * reiser4_init_root_inode - obtain inode of root directory
17005 + * @super: super block of filesystem
17006 + *
17007 + * Obtains inode of root directory (reading it from disk), initializes plugin
17008 + * set it was not initialized.
17009 + */
17010 +int reiser4_init_root_inode(struct super_block *super)
17011 +{
17012 +       reiser4_super_info_data *sbinfo = get_super_private(super);
17013 +       struct inode *inode;
17014 +       int result = 0;
17015 +
17016 +       inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0);
17017 +       if (IS_ERR(inode))
17018 +               return RETERR(PTR_ERR(inode));
17019 +
17020 +       super->s_root = d_alloc_root(inode);
17021 +       if (!super->s_root) {
17022 +               iput(inode);
17023 +               return RETERR(-ENOMEM);
17024 +       }
17025 +
17026 +       super->s_root->d_op = &sbinfo->ops.dentry;
17027 +
17028 +       if (!is_inode_loaded(inode)) {
17029 +               pset_member memb;
17030 +               plugin_set *pset;
17031 +
17032 +               pset = reiser4_inode_data(inode)->pset;
17033 +               for (memb = 0; memb < PSET_LAST; ++memb) {
17034 +
17035 +                       if (aset_get(pset, memb) != NULL)
17036 +                               continue;
17037 +
17038 +                       result = grab_plugin_pset(inode, NULL, memb);
17039 +                       if (result != 0)
17040 +                               break;
17041 +
17042 +                       reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17043 +               }
17044 +
17045 +               if (result == 0) {
17046 +                       if (REISER4_DEBUG) {
17047 +                               for (memb = 0; memb < PSET_LAST; ++memb)
17048 +                                       assert("nikita-3500",
17049 +                                              aset_get(pset, memb) != NULL);
17050 +                       }
17051 +               } else
17052 +                       warning("nikita-3448", "Cannot set plugins of root: %i",
17053 +                               result);
17054 +               reiser4_iget_complete(inode);
17055 +
17056 +               /* As the default pset kept in the root dir may has been changed
17057 +                  (length is unknown), call update_sd. */
17058 +               if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
17059 +                       result = reiser4_grab_space(
17060 +                               inode_file_plugin(inode)->estimate.update(inode),
17061 +                               BA_CAN_COMMIT);
17062 +
17063 +                       if (result == 0)
17064 +                               result = reiser4_update_sd(inode);
17065 +
17066 +                       all_grabbed2free();
17067 +               }
17068 +       }
17069 +
17070 +       super->s_maxbytes = MAX_LFS_FILESIZE;
17071 +       return result;
17072 +}
17073 +
17074 +/*
17075 + * Local variables:
17076 + * c-indentation-style: "K&R"
17077 + * mode-name: "LC"
17078 + * c-basic-offset: 8
17079 + * tab-width: 8
17080 + * fill-column: 79
17081 + * End:
17082 + */
17083 diff -urN linux-2.6.27.orig/fs/reiser4/inode.c linux-2.6.27/fs/reiser4/inode.c
17084 --- linux-2.6.27.orig/fs/reiser4/inode.c        1970-01-01 03:00:00.000000000 +0300
17085 +++ linux-2.6.27/fs/reiser4/inode.c     2008-10-12 18:20:00.000000000 +0400
17086 @@ -0,0 +1,709 @@
17087 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17088 +
17089 +/* Inode specific operations. */
17090 +
17091 +#include "forward.h"
17092 +#include "debug.h"
17093 +#include "key.h"
17094 +#include "kassign.h"
17095 +#include "coord.h"
17096 +#include "seal.h"
17097 +#include "dscale.h"
17098 +#include "plugin/item/item.h"
17099 +#include "plugin/security/perm.h"
17100 +#include "plugin/plugin.h"
17101 +#include "plugin/object.h"
17102 +#include "znode.h"
17103 +#include "vfs_ops.h"
17104 +#include "inode.h"
17105 +#include "super.h"
17106 +#include "reiser4.h"
17107 +
17108 +#include <linux/fs.h>          /* for struct super_block,  address_space */
17109 +
17110 +/* return reiser4 internal tree which inode belongs to */
17111 +/* Audited by: green(2002.06.17) */
17112 +reiser4_tree *reiser4_tree_by_inode(const struct inode *inode /* inode queried */ )
17113 +{
17114 +       assert("nikita-256", inode != NULL);
17115 +       assert("nikita-257", inode->i_sb != NULL);
17116 +       return reiser4_get_tree(inode->i_sb);
17117 +}
17118 +
17119 +/* return reiser4-specific inode flags */
17120 +static inline unsigned long *inode_flags(const struct inode *const inode)
17121 +{
17122 +       assert("nikita-2842", inode != NULL);
17123 +       return &reiser4_inode_data(inode)->flags;
17124 +}
17125 +
17126 +/* set reiser4-specific flag @f in @inode */
17127 +void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f)
17128 +{
17129 +       assert("nikita-2248", inode != NULL);
17130 +       set_bit((int)f, inode_flags(inode));
17131 +}
17132 +
17133 +/* clear reiser4-specific flag @f in @inode */
17134 +void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f)
17135 +{
17136 +       assert("nikita-2250", inode != NULL);
17137 +       clear_bit((int)f, inode_flags(inode));
17138 +}
17139 +
17140 +/* true if reiser4-specific flag @f is set in @inode */
17141 +int reiser4_inode_get_flag(const struct inode *inode,
17142 +                          reiser4_file_plugin_flags f)
17143 +{
17144 +       assert("nikita-2251", inode != NULL);
17145 +       return test_bit((int)f, inode_flags(inode));
17146 +}
17147 +
17148 +/* convert oid to inode number */
17149 +ino_t oid_to_ino(oid_t oid)
17150 +{
17151 +       return (ino_t) oid;
17152 +}
17153 +
17154 +/* convert oid to user visible inode number */
17155 +ino_t oid_to_uino(oid_t oid)
17156 +{
17157 +       /* reiser4 object is uniquely identified by oid which is 64 bit
17158 +          quantity. Kernel in-memory inode is indexed (in the hash table) by
17159 +          32 bit i_ino field, but this is not a problem, because there is a
17160 +          way to further distinguish inodes with identical inode numbers
17161 +          (find_actor supplied to iget()).
17162 +
17163 +          But user space expects unique 32 bit inode number. Obviously this
17164 +          is impossible. Work-around is to somehow hash oid into user visible
17165 +          inode number.
17166 +        */
17167 +       oid_t max_ino = (ino_t) ~ 0;
17168 +
17169 +       if (REISER4_INO_IS_OID || (oid <= max_ino))
17170 +               return oid;
17171 +       else
17172 +               /* this is remotely similar to algorithm used to find next pid
17173 +                  to use for process: after wrap-around start from some
17174 +                  offset rather than from 0. Idea is that there are some long
17175 +                  living objects with which we don't want to collide.
17176 +                */
17177 +               return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1));
17178 +}
17179 +
17180 +/* check that "inode" is on reiser4 file-system */
17181 +int is_reiser4_inode(const struct inode *inode /* inode queried */ )
17182 +{
17183 +       return inode != NULL && is_reiser4_super(inode->i_sb);
17184 +}
17185 +
17186 +/* Maximal length of a name that can be stored in directory @inode.
17187 +
17188 +   This is used in check during file creation and lookup. */
17189 +int reiser4_max_filename_len(const struct inode *inode /* inode queried */ )
17190 +{
17191 +       assert("nikita-287", is_reiser4_inode(inode));
17192 +       assert("nikita-1710", inode_dir_item_plugin(inode));
17193 +       if (inode_dir_item_plugin(inode)->s.dir.max_name_len)
17194 +               return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode);
17195 +       else
17196 +               return 255;
17197 +}
17198 +
17199 +#if REISER4_USE_COLLISION_LIMIT
17200 +/* Maximal number of hash collisions for this directory. */
17201 +int max_hash_collisions(const struct inode *dir /* inode queried */ )
17202 +{
17203 +       assert("nikita-1711", dir != NULL);
17204 +       return reiser4_inode_data(dir)->plugin.max_collisions;
17205 +}
17206 +#endif  /*  REISER4_USE_COLLISION_LIMIT  */
17207 +
17208 +/* Install file, inode, and address_space operation on @inode, depending on
17209 +   its mode. */
17210 +int setup_inode_ops(struct inode *inode /* inode to intialize */ ,
17211 +                   reiser4_object_create_data * data   /* parameters to create
17212 +                                                        * object */ )
17213 +{
17214 +       reiser4_super_info_data *sinfo;
17215 +       file_plugin *fplug;
17216 +       dir_plugin *dplug;
17217 +
17218 +       fplug = inode_file_plugin(inode);
17219 +       dplug = inode_dir_plugin(inode);
17220 +
17221 +       sinfo = get_super_private(inode->i_sb);
17222 +
17223 +       switch (inode->i_mode & S_IFMT) {
17224 +       case S_IFSOCK:
17225 +       case S_IFBLK:
17226 +       case S_IFCHR:
17227 +       case S_IFIFO:
17228 +               {
17229 +                       dev_t rdev;     /* to keep gcc happy */
17230 +
17231 +                       assert("vs-46", fplug != NULL);
17232 +                       /* ugly hack with rdev */
17233 +                       if (data == NULL) {
17234 +                               rdev = inode->i_rdev;
17235 +                               inode->i_rdev = 0;
17236 +                       } else
17237 +                               rdev = data->rdev;
17238 +                       inode->i_blocks = 0;
17239 +                       assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID);
17240 +                       inode->i_op = file_plugins[fplug->h.id].inode_ops;
17241 +                       /* initialize inode->i_fop and inode->i_rdev for block and char
17242 +                          devices */
17243 +                       init_special_inode(inode, inode->i_mode, rdev);
17244 +                       /* all address space operations are null */
17245 +                       inode->i_mapping->a_ops =
17246 +                           file_plugins[fplug->h.id].as_ops;
17247 +                       break;
17248 +               }
17249 +       case S_IFLNK:
17250 +               assert("vs-46", fplug != NULL);
17251 +               assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID);
17252 +               inode->i_op = file_plugins[fplug->h.id].inode_ops;
17253 +               inode->i_fop = NULL;
17254 +               /* all address space operations are null */
17255 +               inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops;
17256 +               break;
17257 +       case S_IFDIR:
17258 +               assert("vs-46", dplug != NULL);
17259 +               assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID ||
17260 +                                dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID));
17261 +               inode->i_op = dir_plugins[dplug->h.id].inode_ops;
17262 +               inode->i_fop = dir_plugins[dplug->h.id].file_ops;
17263 +               inode->i_mapping->a_ops = dir_plugins[dplug->h.id].as_ops;
17264 +               break;
17265 +       case S_IFREG:
17266 +               assert("vs-46", fplug != NULL);
17267 +               assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID ||
17268 +                                fplug->h.id == CRYPTCOMPRESS_FILE_PLUGIN_ID));
17269 +               inode->i_op = file_plugins[fplug->h.id].inode_ops;
17270 +               inode->i_fop = file_plugins[fplug->h.id].file_ops;
17271 +               inode->i_mapping->a_ops = file_plugins[fplug->h.id].as_ops;
17272 +               break;
17273 +       default:
17274 +               warning("nikita-291", "wrong file mode: %o for %llu",
17275 +                       inode->i_mode,
17276 +                       (unsigned long long)get_inode_oid(inode));
17277 +               reiser4_make_bad_inode(inode);
17278 +               return RETERR(-EINVAL);
17279 +       }
17280 +       return 0;
17281 +}
17282 +
17283 +/* Initialize inode from disk data. Called with inode locked.
17284 +   Return inode locked. */
17285 +static int init_inode(struct inode *inode /* inode to intialise */ ,
17286 +                     coord_t * coord /* coord of stat data */ )
17287 +{
17288 +       int result;
17289 +       item_plugin *iplug;
17290 +       void *body;
17291 +       int length;
17292 +       reiser4_inode *state;
17293 +
17294 +       assert("nikita-292", coord != NULL);
17295 +       assert("nikita-293", inode != NULL);
17296 +
17297 +       coord_clear_iplug(coord);
17298 +       result = zload(coord->node);
17299 +       if (result)
17300 +               return result;
17301 +       iplug = item_plugin_by_coord(coord);
17302 +       body = item_body_by_coord(coord);
17303 +       length = item_length_by_coord(coord);
17304 +
17305 +       assert("nikita-295", iplug != NULL);
17306 +       assert("nikita-296", body != NULL);
17307 +       assert("nikita-297", length > 0);
17308 +
17309 +       /* inode is under I_LOCK now */
17310 +
17311 +       state = reiser4_inode_data(inode);
17312 +       /* call stat-data plugin method to load sd content into inode */
17313 +       result = iplug->s.sd.init_inode(inode, body, length);
17314 +       set_plugin(&state->pset, PSET_SD, item_plugin_to_plugin(iplug));
17315 +       if (result == 0) {
17316 +               result = setup_inode_ops(inode, NULL);
17317 +               if (result == 0 && inode->i_sb->s_root &&
17318 +                   inode->i_sb->s_root->d_inode)
17319 +                       result = finish_pset(inode);
17320 +       }
17321 +       zrelse(coord->node);
17322 +       return result;
17323 +}
17324 +
17325 +/* read `inode' from the disk. This is what was previously in
17326 +   reiserfs_read_inode2().
17327 +
17328 +   Must be called with inode locked. Return inode still locked.
17329 +*/
17330 +static int read_inode(struct inode *inode /* inode to read from disk */ ,
17331 +                     const reiser4_key * key /* key of stat data */ ,
17332 +                     int silent)
17333 +{
17334 +       int result;
17335 +       lock_handle lh;
17336 +       reiser4_inode *info;
17337 +       coord_t coord;
17338 +
17339 +       assert("nikita-298", inode != NULL);
17340 +       assert("nikita-1945", !is_inode_loaded(inode));
17341 +
17342 +       info = reiser4_inode_data(inode);
17343 +       assert("nikita-300", info->locality_id != 0);
17344 +
17345 +       coord_init_zero(&coord);
17346 +       init_lh(&lh);
17347 +       /* locate stat-data in a tree and return znode locked */
17348 +       result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent);
17349 +       assert("nikita-301", !is_inode_loaded(inode));
17350 +       if (result == 0) {
17351 +               /* use stat-data plugin to load sd into inode. */
17352 +               result = init_inode(inode, &coord);
17353 +               if (result == 0) {
17354 +                       /* initialize stat-data seal */
17355 +                       spin_lock_inode(inode);
17356 +                       reiser4_seal_init(&info->sd_seal, &coord, key);
17357 +                       info->sd_coord = coord;
17358 +                       spin_unlock_inode(inode);
17359 +
17360 +                       /* call file plugin's method to initialize plugin
17361 +                        * specific part of inode */
17362 +                       if (inode_file_plugin(inode)->init_inode_data)
17363 +                               inode_file_plugin(inode)->init_inode_data(inode,
17364 +                                                                         NULL,
17365 +                                                                         0);
17366 +                       /* load detached directory cursors for stateless
17367 +                        * directory readers (NFS). */
17368 +                       reiser4_load_cursors(inode);
17369 +
17370 +                       /* Check the opened inode for consistency. */
17371 +                       result =
17372 +                           get_super_private(inode->i_sb)->df_plug->
17373 +                           check_open(inode);
17374 +               }
17375 +       }
17376 +       /* lookup_sd() doesn't release coord because we want znode
17377 +          stay read-locked while stat-data fields are accessed in
17378 +          init_inode() */
17379 +       done_lh(&lh);
17380 +
17381 +       if (result != 0)
17382 +               reiser4_make_bad_inode(inode);
17383 +       return result;
17384 +}
17385 +
17386 +/* initialise new reiser4 inode being inserted into hash table. */
17387 +static int init_locked_inode(struct inode *inode /* new inode */ ,
17388 +                            void *opaque       /* key of stat data passed to the
17389 +                                                * iget5_locked as cookie */ )
17390 +{
17391 +       reiser4_key *key;
17392 +
17393 +       assert("nikita-1995", inode != NULL);
17394 +       assert("nikita-1996", opaque != NULL);
17395 +       key = opaque;
17396 +       set_inode_oid(inode, get_key_objectid(key));
17397 +       reiser4_inode_data(inode)->locality_id = get_key_locality(key);
17398 +       return 0;
17399 +}
17400 +
17401 +/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to iget5_locked().
17402 +
17403 +   This function is called by iget5_locked() to distinguish reiser4 inodes
17404 +   having the same inode numbers. Such inodes can only exist due to some error
17405 +   condition. One of them should be bad. Inodes with identical inode numbers
17406 +   (objectids) are distinguished by their packing locality.
17407 +
17408 +*/
17409 +static int reiser4_inode_find_actor(struct inode *inode        /* inode from hash table to
17410 +                                                        * check */ ,
17411 +                                   void *opaque        /* "cookie" passed to
17412 +                                                        * iget5_locked(). This is stat data
17413 +                                                        * key */ )
17414 +{
17415 +       reiser4_key *key;
17416 +
17417 +       key = opaque;
17418 +       return
17419 +           /* oid is unique, so first term is enough, actually. */
17420 +           get_inode_oid(inode) == get_key_objectid(key) &&
17421 +           /*
17422 +            * also, locality should be checked, but locality is stored in
17423 +            * the reiser4-specific part of the inode, and actor can be
17424 +            * called against arbitrary inode that happened to be in this
17425 +            * hash chain. Hence we first have to check that this is
17426 +            * reiser4 inode at least. is_reiser4_inode() is probably too
17427 +            * early to call, as inode may have ->i_op not yet
17428 +            * initialised.
17429 +            */
17430 +           is_reiser4_super(inode->i_sb) &&
17431 +           /*
17432 +            * usually objectid is unique, but pseudo files use counter to
17433 +            * generate objectid. All pseudo files are placed into special
17434 +            * (otherwise unused) locality.
17435 +            */
17436 +           reiser4_inode_data(inode)->locality_id == get_key_locality(key);
17437 +}
17438 +
17439 +/* hook for kmem_cache_create */
17440 +void loading_init_once(reiser4_inode * info)
17441 +{
17442 +       mutex_init(&info->loading);
17443 +}
17444 +
17445 +/* for reiser4_alloc_inode */
17446 +void loading_alloc(reiser4_inode * info)
17447 +{
17448 +       assert("vs-1717", !mutex_is_locked(&info->loading));
17449 +}
17450 +
17451 +/* for reiser4_destroy */
17452 +void loading_destroy(reiser4_inode * info)
17453 +{
17454 +       assert("vs-1717a", !mutex_is_locked(&info->loading));
17455 +}
17456 +
17457 +static void loading_begin(reiser4_inode * info)
17458 +{
17459 +       mutex_lock(&info->loading);
17460 +}
17461 +
17462 +static void loading_end(reiser4_inode * info)
17463 +{
17464 +       mutex_unlock(&info->loading);
17465 +}
17466 +
17467 +/**
17468 + * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary
17469 + * @super: super block of filesystem
17470 + * @key: key of inode's stat-data
17471 + * @silent:
17472 + *
17473 + * This is our helper function a la iget(). This is be called by
17474 + * lookup_common() and reiser4_read_super(). Return inode locked or error
17475 + * encountered.
17476 + */
17477 +struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key,
17478 +                          int silent)
17479 +{
17480 +       struct inode *inode;
17481 +       int result;
17482 +       reiser4_inode *info;
17483 +
17484 +       assert("nikita-302", super != NULL);
17485 +       assert("nikita-303", key != NULL);
17486 +
17487 +       result = 0;
17488 +
17489 +       /* call iget(). Our ->read_inode() is dummy, so this will either
17490 +          find inode in cache or return uninitialised inode */
17491 +       inode = iget5_locked(super,
17492 +                            (unsigned long)get_key_objectid(key),
17493 +                            reiser4_inode_find_actor,
17494 +                            init_locked_inode, (reiser4_key *) key);
17495 +       if (inode == NULL)
17496 +               return ERR_PTR(RETERR(-ENOMEM));
17497 +       if (is_bad_inode(inode)) {
17498 +               warning("nikita-304", "Bad inode found");
17499 +               reiser4_print_key("key", key);
17500 +               iput(inode);
17501 +               return ERR_PTR(RETERR(-EIO));
17502 +       }
17503 +
17504 +       info = reiser4_inode_data(inode);
17505 +
17506 +       /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully
17507 +          loaded and initialized inode from just allocated inode. If
17508 +          REISER4_LOADED bit is not set, reiser4_iget() completes loading under
17509 +          info->loading.  The place in reiser4 which uses not initialized inode
17510 +          is the reiser4 repacker, see repacker-related functions in
17511 +          plugin/item/extent.c */
17512 +       if (!is_inode_loaded(inode)) {
17513 +               loading_begin(info);
17514 +               if (!is_inode_loaded(inode)) {
17515 +                       /* locking: iget5_locked returns locked inode */
17516 +                       assert("nikita-1941", !is_inode_loaded(inode));
17517 +                       assert("nikita-1949",
17518 +                              reiser4_inode_find_actor(inode,
17519 +                                                       (reiser4_key *) key));
17520 +                       /* now, inode has objectid as ->i_ino and locality in
17521 +                          reiser4-specific part. This is enough for
17522 +                          read_inode() to read stat data from the disk */
17523 +                       result = read_inode(inode, key, silent);
17524 +               } else
17525 +                       loading_end(info);
17526 +       }
17527 +
17528 +       if (inode->i_state & I_NEW)
17529 +               unlock_new_inode(inode);
17530 +
17531 +       if (is_bad_inode(inode)) {
17532 +               assert("vs-1717", result != 0);
17533 +               loading_end(info);
17534 +               iput(inode);
17535 +               inode = ERR_PTR(result);
17536 +       } else if (REISER4_DEBUG) {
17537 +               reiser4_key found_key;
17538 +
17539 +               assert("vs-1717", result == 0);
17540 +               build_sd_key(inode, &found_key);
17541 +               if (!keyeq(&found_key, key)) {
17542 +                       warning("nikita-305", "Wrong key in sd");
17543 +                       reiser4_print_key("sought for", key);
17544 +                       reiser4_print_key("found", &found_key);
17545 +               }
17546 +               if (inode->i_nlink == 0) {
17547 +                       warning("nikita-3559", "Unlinked inode found: %llu\n",
17548 +                               (unsigned long long)get_inode_oid(inode));
17549 +               }
17550 +       }
17551 +       return inode;
17552 +}
17553 +
17554 +/* reiser4_iget() may return not fully initialized inode, this function should
17555 + * be called after one completes reiser4 inode initializing. */
17556 +void reiser4_iget_complete(struct inode *inode)
17557 +{
17558 +       assert("zam-988", is_reiser4_inode(inode));
17559 +
17560 +       if (!is_inode_loaded(inode)) {
17561 +               reiser4_inode_set_flag(inode, REISER4_LOADED);
17562 +               loading_end(reiser4_inode_data(inode));
17563 +       }
17564 +}
17565 +
17566 +void reiser4_make_bad_inode(struct inode *inode)
17567 +{
17568 +       assert("nikita-1934", inode != NULL);
17569 +
17570 +       /* clear LOADED bit */
17571 +       reiser4_inode_clr_flag(inode, REISER4_LOADED);
17572 +       make_bad_inode(inode);
17573 +       return;
17574 +}
17575 +
17576 +file_plugin *inode_file_plugin(const struct inode * inode)
17577 +{
17578 +       assert("nikita-1997", inode != NULL);
17579 +       return reiser4_inode_data(inode)->pset->file;
17580 +}
17581 +
17582 +dir_plugin *inode_dir_plugin(const struct inode * inode)
17583 +{
17584 +       assert("nikita-1998", inode != NULL);
17585 +       return reiser4_inode_data(inode)->pset->dir;
17586 +}
17587 +
17588 +formatting_plugin *inode_formatting_plugin(const struct inode * inode)
17589 +{
17590 +       assert("nikita-2000", inode != NULL);
17591 +       return reiser4_inode_data(inode)->pset->formatting;
17592 +}
17593 +
17594 +hash_plugin *inode_hash_plugin(const struct inode * inode)
17595 +{
17596 +       assert("nikita-2001", inode != NULL);
17597 +       return reiser4_inode_data(inode)->pset->hash;
17598 +}
17599 +
17600 +fibration_plugin *inode_fibration_plugin(const struct inode * inode)
17601 +{
17602 +       assert("nikita-2001", inode != NULL);
17603 +       return reiser4_inode_data(inode)->pset->fibration;
17604 +}
17605 +
17606 +cipher_plugin *inode_cipher_plugin(const struct inode * inode)
17607 +{
17608 +       assert("edward-36", inode != NULL);
17609 +       return reiser4_inode_data(inode)->pset->cipher;
17610 +}
17611 +
17612 +compression_plugin *inode_compression_plugin(const struct inode * inode)
17613 +{
17614 +       assert("edward-37", inode != NULL);
17615 +       return reiser4_inode_data(inode)->pset->compression;
17616 +}
17617 +
17618 +compression_mode_plugin *inode_compression_mode_plugin(const struct inode *
17619 +                                                      inode)
17620 +{
17621 +       assert("edward-1330", inode != NULL);
17622 +       return reiser4_inode_data(inode)->pset->compression_mode;
17623 +}
17624 +
17625 +cluster_plugin *inode_cluster_plugin(const struct inode * inode)
17626 +{
17627 +       assert("edward-1328", inode != NULL);
17628 +       return reiser4_inode_data(inode)->pset->cluster;
17629 +}
17630 +
17631 +file_plugin *inode_create_plugin(const struct inode * inode)
17632 +{
17633 +       assert("edward-1329", inode != NULL);
17634 +       return reiser4_inode_data(inode)->pset->create;
17635 +}
17636 +
17637 +digest_plugin *inode_digest_plugin(const struct inode * inode)
17638 +{
17639 +       assert("edward-86", inode != NULL);
17640 +       return reiser4_inode_data(inode)->pset->digest;
17641 +}
17642 +
17643 +item_plugin *inode_sd_plugin(const struct inode * inode)
17644 +{
17645 +       assert("vs-534", inode != NULL);
17646 +       return reiser4_inode_data(inode)->pset->sd;
17647 +}
17648 +
17649 +item_plugin *inode_dir_item_plugin(const struct inode * inode)
17650 +{
17651 +       assert("vs-534", inode != NULL);
17652 +       return reiser4_inode_data(inode)->pset->dir_item;
17653 +}
17654 +
17655 +file_plugin *child_create_plugin(const struct inode * inode)
17656 +{
17657 +       assert("edward-1329", inode != NULL);
17658 +       return reiser4_inode_data(inode)->hset->create;
17659 +}
17660 +
17661 +void inode_set_extension(struct inode *inode, sd_ext_bits ext)
17662 +{
17663 +       reiser4_inode *state;
17664 +
17665 +       assert("nikita-2716", inode != NULL);
17666 +       assert("nikita-2717", ext < LAST_SD_EXTENSION);
17667 +       assert("nikita-3491", spin_inode_is_locked(inode));
17668 +
17669 +       state = reiser4_inode_data(inode);
17670 +       state->extmask |= 1 << ext;
17671 +       /* force re-calculation of stat-data length on next call to
17672 +          update_sd(). */
17673 +       reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17674 +}
17675 +
17676 +void inode_clr_extension(struct inode *inode, sd_ext_bits ext)
17677 +{
17678 +       reiser4_inode *state;
17679 +
17680 +       assert("vpf-1926", inode != NULL);
17681 +       assert("vpf-1927", ext < LAST_SD_EXTENSION);
17682 +       assert("vpf-1928", spin_inode_is_locked(inode));
17683 +
17684 +       state = reiser4_inode_data(inode);
17685 +       state->extmask &= ~(1 << ext);
17686 +       /* force re-calculation of stat-data length on next call to
17687 +          update_sd(). */
17688 +       reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17689 +}
17690 +
17691 +void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new)
17692 +{
17693 +       assert("edward-1287", inode != NULL);
17694 +       if (!dscale_fit(old, new))
17695 +               reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17696 +       return;
17697 +}
17698 +
17699 +void inode_check_scale(struct inode *inode, __u64 old, __u64 new)
17700 +{
17701 +       assert("nikita-2875", inode != NULL);
17702 +       spin_lock_inode(inode);
17703 +       inode_check_scale_nolock(inode, old, new);
17704 +       spin_unlock_inode(inode);
17705 +}
17706 +
17707 +/*
17708 + * initialize ->ordering field of inode. This field defines how file stat-data
17709 + * and body is ordered within a tree with respect to other objects within the
17710 + * same parent directory.
17711 + */
17712 +void
17713 +init_inode_ordering(struct inode *inode,
17714 +                   reiser4_object_create_data * crd, int create)
17715 +{
17716 +       reiser4_key key;
17717 +
17718 +       if (create) {
17719 +               struct inode *parent;
17720 +
17721 +               parent = crd->parent;
17722 +               assert("nikita-3224", inode_dir_plugin(parent) != NULL);
17723 +               inode_dir_plugin(parent)->build_entry_key(parent,
17724 +                                                         &crd->dentry->d_name,
17725 +                                                         &key);
17726 +       } else {
17727 +               coord_t *coord;
17728 +
17729 +               coord = &reiser4_inode_data(inode)->sd_coord;
17730 +               coord_clear_iplug(coord);
17731 +               /* safe to use ->sd_coord, because node is under long term
17732 +                * lock */
17733 +               WITH_DATA(coord->node, item_key_by_coord(coord, &key));
17734 +       }
17735 +
17736 +       set_inode_ordering(inode, get_key_ordering(&key));
17737 +}
17738 +
17739 +znode *inode_get_vroot(struct inode *inode)
17740 +{
17741 +       reiser4_block_nr blk;
17742 +       znode *result;
17743 +
17744 +       spin_lock_inode(inode);
17745 +       blk = reiser4_inode_data(inode)->vroot;
17746 +       spin_unlock_inode(inode);
17747 +       if (!disk_addr_eq(&UBER_TREE_ADDR, &blk))
17748 +               result = zlook(reiser4_tree_by_inode(inode), &blk);
17749 +       else
17750 +               result = NULL;
17751 +       return result;
17752 +}
17753 +
17754 +void inode_set_vroot(struct inode *inode, znode *vroot)
17755 +{
17756 +       spin_lock_inode(inode);
17757 +       reiser4_inode_data(inode)->vroot = *znode_get_block(vroot);
17758 +       spin_unlock_inode(inode);
17759 +}
17760 +
17761 +#if REISER4_DEBUG
17762 +
17763 +void reiser4_inode_invariant(const struct inode *inode)
17764 +{
17765 +       assert("nikita-3077", spin_inode_is_locked(inode));
17766 +}
17767 +
17768 +int inode_has_no_jnodes(reiser4_inode * r4_inode)
17769 +{
17770 +       return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL &&
17771 +               r4_inode->nr_jnodes == 0;
17772 +}
17773 +
17774 +#endif
17775 +
17776 +/* true if directory is empty (only contains dot and dotdot) */
17777 +/* FIXME: shouldn't it be dir plugin method? */
17778 +int is_dir_empty(const struct inode *dir)
17779 +{
17780 +       assert("nikita-1976", dir != NULL);
17781 +
17782 +       /* rely on our method to maintain directory i_size being equal to the
17783 +          number of entries. */
17784 +       return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY);
17785 +}
17786 +
17787 +/* Make Linus happy.
17788 +   Local variables:
17789 +   c-indentation-style: "K&R"
17790 +   mode-name: "LC"
17791 +   c-basic-offset: 8
17792 +   tab-width: 8
17793 +   fill-column: 120
17794 +   End:
17795 +*/
17796 diff -urN linux-2.6.27.orig/fs/reiser4/inode.h linux-2.6.27/fs/reiser4/inode.h
17797 --- linux-2.6.27.orig/fs/reiser4/inode.h        1970-01-01 03:00:00.000000000 +0300
17798 +++ linux-2.6.27/fs/reiser4/inode.h     2008-10-12 18:20:00.000000000 +0400
17799 @@ -0,0 +1,449 @@
17800 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17801 +
17802 +/* Inode functions. */
17803 +
17804 +#if !defined( __REISER4_INODE_H__ )
17805 +#define __REISER4_INODE_H__
17806 +
17807 +#include "forward.h"
17808 +#include "debug.h"
17809 +#include "key.h"
17810 +#include "seal.h"
17811 +#include "plugin/plugin.h"
17812 +#include "plugin/file/cryptcompress.h"
17813 +#include "plugin/file/file.h"
17814 +#include "plugin/dir/dir.h"
17815 +#include "plugin/plugin_set.h"
17816 +#include "plugin/security/perm.h"
17817 +#include "vfs_ops.h"
17818 +#include "jnode.h"
17819 +#include "fsdata.h"
17820 +
17821 +#include <linux/types.h>       /* for __u?? , ino_t */
17822 +#include <linux/fs.h>          /* for struct super_block, struct
17823 +                                * rw_semaphore, etc  */
17824 +#include <linux/spinlock.h>
17825 +#include <asm/types.h>
17826 +
17827 +/* reiser4-specific inode flags. They are "transient" and are not
17828 +   supposed to be stored on disk. Used to trace "state" of
17829 +   inode
17830 +*/
17831 +typedef enum {
17832 +       /* this is light-weight inode, inheriting some state from its
17833 +          parent  */
17834 +       REISER4_LIGHT_WEIGHT = 0,
17835 +       /* stat data wasn't yet created */
17836 +       REISER4_NO_SD = 1,
17837 +       /* internal immutable flag. Currently is only used
17838 +          to avoid race condition during file creation.
17839 +          See comment in create_object(). */
17840 +       REISER4_IMMUTABLE = 2,
17841 +       /* inode was read from storage */
17842 +       REISER4_LOADED = 3,
17843 +       /* this bit is set for symlinks. inode->i_private points to target
17844 +          name of symlink. */
17845 +       REISER4_GENERIC_PTR_USED = 4,
17846 +       /* set if size of stat-data item for this inode is known. If this is
17847 +        * set we can avoid recalculating size of stat-data on each update. */
17848 +       REISER4_SDLEN_KNOWN = 5,
17849 +       /* reiser4_inode->crypt points to the crypto stat */
17850 +       REISER4_CRYPTO_STAT_LOADED = 6,
17851 +       /* cryptcompress_inode_data points to the secret key */
17852 +       REISER4_SECRET_KEY_INSTALLED = 7,
17853 +       /* File (possibly) has pages corresponding to the tail items, that
17854 +        * were created by ->readpage. It is set by mmap_unix_file() and
17855 +        * sendfile_unix_file(). This bit is inspected by write_unix_file and
17856 +        * kill-hook of tail items. It is never cleared once set. This bit is
17857 +        * modified and inspected under i_mutex. */
17858 +       REISER4_HAS_MMAP = 8,
17859 +       REISER4_PART_MIXED = 9,
17860 +       REISER4_PART_IN_CONV = 10,
17861 +       /* This flag indicates that file plugin conversion is in progress */
17862 +       REISER4_FILE_CONV_IN_PROGRESS = 11
17863 +} reiser4_file_plugin_flags;
17864 +
17865 +/* state associated with each inode.
17866 +   reiser4 inode.
17867 +
17868 +   NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes
17869 +   be of the same size. File-system allocates inodes by itself through
17870 +   s_op->allocate_inode() method. So, it is possible to adjust size of inode
17871 +   at the time of its creation.
17872 +
17873 +   Invariants involving parts of this data-type:
17874 +
17875 +      [inode->eflushed]
17876 +
17877 +*/
17878 +
17879 +typedef struct reiser4_inode reiser4_inode;
17880 +/* return pointer to reiser4-specific part of inode */
17881 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
17882 +                                               /* inode queried */ );
17883 +
17884 +#if BITS_PER_LONG == 64
17885 +
17886 +#define REISER4_INO_IS_OID (1)
17887 +typedef struct {;
17888 +} oid_hi_t;
17889 +
17890 +/* BITS_PER_LONG == 64 */
17891 +#else
17892 +
17893 +#define REISER4_INO_IS_OID (0)
17894 +typedef __u32 oid_hi_t;
17895 +
17896 +/* BITS_PER_LONG == 64 */
17897 +#endif
17898 +
17899 +struct reiser4_inode {
17900 +       /* spin lock protecting fields of this structure. */
17901 +       spinlock_t guard;
17902 +       /* main plugin set that control the file
17903 +          (see comments in plugin/plugin_set.c) */
17904 +       plugin_set *pset;
17905 +       /* plugin set for inheritance
17906 +          (see comments in plugin/plugin_set.c) */
17907 +       plugin_set *hset;
17908 +       /* high 32 bits of object id */
17909 +       oid_hi_t oid_hi;
17910 +       /* seal for stat-data */
17911 +       seal_t sd_seal;
17912 +       /* locality id for this file */
17913 +       oid_t locality_id;
17914 +#if REISER4_LARGE_KEY
17915 +       __u64 ordering;
17916 +#endif
17917 +       /* coord of stat-data in sealed node */
17918 +       coord_t sd_coord;
17919 +       /* bit-mask of stat-data extentions used by this file */
17920 +       __u64 extmask;
17921 +       /* bitmask of non-default plugins for this inode */
17922 +       __u16 plugin_mask;
17923 +       /* bitmask of set heir plugins for this inode. */
17924 +       __u16 heir_mask;
17925 +       union {
17926 +               struct list_head readdir_list;
17927 +               struct list_head not_used;
17928 +       } lists;
17929 +       /* per-inode flags. Filled by values of reiser4_file_plugin_flags */
17930 +       unsigned long flags;
17931 +       union {
17932 +               /* fields specific to unix_file plugin */
17933 +               struct unix_file_info unix_file_info;
17934 +               /* fields specific to cryptcompress file plugin */
17935 +               struct cryptcompress_info cryptcompress_info;
17936 +       } file_plugin_data;
17937 +
17938 +       /* this semaphore is to serialize readers and writers of @pset->file
17939 +        * when file plugin conversion is enabled
17940 +        */
17941 +       struct rw_semaphore conv_sem;
17942 +
17943 +       /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are
17944 +          tagged in that tree by EFLUSH_TAG_ANONYMOUS */
17945 +       struct radix_tree_root jnodes_tree;
17946 +#if REISER4_DEBUG
17947 +       /* number of unformatted node jnodes of this file in jnode hash table */
17948 +       unsigned long nr_jnodes;
17949 +#endif
17950 +
17951 +       /* block number of virtual root for this object. See comment above
17952 +        * fs/reiser4/search.c:handle_vroot() */
17953 +       reiser4_block_nr vroot;
17954 +       struct mutex loading;
17955 +};
17956 +
17957 +void loading_init_once(reiser4_inode *);
17958 +void loading_alloc(reiser4_inode *);
17959 +void loading_destroy(reiser4_inode *);
17960 +
17961 +struct reiser4_inode_object {
17962 +       /* private part */
17963 +       reiser4_inode p;
17964 +       /* generic fields not specific to reiser4, but used by VFS */
17965 +       struct inode vfs_inode;
17966 +};
17967 +
17968 +/* return pointer to the reiser4 specific portion of @inode */
17969 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
17970 +                                               /* inode queried */ )
17971 +{
17972 +       assert("nikita-254", inode != NULL);
17973 +       return &container_of(inode, struct reiser4_inode_object, vfs_inode)->p;
17974 +}
17975 +
17976 +static inline struct inode *inode_by_reiser4_inode(const reiser4_inode *
17977 +                                                  r4_inode /* inode queried */
17978 +                                                  )
17979 +{
17980 +       return &container_of(r4_inode, struct reiser4_inode_object, p)->vfs_inode;
17981 +}
17982 +
17983 +/*
17984 + * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct
17985 + * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64
17986 + * bits.
17987 + *
17988 + * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part
17989 + * of inode, otherwise whole oid is stored in i_ino.
17990 + *
17991 + * Wrappers below ([sg]et_inode_oid()) are used to hide this difference.
17992 + */
17993 +
17994 +#define OID_HI_SHIFT (sizeof(ino_t) * 8)
17995 +
17996 +#if REISER4_INO_IS_OID
17997 +
17998 +static inline oid_t get_inode_oid(const struct inode *inode)
17999 +{
18000 +       return inode->i_ino;
18001 +}
18002 +
18003 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
18004 +{
18005 +       inode->i_ino = oid;
18006 +}
18007 +
18008 +/* REISER4_INO_IS_OID */
18009 +#else
18010 +
18011 +static inline oid_t get_inode_oid(const struct inode *inode)
18012 +{
18013 +       return
18014 +           ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) |
18015 +           inode->i_ino;
18016 +}
18017 +
18018 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
18019 +{
18020 +       assert("nikita-2519", inode != NULL);
18021 +       inode->i_ino = (ino_t) (oid);
18022 +       reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT;
18023 +       assert("nikita-2521", get_inode_oid(inode) == (oid));
18024 +}
18025 +
18026 +/* REISER4_INO_IS_OID */
18027 +#endif
18028 +
18029 +static inline oid_t get_inode_locality(const struct inode *inode)
18030 +{
18031 +       return reiser4_inode_data(inode)->locality_id;
18032 +}
18033 +
18034 +#if REISER4_LARGE_KEY
18035 +static inline __u64 get_inode_ordering(const struct inode *inode)
18036 +{
18037 +       return reiser4_inode_data(inode)->ordering;
18038 +}
18039 +
18040 +static inline void set_inode_ordering(const struct inode *inode, __u64 ordering)
18041 +{
18042 +       reiser4_inode_data(inode)->ordering = ordering;
18043 +}
18044 +
18045 +#else
18046 +
18047 +#define get_inode_ordering(inode) (0)
18048 +#define set_inode_ordering(inode, val) noop
18049 +
18050 +#endif
18051 +
18052 +/* return inode in which @uf_info is embedded */
18053 +static inline struct inode *
18054 +unix_file_info_to_inode(const struct unix_file_info * uf_info)
18055 +{
18056 +       return &container_of(uf_info, struct reiser4_inode_object,
18057 +                            p.file_plugin_data.unix_file_info)->vfs_inode;
18058 +}
18059 +
18060 +extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const));
18061 +extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const));
18062 +
18063 +extern reiser4_tree *reiser4_tree_by_inode(const struct inode *inode);
18064 +
18065 +#if REISER4_DEBUG
18066 +extern void reiser4_inode_invariant(const struct inode *inode);
18067 +extern int inode_has_no_jnodes(reiser4_inode *);
18068 +#else
18069 +#define reiser4_inode_invariant(inode) noop
18070 +#endif
18071 +
18072 +static inline int spin_inode_is_locked(const struct inode *inode)
18073 +{
18074 +       assert_spin_locked(&reiser4_inode_data(inode)->guard);
18075 +       return 1;
18076 +}
18077 +
18078 +/**
18079 + * spin_lock_inode - lock reiser4_inode' embedded spinlock
18080 + * @inode: inode to lock
18081 + *
18082 + * In debug mode it checks that lower priority locks are not held and
18083 + * increments reiser4_context's lock counters on which lock ordering checking
18084 + * is based.
18085 + */
18086 +static inline void spin_lock_inode(struct inode *inode)
18087 +{
18088 +       assert("", LOCK_CNT_NIL(spin_locked));
18089 +       /* check lock ordering */
18090 +       assert_spin_not_locked(&d_lock);
18091 +
18092 +       spin_lock(&reiser4_inode_data(inode)->guard);
18093 +
18094 +       LOCK_CNT_INC(spin_locked_inode);
18095 +       LOCK_CNT_INC(spin_locked);
18096 +
18097 +       reiser4_inode_invariant(inode);
18098 +}
18099 +
18100 +/**
18101 + * spin_unlock_inode - unlock reiser4_inode' embedded spinlock
18102 + * @inode: inode to unlock
18103 + *
18104 + * In debug mode it checks that spinlock is held and decrements
18105 + * reiser4_context's lock counters on which lock ordering checking is based.
18106 + */
18107 +static inline void spin_unlock_inode(struct inode *inode)
18108 +{
18109 +       assert_spin_locked(&reiser4_inode_data(inode)->guard);
18110 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode));
18111 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
18112 +
18113 +       reiser4_inode_invariant(inode);
18114 +
18115 +       LOCK_CNT_DEC(spin_locked_inode);
18116 +       LOCK_CNT_DEC(spin_locked);
18117 +
18118 +       spin_unlock(&reiser4_inode_data(inode)->guard);
18119 +}
18120 +
18121 +extern znode *inode_get_vroot(struct inode *inode);
18122 +extern void inode_set_vroot(struct inode *inode, znode * vroot);
18123 +
18124 +extern int reiser4_max_filename_len(const struct inode *inode);
18125 +extern int max_hash_collisions(const struct inode *dir);
18126 +extern void reiser4_unlock_inode(struct inode *inode);
18127 +extern int is_reiser4_inode(const struct inode *inode);
18128 +extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *);
18129 +extern struct inode *reiser4_iget(struct super_block *super,
18130 +                                 const reiser4_key * key, int silent);
18131 +extern void reiser4_iget_complete(struct inode *inode);
18132 +extern void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f);
18133 +extern void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f);
18134 +extern int reiser4_inode_get_flag(const struct inode *inode,
18135 +                                 reiser4_file_plugin_flags f);
18136 +
18137 +/*  has inode been initialized? */
18138 +static inline int
18139 +is_inode_loaded(const struct inode *inode /* inode queried */ )
18140 +{
18141 +       assert("nikita-1120", inode != NULL);
18142 +       return reiser4_inode_get_flag(inode, REISER4_LOADED);
18143 +}
18144 +
18145 +extern file_plugin *inode_file_plugin(const struct inode *inode);
18146 +extern dir_plugin *inode_dir_plugin(const struct inode *inode);
18147 +extern formatting_plugin *inode_formatting_plugin(const struct inode *inode);
18148 +extern hash_plugin *inode_hash_plugin(const struct inode *inode);
18149 +extern fibration_plugin *inode_fibration_plugin(const struct inode *inode);
18150 +extern cipher_plugin *inode_cipher_plugin(const struct inode *inode);
18151 +extern digest_plugin *inode_digest_plugin(const struct inode *inode);
18152 +extern compression_plugin *inode_compression_plugin(const struct inode *inode);
18153 +extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode
18154 +                                                             *inode);
18155 +extern cluster_plugin *inode_cluster_plugin(const struct inode *inode);
18156 +extern file_plugin *inode_create_plugin(const struct inode *inode);
18157 +extern item_plugin *inode_sd_plugin(const struct inode *inode);
18158 +extern item_plugin *inode_dir_item_plugin(const struct inode *inode);
18159 +extern file_plugin *child_create_plugin(const struct inode *inode);
18160 +
18161 +extern void reiser4_make_bad_inode(struct inode *inode);
18162 +
18163 +extern void inode_set_extension(struct inode *inode, sd_ext_bits ext);
18164 +extern void inode_clr_extension(struct inode *inode, sd_ext_bits ext);
18165 +extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new);
18166 +extern void inode_check_scale_nolock(struct inode * inode, __u64 old, __u64 new);
18167 +
18168 +#define INODE_SET_SIZE(i, value)                       \
18169 +({                                                     \
18170 +       struct inode *__i;                              \
18171 +       typeof(value) __v;                              \
18172 +                                                       \
18173 +       __i = (i);                                      \
18174 +       __v = (value);                                  \
18175 +       inode_check_scale(__i, __i->i_size, __v);       \
18176 +       i_size_write(__i, __v);                         \
18177 +})
18178 +
18179 +/*
18180 + * update field @field in inode @i to contain value @value.
18181 + */
18182 +#define INODE_SET_FIELD(i, field, value)               \
18183 +({                                                     \
18184 +       struct inode *__i;                              \
18185 +       typeof(value) __v;                              \
18186 +                                                       \
18187 +       __i = (i);                                      \
18188 +       __v = (value);                                  \
18189 +       inode_check_scale(__i, __i->field, __v);        \
18190 +       __i->field = __v;                               \
18191 +})
18192 +
18193 +#define INODE_INC_FIELD(i, field)                              \
18194 +({                                                             \
18195 +       struct inode *__i;                                      \
18196 +                                                               \
18197 +       __i = (i);                                              \
18198 +       inode_check_scale(__i, __i->field, __i->field + 1);     \
18199 +       ++ __i->field;                                          \
18200 +})
18201 +
18202 +#define INODE_DEC_FIELD(i, field)                              \
18203 +({                                                             \
18204 +       struct inode *__i;                                      \
18205 +                                                               \
18206 +       __i = (i);                                              \
18207 +       inode_check_scale(__i, __i->field, __i->field - 1);     \
18208 +       -- __i->field;                                          \
18209 +})
18210 +
18211 +/* See comment before reiser4_readdir_common() for description. */
18212 +static inline struct list_head *get_readdir_list(const struct inode *inode)
18213 +{
18214 +       return &reiser4_inode_data(inode)->lists.readdir_list;
18215 +}
18216 +
18217 +extern void init_inode_ordering(struct inode *inode,
18218 +                               reiser4_object_create_data * crd, int create);
18219 +
18220 +static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode)
18221 +{
18222 +       return &reiser4_inode_data(inode)->jnodes_tree;
18223 +}
18224 +
18225 +static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode
18226 +                                                                 * r4_inode)
18227 +{
18228 +       return &r4_inode->jnodes_tree;
18229 +}
18230 +
18231 +#if REISER4_DEBUG
18232 +extern void print_inode(const char *prefix, const struct inode *i);
18233 +#endif
18234 +
18235 +int is_dir_empty(const struct inode *);
18236 +
18237 +/* __REISER4_INODE_H__ */
18238 +#endif
18239 +
18240 +/* Make Linus happy.
18241 +   Local variables:
18242 +   c-indentation-style: "K&R"
18243 +   mode-name: "LC"
18244 +   c-basic-offset: 8
18245 +   tab-width: 8
18246 +   fill-column: 120
18247 +   End:
18248 +*/
18249 diff -urN linux-2.6.27.orig/fs/reiser4/ioctl.h linux-2.6.27/fs/reiser4/ioctl.h
18250 --- linux-2.6.27.orig/fs/reiser4/ioctl.h        1970-01-01 03:00:00.000000000 +0300
18251 +++ linux-2.6.27/fs/reiser4/ioctl.h     2008-10-12 18:20:00.000000000 +0400
18252 @@ -0,0 +1,41 @@
18253 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
18254 + * reiser4/README */
18255 +
18256 +#if !defined( __REISER4_IOCTL_H__ )
18257 +#define __REISER4_IOCTL_H__
18258 +
18259 +#include <linux/fs.h>
18260 +
18261 +/*
18262 + * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into
18263 + * extents and fix in this state. This is used by applications that rely on
18264 + *
18265 + *     . files being block aligned, and
18266 + *
18267 + *     . files never migrating on disk
18268 + *
18269 + * for example, boot loaders (LILO) need this.
18270 + *
18271 + * This ioctl should be used as
18272 + *
18273 + *     result = ioctl(fd, REISER4_IOC_UNPACK);
18274 + *
18275 + * File behind fd descriptor will be converted to the extents (if necessary),
18276 + * and its stat-data will be updated so that it will never be converted back
18277 + * into tails again.
18278 + */
18279 +#define REISER4_IOC_UNPACK _IOW(0xCD,1,long)
18280 +
18281 +/* __REISER4_IOCTL_H__ */
18282 +#endif
18283 +
18284 +/* Make Linus happy.
18285 +   Local variables:
18286 +   c-indentation-style: "K&R"
18287 +   mode-name: "LC"
18288 +   c-basic-offset: 8
18289 +   tab-width: 8
18290 +   fill-column: 120
18291 +   scroll-step: 1
18292 +   End:
18293 +*/
18294 diff -urN linux-2.6.27.orig/fs/reiser4/jnode.c linux-2.6.27/fs/reiser4/jnode.c
18295 --- linux-2.6.27.orig/fs/reiser4/jnode.c        1970-01-01 03:00:00.000000000 +0300
18296 +++ linux-2.6.27/fs/reiser4/jnode.c     2008-10-12 18:20:00.000000000 +0400
18297 @@ -0,0 +1,1924 @@
18298 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
18299 + * reiser4/README */
18300 +/* Jnode manipulation functions. */
18301 +/* Jnode is entity used to track blocks with data and meta-data in reiser4.
18302 +
18303 +   In particular, jnodes are used to track transactional information
18304 +   associated with each block. Each znode contains jnode as ->zjnode field.
18305 +
18306 +   Jnode stands for either Josh or Journal node.
18307 +*/
18308 +
18309 +/*
18310 + * Taxonomy.
18311 + *
18312 + *     Jnode represents block containing data or meta-data. There are jnodes
18313 + *     for:
18314 + *
18315 + *         unformatted blocks (jnodes proper). There are plans, however to
18316 + *         have a handle per extent unit rather than per each unformatted
18317 + *         block, because there are so many of them.
18318 + *
18319 + *         For bitmaps. Each bitmap is actually represented by two jnodes--one
18320 + *         for working and another for "commit" data, together forming bnode.
18321 + *
18322 + *         For io-heads. These are used by log writer.
18323 + *
18324 + *         For formatted nodes (znode). See comment at the top of znode.c for
18325 + *         details specific to the formatted nodes (znodes).
18326 + *
18327 + * Node data.
18328 + *
18329 + *     Jnode provides access to the data of node it represents. Data are
18330 + *     stored in a page. Page is kept in a page cache. This means, that jnodes
18331 + *     are highly interconnected with page cache and VM internals.
18332 + *
18333 + *     jnode has a pointer to page (->pg) containing its data. Pointer to data
18334 + *     themselves is cached in ->data field to avoid frequent calls to
18335 + *     page_address().
18336 + *
18337 + *     jnode and page are attached to each other by jnode_attach_page(). This
18338 + *     function places pointer to jnode in set_page_private(), sets PG_private
18339 + *     flag and increments page counter.
18340 + *
18341 + *     Opposite operation is performed by page_clear_jnode().
18342 + *
18343 + *     jnode->pg is protected by jnode spin lock, and page->private is
18344 + *     protected by page lock. See comment at the top of page_cache.c for
18345 + *     more.
18346 + *
18347 + *     page can be detached from jnode for two reasons:
18348 + *
18349 + *         . jnode is removed from a tree (file is truncated, of formatted
18350 + *         node is removed by balancing).
18351 + *
18352 + *         . during memory pressure, VM calls ->releasepage() method
18353 + *         (reiser4_releasepage()) to evict page from memory.
18354 + *
18355 + *    (there, of course, is also umount, but this is special case we are not
18356 + *    concerned with here).
18357 + *
18358 + *    To protect jnode page from eviction, one calls jload() function that
18359 + *    "pins" page in memory (loading it if necessary), increments
18360 + *    jnode->d_count, and kmap()s page. Page is unpinned through call to
18361 + *    jrelse().
18362 + *
18363 + * Jnode life cycle.
18364 + *
18365 + *    jnode is created, placed in hash table, and, optionally, in per-inode
18366 + *    radix tree. Page can be attached to jnode, pinned, released, etc.
18367 + *
18368 + *    When jnode is captured into atom its reference counter is
18369 + *    increased. While being part of an atom, jnode can be "early
18370 + *    flushed". This means that as part of flush procedure, jnode is placed
18371 + *    into "relocate set", and its page is submitted to the disk. After io
18372 + *    completes, page can be detached, then loaded again, re-dirtied, etc.
18373 + *
18374 + *    Thread acquired reference to jnode by calling jref() and releases it by
18375 + *    jput(). When last reference is removed, jnode is still retained in
18376 + *    memory (cached) if it has page attached, _unless_ it is scheduled for
18377 + *    destruction (has JNODE_HEARD_BANSHEE bit set).
18378 + *
18379 + *    Tree read-write lock was used as "existential" lock for jnodes. That is,
18380 + *    jnode->x_count could be changed from 0 to 1 only under tree write lock,
18381 + *    that is, tree lock protected unreferenced jnodes stored in the hash
18382 + *    table, from recycling.
18383 + *
18384 + *    This resulted in high contention on tree lock, because jref()/jput() is
18385 + *    frequent operation. To ameliorate this problem, RCU is used: when jput()
18386 + *    is just about to release last reference on jnode it sets JNODE_RIP bit
18387 + *    on it, and then proceed with jnode destruction (removing jnode from hash
18388 + *    table, cbk_cache, detaching page, etc.). All places that change jnode
18389 + *    reference counter from 0 to 1 (jlookup(), zlook(), zget(), and
18390 + *    cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by
18391 + *    jnode_rip_check() function), and pretend that nothing was found in hash
18392 + *    table if bit is set.
18393 + *
18394 + *    jput defers actual return of jnode into slab cache to some later time
18395 + *    (by call_rcu()), this guarantees that other threads can safely continue
18396 + *    working with JNODE_RIP-ped jnode.
18397 + *
18398 + */
18399 +
18400 +#include "reiser4.h"
18401 +#include "debug.h"
18402 +#include "dformat.h"
18403 +#include "jnode.h"
18404 +#include "plugin/plugin_header.h"
18405 +#include "plugin/plugin.h"
18406 +#include "txnmgr.h"
18407 +/*#include "jnode.h"*/
18408 +#include "znode.h"
18409 +#include "tree.h"
18410 +#include "tree_walk.h"
18411 +#include "super.h"
18412 +#include "inode.h"
18413 +#include "page_cache.h"
18414 +
18415 +#include <asm/uaccess.h>       /* UML needs this for PAGE_OFFSET */
18416 +#include <linux/types.h>
18417 +#include <linux/slab.h>
18418 +#include <linux/pagemap.h>
18419 +#include <linux/swap.h>
18420 +#include <linux/fs.h>          /* for struct address_space  */
18421 +#include <linux/writeback.h>   /* for inode_lock */
18422 +
18423 +static struct kmem_cache *_jnode_slab = NULL;
18424 +
18425 +static void jnode_set_type(jnode * node, jnode_type type);
18426 +static int jdelete(jnode * node);
18427 +static int jnode_try_drop(jnode * node);
18428 +
18429 +#if REISER4_DEBUG
18430 +static int jnode_invariant(jnode * node, int tlocked, int jlocked);
18431 +#endif
18432 +
18433 +/* true if valid page is attached to jnode */
18434 +static inline int jnode_is_parsed(jnode * node)
18435 +{
18436 +       return JF_ISSET(node, JNODE_PARSED);
18437 +}
18438 +
18439 +/* hash table support */
18440 +
18441 +/* compare two jnode keys for equality. Used by hash-table macros */
18442 +static inline int jnode_key_eq(const struct jnode_key * k1,
18443 +                              const struct jnode_key * k2)
18444 +{
18445 +       assert("nikita-2350", k1 != NULL);
18446 +       assert("nikita-2351", k2 != NULL);
18447 +
18448 +       return (k1->index == k2->index && k1->objectid == k2->objectid);
18449 +}
18450 +
18451 +/* Hash jnode by its key (inode plus offset). Used by hash-table macros */
18452 +static inline __u32 jnode_key_hashfn(j_hash_table * table,
18453 +                                    const struct jnode_key * key)
18454 +{
18455 +       assert("nikita-2352", key != NULL);
18456 +       assert("nikita-3346", IS_POW(table->_buckets));
18457 +
18458 +       /* yes, this is remarkable simply (where not stupid) hash function. */
18459 +       return (key->objectid + key->index) & (table->_buckets - 1);
18460 +}
18461 +
18462 +/* The hash table definition */
18463 +#define KMALLOC(size) reiser4_vmalloc(size)
18464 +#define KFREE(ptr, size) vfree(ptr)
18465 +TYPE_SAFE_HASH_DEFINE(j, jnode, struct jnode_key, key.j, link.j,
18466 +                     jnode_key_hashfn, jnode_key_eq);
18467 +#undef KFREE
18468 +#undef KMALLOC
18469 +
18470 +/* call this to initialise jnode hash table */
18471 +int jnodes_tree_init(reiser4_tree * tree /* tree to initialise jnodes for */ )
18472 +{
18473 +       assert("nikita-2359", tree != NULL);
18474 +       return j_hash_init(&tree->jhash_table, 16384);
18475 +}
18476 +
18477 +/* call this to destroy jnode hash table. This is called during umount. */
18478 +int jnodes_tree_done(reiser4_tree * tree /* tree to destroy jnodes for */ )
18479 +{
18480 +       j_hash_table *jtable;
18481 +       jnode *node;
18482 +       jnode *next;
18483 +
18484 +       assert("nikita-2360", tree != NULL);
18485 +
18486 +       /*
18487 +        * Scan hash table and free all jnodes.
18488 +        */
18489 +       jtable = &tree->jhash_table;
18490 +       if (jtable->_table) {
18491 +               for_all_in_htable(jtable, j, node, next) {
18492 +                       assert("nikita-2361", !atomic_read(&node->x_count));
18493 +                       jdrop(node);
18494 +               }
18495 +
18496 +               j_hash_done(&tree->jhash_table);
18497 +       }
18498 +       return 0;
18499 +}
18500 +
18501 +/**
18502 + * init_jnodes - create jnode cache
18503 + *
18504 + * Initializes slab cache jnodes. It is part of reiser4 module initialization.
18505 + */
18506 +int init_jnodes(void)
18507 +{
18508 +       assert("umka-168", _jnode_slab == NULL);
18509 +
18510 +       _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0,
18511 +                                       SLAB_HWCACHE_ALIGN |
18512 +                                       SLAB_RECLAIM_ACCOUNT, NULL);
18513 +       if (_jnode_slab == NULL)
18514 +               return RETERR(-ENOMEM);
18515 +
18516 +       return 0;
18517 +}
18518 +
18519 +/**
18520 + * done_znodes - delete znode cache
18521 + *
18522 + * This is called on reiser4 module unloading or system shutdown.
18523 + */
18524 +void done_jnodes(void)
18525 +{
18526 +       destroy_reiser4_cache(&_jnode_slab);
18527 +}
18528 +
18529 +/* Initialize a jnode. */
18530 +void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type)
18531 +{
18532 +       assert("umka-175", node != NULL);
18533 +
18534 +       memset(node, 0, sizeof(jnode));
18535 +       ON_DEBUG(node->magic = JMAGIC);
18536 +       jnode_set_type(node, type);
18537 +       atomic_set(&node->d_count, 0);
18538 +       atomic_set(&node->x_count, 0);
18539 +       spin_lock_init(&node->guard);
18540 +       spin_lock_init(&node->load);
18541 +       node->atom = NULL;
18542 +       node->tree = tree;
18543 +       INIT_LIST_HEAD(&node->capture_link);
18544 +
18545 +       ASSIGN_NODE_LIST(node, NOT_CAPTURED);
18546 +
18547 +       INIT_RCU_HEAD(&node->rcu);
18548 +
18549 +#if REISER4_DEBUG
18550 +       {
18551 +               reiser4_super_info_data *sbinfo;
18552 +
18553 +               sbinfo = get_super_private(tree->super);
18554 +               spin_lock_irq(&sbinfo->all_guard);
18555 +               list_add(&node->jnodes, &sbinfo->all_jnodes);
18556 +               spin_unlock_irq(&sbinfo->all_guard);
18557 +       }
18558 +#endif
18559 +}
18560 +
18561 +#if REISER4_DEBUG
18562 +/*
18563 + * Remove jnode from ->all_jnodes list.
18564 + */
18565 +static void jnode_done(jnode * node, reiser4_tree * tree)
18566 +{
18567 +       reiser4_super_info_data *sbinfo;
18568 +
18569 +       sbinfo = get_super_private(tree->super);
18570 +
18571 +       spin_lock_irq(&sbinfo->all_guard);
18572 +       assert("nikita-2422", !list_empty(&node->jnodes));
18573 +       list_del_init(&node->jnodes);
18574 +       spin_unlock_irq(&sbinfo->all_guard);
18575 +}
18576 +#endif
18577 +
18578 +/* return already existing jnode of page */
18579 +jnode *jnode_by_page(struct page *pg)
18580 +{
18581 +       assert("nikita-2066", pg != NULL);
18582 +       assert("nikita-2400", PageLocked(pg));
18583 +       assert("nikita-2068", PagePrivate(pg));
18584 +       assert("nikita-2067", jprivate(pg) != NULL);
18585 +       return jprivate(pg);
18586 +}
18587 +
18588 +/* exported functions to allocate/free jnode objects outside this file */
18589 +jnode *jalloc(void)
18590 +{
18591 +       jnode *jal = kmem_cache_alloc(_jnode_slab, reiser4_ctx_gfp_mask_get());
18592 +       return jal;
18593 +}
18594 +
18595 +/* return jnode back to the slab allocator */
18596 +inline void jfree(jnode * node)
18597 +{
18598 +       assert("zam-449", node != NULL);
18599 +
18600 +       assert("nikita-2663", (list_empty_careful(&node->capture_link) &&
18601 +                              NODE_LIST(node) == NOT_CAPTURED));
18602 +       assert("nikita-3222", list_empty(&node->jnodes));
18603 +       assert("nikita-3221", jnode_page(node) == NULL);
18604 +
18605 +       /* not yet phash_jnode_destroy(node); */
18606 +
18607 +       kmem_cache_free(_jnode_slab, node);
18608 +}
18609 +
18610 +/*
18611 + * This function is supplied as RCU callback. It actually frees jnode when
18612 + * last reference to it is gone.
18613 + */
18614 +static void jnode_free_actor(struct rcu_head *head)
18615 +{
18616 +       jnode *node;
18617 +       jnode_type jtype;
18618 +
18619 +       node = container_of(head, jnode, rcu);
18620 +       jtype = jnode_get_type(node);
18621 +
18622 +       ON_DEBUG(jnode_done(node, jnode_get_tree(node)));
18623 +
18624 +       switch (jtype) {
18625 +       case JNODE_IO_HEAD:
18626 +       case JNODE_BITMAP:
18627 +       case JNODE_UNFORMATTED_BLOCK:
18628 +               jfree(node);
18629 +               break;
18630 +       case JNODE_FORMATTED_BLOCK:
18631 +               zfree(JZNODE(node));
18632 +               break;
18633 +       case JNODE_INODE:
18634 +       default:
18635 +               wrong_return_value("nikita-3197", "Wrong jnode type");
18636 +       }
18637 +}
18638 +
18639 +/*
18640 + * Free a jnode. Post a callback to be executed later through RCU when all
18641 + * references to @node are released.
18642 + */
18643 +static inline void jnode_free(jnode * node, jnode_type jtype)
18644 +{
18645 +       if (jtype != JNODE_INODE) {
18646 +               /*assert("nikita-3219", list_empty(&node->rcu.list)); */
18647 +               call_rcu(&node->rcu, jnode_free_actor);
18648 +       } else
18649 +               jnode_list_remove(node);
18650 +}
18651 +
18652 +/* allocate new unformatted jnode */
18653 +static jnode *jnew_unformatted(void)
18654 +{
18655 +       jnode *jal;
18656 +
18657 +       jal = jalloc();
18658 +       if (jal == NULL)
18659 +               return NULL;
18660 +
18661 +       jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK);
18662 +       jal->key.j.mapping = NULL;
18663 +       jal->key.j.index = (unsigned long)-1;
18664 +       jal->key.j.objectid = 0;
18665 +       return jal;
18666 +}
18667 +
18668 +/* look for jnode with given mapping and offset within hash table */
18669 +jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index)
18670 +{
18671 +       struct jnode_key jkey;
18672 +       jnode *node;
18673 +
18674 +       assert("nikita-2353", tree != NULL);
18675 +
18676 +       jkey.objectid = objectid;
18677 +       jkey.index = index;
18678 +
18679 +       /*
18680 +        * hash table is _not_ protected by any lock during lookups. All we
18681 +        * have to do is to disable preemption to keep RCU happy.
18682 +        */
18683 +
18684 +       rcu_read_lock();
18685 +       node = j_hash_find(&tree->jhash_table, &jkey);
18686 +       if (node != NULL) {
18687 +               /* protect @node from recycling */
18688 +               jref(node);
18689 +               assert("nikita-2955", jnode_invariant(node, 0, 0));
18690 +               node = jnode_rip_check(tree, node);
18691 +       }
18692 +       rcu_read_unlock();
18693 +       return node;
18694 +}
18695 +
18696 +/* per inode radix tree of jnodes is protected by tree's read write spin lock */
18697 +static jnode *jfind_nolock(struct address_space *mapping, unsigned long index)
18698 +{
18699 +       assert("vs-1694", mapping->host != NULL);
18700 +
18701 +       return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index);
18702 +}
18703 +
18704 +jnode *jfind(struct address_space * mapping, unsigned long index)
18705 +{
18706 +       reiser4_tree *tree;
18707 +       jnode *node;
18708 +
18709 +       assert("vs-1694", mapping->host != NULL);
18710 +       tree = reiser4_tree_by_inode(mapping->host);
18711 +
18712 +       read_lock_tree(tree);
18713 +       node = jfind_nolock(mapping, index);
18714 +       if (node != NULL)
18715 +               jref(node);
18716 +       read_unlock_tree(tree);
18717 +       return node;
18718 +}
18719 +
18720 +static void inode_attach_jnode(jnode * node)
18721 +{
18722 +       struct inode *inode;
18723 +       reiser4_inode *info;
18724 +       struct radix_tree_root *rtree;
18725 +
18726 +       assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18727 +       assert("zam-1043", node->key.j.mapping != NULL);
18728 +       inode = node->key.j.mapping->host;
18729 +       info = reiser4_inode_data(inode);
18730 +       rtree = jnode_tree_by_reiser4_inode(info);
18731 +       if (rtree->rnode == NULL) {
18732 +               /* prevent inode from being pruned when it has jnodes attached
18733 +                  to it */
18734 +               spin_lock_irq(&inode->i_data.tree_lock);
18735 +               inode->i_data.nrpages++;
18736 +               spin_unlock_irq(&inode->i_data.tree_lock);
18737 +       }
18738 +       assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0));
18739 +       check_me("zam-1045",
18740 +                !radix_tree_insert(rtree, node->key.j.index, node));
18741 +       ON_DEBUG(info->nr_jnodes++);
18742 +}
18743 +
18744 +static void inode_detach_jnode(jnode * node)
18745 +{
18746 +       struct inode *inode;
18747 +       reiser4_inode *info;
18748 +       struct radix_tree_root *rtree;
18749 +
18750 +       assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18751 +       assert("zam-1044", node->key.j.mapping != NULL);
18752 +       inode = node->key.j.mapping->host;
18753 +       info = reiser4_inode_data(inode);
18754 +       rtree = jnode_tree_by_reiser4_inode(info);
18755 +
18756 +       assert("zam-1051", info->nr_jnodes != 0);
18757 +       assert("zam-1052", rtree->rnode != NULL);
18758 +       ON_DEBUG(info->nr_jnodes--);
18759 +
18760 +       /* delete jnode from inode's radix tree of jnodes */
18761 +       check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
18762 +       if (rtree->rnode == NULL) {
18763 +               /* inode can be pruned now */
18764 +               spin_lock_irq(&inode->i_data.tree_lock);
18765 +               inode->i_data.nrpages--;
18766 +               spin_unlock_irq(&inode->i_data.tree_lock);
18767 +       }
18768 +}
18769 +
18770 +/* put jnode into hash table (where they can be found by flush who does not know
18771 +   mapping) and to inode's tree of jnodes (where they can be found (hopefully
18772 +   faster) in places where mapping is known). Currently it is used by
18773 +   fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is
18774 +   created */
18775 +static void
18776 +hash_unformatted_jnode(jnode * node, struct address_space *mapping,
18777 +                      unsigned long index)
18778 +{
18779 +       j_hash_table *jtable;
18780 +
18781 +       assert("vs-1446", jnode_is_unformatted(node));
18782 +       assert("vs-1442", node->key.j.mapping == 0);
18783 +       assert("vs-1443", node->key.j.objectid == 0);
18784 +       assert("vs-1444", node->key.j.index == (unsigned long)-1);
18785 +       assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
18786 +
18787 +       node->key.j.mapping = mapping;
18788 +       node->key.j.objectid = get_inode_oid(mapping->host);
18789 +       node->key.j.index = index;
18790 +
18791 +       jtable = &jnode_get_tree(node)->jhash_table;
18792 +
18793 +       /* race with some other thread inserting jnode into the hash table is
18794 +        * impossible, because we keep the page lock. */
18795 +       /*
18796 +        * following assertion no longer holds because of RCU: it is possible
18797 +        * jnode is in the hash table, but with JNODE_RIP bit set.
18798 +        */
18799 +       /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */
18800 +       j_hash_insert_rcu(jtable, node);
18801 +       inode_attach_jnode(node);
18802 +}
18803 +
18804 +static void unhash_unformatted_node_nolock(jnode * node)
18805 +{
18806 +       assert("vs-1683", node->key.j.mapping != NULL);
18807 +       assert("vs-1684",
18808 +              node->key.j.objectid ==
18809 +              get_inode_oid(node->key.j.mapping->host));
18810 +
18811 +       /* remove jnode from hash-table */
18812 +       j_hash_remove_rcu(&node->tree->jhash_table, node);
18813 +       inode_detach_jnode(node);
18814 +       node->key.j.mapping = NULL;
18815 +       node->key.j.index = (unsigned long)-1;
18816 +       node->key.j.objectid = 0;
18817 +
18818 +}
18819 +
18820 +/* remove jnode from hash table and from inode's tree of jnodes. This is used in
18821 +   reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes ->
18822 +   reiser4_uncapture_jnode */
18823 +void unhash_unformatted_jnode(jnode * node)
18824 +{
18825 +       assert("vs-1445", jnode_is_unformatted(node));
18826 +
18827 +       write_lock_tree(node->tree);
18828 +       unhash_unformatted_node_nolock(node);
18829 +       write_unlock_tree(node->tree);
18830 +}
18831 +
18832 +/*
18833 + * search hash table for a jnode with given oid and index. If not found,
18834 + * allocate new jnode, insert it, and also insert into radix tree for the
18835 + * given inode/mapping.
18836 + */
18837 +static jnode *find_get_jnode(reiser4_tree * tree,
18838 +                            struct address_space *mapping,
18839 +                            oid_t oid, unsigned long index)
18840 +{
18841 +       jnode *result;
18842 +       jnode *shadow;
18843 +       int preload;
18844 +
18845 +       result = jnew_unformatted();
18846 +
18847 +       if (unlikely(result == NULL))
18848 +               return ERR_PTR(RETERR(-ENOMEM));
18849 +
18850 +       preload = radix_tree_preload(reiser4_ctx_gfp_mask_get());
18851 +       if (preload != 0)
18852 +               return ERR_PTR(preload);
18853 +
18854 +       write_lock_tree(tree);
18855 +       shadow = jfind_nolock(mapping, index);
18856 +       if (likely(shadow == NULL)) {
18857 +               /* add new jnode to hash table and inode's radix tree of jnodes */
18858 +               jref(result);
18859 +               hash_unformatted_jnode(result, mapping, index);
18860 +       } else {
18861 +               /* jnode is found in inode's radix tree of jnodes */
18862 +               jref(shadow);
18863 +               jnode_free(result, JNODE_UNFORMATTED_BLOCK);
18864 +               assert("vs-1498", shadow->key.j.mapping == mapping);
18865 +               result = shadow;
18866 +       }
18867 +       write_unlock_tree(tree);
18868 +
18869 +       assert("nikita-2955",
18870 +              ergo(result != NULL, jnode_invariant(result, 0, 0)));
18871 +       radix_tree_preload_end();
18872 +       return result;
18873 +}
18874 +
18875 +/* jget() (a la zget() but for unformatted nodes). Returns (and possibly
18876 +   creates) jnode corresponding to page @pg. jnode is attached to page and
18877 +   inserted into jnode hash-table. */
18878 +static jnode *do_jget(reiser4_tree * tree, struct page *pg)
18879 +{
18880 +       /*
18881 +        * There are two ways to create jnode: starting with pre-existing page
18882 +        * and without page.
18883 +        *
18884 +        * When page already exists, jnode is created
18885 +        * (jnode_of_page()->do_jget()) under page lock. This is done in
18886 +        * ->writepage(), or when capturing anonymous page dirtied through
18887 +        * mmap.
18888 +        *
18889 +        * Jnode without page is created by index_extent_jnode().
18890 +        *
18891 +        */
18892 +
18893 +       jnode *result;
18894 +       oid_t oid = get_inode_oid(pg->mapping->host);
18895 +
18896 +       assert("umka-176", pg != NULL);
18897 +       assert("nikita-2394", PageLocked(pg));
18898 +
18899 +       result = jprivate(pg);
18900 +       if (likely(result != NULL))
18901 +               return jref(result);
18902 +
18903 +       tree = reiser4_tree_by_page(pg);
18904 +
18905 +       /* check hash-table first */
18906 +       result = jfind(pg->mapping, pg->index);
18907 +       if (unlikely(result != NULL)) {
18908 +               spin_lock_jnode(result);
18909 +               jnode_attach_page(result, pg);
18910 +               spin_unlock_jnode(result);
18911 +               result->key.j.mapping = pg->mapping;
18912 +               return result;
18913 +       }
18914 +
18915 +       /* since page is locked, jnode should be allocated with GFP_NOFS flag */
18916 +       reiser4_ctx_gfp_mask_force(GFP_NOFS);
18917 +       result = find_get_jnode(tree, pg->mapping, oid, pg->index);
18918 +       if (unlikely(IS_ERR(result)))
18919 +               return result;
18920 +       /* attach jnode to page */
18921 +       spin_lock_jnode(result);
18922 +       jnode_attach_page(result, pg);
18923 +       spin_unlock_jnode(result);
18924 +       return result;
18925 +}
18926 +
18927 +/*
18928 + * return jnode for @pg, creating it if necessary.
18929 + */
18930 +jnode *jnode_of_page(struct page * pg)
18931 +{
18932 +       jnode *result;
18933 +
18934 +       assert("umka-176", pg != NULL);
18935 +       assert("nikita-2394", PageLocked(pg));
18936 +
18937 +       result = do_jget(reiser4_tree_by_page(pg), pg);
18938 +
18939 +       if (REISER4_DEBUG && !IS_ERR(result)) {
18940 +               assert("nikita-3210", result == jprivate(pg));
18941 +               assert("nikita-2046", jnode_page(jprivate(pg)) == pg);
18942 +               if (jnode_is_unformatted(jprivate(pg))) {
18943 +                       assert("nikita-2364",
18944 +                              jprivate(pg)->key.j.index == pg->index);
18945 +                       assert("nikita-2367",
18946 +                              jprivate(pg)->key.j.mapping == pg->mapping);
18947 +                       assert("nikita-2365",
18948 +                              jprivate(pg)->key.j.objectid ==
18949 +                              get_inode_oid(pg->mapping->host));
18950 +                       assert("vs-1200",
18951 +                              jprivate(pg)->key.j.objectid ==
18952 +                              pg->mapping->host->i_ino);
18953 +                       assert("nikita-2356",
18954 +                              jnode_is_unformatted(jnode_by_page(pg)));
18955 +               }
18956 +               assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0));
18957 +       }
18958 +       return result;
18959 +}
18960 +
18961 +/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the
18962 + * page.*/
18963 +void jnode_attach_page(jnode * node, struct page *pg)
18964 +{
18965 +       assert("nikita-2060", node != NULL);
18966 +       assert("nikita-2061", pg != NULL);
18967 +
18968 +       assert("nikita-2050", jprivate(pg) == 0ul);
18969 +       assert("nikita-2393", !PagePrivate(pg));
18970 +       assert("vs-1741", node->pg == NULL);
18971 +
18972 +       assert("nikita-2396", PageLocked(pg));
18973 +       assert_spin_locked(&(node->guard));
18974 +
18975 +       page_cache_get(pg);
18976 +       set_page_private(pg, (unsigned long)node);
18977 +       node->pg = pg;
18978 +       SetPagePrivate(pg);
18979 +}
18980 +
18981 +/* Dual to jnode_attach_page: break a binding between page and jnode */
18982 +void page_clear_jnode(struct page *page, jnode * node)
18983 +{
18984 +       assert("nikita-2424", page != NULL);
18985 +       assert("nikita-2425", PageLocked(page));
18986 +       assert("nikita-2426", node != NULL);
18987 +       assert_spin_locked(&(node->guard));
18988 +       assert("nikita-2428", PagePrivate(page));
18989 +
18990 +       assert("nikita-3551", !PageWriteback(page));
18991 +
18992 +       JF_CLR(node, JNODE_PARSED);
18993 +       set_page_private(page, 0ul);
18994 +       ClearPagePrivate(page);
18995 +       node->pg = NULL;
18996 +       page_cache_release(page);
18997 +}
18998 +
18999 +#if 0
19000 +/* it is only used in one place to handle error */
19001 +void
19002 +page_detach_jnode(struct page *page, struct address_space *mapping,
19003 +                 unsigned long index)
19004 +{
19005 +       assert("nikita-2395", page != NULL);
19006 +
19007 +       lock_page(page);
19008 +       if ((page->mapping == mapping) && (page->index == index)
19009 +           && PagePrivate(page)) {
19010 +               jnode *node;
19011 +
19012 +               node = jprivate(page);
19013 +               spin_lock_jnode(node);
19014 +               page_clear_jnode(page, node);
19015 +               spin_unlock_jnode(node);
19016 +       }
19017 +       unlock_page(page);
19018 +}
19019 +#endif  /*  0  */
19020 +
19021 +/* return @node page locked.
19022 +
19023 +   Locking ordering requires that one first takes page lock and afterwards
19024 +   spin lock on node attached to this page. Sometimes it is necessary to go in
19025 +   the opposite direction. This is done through standard trylock-and-release
19026 +   loop.
19027 +*/
19028 +static struct page *jnode_lock_page(jnode * node)
19029 +{
19030 +       struct page *page;
19031 +
19032 +       assert("nikita-2052", node != NULL);
19033 +       assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode));
19034 +
19035 +       while (1) {
19036 +
19037 +               spin_lock_jnode(node);
19038 +               page = jnode_page(node);
19039 +               if (page == NULL) {
19040 +                       break;
19041 +               }
19042 +
19043 +               /* no need to page_cache_get( page ) here, because page cannot
19044 +                  be evicted from memory without detaching it from jnode and
19045 +                  this requires spin lock on jnode that we already hold.
19046 +                */
19047 +               if (trylock_page(page)) {
19048 +                       /* We won a lock on jnode page, proceed. */
19049 +                       break;
19050 +               }
19051 +
19052 +               /* Page is locked by someone else. */
19053 +               page_cache_get(page);
19054 +               spin_unlock_jnode(node);
19055 +               wait_on_page_locked(page);
19056 +               /* it is possible that page was detached from jnode and
19057 +                  returned to the free pool, or re-assigned while we were
19058 +                  waiting on locked bit. This will be rechecked on the next
19059 +                  loop iteration.
19060 +                */
19061 +               page_cache_release(page);
19062 +
19063 +               /* try again */
19064 +       }
19065 +       return page;
19066 +}
19067 +
19068 +/*
19069 + * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify
19070 + * validness of jnode content.
19071 + */
19072 +static inline int jparse(jnode * node)
19073 +{
19074 +       int result;
19075 +
19076 +       assert("nikita-2466", node != NULL);
19077 +
19078 +       spin_lock_jnode(node);
19079 +       if (likely(!jnode_is_parsed(node))) {
19080 +               result = jnode_ops(node)->parse(node);
19081 +               if (likely(result == 0))
19082 +                       JF_SET(node, JNODE_PARSED);
19083 +       } else
19084 +               result = 0;
19085 +       spin_unlock_jnode(node);
19086 +       return result;
19087 +}
19088 +
19089 +/* Lock a page attached to jnode, create and attach page to jnode if it had no
19090 + * one. */
19091 +static struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags)
19092 +{
19093 +       struct page *page;
19094 +
19095 +       spin_lock_jnode(node);
19096 +       page = jnode_page(node);
19097 +
19098 +       if (page == NULL) {
19099 +               spin_unlock_jnode(node);
19100 +               page = find_or_create_page(jnode_get_mapping(node),
19101 +                                          jnode_get_index(node), gfp_flags);
19102 +               if (page == NULL)
19103 +                       return ERR_PTR(RETERR(-ENOMEM));
19104 +       } else {
19105 +               if (trylock_page(page)) {
19106 +                       spin_unlock_jnode(node);
19107 +                       return page;
19108 +               }
19109 +               page_cache_get(page);
19110 +               spin_unlock_jnode(node);
19111 +               lock_page(page);
19112 +               assert("nikita-3134", page->mapping == jnode_get_mapping(node));
19113 +       }
19114 +
19115 +       spin_lock_jnode(node);
19116 +       if (!jnode_page(node))
19117 +               jnode_attach_page(node, page);
19118 +       spin_unlock_jnode(node);
19119 +
19120 +       page_cache_release(page);
19121 +       assert("zam-894", jnode_page(node) == page);
19122 +       return page;
19123 +}
19124 +
19125 +/* Start read operation for jnode's page if page is not up-to-date. */
19126 +static int jnode_start_read(jnode * node, struct page *page)
19127 +{
19128 +       assert("zam-893", PageLocked(page));
19129 +
19130 +       if (PageUptodate(page)) {
19131 +               unlock_page(page);
19132 +               return 0;
19133 +       }
19134 +       return reiser4_page_io(page, node, READ, reiser4_ctx_gfp_mask_get());
19135 +}
19136 +
19137 +#if REISER4_DEBUG
19138 +static void check_jload(jnode * node, struct page *page)
19139 +{
19140 +       if (jnode_is_znode(node)) {
19141 +               node40_header *nh;
19142 +               znode *z;
19143 +
19144 +               z = JZNODE(node);
19145 +               if (znode_is_any_locked(z)) {
19146 +                       nh = (node40_header *) kmap(page);
19147 +                       /* this only works for node40-only file systems. For
19148 +                        * debugging. */
19149 +                       assert("nikita-3253",
19150 +                              z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items)));
19151 +                       kunmap(page);
19152 +               }
19153 +               assert("nikita-3565", znode_invariant(z));
19154 +       }
19155 +}
19156 +#else
19157 +#define check_jload(node, page) noop
19158 +#endif
19159 +
19160 +/* prefetch jnode to speed up next call to jload. Call this when you are going
19161 + * to call jload() shortly. This will bring appropriate portion of jnode into
19162 + * CPU cache. */
19163 +void jload_prefetch(jnode * node)
19164 +{
19165 +       prefetchw(&node->x_count);
19166 +}
19167 +
19168 +/* load jnode's data into memory */
19169 +int jload_gfp(jnode * node /* node to load */ ,
19170 +             gfp_t gfp_flags /* allocation flags */ ,
19171 +             int do_kmap /* true if page should be kmapped */ )
19172 +{
19173 +       struct page *page;
19174 +       int result = 0;
19175 +       int parsed;
19176 +
19177 +       assert("nikita-3010", reiser4_schedulable());
19178 +
19179 +       prefetchw(&node->pg);
19180 +
19181 +       /* taking d-reference implies taking x-reference. */
19182 +       jref(node);
19183 +
19184 +       /*
19185 +        * acquiring d-reference to @jnode and check for JNODE_PARSED bit
19186 +        * should be atomic, otherwise there is a race against
19187 +        * reiser4_releasepage().
19188 +        */
19189 +       spin_lock(&(node->load));
19190 +       add_d_ref(node);
19191 +       parsed = jnode_is_parsed(node);
19192 +       spin_unlock(&(node->load));
19193 +
19194 +       if (unlikely(!parsed)) {
19195 +               page = jnode_get_page_locked(node, gfp_flags);
19196 +               if (unlikely(IS_ERR(page))) {
19197 +                       result = PTR_ERR(page);
19198 +                       goto failed;
19199 +               }
19200 +
19201 +               result = jnode_start_read(node, page);
19202 +               if (unlikely(result != 0))
19203 +                       goto failed;
19204 +
19205 +               wait_on_page_locked(page);
19206 +               if (unlikely(!PageUptodate(page))) {
19207 +                       result = RETERR(-EIO);
19208 +                       goto failed;
19209 +               }
19210 +
19211 +               if (do_kmap)
19212 +                       node->data = kmap(page);
19213 +
19214 +               result = jparse(node);
19215 +               if (unlikely(result != 0)) {
19216 +                       if (do_kmap)
19217 +                               kunmap(page);
19218 +                       goto failed;
19219 +               }
19220 +               check_jload(node, page);
19221 +       } else {
19222 +               page = jnode_page(node);
19223 +               check_jload(node, page);
19224 +               if (do_kmap)
19225 +                       node->data = kmap(page);
19226 +       }
19227 +
19228 +       if (!is_writeout_mode())
19229 +               /* We do not mark pages active if jload is called as a part of
19230 +                * jnode_flush() or reiser4_write_logs().  Both jnode_flush()
19231 +                * and write_logs() add no value to cached data, there is no
19232 +                * sense to mark pages as active when they go to disk, it just
19233 +                * confuses vm scanning routines because clean page could be
19234 +                * moved out from inactive list as a result of this
19235 +                * mark_page_accessed() call. */
19236 +               mark_page_accessed(page);
19237 +
19238 +       return 0;
19239 +
19240 +      failed:
19241 +       jrelse_tail(node);
19242 +       return result;
19243 +
19244 +}
19245 +
19246 +/* start asynchronous reading for given jnode's page. */
19247 +int jstartio(jnode * node)
19248 +{
19249 +       struct page *page;
19250 +
19251 +       page = jnode_get_page_locked(node, reiser4_ctx_gfp_mask_get());
19252 +       if (IS_ERR(page))
19253 +               return PTR_ERR(page);
19254 +
19255 +       return jnode_start_read(node, page);
19256 +}
19257 +
19258 +/* Initialize a node by calling appropriate plugin instead of reading
19259 + * node from disk as in jload(). */
19260 +int jinit_new(jnode * node, gfp_t gfp_flags)
19261 +{
19262 +       struct page *page;
19263 +       int result;
19264 +
19265 +       jref(node);
19266 +       add_d_ref(node);
19267 +
19268 +       page = jnode_get_page_locked(node, gfp_flags);
19269 +       if (IS_ERR(page)) {
19270 +               result = PTR_ERR(page);
19271 +               goto failed;
19272 +       }
19273 +
19274 +       SetPageUptodate(page);
19275 +       unlock_page(page);
19276 +
19277 +       node->data = kmap(page);
19278 +
19279 +       if (!jnode_is_parsed(node)) {
19280 +               jnode_plugin *jplug = jnode_ops(node);
19281 +               spin_lock_jnode(node);
19282 +               result = jplug->init(node);
19283 +               spin_unlock_jnode(node);
19284 +               if (result) {
19285 +                       kunmap(page);
19286 +                       goto failed;
19287 +               }
19288 +               JF_SET(node, JNODE_PARSED);
19289 +       }
19290 +
19291 +       return 0;
19292 +
19293 +      failed:
19294 +       jrelse(node);
19295 +       return result;
19296 +}
19297 +
19298 +/* release a reference to jnode acquired by jload(), decrement ->d_count */
19299 +void jrelse_tail(jnode * node /* jnode to release references to */ )
19300 +{
19301 +       assert("nikita-489", atomic_read(&node->d_count) > 0);
19302 +       atomic_dec(&node->d_count);
19303 +       /* release reference acquired in jload_gfp() or jinit_new() */
19304 +       jput(node);
19305 +       if (jnode_is_unformatted(node) || jnode_is_znode(node))
19306 +               LOCK_CNT_DEC(d_refs);
19307 +}
19308 +
19309 +/* drop reference to node data. When last reference is dropped, data are
19310 +   unloaded. */
19311 +void jrelse(jnode * node /* jnode to release references to */ )
19312 +{
19313 +       struct page *page;
19314 +
19315 +       assert("nikita-487", node != NULL);
19316 +       assert_spin_not_locked(&(node->guard));
19317 +
19318 +       page = jnode_page(node);
19319 +       if (likely(page != NULL)) {
19320 +               /*
19321 +                * it is safe not to lock jnode here, because at this point
19322 +                * @node->d_count is greater than zero (if jrelse() is used
19323 +                * correctly, that is). JNODE_PARSED may be not set yet, if,
19324 +                * for example, we got here as a result of error handling path
19325 +                * in jload(). Anyway, page cannot be detached by
19326 +                * reiser4_releasepage(). truncate will invalidate page
19327 +                * regardless, but this should not be a problem.
19328 +                */
19329 +               kunmap(page);
19330 +       }
19331 +       jrelse_tail(node);
19332 +}
19333 +
19334 +/* called from jput() to wait for io completion */
19335 +static void jnode_finish_io(jnode * node)
19336 +{
19337 +       struct page *page;
19338 +
19339 +       assert("nikita-2922", node != NULL);
19340 +
19341 +       spin_lock_jnode(node);
19342 +       page = jnode_page(node);
19343 +       if (page != NULL) {
19344 +               page_cache_get(page);
19345 +               spin_unlock_jnode(node);
19346 +               wait_on_page_writeback(page);
19347 +               page_cache_release(page);
19348 +       } else
19349 +               spin_unlock_jnode(node);
19350 +}
19351 +
19352 +/*
19353 + * This is called by jput() when last reference to jnode is released. This is
19354 + * separate function, because we want fast path of jput() to be inline and,
19355 + * therefore, small.
19356 + */
19357 +void jput_final(jnode * node)
19358 +{
19359 +       int r_i_p;
19360 +
19361 +       /* A fast check for keeping node in cache. We always keep node in cache
19362 +        * if its page is present and node was not marked for deletion */
19363 +       if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
19364 +               rcu_read_unlock();
19365 +               return;
19366 +       }
19367 +       r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP);
19368 +       /*
19369 +        * if r_i_p is true, we were first to set JNODE_RIP on this node. In
19370 +        * this case it is safe to access node after unlock.
19371 +        */
19372 +       rcu_read_unlock();
19373 +       if (r_i_p) {
19374 +               jnode_finish_io(node);
19375 +               if (JF_ISSET(node, JNODE_HEARD_BANSHEE))
19376 +                       /* node is removed from the tree. */
19377 +                       jdelete(node);
19378 +               else
19379 +                       jnode_try_drop(node);
19380 +       }
19381 +       /* if !r_i_p some other thread is already killing it */
19382 +}
19383 +
19384 +int jwait_io(jnode * node, int rw)
19385 +{
19386 +       struct page *page;
19387 +       int result;
19388 +
19389 +       assert("zam-447", node != NULL);
19390 +       assert("zam-448", jnode_page(node) != NULL);
19391 +
19392 +       page = jnode_page(node);
19393 +
19394 +       result = 0;
19395 +       if (rw == READ) {
19396 +               wait_on_page_locked(page);
19397 +       } else {
19398 +               assert("nikita-2227", rw == WRITE);
19399 +               wait_on_page_writeback(page);
19400 +       }
19401 +       if (PageError(page))
19402 +               result = RETERR(-EIO);
19403 +
19404 +       return result;
19405 +}
19406 +
19407 +/*
19408 + * jnode types and plugins.
19409 + *
19410 + * jnode by itself is a "base type". There are several different jnode
19411 + * flavors, called "jnode types" (see jnode_type for a list). Sometimes code
19412 + * has to do different things based on jnode type. In the standard reiser4 way
19413 + * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin).
19414 + *
19415 + * Functions below deal with jnode types and define methods of jnode plugin.
19416 + *
19417 + */
19418 +
19419 +/* set jnode type. This is done during jnode initialization. */
19420 +static void jnode_set_type(jnode * node, jnode_type type)
19421 +{
19422 +       static unsigned long type_to_mask[] = {
19423 +               [JNODE_UNFORMATTED_BLOCK] = 1,
19424 +               [JNODE_FORMATTED_BLOCK] = 0,
19425 +               [JNODE_BITMAP] = 2,
19426 +               [JNODE_IO_HEAD] = 6,
19427 +               [JNODE_INODE] = 4
19428 +       };
19429 +
19430 +       assert("zam-647", type < LAST_JNODE_TYPE);
19431 +       assert("nikita-2815", !jnode_is_loaded(node));
19432 +       assert("nikita-3386", node->state == 0);
19433 +
19434 +       node->state |= (type_to_mask[type] << JNODE_TYPE_1);
19435 +}
19436 +
19437 +/* ->init() method of jnode plugin for jnodes that don't require plugin
19438 + * specific initialization. */
19439 +static int init_noinit(jnode * node UNUSED_ARG)
19440 +{
19441 +       return 0;
19442 +}
19443 +
19444 +/* ->parse() method of jnode plugin for jnodes that don't require plugin
19445 + * specific pasring. */
19446 +static int parse_noparse(jnode * node UNUSED_ARG)
19447 +{
19448 +       return 0;
19449 +}
19450 +
19451 +/* ->mapping() method for unformatted jnode */
19452 +struct address_space *mapping_jnode(const jnode * node)
19453 +{
19454 +       struct address_space *map;
19455 +
19456 +       assert("nikita-2713", node != NULL);
19457 +
19458 +       /* mapping is stored in jnode */
19459 +
19460 +       map = node->key.j.mapping;
19461 +       assert("nikita-2714", map != NULL);
19462 +       assert("nikita-2897", is_reiser4_inode(map->host));
19463 +       assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid);
19464 +       return map;
19465 +}
19466 +
19467 +/* ->index() method for unformatted jnodes */
19468 +unsigned long index_jnode(const jnode * node)
19469 +{
19470 +       /* index is stored in jnode */
19471 +       return node->key.j.index;
19472 +}
19473 +
19474 +/* ->remove() method for unformatted jnodes */
19475 +static inline void remove_jnode(jnode * node, reiser4_tree * tree)
19476 +{
19477 +       /* remove jnode from hash table and radix tree */
19478 +       if (node->key.j.mapping)
19479 +               unhash_unformatted_node_nolock(node);
19480 +}
19481 +
19482 +/* ->mapping() method for znodes */
19483 +static struct address_space *mapping_znode(const jnode * node)
19484 +{
19485 +       /* all znodes belong to fake inode */
19486 +       return reiser4_get_super_fake(jnode_get_tree(node)->super)->i_mapping;
19487 +}
19488 +
19489 +/* ->index() method for znodes */
19490 +static unsigned long index_znode(const jnode * node)
19491 +{
19492 +       unsigned long addr;
19493 +       assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode));
19494 +
19495 +       /* index of znode is just its address (shifted) */
19496 +       addr = (unsigned long)node;
19497 +       return (addr - PAGE_OFFSET) >> znode_shift_order;
19498 +}
19499 +
19500 +/* ->mapping() method for bitmap jnode */
19501 +static struct address_space *mapping_bitmap(const jnode * node)
19502 +{
19503 +       /* all bitmap blocks belong to special bitmap inode */
19504 +       return get_super_private(jnode_get_tree(node)->super)->bitmap->
19505 +           i_mapping;
19506 +}
19507 +
19508 +/* ->index() method for jnodes that are indexed by address */
19509 +static unsigned long index_is_address(const jnode * node)
19510 +{
19511 +       unsigned long ind;
19512 +
19513 +       ind = (unsigned long)node;
19514 +       return ind - PAGE_OFFSET;
19515 +}
19516 +
19517 +/* resolve race with jput */
19518 +jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node)
19519 +{
19520 +       /*
19521 +        * This is used as part of RCU-based jnode handling.
19522 +        *
19523 +        * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work
19524 +        * with unreferenced jnodes (ones with ->x_count == 0). Hash table is
19525 +        * not protected during this, so concurrent thread may execute
19526 +        * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be
19527 +        * freed in jput_final(). To avoid such races, jput_final() sets
19528 +        * JNODE_RIP on jnode (under tree lock). All places that work with
19529 +        * unreferenced jnodes call this function. It checks for JNODE_RIP bit
19530 +        * (first without taking tree lock), and if this bit is set, released
19531 +        * reference acquired by the current thread and returns NULL.
19532 +        *
19533 +        * As a result, if jnode is being concurrently freed, NULL is returned
19534 +        * and caller should pretend that jnode wasn't found in the first
19535 +        * place.
19536 +        *
19537 +        * Otherwise it's safe to release "rcu-read-lock" and continue with
19538 +        * jnode.
19539 +        */
19540 +       if (unlikely(JF_ISSET(node, JNODE_RIP))) {
19541 +               read_lock_tree(tree);
19542 +               if (JF_ISSET(node, JNODE_RIP)) {
19543 +                       dec_x_ref(node);
19544 +                       node = NULL;
19545 +               }
19546 +               read_unlock_tree(tree);
19547 +       }
19548 +       return node;
19549 +}
19550 +
19551 +reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key)
19552 +{
19553 +       struct inode *inode;
19554 +       item_plugin *iplug;
19555 +       loff_t off;
19556 +
19557 +       assert("nikita-3092", node != NULL);
19558 +       assert("nikita-3093", key != NULL);
19559 +       assert("nikita-3094", jnode_is_unformatted(node));
19560 +
19561 +       off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT;
19562 +       inode = mapping_jnode(node)->host;
19563 +
19564 +       if (node->parent_item_id != 0)
19565 +               iplug = item_plugin_by_id(node->parent_item_id);
19566 +       else
19567 +               iplug = NULL;
19568 +
19569 +       if (iplug != NULL && iplug->f.key_by_offset)
19570 +               iplug->f.key_by_offset(inode, off, key);
19571 +       else {
19572 +               file_plugin *fplug;
19573 +
19574 +               fplug = inode_file_plugin(inode);
19575 +               assert("zam-1007", fplug != NULL);
19576 +               assert("zam-1008", fplug->key_by_inode != NULL);
19577 +
19578 +               fplug->key_by_inode(inode, off, key);
19579 +       }
19580 +
19581 +       return key;
19582 +}
19583 +
19584 +/* ->parse() method for formatted nodes */
19585 +static int parse_znode(jnode * node)
19586 +{
19587 +       return zparse(JZNODE(node));
19588 +}
19589 +
19590 +/* ->delete() method for formatted nodes */
19591 +static void delete_znode(jnode * node, reiser4_tree * tree)
19592 +{
19593 +       znode *z;
19594 +
19595 +       assert_rw_write_locked(&(tree->tree_lock));
19596 +       assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE));
19597 +
19598 +       z = JZNODE(node);
19599 +       assert("vs-899", z->c_count == 0);
19600 +
19601 +       /* delete znode from sibling list. */
19602 +       sibling_list_remove(z);
19603 +
19604 +       znode_remove(z, tree);
19605 +}
19606 +
19607 +/* ->remove() method for formatted nodes */
19608 +static int remove_znode(jnode * node, reiser4_tree * tree)
19609 +{
19610 +       znode *z;
19611 +
19612 +       assert_rw_write_locked(&(tree->tree_lock));
19613 +       z = JZNODE(node);
19614 +
19615 +       if (z->c_count == 0) {
19616 +               /* detach znode from sibling list. */
19617 +               sibling_list_drop(z);
19618 +               /* this is called with tree spin-lock held, so call
19619 +                  znode_remove() directly (rather than znode_lock_remove()). */
19620 +               znode_remove(z, tree);
19621 +               return 0;
19622 +       }
19623 +       return RETERR(-EBUSY);
19624 +}
19625 +
19626 +/* ->init() method for formatted nodes */
19627 +static int init_znode(jnode * node)
19628 +{
19629 +       znode *z;
19630 +
19631 +       z = JZNODE(node);
19632 +       /* call node plugin to do actual initialization */
19633 +       return z->nplug->init(z);
19634 +}
19635 +
19636 +/* ->clone() method for formatted nodes */
19637 +static jnode *clone_formatted(jnode * node)
19638 +{
19639 +       znode *clone;
19640 +
19641 +       assert("vs-1430", jnode_is_znode(node));
19642 +       clone = zalloc(reiser4_ctx_gfp_mask_get());
19643 +       if (clone == NULL)
19644 +               return ERR_PTR(RETERR(-ENOMEM));
19645 +       zinit(clone, NULL, current_tree);
19646 +       jnode_set_block(ZJNODE(clone), jnode_get_block(node));
19647 +       /* ZJNODE(clone)->key.z is not initialized */
19648 +       clone->level = JZNODE(node)->level;
19649 +
19650 +       return ZJNODE(clone);
19651 +}
19652 +
19653 +/* jplug->clone for unformatted nodes */
19654 +static jnode *clone_unformatted(jnode * node)
19655 +{
19656 +       jnode *clone;
19657 +
19658 +       assert("vs-1431", jnode_is_unformatted(node));
19659 +       clone = jalloc();
19660 +       if (clone == NULL)
19661 +               return ERR_PTR(RETERR(-ENOMEM));
19662 +
19663 +       jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK);
19664 +       jnode_set_block(clone, jnode_get_block(node));
19665 +
19666 +       return clone;
19667 +
19668 +}
19669 +
19670 +/*
19671 + * Setup jnode plugin methods for various jnode types.
19672 + */
19673 +jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = {
19674 +       [JNODE_UNFORMATTED_BLOCK] = {
19675 +               .h = {
19676 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
19677 +                       .id = JNODE_UNFORMATTED_BLOCK,
19678 +                       .pops = NULL,
19679 +                       .label = "unformatted",
19680 +                       .desc = "unformatted node",
19681 +                       .linkage = {NULL, NULL}
19682 +               },
19683 +               .init = init_noinit,
19684 +               .parse = parse_noparse,
19685 +               .mapping = mapping_jnode,
19686 +               .index = index_jnode,
19687 +               .clone = clone_unformatted
19688 +       },
19689 +       [JNODE_FORMATTED_BLOCK] = {
19690 +               .h = {
19691 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
19692 +                       .id = JNODE_FORMATTED_BLOCK,
19693 +                       .pops = NULL,
19694 +                       .label = "formatted",
19695 +                       .desc = "formatted tree node",
19696 +                       .linkage = {NULL, NULL}
19697 +               },
19698 +               .init = init_znode,
19699 +               .parse = parse_znode,
19700 +               .mapping = mapping_znode,
19701 +               .index = index_znode,
19702 +               .clone = clone_formatted
19703 +       },
19704 +       [JNODE_BITMAP] = {
19705 +               .h = {
19706 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
19707 +                       .id = JNODE_BITMAP,
19708 +                       .pops = NULL,
19709 +                       .label = "bitmap",
19710 +                       .desc = "bitmap node",
19711 +                       .linkage = {NULL, NULL}
19712 +               },
19713 +               .init = init_noinit,
19714 +               .parse = parse_noparse,
19715 +               .mapping = mapping_bitmap,
19716 +               .index = index_is_address,
19717 +               .clone = NULL
19718 +       },
19719 +       [JNODE_IO_HEAD] = {
19720 +               .h = {
19721 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
19722 +                       .id = JNODE_IO_HEAD,
19723 +                       .pops = NULL,
19724 +                       .label = "io head",
19725 +                       .desc = "io head",
19726 +                       .linkage = {NULL, NULL}
19727 +               },
19728 +               .init = init_noinit,
19729 +               .parse = parse_noparse,
19730 +               .mapping = mapping_bitmap,
19731 +               .index = index_is_address,
19732 +               .clone = NULL
19733 +       },
19734 +       [JNODE_INODE] = {
19735 +               .h = {
19736 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
19737 +                       .id = JNODE_INODE,
19738 +                       .pops = NULL,
19739 +                       .label = "inode",
19740 +                       .desc = "inode's builtin jnode",
19741 +                       .linkage = {NULL, NULL}
19742 +               },
19743 +               .init = NULL,
19744 +               .parse = NULL,
19745 +               .mapping = NULL,
19746 +               .index = NULL,
19747 +               .clone = NULL
19748 +       }
19749 +};
19750 +
19751 +/*
19752 + * jnode destruction.
19753 + *
19754 + * Thread may use a jnode after it acquired a reference to it. References are
19755 + * counted in ->x_count field. Reference protects jnode from being
19756 + * recycled. This is different from protecting jnode data (that are stored in
19757 + * jnode page) from being evicted from memory. Data are protected by jload()
19758 + * and released by jrelse().
19759 + *
19760 + * If thread already possesses a reference to the jnode it can acquire another
19761 + * one through jref(). Initial reference is obtained (usually) by locating
19762 + * jnode in some indexing structure that depends on jnode type: formatted
19763 + * nodes are kept in global hash table, where they are indexed by block
19764 + * number, and also in the cbk cache. Unformatted jnodes are also kept in hash
19765 + * table, which is indexed by oid and offset within file, and in per-inode
19766 + * radix tree.
19767 + *
19768 + * Reference to jnode is released by jput(). If last reference is released,
19769 + * jput_final() is called. This function determines whether jnode has to be
19770 + * deleted (this happens when corresponding node is removed from the file
19771 + * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it
19772 + * should be just "removed" (deleted from memory).
19773 + *
19774 + * Jnode destruction is signally delicate dance because of locking and RCU.
19775 + */
19776 +
19777 +/*
19778 + * Returns true if jnode cannot be removed right now. This check is called
19779 + * under tree lock. If it returns true, jnode is irrevocably committed to be
19780 + * deleted/removed.
19781 + */
19782 +static inline int jnode_is_busy(const jnode * node, jnode_type jtype)
19783 +{
19784 +       /* if other thread managed to acquire a reference to this jnode, don't
19785 +        * free it. */
19786 +       if (atomic_read(&node->x_count) > 0)
19787 +               return 1;
19788 +       /* also, don't free znode that has children in memory */
19789 +       if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0)
19790 +               return 1;
19791 +       return 0;
19792 +}
19793 +
19794 +/*
19795 + * this is called as part of removing jnode. Based on jnode type, call
19796 + * corresponding function that removes jnode from indices and returns it back
19797 + * to the appropriate slab (through RCU).
19798 + */
19799 +static inline void
19800 +jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree)
19801 +{
19802 +       switch (jtype) {
19803 +       case JNODE_UNFORMATTED_BLOCK:
19804 +               remove_jnode(node, tree);
19805 +               break;
19806 +       case JNODE_IO_HEAD:
19807 +       case JNODE_BITMAP:
19808 +               break;
19809 +       case JNODE_INODE:
19810 +               break;
19811 +       case JNODE_FORMATTED_BLOCK:
19812 +               remove_znode(node, tree);
19813 +               break;
19814 +       default:
19815 +               wrong_return_value("nikita-3196", "Wrong jnode type");
19816 +       }
19817 +}
19818 +
19819 +/*
19820 + * this is called as part of deleting jnode. Based on jnode type, call
19821 + * corresponding function that removes jnode from indices and returns it back
19822 + * to the appropriate slab (through RCU).
19823 + *
19824 + * This differs from jnode_remove() only for formatted nodes---for them
19825 + * sibling list handling is different for removal and deletion.
19826 + */
19827 +static inline void
19828 +jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG)
19829 +{
19830 +       switch (jtype) {
19831 +       case JNODE_UNFORMATTED_BLOCK:
19832 +               remove_jnode(node, tree);
19833 +               break;
19834 +       case JNODE_IO_HEAD:
19835 +       case JNODE_BITMAP:
19836 +               break;
19837 +       case JNODE_FORMATTED_BLOCK:
19838 +               delete_znode(node, tree);
19839 +               break;
19840 +       case JNODE_INODE:
19841 +       default:
19842 +               wrong_return_value("nikita-3195", "Wrong jnode type");
19843 +       }
19844 +}
19845 +
19846 +#if REISER4_DEBUG
19847 +/*
19848 + * remove jnode from the debugging list of all jnodes hanging off super-block.
19849 + */
19850 +void jnode_list_remove(jnode * node)
19851 +{
19852 +       reiser4_super_info_data *sbinfo;
19853 +
19854 +       sbinfo = get_super_private(jnode_get_tree(node)->super);
19855 +
19856 +       spin_lock_irq(&sbinfo->all_guard);
19857 +       assert("nikita-2422", !list_empty(&node->jnodes));
19858 +       list_del_init(&node->jnodes);
19859 +       spin_unlock_irq(&sbinfo->all_guard);
19860 +}
19861 +#endif
19862 +
19863 +/*
19864 + * this is called by jput_final() to remove jnode when last reference to it is
19865 + * released.
19866 + */
19867 +static int jnode_try_drop(jnode * node)
19868 +{
19869 +       int result;
19870 +       reiser4_tree *tree;
19871 +       jnode_type jtype;
19872 +
19873 +       assert("nikita-2491", node != NULL);
19874 +       assert("nikita-2583", JF_ISSET(node, JNODE_RIP));
19875 +
19876 +       tree = jnode_get_tree(node);
19877 +       jtype = jnode_get_type(node);
19878 +
19879 +       spin_lock_jnode(node);
19880 +       write_lock_tree(tree);
19881 +       /*
19882 +        * if jnode has a page---leave it alone. Memory pressure will
19883 +        * eventually kill page and jnode.
19884 +        */
19885 +       if (jnode_page(node) != NULL) {
19886 +               write_unlock_tree(tree);
19887 +               spin_unlock_jnode(node);
19888 +               JF_CLR(node, JNODE_RIP);
19889 +               return RETERR(-EBUSY);
19890 +       }
19891 +
19892 +       /* re-check ->x_count under tree lock. */
19893 +       result = jnode_is_busy(node, jtype);
19894 +       if (result == 0) {
19895 +               assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
19896 +               assert("jmacd-511/b", atomic_read(&node->d_count) == 0);
19897 +
19898 +               spin_unlock_jnode(node);
19899 +               /* no page and no references---despatch him. */
19900 +               jnode_remove(node, jtype, tree);
19901 +               write_unlock_tree(tree);
19902 +               jnode_free(node, jtype);
19903 +       } else {
19904 +               /* busy check failed: reference was acquired by concurrent
19905 +                * thread. */
19906 +               write_unlock_tree(tree);
19907 +               spin_unlock_jnode(node);
19908 +               JF_CLR(node, JNODE_RIP);
19909 +       }
19910 +       return result;
19911 +}
19912 +
19913 +/* jdelete() -- Delete jnode from the tree and file system */
19914 +static int jdelete(jnode * node /* jnode to finish with */ )
19915 +{
19916 +       struct page *page;
19917 +       int result;
19918 +       reiser4_tree *tree;
19919 +       jnode_type jtype;
19920 +
19921 +       assert("nikita-467", node != NULL);
19922 +       assert("nikita-2531", JF_ISSET(node, JNODE_RIP));
19923 +
19924 +       jtype = jnode_get_type(node);
19925 +
19926 +       page = jnode_lock_page(node);
19927 +       assert_spin_locked(&(node->guard));
19928 +
19929 +       tree = jnode_get_tree(node);
19930 +
19931 +       write_lock_tree(tree);
19932 +       /* re-check ->x_count under tree lock. */
19933 +       result = jnode_is_busy(node, jtype);
19934 +       if (likely(!result)) {
19935 +               assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE));
19936 +               assert("jmacd-511", atomic_read(&node->d_count) == 0);
19937 +
19938 +               /* detach page */
19939 +               if (page != NULL) {
19940 +                       /*
19941 +                        * FIXME this is racy against jnode_extent_write().
19942 +                        */
19943 +                       page_clear_jnode(page, node);
19944 +               }
19945 +               spin_unlock_jnode(node);
19946 +               /* goodbye */
19947 +               jnode_delete(node, jtype, tree);
19948 +               write_unlock_tree(tree);
19949 +               jnode_free(node, jtype);
19950 +               /* @node is no longer valid pointer */
19951 +               if (page != NULL)
19952 +                       reiser4_drop_page(page);
19953 +       } else {
19954 +               /* busy check failed: reference was acquired by concurrent
19955 +                * thread. */
19956 +               JF_CLR(node, JNODE_RIP);
19957 +               write_unlock_tree(tree);
19958 +               spin_unlock_jnode(node);
19959 +               if (page != NULL)
19960 +                       unlock_page(page);
19961 +       }
19962 +       return result;
19963 +}
19964 +
19965 +/* drop jnode on the floor.
19966 +
19967 +   Return value:
19968 +
19969 +    -EBUSY:  failed to drop jnode, because there are still references to it
19970 +
19971 +    0:       successfully dropped jnode
19972 +
19973 +*/
19974 +static int jdrop_in_tree(jnode * node, reiser4_tree * tree)
19975 +{
19976 +       struct page *page;
19977 +       jnode_type jtype;
19978 +       int result;
19979 +
19980 +       assert("zam-602", node != NULL);
19981 +       assert_rw_not_read_locked(&(tree->tree_lock));
19982 +       assert_rw_not_write_locked(&(tree->tree_lock));
19983 +       assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
19984 +
19985 +       jtype = jnode_get_type(node);
19986 +
19987 +       page = jnode_lock_page(node);
19988 +       assert_spin_locked(&(node->guard));
19989 +
19990 +       write_lock_tree(tree);
19991 +
19992 +       /* re-check ->x_count under tree lock. */
19993 +       result = jnode_is_busy(node, jtype);
19994 +       if (!result) {
19995 +               assert("nikita-2488", page == jnode_page(node));
19996 +               assert("nikita-2533", atomic_read(&node->d_count) == 0);
19997 +               if (page != NULL) {
19998 +                       assert("nikita-2126", !PageDirty(page));
19999 +                       assert("nikita-2127", PageUptodate(page));
20000 +                       assert("nikita-2181", PageLocked(page));
20001 +                       page_clear_jnode(page, node);
20002 +               }
20003 +               spin_unlock_jnode(node);
20004 +               jnode_remove(node, jtype, tree);
20005 +               write_unlock_tree(tree);
20006 +               jnode_free(node, jtype);
20007 +               if (page != NULL) {
20008 +                       reiser4_drop_page(page);
20009 +               }
20010 +       } else {
20011 +               /* busy check failed: reference was acquired by concurrent
20012 +                * thread. */
20013 +               JF_CLR(node, JNODE_RIP);
20014 +               write_unlock_tree(tree);
20015 +               spin_unlock_jnode(node);
20016 +               if (page != NULL)
20017 +                       unlock_page(page);
20018 +       }
20019 +       return result;
20020 +}
20021 +
20022 +/* This function frees jnode "if possible". In particular, [dcx]_count has to
20023 +   be 0 (where applicable).  */
20024 +void jdrop(jnode * node)
20025 +{
20026 +       jdrop_in_tree(node, jnode_get_tree(node));
20027 +}
20028 +
20029 +/* IO head jnode implementation; The io heads are simple j-nodes with limited
20030 +   functionality (these j-nodes are not in any hash table) just for reading
20031 +   from and writing to disk. */
20032 +
20033 +jnode *reiser4_alloc_io_head(const reiser4_block_nr * block)
20034 +{
20035 +       jnode *jal = jalloc();
20036 +
20037 +       if (jal != NULL) {
20038 +               jnode_init(jal, current_tree, JNODE_IO_HEAD);
20039 +               jnode_set_block(jal, block);
20040 +       }
20041 +
20042 +       jref(jal);
20043 +
20044 +       return jal;
20045 +}
20046 +
20047 +void reiser4_drop_io_head(jnode * node)
20048 +{
20049 +       assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD);
20050 +
20051 +       jput(node);
20052 +       jdrop(node);
20053 +}
20054 +
20055 +/* protect keep jnode data from reiser4_releasepage()  */
20056 +void pin_jnode_data(jnode * node)
20057 +{
20058 +       assert("zam-671", jnode_page(node) != NULL);
20059 +       page_cache_get(jnode_page(node));
20060 +}
20061 +
20062 +/* make jnode data free-able again */
20063 +void unpin_jnode_data(jnode * node)
20064 +{
20065 +       assert("zam-672", jnode_page(node) != NULL);
20066 +       page_cache_release(jnode_page(node));
20067 +}
20068 +
20069 +struct address_space *jnode_get_mapping(const jnode * node)
20070 +{
20071 +       assert("nikita-3162", node != NULL);
20072 +       return jnode_ops(node)->mapping(node);
20073 +}
20074 +
20075 +#if REISER4_DEBUG
20076 +/* debugging aid: jnode invariant */
20077 +int jnode_invariant_f(const jnode * node, char const **msg)
20078 +{
20079 +#define _ergo(ant, con)                                                \
20080 +       ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
20081 +#define _check(exp) ((*msg) = #exp, (exp))
20082 +
20083 +       return _check(node != NULL) &&
20084 +           /* [jnode-queued] */
20085 +           /* only relocated node can be queued, except that when znode
20086 +            * is being deleted, its JNODE_RELOC bit is cleared */
20087 +           _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED),
20088 +                 JF_ISSET(node, JNODE_RELOC) ||
20089 +                 JF_ISSET(node, JNODE_HEARD_BANSHEE)) &&
20090 +           _check(node->jnodes.prev != NULL) &&
20091 +           _check(node->jnodes.next != NULL) &&
20092 +           /* [jnode-dirty] invariant */
20093 +           /* dirty inode is part of atom */
20094 +           _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) &&
20095 +           /* [jnode-oid] invariant */
20096 +           /* for unformatted node ->objectid and ->mapping fields are
20097 +            * consistent */
20098 +           _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL,
20099 +                 node->key.j.objectid ==
20100 +                 get_inode_oid(node->key.j.mapping->host)) &&
20101 +           /* [jnode-atom-valid] invariant */
20102 +           /* node atom has valid state */
20103 +           _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) &&
20104 +           /* [jnode-page-binding] invariant */
20105 +           /* if node points to page, it points back to node */
20106 +           _ergo(node->pg != NULL, jprivate(node->pg) == node) &&
20107 +           /* [jnode-refs] invariant */
20108 +           /* only referenced jnode can be loaded */
20109 +           _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count));
20110 +
20111 +}
20112 +
20113 +static const char *jnode_type_name(jnode_type type)
20114 +{
20115 +       switch (type) {
20116 +       case JNODE_UNFORMATTED_BLOCK:
20117 +               return "unformatted";
20118 +       case JNODE_FORMATTED_BLOCK:
20119 +               return "formatted";
20120 +       case JNODE_BITMAP:
20121 +               return "bitmap";
20122 +       case JNODE_IO_HEAD:
20123 +               return "io head";
20124 +       case JNODE_INODE:
20125 +               return "inode";
20126 +       case LAST_JNODE_TYPE:
20127 +               return "last";
20128 +       default:{
20129 +                       static char unknown[30];
20130 +
20131 +                       sprintf(unknown, "unknown %i", type);
20132 +                       return unknown;
20133 +               }
20134 +       }
20135 +}
20136 +
20137 +#define jnode_state_name( node, flag )                 \
20138 +       ( JF_ISSET( ( node ), ( flag ) ) ? ((#flag "|")+6) : "" )
20139 +
20140 +/* debugging aid: output human readable information about @node */
20141 +static void info_jnode(const char *prefix /* prefix to print */ ,
20142 +                      const jnode * node /* node to print */ )
20143 +{
20144 +       assert("umka-068", prefix != NULL);
20145 +
20146 +       if (node == NULL) {
20147 +               printk("%s: null\n", prefix);
20148 +               return;
20149 +       }
20150 +
20151 +       printk
20152 +           ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i,"
20153 +            " block: %s, d_count: %d, x_count: %d, "
20154 +            "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node,
20155 +            node->state,
20156 +            jnode_state_name(node, JNODE_PARSED),
20157 +            jnode_state_name(node, JNODE_HEARD_BANSHEE),
20158 +            jnode_state_name(node, JNODE_LEFT_CONNECTED),
20159 +            jnode_state_name(node, JNODE_RIGHT_CONNECTED),
20160 +            jnode_state_name(node, JNODE_ORPHAN),
20161 +            jnode_state_name(node, JNODE_CREATED),
20162 +            jnode_state_name(node, JNODE_RELOC),
20163 +            jnode_state_name(node, JNODE_OVRWR),
20164 +            jnode_state_name(node, JNODE_DIRTY),
20165 +            jnode_state_name(node, JNODE_IS_DYING),
20166 +            jnode_state_name(node, JNODE_RIP),
20167 +            jnode_state_name(node, JNODE_MISSED_IN_CAPTURE),
20168 +            jnode_state_name(node, JNODE_WRITEBACK),
20169 +            jnode_state_name(node, JNODE_NEW),
20170 +            jnode_state_name(node, JNODE_DKSET),
20171 +            jnode_state_name(node, JNODE_REPACK),
20172 +            jnode_state_name(node, JNODE_CLUSTER_PAGE),
20173 +            jnode_get_level(node), sprint_address(jnode_get_block(node)),
20174 +            atomic_read(&node->d_count), atomic_read(&node->x_count),
20175 +            jnode_page(node), node->atom, 0, 0,
20176 +            jnode_type_name(jnode_get_type(node)));
20177 +       if (jnode_is_unformatted(node)) {
20178 +               printk("inode: %llu, index: %lu, ",
20179 +                      node->key.j.objectid, node->key.j.index);
20180 +       }
20181 +}
20182 +
20183 +/* debugging aid: check znode invariant and panic if it doesn't hold */
20184 +static int jnode_invariant(jnode * node, int tlocked, int jlocked)
20185 +{
20186 +       char const *failed_msg;
20187 +       int result;
20188 +       reiser4_tree *tree;
20189 +
20190 +       tree = jnode_get_tree(node);
20191 +
20192 +       assert("umka-063312", node != NULL);
20193 +       assert("umka-064321", tree != NULL);
20194 +
20195 +       if (!jlocked && !tlocked)
20196 +               spin_lock_jnode((jnode *) node);
20197 +       if (!tlocked)
20198 +               read_lock_tree(jnode_get_tree(node));
20199 +       result = jnode_invariant_f(node, &failed_msg);
20200 +       if (!result) {
20201 +               info_jnode("corrupted node", node);
20202 +               warning("jmacd-555", "Condition %s failed", failed_msg);
20203 +       }
20204 +       if (!tlocked)
20205 +               read_unlock_tree(jnode_get_tree(node));
20206 +       if (!jlocked && !tlocked)
20207 +               spin_unlock_jnode((jnode *) node);
20208 +       return result;
20209 +}
20210 +
20211 +#endif                         /* REISER4_DEBUG */
20212 +
20213 +/* Make Linus happy.
20214 +   Local variables:
20215 +   c-indentation-style: "K&R"
20216 +   mode-name: "LC"
20217 +   c-basic-offset: 8
20218 +   tab-width: 8
20219 +   fill-column: 80
20220 +   End:
20221 +*/
20222 diff -urN linux-2.6.27.orig/fs/reiser4/jnode.h linux-2.6.27/fs/reiser4/jnode.h
20223 --- linux-2.6.27.orig/fs/reiser4/jnode.h        1970-01-01 03:00:00.000000000 +0300
20224 +++ linux-2.6.27/fs/reiser4/jnode.h     2008-10-12 18:20:00.000000000 +0400
20225 @@ -0,0 +1,702 @@
20226 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20227 + * reiser4/README */
20228 +
20229 +/* Declaration of jnode. See jnode.c for details. */
20230 +
20231 +#ifndef __JNODE_H__
20232 +#define __JNODE_H__
20233 +
20234 +#include "forward.h"
20235 +#include "type_safe_hash.h"
20236 +#include "txnmgr.h"
20237 +#include "key.h"
20238 +#include "debug.h"
20239 +#include "dformat.h"
20240 +#include "page_cache.h"
20241 +#include "context.h"
20242 +
20243 +#include "plugin/plugin.h"
20244 +
20245 +#include <linux/fs.h>
20246 +#include <linux/mm.h>
20247 +#include <linux/spinlock.h>
20248 +#include <asm/atomic.h>
20249 +#include <linux/bitops.h>
20250 +#include <linux/list.h>
20251 +#include <linux/rcupdate.h>
20252 +
20253 +/* declare hash table of jnodes (jnodes proper, that is, unformatted
20254 +   nodes)  */
20255 +TYPE_SAFE_HASH_DECLARE(j, jnode);
20256 +
20257 +/* declare hash table of znodes */
20258 +TYPE_SAFE_HASH_DECLARE(z, znode);
20259 +
20260 +struct jnode_key {
20261 +       __u64 objectid;
20262 +       unsigned long index;
20263 +       struct address_space *mapping;
20264 +};
20265 +
20266 +/*
20267 +   Jnode is the "base class" of other nodes in reiser4. It is also happens to
20268 +   be exactly the node we use for unformatted tree nodes.
20269 +
20270 +   Jnode provides following basic functionality:
20271 +
20272 +   . reference counting and indexing.
20273 +
20274 +   . integration with page cache. Jnode has ->pg reference to which page can
20275 +   be attached.
20276 +
20277 +   . interface to transaction manager. It is jnode that is kept in transaction
20278 +   manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this
20279 +   means, there should be special type of jnode for inode.)
20280 +
20281 +   Locking:
20282 +
20283 +   Spin lock: the following fields are protected by the per-jnode spin lock:
20284 +
20285 +    ->state
20286 +    ->atom
20287 +    ->capture_link
20288 +
20289 +   Following fields are protected by the global tree lock:
20290 +
20291 +    ->link
20292 +    ->key.z (content of ->key.z is only changed in znode_rehash())
20293 +    ->key.j
20294 +
20295 +   Atomic counters
20296 +
20297 +    ->x_count
20298 +    ->d_count
20299 +
20300 +    ->pg, and ->data are protected by spin lock for unused jnode and are
20301 +    immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable()
20302 +    is false).
20303 +
20304 +    ->tree is immutable after creation
20305 +
20306 +   Unclear
20307 +
20308 +    ->blocknr: should be under jnode spin-lock, but current interface is based
20309 +    on passing of block address.
20310 +
20311 +   If you ever need to spin lock two nodes at once, do this in "natural"
20312 +   memory order: lock znode with lower address first. (See lock_two_nodes().)
20313 +
20314 +   Invariants involving this data-type:
20315 +
20316 +      [jnode-dirty]
20317 +      [jnode-refs]
20318 +      [jnode-oid]
20319 +      [jnode-queued]
20320 +      [jnode-atom-valid]
20321 +      [jnode-page-binding]
20322 +*/
20323 +
20324 +struct jnode {
20325 +#if REISER4_DEBUG
20326 +#define JMAGIC 0x52654973      /* "ReIs" */
20327 +       int magic;
20328 +#endif
20329 +       /* FIRST CACHE LINE (16 bytes): data used by jload */
20330 +
20331 +       /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */
20332 +       /*   0 */ unsigned long state;
20333 +
20334 +       /* lock, protecting jnode's fields. */
20335 +       /*   4 */ spinlock_t load;
20336 +
20337 +       /* counter of references to jnode itself. Increased on jref().
20338 +          Decreased on jput().
20339 +        */
20340 +       /*   8 */ atomic_t x_count;
20341 +
20342 +       /* counter of references to jnode's data. Pin data page(s) in
20343 +          memory while this is greater than 0. Increased on jload().
20344 +          Decreased on jrelse().
20345 +        */
20346 +       /*   12 */ atomic_t d_count;
20347 +
20348 +       /* SECOND CACHE LINE: data used by hash table lookups */
20349 +
20350 +       /*   16 */ union {
20351 +               /* znodes are hashed by block number */
20352 +               reiser4_block_nr z;
20353 +               /* unformatted nodes are hashed by mapping plus offset */
20354 +               struct jnode_key j;
20355 +       } key;
20356 +
20357 +       /* THIRD CACHE LINE */
20358 +
20359 +       /*   32 */ union {
20360 +               /* pointers to maintain hash-table */
20361 +               z_hash_link z;
20362 +               j_hash_link j;
20363 +       } link;
20364 +
20365 +       /* pointer to jnode page.  */
20366 +       /*   36 */ struct page *pg;
20367 +       /* pointer to node itself. This is page_address(node->pg) when page is
20368 +          attached to the jnode
20369 +        */
20370 +       /*   40 */ void *data;
20371 +
20372 +       /*   44 */ reiser4_tree *tree;
20373 +
20374 +       /* FOURTH CACHE LINE: atom related fields */
20375 +
20376 +       /*   48 */ spinlock_t guard;
20377 +
20378 +       /* atom the block is in, if any */
20379 +       /*   52 */ txn_atom *atom;
20380 +
20381 +       /* capture list */
20382 +       /*   56 */ struct list_head capture_link;
20383 +
20384 +       /* FIFTH CACHE LINE */
20385 +
20386 +       /*   64 */ struct rcu_head rcu;
20387 +       /* crosses cache line */
20388 +
20389 +       /* SIXTH CACHE LINE */
20390 +
20391 +       /* the real blocknr (where io is going to/from) */
20392 +       /*   80 */ reiser4_block_nr blocknr;
20393 +       /* Parent item type, unformatted and CRC need it for offset => key conversion.  */
20394 +       /* NOTE: this parent_item_id looks like jnode type. */
20395 +       /*   88 */ reiser4_plugin_id parent_item_id;
20396 +       /*   92 */
20397 +#if REISER4_DEBUG
20398 +       /* list of all jnodes for debugging purposes. */
20399 +       struct list_head jnodes;
20400 +       /* how many times this jnode was written in one transaction */
20401 +       int written;
20402 +       /* this indicates which atom's list the jnode is on */
20403 +       atom_list list;
20404 +#endif
20405 +} __attribute__ ((aligned(16)));
20406 +
20407 +/*
20408 + * jnode types. Enumeration of existing jnode types.
20409 + */
20410 +typedef enum {
20411 +       JNODE_UNFORMATTED_BLOCK,        /* unformatted block */
20412 +       JNODE_FORMATTED_BLOCK,  /* formatted block, znode */
20413 +       JNODE_BITMAP,           /* bitmap */
20414 +       JNODE_IO_HEAD,          /* jnode representing a block in the
20415 +                                * wandering log */
20416 +       JNODE_INODE,            /* jnode embedded into inode */
20417 +       LAST_JNODE_TYPE
20418 +} jnode_type;
20419 +
20420 +/* jnode states */
20421 +typedef enum {
20422 +       /* jnode's page is loaded and data checked */
20423 +       JNODE_PARSED = 0,
20424 +       /* node was deleted, not all locks on it were released. This
20425 +          node is empty and is going to be removed from the tree
20426 +          shortly. */
20427 +       JNODE_HEARD_BANSHEE = 1,
20428 +       /* left sibling pointer is valid */
20429 +       JNODE_LEFT_CONNECTED = 2,
20430 +       /* right sibling pointer is valid */
20431 +       JNODE_RIGHT_CONNECTED = 3,
20432 +
20433 +       /* znode was just created and doesn't yet have a pointer from
20434 +          its parent */
20435 +       JNODE_ORPHAN = 4,
20436 +
20437 +       /* this node was created by its transaction and has not been assigned
20438 +          a block address. */
20439 +       JNODE_CREATED = 5,
20440 +
20441 +       /* this node is currently relocated */
20442 +       JNODE_RELOC = 6,
20443 +       /* this node is currently wandered */
20444 +       JNODE_OVRWR = 7,
20445 +
20446 +       /* this znode has been modified */
20447 +       JNODE_DIRTY = 8,
20448 +
20449 +       /* znode lock is being invalidated */
20450 +       JNODE_IS_DYING = 9,
20451 +
20452 +       /* THIS PLACE IS INTENTIONALLY LEFT BLANK */
20453 +
20454 +       /* jnode is queued for flushing. */
20455 +       JNODE_FLUSH_QUEUED = 12,
20456 +
20457 +       /* In the following bits jnode type is encoded. */
20458 +       JNODE_TYPE_1 = 13,
20459 +       JNODE_TYPE_2 = 14,
20460 +       JNODE_TYPE_3 = 15,
20461 +
20462 +       /* jnode is being destroyed */
20463 +       JNODE_RIP = 16,
20464 +
20465 +       /* znode was not captured during locking (it might so be because
20466 +          ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */
20467 +       JNODE_MISSED_IN_CAPTURE = 17,
20468 +
20469 +       /* write is in progress */
20470 +       JNODE_WRITEBACK = 18,
20471 +
20472 +       /* FIXME: now it is used by crypto-compress plugin only */
20473 +       JNODE_NEW = 19,
20474 +
20475 +       /* delimiting keys are already set for this znode. */
20476 +       JNODE_DKSET = 20,
20477 +
20478 +       /* when this bit is set page and jnode can not be disconnected */
20479 +       JNODE_WRITE_PREPARED = 21,
20480 +
20481 +       JNODE_CLUSTER_PAGE = 22,
20482 +       /* Jnode is marked for repacking, that means the reiser4 flush and the
20483 +        * block allocator should process this node special way  */
20484 +       JNODE_REPACK = 23,
20485 +       /* node should be converted by flush in squalloc phase */
20486 +       JNODE_CONVERTIBLE = 24,
20487 +       /*
20488 +        * When jnode is dirtied for the first time in given transaction,
20489 +        * do_jnode_make_dirty() checks whether this jnode can possible became
20490 +        * member of overwrite set. If so, this bit is set, and one block is
20491 +        * reserved in the ->flush_reserved space of atom.
20492 +        *
20493 +        * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when
20494 +        *
20495 +        *     (1) flush decides that we want this block to go into relocate
20496 +        *     set after all.
20497 +        *
20498 +        *     (2) wandering log is allocated (by log writer)
20499 +        *
20500 +        *     (3) extent is allocated
20501 +        *
20502 +        */
20503 +       JNODE_FLUSH_RESERVED = 29
20504 +} reiser4_jnode_state;
20505 +
20506 +/* Macros for accessing the jnode state. */
20507 +
20508 +static inline void JF_CLR(jnode * j, int f)
20509 +{
20510 +       assert("unknown-1", j->magic == JMAGIC);
20511 +       clear_bit(f, &j->state);
20512 +}
20513 +static inline int JF_ISSET(const jnode * j, int f)
20514 +{
20515 +       assert("unknown-2", j->magic == JMAGIC);
20516 +       return test_bit(f, &((jnode *) j)->state);
20517 +}
20518 +static inline void JF_SET(jnode * j, int f)
20519 +{
20520 +       assert("unknown-3", j->magic == JMAGIC);
20521 +       set_bit(f, &j->state);
20522 +}
20523 +
20524 +static inline int JF_TEST_AND_SET(jnode * j, int f)
20525 +{
20526 +       assert("unknown-4", j->magic == JMAGIC);
20527 +       return test_and_set_bit(f, &j->state);
20528 +}
20529 +
20530 +static inline void spin_lock_jnode(jnode *node)
20531 +{
20532 +       /* check that spinlocks of lower priorities are not held */
20533 +       assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
20534 +                   LOCK_CNT_NIL(spin_locked_txnh) &&
20535 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
20536 +                   LOCK_CNT_NIL(rw_locked_dk) &&
20537 +                   LOCK_CNT_LT(spin_locked_jnode, 2)));
20538 +
20539 +       spin_lock(&(node->guard));
20540 +
20541 +       LOCK_CNT_INC(spin_locked_jnode);
20542 +       LOCK_CNT_INC(spin_locked);
20543 +}
20544 +
20545 +static inline void spin_unlock_jnode(jnode *node)
20546 +{
20547 +       assert_spin_locked(&(node->guard));
20548 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode));
20549 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
20550 +
20551 +       LOCK_CNT_DEC(spin_locked_jnode);
20552 +       LOCK_CNT_DEC(spin_locked);
20553 +
20554 +       spin_unlock(&(node->guard));
20555 +}
20556 +
20557 +static inline int jnode_is_in_deleteset(const jnode * node)
20558 +{
20559 +       return JF_ISSET(node, JNODE_RELOC);
20560 +}
20561 +
20562 +extern int init_jnodes(void);
20563 +extern void done_jnodes(void);
20564 +
20565 +/* Jnode routines */
20566 +extern jnode *jalloc(void);
20567 +extern void jfree(jnode * node) NONNULL;
20568 +extern jnode *jclone(jnode *);
20569 +extern jnode *jlookup(reiser4_tree * tree,
20570 +                     oid_t objectid, unsigned long ind) NONNULL;
20571 +extern jnode *jfind(struct address_space *, unsigned long index) NONNULL;
20572 +extern jnode *jnode_by_page(struct page *pg) NONNULL;
20573 +extern jnode *jnode_of_page(struct page *pg) NONNULL;
20574 +void jnode_attach_page(jnode * node, struct page *pg);
20575 +
20576 +void unhash_unformatted_jnode(jnode *);
20577 +extern jnode *page_next_jnode(jnode * node) NONNULL;
20578 +extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL;
20579 +extern void jnode_make_dirty(jnode * node) NONNULL;
20580 +extern void jnode_make_clean(jnode * node) NONNULL;
20581 +extern void jnode_make_wander_nolock(jnode * node) NONNULL;
20582 +extern void jnode_make_wander(jnode *) NONNULL;
20583 +extern void znode_make_reloc(znode *, flush_queue_t *) NONNULL;
20584 +extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL;
20585 +extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL;
20586 +
20587 +/**
20588 + * jnode_get_block
20589 + * @node: jnode to query
20590 + *
20591 + */
20592 +static inline const reiser4_block_nr *jnode_get_block(const jnode *node)
20593 +{
20594 +       assert("nikita-528", node != NULL);
20595 +
20596 +       return &node->blocknr;
20597 +}
20598 +
20599 +/**
20600 + * jnode_set_block
20601 + * @node: jnode to update
20602 + * @blocknr: new block nr
20603 + */
20604 +static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr)
20605 +{
20606 +       assert("nikita-2020", node != NULL);
20607 +       assert("umka-055", blocknr != NULL);
20608 +       node->blocknr = *blocknr;
20609 +}
20610 +
20611 +
20612 +/* block number for IO. Usually this is the same as jnode_get_block(), unless
20613 + * jnode was emergency flushed---then block number chosen by eflush is
20614 + * used. */
20615 +static inline const reiser4_block_nr *jnode_get_io_block(jnode * node)
20616 +{
20617 +       assert("nikita-2768", node != NULL);
20618 +       assert_spin_locked(&(node->guard));
20619 +
20620 +       return jnode_get_block(node);
20621 +}
20622 +
20623 +/* Jnode flush interface. */
20624 +extern reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos);
20625 +extern flush_queue_t *reiser4_pos_fq(flush_pos_t * pos);
20626 +
20627 +/* FIXME-VS: these are used in plugin/item/extent.c */
20628 +
20629 +/* does extent_get_block have to be called */
20630 +#define jnode_mapped(node)     JF_ISSET (node, JNODE_MAPPED)
20631 +#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED)
20632 +
20633 +/* the node should be converted during flush squalloc phase */
20634 +#define jnode_convertible(node)        JF_ISSET (node, JNODE_CONVERTIBLE)
20635 +#define jnode_set_convertible(node)    JF_SET (node, JNODE_CONVERTIBLE)
20636 +
20637 +/* Macros to convert from jnode to znode, znode to jnode.  These are macros
20638 +   because C doesn't allow overloading of const prototypes. */
20639 +#define ZJNODE(x) (& (x) -> zjnode)
20640 +#define JZNODE(x)                                              \
20641 +({                                                             \
20642 +       typeof (x) __tmp_x;                                     \
20643 +                                                               \
20644 +       __tmp_x = (x);                                          \
20645 +       assert ("jmacd-1300", jnode_is_znode (__tmp_x));        \
20646 +       (znode*) __tmp_x;                                       \
20647 +})
20648 +
20649 +extern int jnodes_tree_init(reiser4_tree * tree);
20650 +extern int jnodes_tree_done(reiser4_tree * tree);
20651 +
20652 +#if REISER4_DEBUG
20653 +
20654 +extern int znode_is_any_locked(const znode * node);
20655 +extern void jnode_list_remove(jnode * node);
20656 +
20657 +#else
20658 +
20659 +#define jnode_list_remove(node) noop
20660 +
20661 +#endif
20662 +
20663 +int znode_is_root(const znode * node) NONNULL;
20664 +
20665 +/* bump reference counter on @node */
20666 +static inline void add_x_ref(jnode * node /* node to increase x_count of */ )
20667 +{
20668 +       assert("nikita-1911", node != NULL);
20669 +
20670 +       atomic_inc(&node->x_count);
20671 +       LOCK_CNT_INC(x_refs);
20672 +}
20673 +
20674 +static inline void dec_x_ref(jnode * node)
20675 +{
20676 +       assert("nikita-3215", node != NULL);
20677 +       assert("nikita-3216", atomic_read(&node->x_count) > 0);
20678 +
20679 +       atomic_dec(&node->x_count);
20680 +       assert("nikita-3217", LOCK_CNT_GTZ(x_refs));
20681 +       LOCK_CNT_DEC(x_refs);
20682 +}
20683 +
20684 +/* jref() - increase counter of references to jnode/znode (x_count) */
20685 +static inline jnode *jref(jnode * node)
20686 +{
20687 +       assert("jmacd-508", (node != NULL) && !IS_ERR(node));
20688 +       add_x_ref(node);
20689 +       return node;
20690 +}
20691 +
20692 +/* get the page of jnode */
20693 +static inline struct page *jnode_page(const jnode * node)
20694 +{
20695 +       return node->pg;
20696 +}
20697 +
20698 +/* return pointer to jnode data */
20699 +static inline char *jdata(const jnode * node)
20700 +{
20701 +       assert("nikita-1415", node != NULL);
20702 +       assert("nikita-3198", jnode_page(node) != NULL);
20703 +       return node->data;
20704 +}
20705 +
20706 +static inline int jnode_is_loaded(const jnode * node)
20707 +{
20708 +       assert("zam-506", node != NULL);
20709 +       return atomic_read(&node->d_count) > 0;
20710 +}
20711 +
20712 +extern void page_clear_jnode(struct page *page, jnode * node) NONNULL;
20713 +
20714 +static inline void jnode_set_reloc(jnode * node)
20715 +{
20716 +       assert("nikita-2431", node != NULL);
20717 +       assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR));
20718 +       JF_SET(node, JNODE_RELOC);
20719 +}
20720 +
20721 +/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */
20722 +
20723 +extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL;
20724 +
20725 +static inline int jload(jnode *node)
20726 +{
20727 +       return jload_gfp(node, reiser4_ctx_gfp_mask_get(), 1);
20728 +}
20729 +
20730 +extern int jinit_new(jnode *, gfp_t) NONNULL;
20731 +extern int jstartio(jnode *) NONNULL;
20732 +
20733 +extern void jdrop(jnode *) NONNULL;
20734 +extern int jwait_io(jnode *, int rw) NONNULL;
20735 +
20736 +void jload_prefetch(jnode *);
20737 +
20738 +extern jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) NONNULL;
20739 +extern void reiser4_drop_io_head(jnode * node) NONNULL;
20740 +
20741 +static inline reiser4_tree *jnode_get_tree(const jnode * node)
20742 +{
20743 +       assert("nikita-2691", node != NULL);
20744 +       return node->tree;
20745 +}
20746 +
20747 +extern void pin_jnode_data(jnode *);
20748 +extern void unpin_jnode_data(jnode *);
20749 +
20750 +static inline jnode_type jnode_get_type(const jnode * node)
20751 +{
20752 +       static const unsigned long state_mask =
20753 +           (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3);
20754 +
20755 +       static jnode_type mask_to_type[] = {
20756 +               /*  JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */
20757 +
20758 +               /* 000 */
20759 +               [0] = JNODE_FORMATTED_BLOCK,
20760 +               /* 001 */
20761 +               [1] = JNODE_UNFORMATTED_BLOCK,
20762 +               /* 010 */
20763 +               [2] = JNODE_BITMAP,
20764 +               /* 011 */
20765 +               [3] = LAST_JNODE_TYPE,  /*invalid */
20766 +               /* 100 */
20767 +               [4] = JNODE_INODE,
20768 +               /* 101 */
20769 +               [5] = LAST_JNODE_TYPE,
20770 +               /* 110 */
20771 +               [6] = JNODE_IO_HEAD,
20772 +               /* 111 */
20773 +               [7] = LAST_JNODE_TYPE,  /* invalid */
20774 +       };
20775 +
20776 +       return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1];
20777 +}
20778 +
20779 +/* returns true if node is a znode */
20780 +static inline int jnode_is_znode(const jnode * node)
20781 +{
20782 +       return jnode_get_type(node) == JNODE_FORMATTED_BLOCK;
20783 +}
20784 +
20785 +static inline int jnode_is_flushprepped(jnode * node)
20786 +{
20787 +       assert("jmacd-78212", node != NULL);
20788 +       assert_spin_locked(&(node->guard));
20789 +       return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) ||
20790 +               JF_ISSET(node, JNODE_OVRWR);
20791 +}
20792 +
20793 +/* Return true if @node has already been processed by the squeeze and allocate
20794 +   process.  This implies the block address has been finalized for the
20795 +   duration of this atom (or it is clean and will remain in place).  If this
20796 +   returns true you may use the block number as a hint. */
20797 +static inline int jnode_check_flushprepped(jnode * node)
20798 +{
20799 +       int result;
20800 +
20801 +       /* It must be clean or relocated or wandered.  New allocations are set to relocate. */
20802 +       spin_lock_jnode(node);
20803 +       result = jnode_is_flushprepped(node);
20804 +       spin_unlock_jnode(node);
20805 +       return result;
20806 +}
20807 +
20808 +/* returns true if node is unformatted */
20809 +static inline int jnode_is_unformatted(const jnode * node)
20810 +{
20811 +       assert("jmacd-0123", node != NULL);
20812 +       return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK;
20813 +}
20814 +
20815 +/* returns true if node represents a cluster cache page */
20816 +static inline int jnode_is_cluster_page(const jnode * node)
20817 +{
20818 +       assert("edward-50", node != NULL);
20819 +       return (JF_ISSET(node, JNODE_CLUSTER_PAGE));
20820 +}
20821 +
20822 +/* returns true is node is builtin inode's jnode */
20823 +static inline int jnode_is_inode(const jnode * node)
20824 +{
20825 +       assert("vs-1240", node != NULL);
20826 +       return jnode_get_type(node) == JNODE_INODE;
20827 +}
20828 +
20829 +static inline jnode_plugin *jnode_ops_of(const jnode_type type)
20830 +{
20831 +       assert("nikita-2367", type < LAST_JNODE_TYPE);
20832 +       return jnode_plugin_by_id((reiser4_plugin_id) type);
20833 +}
20834 +
20835 +static inline jnode_plugin *jnode_ops(const jnode * node)
20836 +{
20837 +       assert("nikita-2366", node != NULL);
20838 +
20839 +       return jnode_ops_of(jnode_get_type(node));
20840 +}
20841 +
20842 +/* Get the index of a block. */
20843 +static inline unsigned long jnode_get_index(jnode * node)
20844 +{
20845 +       return jnode_ops(node)->index(node);
20846 +}
20847 +
20848 +/* return true if "node" is the root */
20849 +static inline int jnode_is_root(const jnode * node)
20850 +{
20851 +       return jnode_is_znode(node) && znode_is_root(JZNODE(node));
20852 +}
20853 +
20854 +extern struct address_space *mapping_jnode(const jnode * node);
20855 +extern unsigned long index_jnode(const jnode * node);
20856 +
20857 +static inline void jput(jnode * node);
20858 +extern void jput_final(jnode * node);
20859 +
20860 +/* bump data counter on @node */
20861 +static inline void add_d_ref(jnode * node /* node to increase d_count of */ )
20862 +{
20863 +       assert("nikita-1962", node != NULL);
20864 +
20865 +       atomic_inc(&node->d_count);
20866 +       if (jnode_is_unformatted(node) || jnode_is_znode(node))
20867 +               LOCK_CNT_INC(d_refs);
20868 +}
20869 +
20870 +/* jput() - decrement x_count reference counter on znode.
20871 +
20872 +   Count may drop to 0, jnode stays in cache until memory pressure causes the
20873 +   eviction of its page. The c_count variable also ensures that children are
20874 +   pressured out of memory before the parent. The jnode remains hashed as
20875 +   long as the VM allows its page to stay in memory.
20876 +*/
20877 +static inline void jput(jnode * node)
20878 +{
20879 +       assert("jmacd-509", node != NULL);
20880 +       assert("jmacd-510", atomic_read(&node->x_count) > 0);
20881 +       assert("zam-926", reiser4_schedulable());
20882 +       LOCK_CNT_DEC(x_refs);
20883 +
20884 +       rcu_read_lock();
20885 +       /*
20886 +        * we don't need any kind of lock here--jput_final() uses RCU.
20887 +        */
20888 +       if (unlikely(atomic_dec_and_test(&node->x_count))) {
20889 +               jput_final(node);
20890 +       } else
20891 +               rcu_read_unlock();
20892 +       assert("nikita-3473", reiser4_schedulable());
20893 +}
20894 +
20895 +extern void jrelse(jnode * node);
20896 +extern void jrelse_tail(jnode * node);
20897 +
20898 +extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node);
20899 +
20900 +/* resolve race with jput */
20901 +static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node)
20902 +{
20903 +       if (unlikely(JF_ISSET(node, JNODE_RIP)))
20904 +               node = jnode_rip_sync(tree, node);
20905 +       return node;
20906 +}
20907 +
20908 +extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key);
20909 +
20910 +#if REISER4_DEBUG
20911 +extern int jnode_invariant_f(const jnode *node, char const **msg);
20912 +#endif
20913 +
20914 +extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE];
20915 +
20916 +/* __JNODE_H__ */
20917 +#endif
20918 +
20919 +/* Make Linus happy.
20920 +   Local variables:
20921 +   c-indentation-style: "K&R"
20922 +   mode-name: "LC"
20923 +   c-basic-offset: 8
20924 +   tab-width: 8
20925 +   fill-column: 120
20926 +   End:
20927 +*/
20928 diff -urN linux-2.6.27.orig/fs/reiser4/kassign.c linux-2.6.27/fs/reiser4/kassign.c
20929 --- linux-2.6.27.orig/fs/reiser4/kassign.c      1970-01-01 03:00:00.000000000 +0300
20930 +++ linux-2.6.27/fs/reiser4/kassign.c   2008-10-12 18:20:00.000000000 +0400
20931 @@ -0,0 +1,677 @@
20932 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20933 + * reiser4/README */
20934 +
20935 +/* Key assignment policy implementation */
20936 +
20937 +/*
20938 + * In reiser4 every piece of file system data and meta-data has a key. Keys
20939 + * are used to store information in and retrieve it from reiser4 internal
20940 + * tree. In addition to this, keys define _ordering_ of all file system
20941 + * information: things having close keys are placed into the same or
20942 + * neighboring (in the tree order) nodes of the tree. As our block allocator
20943 + * tries to respect tree order (see flush.c), keys also define order in which
20944 + * things are laid out on the disk, and hence, affect performance directly.
20945 + *
20946 + * Obviously, assignment of keys to data and meta-data should be consistent
20947 + * across whole file system. Algorithm that calculates a key for a given piece
20948 + * of data or meta-data is referred to as "key assignment".
20949 + *
20950 + * Key assignment is too expensive to be implemented as a plugin (that is,
20951 + * with an ability to support different key assignment schemas in the same
20952 + * compiled kernel image). As a compromise, all key-assignment functions and
20953 + * data-structures are collected in this single file, so that modifications to
20954 + * key assignment algorithm can be localized. Additional changes may be
20955 + * required in key.[ch].
20956 + *
20957 + * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
20958 + * may guess, there is "Plan B" too.
20959 + *
20960 + */
20961 +
20962 +/*
20963 + * Additional complication with key assignment implementation is a requirement
20964 + * to support different key length.
20965 + */
20966 +
20967 +/*
20968 + *                   KEY ASSIGNMENT: PLAN A, LONG KEYS.
20969 + *
20970 + * DIRECTORY ITEMS
20971 + *
20972 + *  |       60     | 4 | 7 |1|   56        |        64        |        64       |
20973 + *  +--------------+---+---+-+-------------+------------------+-----------------+
20974 + *  |    dirid     | 0 | F |H|  prefix-1   |    prefix-2      |  prefix-3/hash  |
20975 + *  +--------------+---+---+-+-------------+------------------+-----------------+
20976 + *  |                  |                   |                  |                 |
20977 + *  |    8 bytes       |      8 bytes      |     8 bytes      |     8 bytes     |
20978 + *
20979 + * dirid         objectid of directory this item is for
20980 + *
20981 + * F             fibration, see fs/reiser4/plugin/fibration.[ch]
20982 + *
20983 + * H             1 if last 8 bytes of the key contain hash,
20984 + *               0 if last 8 bytes of the key contain prefix-3
20985 + *
20986 + * prefix-1      first 7 characters of file name.
20987 + *               Padded by zeroes if name is not long enough.
20988 + *
20989 + * prefix-2      next 8 characters of the file name.
20990 + *
20991 + * prefix-3      next 8 characters of the file name.
20992 + *
20993 + * hash          hash of the rest of file name (i.e., portion of file
20994 + *               name not included into prefix-1 and prefix-2).
20995 + *
20996 + * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
20997 + * in the key. Such file names are called "short". They are distinguished by H
20998 + * bit set 0 in the key.
20999 + *
21000 + * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
21001 + * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
21002 + * key. Last 8 bytes of the key are occupied by hash of the remaining
21003 + * characters of the name.
21004 + *
21005 + * This key assignment reaches following important goals:
21006 + *
21007 + *     (1) directory entries are sorted in approximately lexicographical
21008 + *     order.
21009 + *
21010 + *     (2) collisions (when multiple directory items have the same key), while
21011 + *     principally unavoidable in a tree with fixed length keys, are rare.
21012 + *
21013 + * STAT DATA
21014 + *
21015 + *  |       60     | 4 |       64        | 4 |     60       |        64       |
21016 + *  +--------------+---+-----------------+---+--------------+-----------------+
21017 + *  |  locality id | 1 |    ordering     | 0 |  objectid    |        0        |
21018 + *  +--------------+---+-----------------+---+--------------+-----------------+
21019 + *  |                  |                 |                  |                 |
21020 + *  |    8 bytes       |    8 bytes      |     8 bytes      |     8 bytes     |
21021 + *
21022 + * locality id     object id of a directory where first name was created for
21023 + *                 the object
21024 + *
21025 + * ordering        copy of second 8-byte portion of the key of directory
21026 + *                 entry for the first name of this object. Ordering has a form
21027 + *                         {
21028 + *                                 fibration :7;
21029 + *                                 h         :1;
21030 + *                                 prefix1   :56;
21031 + *                         }
21032 + *                 see description of key for directory entry above.
21033 + *
21034 + * objectid        object id for this object
21035 + *
21036 + * This key assignment policy is designed to keep stat-data in the same order
21037 + * as corresponding directory items, thus speeding up readdir/stat types of
21038 + * workload.
21039 + *
21040 + * FILE BODY
21041 + *
21042 + *  |       60     | 4 |       64        | 4 |     60       |        64       |
21043 + *  +--------------+---+-----------------+---+--------------+-----------------+
21044 + *  |  locality id | 4 |    ordering     | 0 |  objectid    |      offset     |
21045 + *  +--------------+---+-----------------+---+--------------+-----------------+
21046 + *  |                  |                 |                  |                 |
21047 + *  |    8 bytes       |    8 bytes      |     8 bytes      |     8 bytes     |
21048 + *
21049 + * locality id     object id of a directory where first name was created for
21050 + *                 the object
21051 + *
21052 + * ordering        the same as in the key of stat-data for this object
21053 + *
21054 + * objectid        object id for this object
21055 + *
21056 + * offset          logical offset from the beginning of this file.
21057 + *                 Measured in bytes.
21058 + *
21059 + *
21060 + *                   KEY ASSIGNMENT: PLAN A, SHORT KEYS.
21061 + *
21062 + * DIRECTORY ITEMS
21063 + *
21064 + *  |       60     | 4 | 7 |1|   56        |        64       |
21065 + *  +--------------+---+---+-+-------------+-----------------+
21066 + *  |    dirid     | 0 | F |H|  prefix-1   |  prefix-2/hash  |
21067 + *  +--------------+---+---+-+-------------+-----------------+
21068 + *  |                  |                   |                 |
21069 + *  |    8 bytes       |      8 bytes      |     8 bytes     |
21070 + *
21071 + * dirid         objectid of directory this item is for
21072 + *
21073 + * F             fibration, see fs/reiser4/plugin/fibration.[ch]
21074 + *
21075 + * H             1 if last 8 bytes of the key contain hash,
21076 + *               0 if last 8 bytes of the key contain prefix-2
21077 + *
21078 + * prefix-1      first 7 characters of file name.
21079 + *               Padded by zeroes if name is not long enough.
21080 + *
21081 + * prefix-2      next 8 characters of the file name.
21082 + *
21083 + * hash          hash of the rest of file name (i.e., portion of file
21084 + *               name not included into prefix-1).
21085 + *
21086 + * File names shorter than 15 (== 7 + 8) characters are completely encoded in
21087 + * the key. Such file names are called "short". They are distinguished by H
21088 + * bit set in the key.
21089 + *
21090 + * Other file names are "long". For long name, H bit is 0, and first 7
21091 + * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
21092 + * key are occupied by hash of the remaining characters of the name.
21093 + *
21094 + * STAT DATA
21095 + *
21096 + *  |       60     | 4 | 4 |     60       |        64       |
21097 + *  +--------------+---+---+--------------+-----------------+
21098 + *  |  locality id | 1 | 0 |  objectid    |        0        |
21099 + *  +--------------+---+---+--------------+-----------------+
21100 + *  |                  |                  |                 |
21101 + *  |    8 bytes       |     8 bytes      |     8 bytes     |
21102 + *
21103 + * locality id     object id of a directory where first name was created for
21104 + *                 the object
21105 + *
21106 + * objectid        object id for this object
21107 + *
21108 + * FILE BODY
21109 + *
21110 + *  |       60     | 4 | 4 |     60       |        64       |
21111 + *  +--------------+---+---+--------------+-----------------+
21112 + *  |  locality id | 4 | 0 |  objectid    |      offset     |
21113 + *  +--------------+---+---+--------------+-----------------+
21114 + *  |                  |                  |                 |
21115 + *  |    8 bytes       |     8 bytes      |     8 bytes     |
21116 + *
21117 + * locality id     object id of a directory where first name was created for
21118 + *                 the object
21119 + *
21120 + * objectid        object id for this object
21121 + *
21122 + * offset          logical offset from the beginning of this file.
21123 + *                 Measured in bytes.
21124 + *
21125 + *
21126 + */
21127 +
21128 +#include "debug.h"
21129 +#include "key.h"
21130 +#include "kassign.h"
21131 +#include "vfs_ops.h"
21132 +#include "inode.h"
21133 +#include "super.h"
21134 +#include "dscale.h"
21135 +
21136 +#include <linux/types.h>       /* for __u??  */
21137 +#include <linux/fs.h>          /* for struct super_block, etc  */
21138 +
21139 +/* bitmask for H bit (see comment at the beginning of this file */
21140 +static const __u64 longname_mark = 0x0100000000000000ull;
21141 +/* bitmask for F and H portions of the key. */
21142 +static const __u64 fibration_mask = 0xff00000000000000ull;
21143 +
21144 +/* return true if name is not completely encoded in @key */
21145 +int is_longname_key(const reiser4_key * key)
21146 +{
21147 +       __u64 highpart;
21148 +
21149 +       assert("nikita-2863", key != NULL);
21150 +       if (get_key_type(key) != KEY_FILE_NAME_MINOR)
21151 +               reiser4_print_key("oops", key);
21152 +       assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
21153 +
21154 +       if (REISER4_LARGE_KEY)
21155 +               highpart = get_key_ordering(key);
21156 +       else
21157 +               highpart = get_key_objectid(key);
21158 +
21159 +       return (highpart & longname_mark) ? 1 : 0;
21160 +}
21161 +
21162 +/* return true if @name is too long to be completely encoded in the key */
21163 +int is_longname(const char *name UNUSED_ARG, int len)
21164 +{
21165 +       if (REISER4_LARGE_KEY)
21166 +               return len > 23;
21167 +       else
21168 +               return len > 15;
21169 +}
21170 +
21171 +/* code ascii string into __u64.
21172 +
21173 +   Put characters of @name into result (@str) one after another starting
21174 +   from @start_idx-th highest (arithmetically) byte. This produces
21175 +   endian-safe encoding. memcpy(2) will not do.
21176 +
21177 +*/
21178 +static __u64 pack_string(const char *name /* string to encode */ ,
21179 +                        int start_idx  /* highest byte in result from
21180 +                                        * which to start encoding */ )
21181 +{
21182 +       unsigned i;
21183 +       __u64 str;
21184 +
21185 +       str = 0;
21186 +       for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
21187 +               str <<= 8;
21188 +               str |= (unsigned char)name[i];
21189 +       }
21190 +       str <<= (sizeof str - i - start_idx) << 3;
21191 +       return str;
21192 +}
21193 +
21194 +/* opposite to pack_string(). Takes value produced by pack_string(), restores
21195 + * string encoded in it and stores result in @buf */
21196 +char * reiser4_unpack_string(__u64 value, char *buf)
21197 +{
21198 +       do {
21199 +               *buf = value >> (64 - 8);
21200 +               if (*buf)
21201 +                       ++buf;
21202 +               value <<= 8;
21203 +       } while (value != 0);
21204 +       *buf = 0;
21205 +       return buf;
21206 +}
21207 +
21208 +/* obtain name encoded in @key and store it in @buf */
21209 +char *extract_name_from_key(const reiser4_key * key, char *buf)
21210 +{
21211 +       char *c;
21212 +
21213 +       assert("nikita-2868", !is_longname_key(key));
21214 +
21215 +       c = buf;
21216 +       if (REISER4_LARGE_KEY) {
21217 +               c = reiser4_unpack_string(get_key_ordering(key) &
21218 +                                         ~fibration_mask, c);
21219 +               c = reiser4_unpack_string(get_key_fulloid(key), c);
21220 +       } else
21221 +               c = reiser4_unpack_string(get_key_fulloid(key) &
21222 +                                         ~fibration_mask, c);
21223 +       reiser4_unpack_string(get_key_offset(key), c);
21224 +       return buf;
21225 +}
21226 +
21227 +/**
21228 + * complete_entry_key - calculate entry key by name
21229 + * @dir: directory where entry is (or will be) in
21230 + * @name: name to calculate key of
21231 + * @len: lenth of name
21232 + * @result: place to store result in
21233 + *
21234 + * Sets fields of entry key @result which depend on file name.
21235 + * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
21236 + * objectid and offset. Otherwise, objectid and offset are set.
21237 + */
21238 +void complete_entry_key(const struct inode *dir, const char *name,
21239 +                       int len, reiser4_key *result)
21240 +{
21241 +#if REISER4_LARGE_KEY
21242 +       __u64 ordering;
21243 +       __u64 objectid;
21244 +       __u64 offset;
21245 +
21246 +       assert("nikita-1139", dir != NULL);
21247 +       assert("nikita-1142", result != NULL);
21248 +       assert("nikita-2867", strlen(name) == len);
21249 +
21250 +       /*
21251 +        * key allocation algorithm for directory entries in case of large
21252 +        * keys:
21253 +        *
21254 +        * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
21255 +        * characters into ordering field of key, next 8 charactes (if any)
21256 +        * into objectid field of key and next 8 ones (of any) into offset
21257 +        * field of key
21258 +        *
21259 +        * If file name is longer than 23 characters, put first 7 characters
21260 +        * into key's ordering, next 8 to objectid and hash of remaining
21261 +        * characters into offset field.
21262 +        *
21263 +        * To distinguish above cases, in latter set up unused high bit in
21264 +        * ordering field.
21265 +        */
21266 +
21267 +       /* [0-6] characters to ordering */
21268 +       ordering = pack_string(name, 1);
21269 +       if (len > 7) {
21270 +               /* [7-14] characters to objectid */
21271 +               objectid = pack_string(name + 7, 0);
21272 +               if (len > 15) {
21273 +                       if (len <= 23) {
21274 +                               /* [15-23] characters to offset */
21275 +                               offset = pack_string(name + 15, 0);
21276 +                       } else {
21277 +                               /* note in a key the fact that offset contains hash. */
21278 +                               ordering |= longname_mark;
21279 +
21280 +                               /* offset is the hash of the file name's tail. */
21281 +                               offset = inode_hash_plugin(dir)->hash(name + 15,
21282 +                                                                     len - 15);
21283 +                       }
21284 +               } else {
21285 +                       offset = 0ull;
21286 +               }
21287 +       } else {
21288 +               objectid = 0ull;
21289 +               offset = 0ull;
21290 +       }
21291 +
21292 +       assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21293 +       ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21294 +
21295 +       set_key_ordering(result, ordering);
21296 +       set_key_fulloid(result, objectid);
21297 +       set_key_offset(result, offset);
21298 +       return;
21299 +
21300 +#else
21301 +       __u64 objectid;
21302 +       __u64 offset;
21303 +
21304 +       assert("nikita-1139", dir != NULL);
21305 +       assert("nikita-1142", result != NULL);
21306 +       assert("nikita-2867", strlen(name) == len);
21307 +
21308 +       /*
21309 +        * key allocation algorithm for directory entries in case of not large
21310 +        * keys:
21311 +        *
21312 +        * If name is not longer than 7 + 8 = 15 characters, put first 7
21313 +        * characters into objectid field of key, next 8 charactes (if any)
21314 +        * into offset field of key
21315 +        *
21316 +        * If file name is longer than 15 characters, put first 7 characters
21317 +        * into key's objectid, and hash of remaining characters into offset
21318 +        * field.
21319 +        *
21320 +        * To distinguish above cases, in latter set up unused high bit in
21321 +        * objectid field.
21322 +        */
21323 +
21324 +       /* [0-6] characters to objectid */
21325 +       objectid = pack_string(name, 1);
21326 +       if (len > 7) {
21327 +               if (len <= 15) {
21328 +                       /* [7-14] characters to offset */
21329 +                       offset = pack_string(name + 7, 0);
21330 +               } else {
21331 +                       /* note in a key the fact that offset contains hash. */
21332 +                       objectid |= longname_mark;
21333 +
21334 +                       /* offset is the hash of the file name. */
21335 +                       offset = inode_hash_plugin(dir)->hash(name + 7,
21336 +                                                             len - 7);
21337 +               }
21338 +       } else
21339 +               offset = 0ull;
21340 +
21341 +       assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21342 +       objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21343 +
21344 +       set_key_fulloid(result, objectid);
21345 +       set_key_offset(result, offset);
21346 +       return;
21347 +#endif                         /* ! REISER4_LARGE_KEY */
21348 +}
21349 +
21350 +/* true, if @key is the key of "." */
21351 +int is_dot_key(const reiser4_key * key /* key to check */ )
21352 +{
21353 +       assert("nikita-1717", key != NULL);
21354 +       assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
21355 +       return
21356 +           (get_key_ordering(key) == 0ull) &&
21357 +           (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
21358 +}
21359 +
21360 +/* build key for stat-data.
21361 +
21362 +   return key of stat-data of this object. This should became sd plugin
21363 +   method in the future. For now, let it be here.
21364 +
21365 +*/
21366 +reiser4_key *build_sd_key(const struct inode * target /* inode of an object */ ,
21367 +                         reiser4_key * result  /* resulting key of @target
21368 +                                                  stat-data */ )
21369 +{
21370 +       assert("nikita-261", result != NULL);
21371 +
21372 +       reiser4_key_init(result);
21373 +       set_key_locality(result, reiser4_inode_data(target)->locality_id);
21374 +       set_key_ordering(result, get_inode_ordering(target));
21375 +       set_key_objectid(result, get_inode_oid(target));
21376 +       set_key_type(result, KEY_SD_MINOR);
21377 +       set_key_offset(result, (__u64) 0);
21378 +       return result;
21379 +}
21380 +
21381 +/* encode part of key into &obj_key_id
21382 +
21383 +   This encodes into @id part of @key sufficient to restore @key later,
21384 +   given that latter is key of object (key of stat-data).
21385 +
21386 +   See &obj_key_id
21387 +*/
21388 +int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
21389 +                    obj_key_id * id /* id where key is encoded in */ )
21390 +{
21391 +       assert("nikita-1151", key != NULL);
21392 +       assert("nikita-1152", id != NULL);
21393 +
21394 +       memcpy(id, key, sizeof *id);
21395 +       return 0;
21396 +}
21397 +
21398 +/* encode reference to @obj in @id.
21399 +
21400 +   This is like build_obj_key_id() above, but takes inode as parameter. */
21401 +int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
21402 +                      obj_key_id * id /* result */ )
21403 +{
21404 +       reiser4_key sdkey;
21405 +
21406 +       assert("nikita-1166", obj != NULL);
21407 +       assert("nikita-1167", id != NULL);
21408 +
21409 +       build_sd_key(obj, &sdkey);
21410 +       build_obj_key_id(&sdkey, id);
21411 +       return 0;
21412 +}
21413 +
21414 +/* decode @id back into @key
21415 +
21416 +   Restore key of object stat-data from @id. This is dual to
21417 +   build_obj_key_id() above.
21418 +*/
21419 +int extract_key_from_id(const obj_key_id * id  /* object key id to extract key
21420 +                                                * from */ ,
21421 +                       reiser4_key * key /* result */ )
21422 +{
21423 +       assert("nikita-1153", id != NULL);
21424 +       assert("nikita-1154", key != NULL);
21425 +
21426 +       reiser4_key_init(key);
21427 +       memcpy(key, id, sizeof *id);
21428 +       return 0;
21429 +}
21430 +
21431 +/* extract objectid of directory from key of directory entry within said
21432 +   directory.
21433 +   */
21434 +oid_t extract_dir_id_from_key(const reiser4_key * de_key       /* key of
21435 +                                                                * directory
21436 +                                                                * entry */ )
21437 +{
21438 +       assert("nikita-1314", de_key != NULL);
21439 +       return get_key_locality(de_key);
21440 +}
21441 +
21442 +/* encode into @id key of directory entry.
21443 +
21444 +   Encode into @id information sufficient to later distinguish directory
21445 +   entries within the same directory. This is not whole key, because all
21446 +   directory entries within directory item share locality which is equal
21447 +   to objectid of their directory.
21448 +
21449 +*/
21450 +int build_de_id(const struct inode *dir /* inode of directory */ ,
21451 +               const struct qstr *name /* name to be given to @obj by
21452 +                                        * directory entry being
21453 +                                        * constructed */ ,
21454 +               de_id * id /* short key of directory entry */ )
21455 +{
21456 +       reiser4_key key;
21457 +
21458 +       assert("nikita-1290", dir != NULL);
21459 +       assert("nikita-1292", id != NULL);
21460 +
21461 +       /* NOTE-NIKITA this is suboptimal. */
21462 +       inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
21463 +       return build_de_id_by_key(&key, id);
21464 +}
21465 +
21466 +/* encode into @id key of directory entry.
21467 +
21468 +   Encode into @id information sufficient to later distinguish directory
21469 +   entries within the same directory. This is not whole key, because all
21470 +   directory entries within directory item share locality which is equal
21471 +   to objectid of their directory.
21472 +
21473 +*/
21474 +int build_de_id_by_key(const reiser4_key * entry_key   /* full key of directory
21475 +                                                        * entry */ ,
21476 +                      de_id * id /* short key of directory entry */ )
21477 +{
21478 +       memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
21479 +       return 0;
21480 +}
21481 +
21482 +/* restore from @id key of directory entry.
21483 +
21484 +   Function dual to build_de_id(): given @id and locality, build full
21485 +   key of directory entry within directory item.
21486 +
21487 +*/
21488 +int extract_key_from_de_id(const oid_t locality        /* locality of directory
21489 +                                                * entry */ ,
21490 +                          const de_id * id /* directory entry id */ ,
21491 +                          reiser4_key * key /* result */ )
21492 +{
21493 +       /* no need to initialise key here: all fields are overwritten */
21494 +       memcpy(((__u64 *) key) + 1, id, sizeof *id);
21495 +       set_key_locality(key, locality);
21496 +       set_key_type(key, KEY_FILE_NAME_MINOR);
21497 +       return 0;
21498 +}
21499 +
21500 +/* compare two &de_id's */
21501 +cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
21502 +               const de_id * id2 /* second &de_id to compare */ )
21503 +{
21504 +       /* NOTE-NIKITA ugly implementation */
21505 +       reiser4_key k1;
21506 +       reiser4_key k2;
21507 +
21508 +       extract_key_from_de_id((oid_t) 0, id1, &k1);
21509 +       extract_key_from_de_id((oid_t) 0, id2, &k2);
21510 +       return keycmp(&k1, &k2);
21511 +}
21512 +
21513 +/* compare &de_id with key */
21514 +cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
21515 +                   const reiser4_key * key /* key to compare */ )
21516 +{
21517 +       cmp_t result;
21518 +       reiser4_key *k1;
21519 +
21520 +       k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
21521 +       result = KEY_DIFF_EL(k1, key, 1);
21522 +       if (result == EQUAL_TO) {
21523 +               result = KEY_DIFF_EL(k1, key, 2);
21524 +               if (REISER4_LARGE_KEY && result == EQUAL_TO) {
21525 +                       result = KEY_DIFF_EL(k1, key, 3);
21526 +               }
21527 +       }
21528 +       return result;
21529 +}
21530 +
21531 +/*
21532 + * return number of bytes necessary to encode @inode identity.
21533 + */
21534 +int inode_onwire_size(const struct inode *inode)
21535 +{
21536 +       int result;
21537 +
21538 +       result = dscale_bytes_to_write(get_inode_oid(inode));
21539 +       result += dscale_bytes_to_write(get_inode_locality(inode));
21540 +
21541 +       /*
21542 +        * ordering is large (it usually has highest bits set), so it makes
21543 +        * little sense to dscale it.
21544 +        */
21545 +       if (REISER4_LARGE_KEY)
21546 +               result += sizeof(get_inode_ordering(inode));
21547 +       return result;
21548 +}
21549 +
21550 +/*
21551 + * encode @inode identity at @start
21552 + */
21553 +char *build_inode_onwire(const struct inode *inode, char *start)
21554 +{
21555 +       start += dscale_write(start, get_inode_locality(inode));
21556 +       start += dscale_write(start, get_inode_oid(inode));
21557 +
21558 +       if (REISER4_LARGE_KEY) {
21559 +               put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
21560 +               start += sizeof(get_inode_ordering(inode));
21561 +       }
21562 +       return start;
21563 +}
21564 +
21565 +/*
21566 + * extract key that was previously encoded by build_inode_onwire() at @addr
21567 + */
21568 +char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
21569 +{
21570 +       __u64 val;
21571 +
21572 +       addr += dscale_read(addr, &val);
21573 +       val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
21574 +       put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
21575 +       addr += dscale_read(addr, &val);
21576 +       put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
21577 +#if REISER4_LARGE_KEY
21578 +       memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
21579 +       addr += sizeof key_id->ordering;
21580 +#endif
21581 +       return addr;
21582 +}
21583 +
21584 +/*
21585 + * skip a key that was previously encoded by build_inode_onwire() at @addr
21586 + * FIXME: handle IO errors.
21587 + */
21588 +char * locate_obj_key_id_onwire(char * addr)
21589 +{
21590 +       /* locality */
21591 +       addr += dscale_bytes_to_read(addr);
21592 +       /* objectid */
21593 +       addr += dscale_bytes_to_read(addr);
21594 +#if REISER4_LARGE_KEY
21595 +       addr += sizeof ((obj_key_id *)0)->ordering;
21596 +#endif
21597 +       return addr;
21598 +}
21599 +
21600 +/* Make Linus happy.
21601 +   Local variables:
21602 +   c-indentation-style: "K&R"
21603 +   mode-name: "LC"
21604 +   c-basic-offset: 8
21605 +   tab-width: 8
21606 +   fill-column: 120
21607 +   End:
21608 +*/
21609 diff -urN linux-2.6.27.orig/fs/reiser4/kassign.h linux-2.6.27/fs/reiser4/kassign.h
21610 --- linux-2.6.27.orig/fs/reiser4/kassign.h      1970-01-01 03:00:00.000000000 +0300
21611 +++ linux-2.6.27/fs/reiser4/kassign.h   2008-10-12 18:20:00.000000000 +0400
21612 @@ -0,0 +1,111 @@
21613 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21614 + * reiser4/README */
21615 +
21616 +/* Key assignment policy interface. See kassign.c for details. */
21617 +
21618 +#if !defined( __KASSIGN_H__ )
21619 +#define __KASSIGN_H__
21620 +
21621 +#include "forward.h"
21622 +#include "key.h"
21623 +#include "dformat.h"
21624 +
21625 +#include <linux/types.h>       /* for __u??  */
21626 +#include <linux/fs.h>          /* for struct super_block, etc  */
21627 +#include <linux/dcache.h>      /* for struct qstr */
21628 +
21629 +/* key assignment functions */
21630 +
21631 +/* Information from which key of file stat-data can be uniquely
21632 +   restored. This depends on key assignment policy for
21633 +   stat-data. Currently it's enough to store object id and locality id
21634 +   (60+60==120) bits, because minor packing locality and offset of
21635 +   stat-data key are always known constants: KEY_SD_MINOR and 0
21636 +   respectively. For simplicity 4 bits are wasted in each id, and just
21637 +   two 64 bit integers are stored.
21638 +
21639 +   This field has to be byte-aligned, because we don't want to waste
21640 +   space in directory entries. There is another side of a coin of
21641 +   course: we waste CPU and bus bandwidth in stead, by copying data back
21642 +   and forth.
21643 +
21644 +   Next optimization: &obj_key_id is mainly used to address stat data from
21645 +   directory entries. Under the assumption that majority of files only have
21646 +   only name (one hard link) from *the* parent directory it seems reasonable
21647 +   to only store objectid of stat data and take its locality from key of
21648 +   directory item.
21649 +
21650 +   This requires some flag to be added to the &obj_key_id to distinguish
21651 +   between these two cases. Remaining bits in flag byte are then asking to be
21652 +   used to store file type.
21653 +
21654 +   This optimization requires changes in directory item handling code.
21655 +
21656 +*/
21657 +typedef struct obj_key_id {
21658 +       d8 locality[sizeof(__u64)];
21659 +        ON_LARGE_KEY(d8 ordering[sizeof(__u64)];
21660 +           )
21661 +       d8 objectid[sizeof(__u64)];
21662 +}
21663 +obj_key_id;
21664 +
21665 +/* Information sufficient to uniquely identify directory entry within
21666 +   compressed directory item.
21667 +
21668 +   For alignment issues see &obj_key_id above.
21669 +*/
21670 +typedef struct de_id {
21671 +       ON_LARGE_KEY(d8 ordering[sizeof(__u64)];)
21672 +       d8 objectid[sizeof(__u64)];
21673 +       d8 offset[sizeof(__u64)];
21674 +}
21675 +de_id;
21676 +
21677 +extern int inode_onwire_size(const struct inode *obj);
21678 +extern char *build_inode_onwire(const struct inode *obj, char *area);
21679 +extern char *locate_obj_key_id_onwire(char *area);
21680 +extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id);
21681 +
21682 +extern int build_inode_key_id(const struct inode *obj, obj_key_id * id);
21683 +extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key);
21684 +extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id);
21685 +extern oid_t extract_dir_id_from_key(const reiser4_key * de_key);
21686 +extern int build_de_id(const struct inode *dir, const struct qstr *name,
21687 +                      de_id * id);
21688 +extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id);
21689 +extern int extract_key_from_de_id(const oid_t locality, const de_id * id,
21690 +                                 reiser4_key * key);
21691 +extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2);
21692 +extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key);
21693 +
21694 +extern int build_readdir_key_common(struct file *dir, reiser4_key * result);
21695 +extern void build_entry_key_common(const struct inode *dir,
21696 +                                  const struct qstr *name,
21697 +                                  reiser4_key * result);
21698 +extern void build_entry_key_stable_entry(const struct inode *dir,
21699 +                                        const struct qstr *name,
21700 +                                        reiser4_key * result);
21701 +extern int is_dot_key(const reiser4_key * key);
21702 +extern reiser4_key *build_sd_key(const struct inode *target,
21703 +                                reiser4_key * result);
21704 +
21705 +extern int is_longname_key(const reiser4_key * key);
21706 +extern int is_longname(const char *name, int len);
21707 +extern char *extract_name_from_key(const reiser4_key * key, char *buf);
21708 +extern char *reiser4_unpack_string(__u64 value, char *buf);
21709 +extern void complete_entry_key(const struct inode *dir, const char *name,
21710 +                              int len, reiser4_key *result);
21711 +
21712 +/* __KASSIGN_H__ */
21713 +#endif
21714 +
21715 +/* Make Linus happy.
21716 +   Local variables:
21717 +   c-indentation-style: "K&R"
21718 +   mode-name: "LC"
21719 +   c-basic-offset: 8
21720 +   tab-width: 8
21721 +   fill-column: 120
21722 +   End:
21723 +*/
21724 diff -urN linux-2.6.27.orig/fs/reiser4/Kconfig linux-2.6.27/fs/reiser4/Kconfig
21725 --- linux-2.6.27.orig/fs/reiser4/Kconfig        1970-01-01 03:00:00.000000000 +0300
21726 +++ linux-2.6.27/fs/reiser4/Kconfig     2008-10-12 18:20:00.000000000 +0400
21727 @@ -0,0 +1,34 @@
21728 +config REISER4_FS
21729 +       tristate "Reiser4 (EXPERIMENTAL)"
21730 +       depends on EXPERIMENTAL
21731 +       select ZLIB_INFLATE
21732 +       select ZLIB_DEFLATE
21733 +       select LZO_COMPRESS
21734 +       select LZO_DECOMPRESS
21735 +       select CRYPTO
21736 +       help
21737 +         Reiser4 is a filesystem that performs all filesystem operations
21738 +         as atomic transactions, which means that it either performs a
21739 +         write, or it does not, and in the event of a crash it does not
21740 +         partially perform it or corrupt it.
21741 +
21742 +         It stores files in dancing trees, which are like balanced trees but
21743 +         faster.  It packs small files together so that they share blocks
21744 +         without wasting space.  This means you can use it to store really
21745 +         small files.  It also means that it saves you disk space.  It avoids
21746 +         hassling you with anachronisms like having a maximum number of
21747 +         inodes, and wasting space if you use less than that number.
21748 +
21749 +         Reiser4 is a distinct filesystem type from reiserfs (V3).
21750 +         It's therefore not possible to use reiserfs file systems
21751 +         with reiser4.
21752 +
21753 +         To learn more about reiser4, go to http://www.namesys.com
21754 +
21755 +config REISER4_DEBUG
21756 +       bool "Enable reiser4 debug mode"
21757 +       depends on REISER4_FS
21758 +       help
21759 +         Don't use this unless you are debugging reiser4.
21760 +
21761 +         If unsure, say N.
21762 diff -urN linux-2.6.27.orig/fs/reiser4/key.c linux-2.6.27/fs/reiser4/key.c
21763 --- linux-2.6.27.orig/fs/reiser4/key.c  1970-01-01 03:00:00.000000000 +0300
21764 +++ linux-2.6.27/fs/reiser4/key.c       2008-10-12 18:20:00.000000000 +0400
21765 @@ -0,0 +1,137 @@
21766 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
21767 +
21768 +/* Key manipulations. */
21769 +
21770 +#include "debug.h"
21771 +#include "key.h"
21772 +#include "super.h"
21773 +#include "reiser4.h"
21774 +
21775 +#include <linux/types.h>       /* for __u??  */
21776 +
21777 +/* Minimal possible key: all components are zero. It is presumed that this is
21778 +   independent of key scheme. */
21779 +static const reiser4_key MINIMAL_KEY = {
21780 +       .el = {
21781 +               0ull,
21782 +               ON_LARGE_KEY(0ull,)
21783 +               0ull,
21784 +               0ull
21785 +       }
21786 +};
21787 +
21788 +/* Maximal possible key: all components are ~0. It is presumed that this is
21789 +   independent of key scheme. */
21790 +static const reiser4_key MAXIMAL_KEY = {
21791 +       .el = {
21792 +               __constant_cpu_to_le64(~0ull),
21793 +               ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),)
21794 +               __constant_cpu_to_le64(~0ull),
21795 +               __constant_cpu_to_le64(~0ull)
21796 +       }
21797 +};
21798 +
21799 +/* Initialize key. */
21800 +void reiser4_key_init(reiser4_key * key /* key to init */ )
21801 +{
21802 +       assert("nikita-1169", key != NULL);
21803 +       memset(key, 0, sizeof *key);
21804 +}
21805 +
21806 +/* minimal possible key in the tree. Return pointer to the static storage. */
21807 +const reiser4_key *reiser4_min_key(void)
21808 +{
21809 +       return &MINIMAL_KEY;
21810 +}
21811 +
21812 +/* maximum possible key in the tree. Return pointer to the static storage. */
21813 +const reiser4_key *reiser4_max_key(void)
21814 +{
21815 +       return &MAXIMAL_KEY;
21816 +}
21817 +
21818 +#if REISER4_DEBUG
21819 +/* debugging aid: print symbolic name of key type */
21820 +static const char *type_name(unsigned int key_type /* key type */ )
21821 +{
21822 +       switch (key_type) {
21823 +       case KEY_FILE_NAME_MINOR:
21824 +               return "file name";
21825 +       case KEY_SD_MINOR:
21826 +               return "stat data";
21827 +       case KEY_ATTR_NAME_MINOR:
21828 +               return "attr name";
21829 +       case KEY_ATTR_BODY_MINOR:
21830 +               return "attr body";
21831 +       case KEY_BODY_MINOR:
21832 +               return "file body";
21833 +       default:
21834 +               return "unknown";
21835 +       }
21836 +}
21837 +
21838 +/* debugging aid: print human readable information about key */
21839 +void reiser4_print_key(const char *prefix /* prefix to print */ ,
21840 +              const reiser4_key * key /* key to print */ )
21841 +{
21842 +       /* turn bold on */
21843 +       /* printf ("\033[1m"); */
21844 +       if (key == NULL)
21845 +               printk("%s: null key\n", prefix);
21846 +       else {
21847 +               if (REISER4_LARGE_KEY)
21848 +                       printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix,
21849 +                              get_key_locality(key),
21850 +                              get_key_type(key),
21851 +                              get_key_ordering(key),
21852 +                              get_key_band(key),
21853 +                              get_key_objectid(key), get_key_offset(key));
21854 +               else
21855 +                       printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix,
21856 +                              get_key_locality(key),
21857 +                              get_key_type(key),
21858 +                              get_key_band(key),
21859 +                              get_key_objectid(key), get_key_offset(key));
21860 +               /*
21861 +                * if this is a key of directory entry, try to decode part of
21862 +                * a name stored in the key, and output it.
21863 +                */
21864 +               if (get_key_type(key) == KEY_FILE_NAME_MINOR) {
21865 +                       char buf[DE_NAME_BUF_LEN];
21866 +                       char *c;
21867 +
21868 +                       c = buf;
21869 +                       c = reiser4_unpack_string(get_key_ordering(key), c);
21870 +                       reiser4_unpack_string(get_key_fulloid(key), c);
21871 +                       printk("[%s", buf);
21872 +                       if (is_longname_key(key))
21873 +                               /*
21874 +                                * only part of the name is stored in the key.
21875 +                                */
21876 +                               printk("...]\n");
21877 +                       else {
21878 +                               /*
21879 +                                * whole name is stored in the key.
21880 +                                */
21881 +                               reiser4_unpack_string(get_key_offset(key), buf);
21882 +                               printk("%s]\n", buf);
21883 +                       }
21884 +               } else {
21885 +                       printk("[%s]\n", type_name(get_key_type(key)));
21886 +               }
21887 +       }
21888 +       /* turn bold off */
21889 +       /* printf ("\033[m\017"); */
21890 +}
21891 +
21892 +#endif
21893 +
21894 +/* Make Linus happy.
21895 +   Local variables:
21896 +   c-indentation-style: "K&R"
21897 +   mode-name: "LC"
21898 +   c-basic-offset: 8
21899 +   tab-width: 8
21900 +   fill-column: 120
21901 +   End:
21902 +*/
21903 diff -urN linux-2.6.27.orig/fs/reiser4/key.h linux-2.6.27/fs/reiser4/key.h
21904 --- linux-2.6.27.orig/fs/reiser4/key.h  1970-01-01 03:00:00.000000000 +0300
21905 +++ linux-2.6.27/fs/reiser4/key.h       2008-10-12 18:20:00.000000000 +0400
21906 @@ -0,0 +1,384 @@
21907 +/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
21908 +
21909 +/* Declarations of key-related data-structures and operations on keys. */
21910 +
21911 +#if !defined( __REISER4_KEY_H__ )
21912 +#define __REISER4_KEY_H__
21913 +
21914 +#include "dformat.h"
21915 +#include "forward.h"
21916 +#include "debug.h"
21917 +
21918 +#include <linux/types.h>       /* for __u??  */
21919 +
21920 +/* Operations on keys in reiser4 tree */
21921 +
21922 +/* No access to any of these fields shall be done except via a
21923 +   wrapping macro/function, and that wrapping macro/function shall
21924 +   convert to little endian order.  Compare keys will consider cpu byte order. */
21925 +
21926 +/* A storage layer implementation difference between a regular unix file body and its attributes is in the typedef below
21927 +   which causes all of the attributes of a file to be near in key to all of the other attributes for all of the files
21928 +   within that directory, and not near to the file itself.  It is interesting to consider whether this is the wrong
21929 +   approach, and whether there should be no difference at all. For current usage patterns this choice is probably the
21930 +   right one.  */
21931 +
21932 +/* possible values for minor packing locality (4 bits required) */
21933 +typedef enum {
21934 +       /* file name */
21935 +       KEY_FILE_NAME_MINOR = 0,
21936 +       /* stat-data */
21937 +       KEY_SD_MINOR = 1,
21938 +       /* file attribute name */
21939 +       KEY_ATTR_NAME_MINOR = 2,
21940 +       /* file attribute value */
21941 +       KEY_ATTR_BODY_MINOR = 3,
21942 +       /* file body (tail or extent) */
21943 +       KEY_BODY_MINOR = 4,
21944 +} key_minor_locality;
21945 +
21946 +/* everything stored in the tree has a unique key, which means that the tree is (logically) fully ordered by key.
21947 +   Physical order is determined by dynamic heuristics that attempt to reflect key order when allocating available space,
21948 +   and by the repacker.  It is stylistically better to put aggregation information into the key.  Thus, if you want to
21949 +   segregate extents from tails, it is better to give them distinct minor packing localities rather than changing
21950 +   block_alloc.c to check the node type when deciding where to allocate the node.
21951 +
21952 +   The need to randomly displace new directories and large files disturbs this symmetry unfortunately.  However, it
21953 +   should be noted that this is a need that is not clearly established given the existence of a repacker.  Also, in our
21954 +   current implementation tails have a different minor packing locality from extents, and no files have both extents and
21955 +   tails, so maybe symmetry can be had without performance cost after all.  Symmetry is what we ship for now....
21956 +*/
21957 +
21958 +/* Arbitrary major packing localities can be assigned to objects using
21959 +   the reiser4(filenameA/..packing<=some_number) system call.
21960 +
21961 +   In reiser4, the creat() syscall creates a directory
21962 +
21963 +   whose default flow (that which is referred to if the directory is
21964 +   read as a file) is the traditional unix file body.
21965 +
21966 +   whose directory plugin is the 'filedir'
21967 +
21968 +   whose major packing locality is that of the parent of the object created.
21969 +
21970 +   The static_stat item is a particular commonly used directory
21971 +   compression (the one for normal unix files).
21972 +
21973 +   The filedir plugin checks to see if the static_stat item exists.
21974 +   There is a unique key for static_stat.  If yes, then it uses the
21975 +   static_stat item for all of the values that it contains.  The
21976 +   static_stat item contains a flag for each stat it contains which
21977 +   indicates whether one should look outside the static_stat item for its
21978 +   contents.
21979 +*/
21980 +
21981 +/* offset of fields in reiser4_key. Value of each element of this enum
21982 +    is index within key (thought as array of __u64's) where this field
21983 +    is. */
21984 +typedef enum {
21985 +       /* major "locale", aka dirid. Sits in 1st element */
21986 +       KEY_LOCALITY_INDEX = 0,
21987 +       /* minor "locale", aka item type. Sits in 1st element */
21988 +       KEY_TYPE_INDEX = 0,
21989 +       ON_LARGE_KEY(KEY_ORDERING_INDEX,)
21990 +           /* "object band". Sits in 2nd element */
21991 +           KEY_BAND_INDEX,
21992 +       /* objectid. Sits in 2nd element */
21993 +       KEY_OBJECTID_INDEX = KEY_BAND_INDEX,
21994 +       /* full objectid. Sits in 2nd element */
21995 +       KEY_FULLOID_INDEX = KEY_BAND_INDEX,
21996 +       /* Offset. Sits in 3rd element */
21997 +       KEY_OFFSET_INDEX,
21998 +       /* Name hash. Sits in 3rd element */
21999 +       KEY_HASH_INDEX = KEY_OFFSET_INDEX,
22000 +       KEY_CACHELINE_END = KEY_OFFSET_INDEX,
22001 +       KEY_LAST_INDEX
22002 +} reiser4_key_field_index;
22003 +
22004 +/* key in reiser4 internal "balanced" tree. It is just array of three
22005 +    64bit integers in disk byte order (little-endian by default). This
22006 +    array is actually indexed by reiser4_key_field.  Each __u64 within
22007 +    this array is called "element". Logical key component encoded within
22008 +    elements are called "fields".
22009 +
22010 +    We declare this as union with second component dummy to suppress
22011 +    inconvenient array<->pointer casts implied in C. */
22012 +union reiser4_key {
22013 +       __le64 el[KEY_LAST_INDEX];
22014 +       int pad;
22015 +};
22016 +
22017 +/* bitmasks showing where within reiser4_key particular key is stored. */
22018 +/* major locality occupies higher 60 bits of the first element */
22019 +#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull
22020 +
22021 +/* minor locality occupies lower 4 bits of the first element */
22022 +#define KEY_TYPE_MASK 0xfull
22023 +
22024 +/* controversial band occupies higher 4 bits of the 2nd element */
22025 +#define KEY_BAND_MASK 0xf000000000000000ull
22026 +
22027 +/* objectid occupies lower 60 bits of the 2nd element */
22028 +#define KEY_OBJECTID_MASK 0x0fffffffffffffffull
22029 +
22030 +/* full 64bit objectid*/
22031 +#define KEY_FULLOID_MASK 0xffffffffffffffffull
22032 +
22033 +/* offset is just 3rd L.M.Nt itself */
22034 +#define KEY_OFFSET_MASK 0xffffffffffffffffull
22035 +
22036 +/* ordering is whole second element */
22037 +#define KEY_ORDERING_MASK 0xffffffffffffffffull
22038 +
22039 +/* how many bits key element should be shifted to left to get particular field */
22040 +typedef enum {
22041 +       KEY_LOCALITY_SHIFT = 4,
22042 +       KEY_TYPE_SHIFT = 0,
22043 +       KEY_BAND_SHIFT = 60,
22044 +       KEY_OBJECTID_SHIFT = 0,
22045 +       KEY_FULLOID_SHIFT = 0,
22046 +       KEY_OFFSET_SHIFT = 0,
22047 +       KEY_ORDERING_SHIFT = 0,
22048 +} reiser4_key_field_shift;
22049 +
22050 +static inline __u64
22051 +get_key_el(const reiser4_key * key, reiser4_key_field_index off)
22052 +{
22053 +       assert("nikita-753", key != NULL);
22054 +       assert("nikita-754", off < KEY_LAST_INDEX);
22055 +       return le64_to_cpu(get_unaligned(&key->el[off]));
22056 +}
22057 +
22058 +static inline void
22059 +set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value)
22060 +{
22061 +       assert("nikita-755", key != NULL);
22062 +       assert("nikita-756", off < KEY_LAST_INDEX);
22063 +       put_unaligned(cpu_to_le64(value), &key->el[off]);
22064 +}
22065 +
22066 +/* macro to define getter and setter functions for field F with type T */
22067 +#define DEFINE_KEY_FIELD( L, U, T )                                    \
22068 +static inline T get_key_ ## L ( const reiser4_key *key )               \
22069 +{                                                                      \
22070 +       assert( "nikita-750", key != NULL );                            \
22071 +       return ( T ) ( get_key_el( key, KEY_ ## U ## _INDEX ) &         \
22072 +                KEY_ ## U ## _MASK ) >> KEY_ ## U ## _SHIFT;           \
22073 +}                                                                      \
22074 +                                                                       \
22075 +static inline void set_key_ ## L ( reiser4_key *key, T loc )           \
22076 +{                                                                      \
22077 +       __u64 el;                                                       \
22078 +                                                                       \
22079 +       assert( "nikita-752", key != NULL );                            \
22080 +                                                                       \
22081 +       el = get_key_el( key, KEY_ ## U ## _INDEX );                    \
22082 +       /* clear field bits in the key */                               \
22083 +       el &= ~KEY_ ## U ## _MASK;                                      \
22084 +       /* actually it should be                                        \
22085 +                                                                       \
22086 +          el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK;   \
22087 +                                                                       \
22088 +          but we trust user to never pass values that wouldn't fit     \
22089 +          into field. Clearing extra bits is one operation, but this   \
22090 +          function is time-critical.                                   \
22091 +          But check this in assertion. */                              \
22092 +       assert( "nikita-759", ( ( loc << KEY_ ## U ## _SHIFT ) &        \
22093 +               ~KEY_ ## U ## _MASK ) == 0 );                           \
22094 +       el |= ( loc << KEY_ ## U ## _SHIFT );                           \
22095 +       set_key_el( key, KEY_ ## U ## _INDEX, el );                     \
22096 +}
22097 +
22098 +typedef __u64 oid_t;
22099 +
22100 +/* define get_key_locality(), set_key_locality() */
22101 +DEFINE_KEY_FIELD(locality, LOCALITY, oid_t);
22102 +/* define get_key_type(), set_key_type() */
22103 +DEFINE_KEY_FIELD(type, TYPE, key_minor_locality);
22104 +/* define get_key_band(), set_key_band() */
22105 +DEFINE_KEY_FIELD(band, BAND, __u64);
22106 +/* define get_key_objectid(), set_key_objectid() */
22107 +DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t);
22108 +/* define get_key_fulloid(), set_key_fulloid() */
22109 +DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t);
22110 +/* define get_key_offset(), set_key_offset() */
22111 +DEFINE_KEY_FIELD(offset, OFFSET, __u64);
22112 +#if (REISER4_LARGE_KEY)
22113 +/* define get_key_ordering(), set_key_ordering() */
22114 +DEFINE_KEY_FIELD(ordering, ORDERING, __u64);
22115 +#else
22116 +static inline __u64 get_key_ordering(const reiser4_key * key)
22117 +{
22118 +       return 0;
22119 +}
22120 +
22121 +static inline void set_key_ordering(reiser4_key * key, __u64 val)
22122 +{
22123 +}
22124 +#endif
22125 +
22126 +/* key comparison result */
22127 +typedef enum { LESS_THAN = -1, /* if first key is less than second */
22128 +       EQUAL_TO = 0,           /* if keys are equal */
22129 +       GREATER_THAN = +1       /* if first key is greater than second */
22130 +} cmp_t;
22131 +
22132 +void reiser4_key_init(reiser4_key * key);
22133 +
22134 +/* minimal possible key in the tree. Return pointer to the static storage. */
22135 +extern const reiser4_key *reiser4_min_key(void);
22136 +extern const reiser4_key *reiser4_max_key(void);
22137 +
22138 +/* helper macro for keycmp() */
22139 +#define KEY_DIFF(k1, k2, field)                                                        \
22140 +({                                                                             \
22141 +       typeof (get_key_ ## field (k1)) f1;                                     \
22142 +       typeof (get_key_ ## field (k2)) f2;                                     \
22143 +                                                                               \
22144 +       f1 = get_key_ ## field (k1);                                            \
22145 +       f2 = get_key_ ## field (k2);                                            \
22146 +                                                                               \
22147 +       (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN);         \
22148 +})
22149 +
22150 +/* helper macro for keycmp() */
22151 +#define KEY_DIFF_EL(k1, k2, off)                                               \
22152 +({                                                                             \
22153 +       __u64 e1;                                                               \
22154 +       __u64 e2;                                                               \
22155 +                                                                               \
22156 +       e1 = get_key_el(k1, off);                                               \
22157 +       e2 = get_key_el(k2, off);                                               \
22158 +                                                                               \
22159 +       (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN);         \
22160 +})
22161 +
22162 +/* compare `k1' and `k2'.  This function is a heart of "key allocation
22163 +    policy". All you need to implement new policy is to add yet another
22164 +    clause here. */
22165 +static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ ,
22166 +                          const reiser4_key * k2 /* second key to compare */ )
22167 +{
22168 +       cmp_t result;
22169 +
22170 +       /*
22171 +        * This function is the heart of reiser4 tree-routines. Key comparison
22172 +        * is among most heavily used operations in the file system.
22173 +        */
22174 +
22175 +       assert("nikita-439", k1 != NULL);
22176 +       assert("nikita-440", k2 != NULL);
22177 +
22178 +       /* there is no actual branch here: condition is compile time constant
22179 +        * and constant folding and propagation ensures that only one branch
22180 +        * is actually compiled in. */
22181 +
22182 +       if (REISER4_PLANA_KEY_ALLOCATION) {
22183 +               /* if physical order of fields in a key is identical
22184 +                  with logical order, we can implement key comparison
22185 +                  as three 64bit comparisons. */
22186 +               /* logical order of fields in plan-a:
22187 +                  locality->type->objectid->offset. */
22188 +               /* compare locality and type at once */
22189 +               result = KEY_DIFF_EL(k1, k2, 0);
22190 +               if (result == EQUAL_TO) {
22191 +                       /* compare objectid (and band if it's there) */
22192 +                       result = KEY_DIFF_EL(k1, k2, 1);
22193 +                       /* compare offset */
22194 +                       if (result == EQUAL_TO) {
22195 +                               result = KEY_DIFF_EL(k1, k2, 2);
22196 +                               if (REISER4_LARGE_KEY && result == EQUAL_TO) {
22197 +                                       result = KEY_DIFF_EL(k1, k2, 3);
22198 +                               }
22199 +                       }
22200 +               }
22201 +       } else if (REISER4_3_5_KEY_ALLOCATION) {
22202 +               result = KEY_DIFF(k1, k2, locality);
22203 +               if (result == EQUAL_TO) {
22204 +                       result = KEY_DIFF(k1, k2, objectid);
22205 +                       if (result == EQUAL_TO) {
22206 +                               result = KEY_DIFF(k1, k2, type);
22207 +                               if (result == EQUAL_TO)
22208 +                                       result = KEY_DIFF(k1, k2, offset);
22209 +                       }
22210 +               }
22211 +       } else
22212 +               impossible("nikita-441", "Unknown key allocation scheme!");
22213 +       return result;
22214 +}
22215 +
22216 +/* true if @k1 equals @k2 */
22217 +static inline int keyeq(const reiser4_key * k1 /* first key to compare */ ,
22218 +                       const reiser4_key * k2 /* second key to compare */ )
22219 +{
22220 +       assert("nikita-1879", k1 != NULL);
22221 +       assert("nikita-1880", k2 != NULL);
22222 +       return !memcmp(k1, k2, sizeof *k1);
22223 +}
22224 +
22225 +/* true if @k1 is less than @k2 */
22226 +static inline int keylt(const reiser4_key * k1 /* first key to compare */ ,
22227 +                       const reiser4_key * k2 /* second key to compare */ )
22228 +{
22229 +       assert("nikita-1952", k1 != NULL);
22230 +       assert("nikita-1953", k2 != NULL);
22231 +       return keycmp(k1, k2) == LESS_THAN;
22232 +}
22233 +
22234 +/* true if @k1 is less than or equal to @k2 */
22235 +static inline int keyle(const reiser4_key * k1 /* first key to compare */ ,
22236 +                       const reiser4_key * k2 /* second key to compare */ )
22237 +{
22238 +       assert("nikita-1954", k1 != NULL);
22239 +       assert("nikita-1955", k2 != NULL);
22240 +       return keycmp(k1, k2) != GREATER_THAN;
22241 +}
22242 +
22243 +/* true if @k1 is greater than @k2 */
22244 +static inline int keygt(const reiser4_key * k1 /* first key to compare */ ,
22245 +                       const reiser4_key * k2 /* second key to compare */ )
22246 +{
22247 +       assert("nikita-1959", k1 != NULL);
22248 +       assert("nikita-1960", k2 != NULL);
22249 +       return keycmp(k1, k2) == GREATER_THAN;
22250 +}
22251 +
22252 +/* true if @k1 is greater than or equal to @k2 */
22253 +static inline int keyge(const reiser4_key * k1 /* first key to compare */ ,
22254 +                       const reiser4_key * k2 /* second key to compare */ )
22255 +{
22256 +       assert("nikita-1956", k1 != NULL);
22257 +       assert("nikita-1957", k2 != NULL);      /* October  4: sputnik launched
22258 +                                                * November 3: Laika */
22259 +       return keycmp(k1, k2) != LESS_THAN;
22260 +}
22261 +
22262 +static inline void prefetchkey(reiser4_key * key)
22263 +{
22264 +       prefetch(key);
22265 +       prefetch(&key->el[KEY_CACHELINE_END]);
22266 +}
22267 +
22268 +/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) =
22269 +           1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */
22270 +/* size of a buffer suitable to hold human readable key representation */
22271 +#define KEY_BUF_LEN (80)
22272 +
22273 +#if REISER4_DEBUG
22274 +extern void reiser4_print_key(const char *prefix, const reiser4_key * key);
22275 +#else
22276 +#define reiser4_print_key(p,k) noop
22277 +#endif
22278 +
22279 +/* __FS_REISERFS_KEY_H__ */
22280 +#endif
22281 +
22282 +/* Make Linus happy.
22283 +   Local variables:
22284 +   c-indentation-style: "K&R"
22285 +   mode-name: "LC"
22286 +   c-basic-offset: 8
22287 +   tab-width: 8
22288 +   fill-column: 120
22289 +   End:
22290 +*/
22291 diff -urN linux-2.6.27.orig/fs/reiser4/ktxnmgrd.c linux-2.6.27/fs/reiser4/ktxnmgrd.c
22292 --- linux-2.6.27.orig/fs/reiser4/ktxnmgrd.c     1970-01-01 03:00:00.000000000 +0300
22293 +++ linux-2.6.27/fs/reiser4/ktxnmgrd.c  2008-10-12 18:20:00.000000000 +0400
22294 @@ -0,0 +1,214 @@
22295 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22296 +/* Transaction manager daemon. */
22297 +
22298 +/*
22299 + * ktxnmgrd is a kernel daemon responsible for committing transactions. It is
22300 + * needed/important for the following reasons:
22301 + *
22302 + *     1. in reiser4 atom is not committed immediately when last transaction
22303 + *     handle closes, unless atom is either too old or too large (see
22304 + *     atom_should_commit()). This is done to avoid committing too frequently.
22305 + *     because:
22306 + *
22307 + *     2. sometimes we don't want to commit atom when closing last transaction
22308 + *     handle even if it is old and fat enough. For example, because we are at
22309 + *     this point under directory semaphore, and committing would stall all
22310 + *     accesses to this directory.
22311 + *
22312 + * ktxnmgrd binds its time sleeping on condition variable. When is awakes
22313 + * either due to (tunable) timeout or because it was explicitly woken up by
22314 + * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones
22315 + * eligible.
22316 + *
22317 + */
22318 +
22319 +#include "debug.h"
22320 +#include "txnmgr.h"
22321 +#include "tree.h"
22322 +#include "ktxnmgrd.h"
22323 +#include "super.h"
22324 +#include "reiser4.h"
22325 +
22326 +#include <linux/sched.h>       /* for struct task_struct */
22327 +#include <linux/wait.h>
22328 +#include <linux/suspend.h>
22329 +#include <linux/kernel.h>
22330 +#include <linux/writeback.h>
22331 +#include <linux/kthread.h>
22332 +#include <linux/freezer.h>
22333 +
22334 +static int scan_mgr(struct super_block *);
22335 +
22336 +/*
22337 + * change current->comm so that ps, top, and friends will see changed
22338 + * state. This serves no useful purpose whatsoever, but also costs nothing. May
22339 + * be it will make lonely system administrator feeling less alone at 3 A.M.
22340 + */
22341 +#define set_comm( state )                                              \
22342 +       snprintf( current -> comm, sizeof( current -> comm ),   \
22343 +                 "%s:%s:%s", __FUNCTION__, (super)->s_id, ( state ) )
22344 +
22345 +/**
22346 + * ktxnmgrd - kernel txnmgr daemon
22347 + * @arg: pointer to super block
22348 + *
22349 + * The background transaction manager daemon, started as a kernel thread during
22350 + * reiser4 initialization.
22351 + */
22352 +static int ktxnmgrd(void *arg)
22353 +{
22354 +       struct super_block *super;
22355 +       ktxnmgrd_context *ctx;
22356 +       txn_mgr *mgr;
22357 +       int done = 0;
22358 +
22359 +       super = arg;
22360 +       mgr = &get_super_private(super)->tmgr;
22361 +
22362 +       /*
22363 +        * do_fork() just copies task_struct into the new thread. ->fs_context
22364 +        * shouldn't be copied of course. This shouldn't be a problem for the
22365 +        * rest of the code though.
22366 +        */
22367 +       current->journal_info = NULL;
22368 +       ctx = mgr->daemon;
22369 +       while (1) {
22370 +               try_to_freeze();
22371 +               set_comm("wait");
22372 +               {
22373 +                       DEFINE_WAIT(__wait);
22374 +
22375 +                       prepare_to_wait(&ctx->wait, &__wait, TASK_INTERRUPTIBLE);
22376 +                       if (kthread_should_stop()) {
22377 +                               done = 1;
22378 +                       } else
22379 +                               schedule_timeout(ctx->timeout);
22380 +                       finish_wait(&ctx->wait, &__wait);
22381 +               }
22382 +               if (done)
22383 +                       break;
22384 +               set_comm("run");
22385 +               spin_lock(&ctx->guard);
22386 +               /*
22387 +                * wait timed out or ktxnmgrd was woken up by explicit request
22388 +                * to commit something. Scan list of atoms in txnmgr and look
22389 +                * for too old atoms.
22390 +                */
22391 +               do {
22392 +                       ctx->rescan = 0;
22393 +                       scan_mgr(super);
22394 +                       spin_lock(&ctx->guard);
22395 +                       if (ctx->rescan) {
22396 +                               /*
22397 +                                * the list could be modified while ctx
22398 +                                * spinlock was released, we have to repeat
22399 +                                * scanning from the beginning
22400 +                                */
22401 +                               break;
22402 +                       }
22403 +               } while (ctx->rescan);
22404 +               spin_unlock(&ctx->guard);
22405 +       }
22406 +       return 0;
22407 +}
22408 +
22409 +#undef set_comm
22410 +
22411 +/**
22412 + * reiser4_init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon
22413 + * @super: pointer to super block
22414 + *
22415 + * Allocates and initializes ktxnmgrd_context, attaches it to transaction
22416 + * manager. Starts kernel txnmgr daemon. This is called on mount.
22417 + */
22418 +int reiser4_init_ktxnmgrd(struct super_block *super)
22419 +{
22420 +       txn_mgr *mgr;
22421 +       ktxnmgrd_context *ctx;
22422 +
22423 +       mgr = &get_super_private(super)->tmgr;
22424 +
22425 +       assert("zam-1014", mgr->daemon == NULL);
22426 +
22427 +       ctx = kzalloc(sizeof(ktxnmgrd_context), reiser4_ctx_gfp_mask_get());
22428 +       if (!ctx)
22429 +               return RETERR(-ENOMEM);
22430 +
22431 +       assert("nikita-2442", ctx != NULL);
22432 +
22433 +       init_waitqueue_head(&ctx->wait);
22434 +
22435 +       /*kcond_init(&ctx->startup);*/
22436 +       spin_lock_init(&ctx->guard);
22437 +       ctx->timeout = REISER4_TXNMGR_TIMEOUT;
22438 +       ctx->rescan = 1;
22439 +       mgr->daemon = ctx;
22440 +
22441 +       ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd");
22442 +       if (IS_ERR(ctx->tsk)) {
22443 +               int ret = PTR_ERR(ctx->tsk);
22444 +               mgr->daemon = NULL;
22445 +               kfree(ctx);
22446 +               return RETERR(ret);
22447 +       }
22448 +       return 0;
22449 +}
22450 +
22451 +void ktxnmgrd_kick(txn_mgr *mgr)
22452 +{
22453 +       assert("nikita-3234", mgr != NULL);
22454 +       assert("nikita-3235", mgr->daemon != NULL);
22455 +       wake_up(&mgr->daemon->wait);
22456 +}
22457 +
22458 +int is_current_ktxnmgrd(void)
22459 +{
22460 +       return (get_current_super_private()->tmgr.daemon->tsk == current);
22461 +}
22462 +
22463 +/**
22464 + * scan_mgr - commit atoms which are to be committed
22465 + * @super: super block to commit atoms of
22466 + *
22467 + * Commits old atoms.
22468 + */
22469 +static int scan_mgr(struct super_block *super)
22470 +{
22471 +       int ret;
22472 +       reiser4_context ctx;
22473 +
22474 +       init_stack_context(&ctx, super);
22475 +
22476 +       ret = commit_some_atoms(&get_super_private(super)->tmgr);
22477 +
22478 +       reiser4_exit_context(&ctx);
22479 +       return ret;
22480 +}
22481 +
22482 +/**
22483 + * reiser4_done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context
22484 + * @mgr:
22485 + *
22486 + * This is called on umount. Stops ktxnmgrd and free t
22487 + */
22488 +void reiser4_done_ktxnmgrd(struct super_block *super)
22489 +{
22490 +       txn_mgr *mgr;
22491 +
22492 +       mgr = &get_super_private(super)->tmgr;
22493 +       assert("zam-1012", mgr->daemon != NULL);
22494 +
22495 +       kthread_stop(mgr->daemon->tsk);
22496 +       kfree(mgr->daemon);
22497 +       mgr->daemon = NULL;
22498 +}
22499 +
22500 +/*
22501 + * Local variables:
22502 + * c-indentation-style: "K&R"
22503 + * mode-name: "LC"
22504 + * c-basic-offset: 8
22505 + * tab-width: 8
22506 + * fill-column: 120
22507 + * End:
22508 + */
22509 diff -urN linux-2.6.27.orig/fs/reiser4/ktxnmgrd.h linux-2.6.27/fs/reiser4/ktxnmgrd.h
22510 --- linux-2.6.27.orig/fs/reiser4/ktxnmgrd.h     1970-01-01 03:00:00.000000000 +0300
22511 +++ linux-2.6.27/fs/reiser4/ktxnmgrd.h  2008-10-12 18:20:00.000000000 +0400
22512 @@ -0,0 +1,52 @@
22513 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22514 + * reiser4/README */
22515 +
22516 +/* Transaction manager daemon. See ktxnmgrd.c for comments. */
22517 +
22518 +#ifndef __KTXNMGRD_H__
22519 +#define __KTXNMGRD_H__
22520 +
22521 +#include "txnmgr.h"
22522 +
22523 +#include <linux/fs.h>
22524 +#include <linux/wait.h>
22525 +#include <linux/completion.h>
22526 +#include <linux/spinlock.h>
22527 +#include <asm/atomic.h>
22528 +#include <linux/sched.h>       /* for struct task_struct */
22529 +
22530 +/* in this structure all data necessary to start up, shut down and communicate
22531 + * with ktxnmgrd are kept. */
22532 +struct ktxnmgrd_context {
22533 +       /* wait queue head on which ktxnmgrd sleeps */
22534 +       wait_queue_head_t wait;
22535 +       /* spin lock protecting all fields of this structure */
22536 +       spinlock_t guard;
22537 +       /* timeout of sleeping on ->wait */
22538 +       signed long timeout;
22539 +       /* kernel thread running ktxnmgrd */
22540 +       struct task_struct *tsk;
22541 +       /* list of all file systems served by this ktxnmgrd */
22542 +       struct list_head queue;
22543 +       /* should ktxnmgrd repeat scanning of atoms? */
22544 +       unsigned int rescan:1;
22545 +};
22546 +
22547 +extern int reiser4_init_ktxnmgrd(struct super_block *);
22548 +extern void reiser4_done_ktxnmgrd(struct super_block *);
22549 +
22550 +extern void ktxnmgrd_kick(txn_mgr * mgr);
22551 +extern int is_current_ktxnmgrd(void);
22552 +
22553 +/* __KTXNMGRD_H__ */
22554 +#endif
22555 +
22556 +/* Make Linus happy.
22557 +   Local variables:
22558 +   c-indentation-style: "K&R"
22559 +   mode-name: "LC"
22560 +   c-basic-offset: 8
22561 +   tab-width: 8
22562 +   fill-column: 120
22563 +   End:
22564 +*/
22565 diff -urN linux-2.6.27.orig/fs/reiser4/lock.c linux-2.6.27/fs/reiser4/lock.c
22566 --- linux-2.6.27.orig/fs/reiser4/lock.c 1970-01-01 03:00:00.000000000 +0300
22567 +++ linux-2.6.27/fs/reiser4/lock.c      2008-10-12 18:20:00.000000000 +0400
22568 @@ -0,0 +1,1232 @@
22569 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22570 + * reiser4/README */
22571 +
22572 +/* Traditional deadlock avoidance is achieved by acquiring all locks in a single
22573 +   order.  V4 balances the tree from the bottom up, and searches the tree from
22574 +   the top down, and that is really the way we want it, so tradition won't work
22575 +   for us.
22576 +
22577 +   Instead we have two lock orderings, a high priority lock ordering, and a low
22578 +   priority lock ordering.  Each node in the tree has a lock in its znode.
22579 +
22580 +   Suppose we have a set of processes which lock (R/W) tree nodes. Each process
22581 +   has a set (maybe empty) of already locked nodes ("process locked set"). Each
22582 +   process may have a pending lock request to a node locked by another process.
22583 +   Note: we lock and unlock, but do not transfer locks: it is possible
22584 +   transferring locks instead would save some bus locking....
22585 +
22586 +   Deadlock occurs when we have a loop constructed from process locked sets and
22587 +   lock request vectors.
22588 +
22589 +   NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in
22590 +   memory is extended with "znodes" with which we connect nodes with their left
22591 +   and right neighbors using sibling pointers stored in the znodes.  When we
22592 +   perform balancing operations we often go from left to right and from right to
22593 +   left.
22594 +
22595 +   +-P1-+          +-P3-+
22596 +   |+--+|   V1     |+--+|
22597 +   ||N1|| -------> ||N3||
22598 +   |+--+|          |+--+|
22599 +   +----+          +----+
22600 +     ^               |
22601 +     |V2             |V3
22602 +     |               v
22603 +   +---------P2---------+
22604 +   |+--+            +--+|
22605 +   ||N2|  --------  |N4||
22606 +   |+--+            +--+|
22607 +   +--------------------+
22608 +
22609 +   We solve this by ensuring that only low priority processes lock in top to
22610 +   bottom order and from right to left, and high priority processes lock from
22611 +   bottom to top and left to right.
22612 +
22613 +   ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and
22614 +   kill those damn busy loops.
22615 +   ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom
22616 +   stage) cannot be ordered that way. There are no rules what nodes can belong
22617 +   to the atom and what nodes cannot.  We cannot define what is right or left
22618 +   direction, what is top or bottom.  We can take immediate parent or side
22619 +   neighbor of one node, but nobody guarantees that, say, left neighbor node is
22620 +   not a far right neighbor for other nodes from the same atom.  It breaks
22621 +   deadlock avoidance rules and hi-low priority locking cannot be applied for
22622 +   atom locks.
22623 +
22624 +   How does it help to avoid deadlocks ?
22625 +
22626 +   Suppose we have a deadlock with n processes. Processes from one priority
22627 +   class never deadlock because they take locks in one consistent
22628 +   order.
22629 +
22630 +   So, any possible deadlock loop must have low priority as well as high
22631 +   priority processes.  There are no other lock priority levels except low and
22632 +   high. We know that any deadlock loop contains at least one node locked by a
22633 +   low priority process and requested by a high priority process. If this
22634 +   situation is caught and resolved it is sufficient to avoid deadlocks.
22635 +
22636 +   V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION.
22637 +
22638 +   The deadlock prevention algorithm is based on comparing
22639 +   priorities of node owners (processes which keep znode locked) and
22640 +   requesters (processes which want to acquire a lock on znode).  We
22641 +   implement a scheme where low-priority owners yield locks to
22642 +   high-priority requesters. We created a signal passing system that
22643 +   is used to ask low-priority processes to yield one or more locked
22644 +   znodes.
22645 +
22646 +   The condition when a znode needs to change its owners is described by the
22647 +   following formula:
22648 +
22649 +   #############################################
22650 +   #                                           #
22651 +   # (number of high-priority requesters) >  0 #
22652 +   #                AND                        #
22653 +   # (numbers of high-priority owners)    == 0 #
22654 +   #                                           #
22655 +   #############################################
22656 +
22657 +   Note that a low-priority process delays node releasing if another
22658 +   high-priority process owns this node.  So, slightly more strictly speaking,
22659 +   to have a deadlock capable cycle you must have a loop in which a high
22660 +   priority process is waiting on a low priority process to yield a node, which
22661 +   is slightly different from saying a high priority process is waiting on a
22662 +   node owned by a low priority process.
22663 +
22664 +   It is enough to avoid deadlocks if we prevent any low-priority process from
22665 +   falling asleep if its locked set contains a node which satisfies the
22666 +   deadlock condition.
22667 +
22668 +   That condition is implicitly or explicitly checked in all places where new
22669 +   high-priority requests may be added or removed from node request queue or
22670 +   high-priority process takes or releases a lock on node. The main
22671 +   goal of these checks is to never lose the moment when node becomes "has
22672 +   wrong owners" and send "must-yield-this-lock" signals to its low-pri owners
22673 +   at that time.
22674 +
22675 +   The information about received signals is stored in the per-process
22676 +   structure (lock stack) and analyzed before a low-priority process goes to
22677 +   sleep but after a "fast" attempt to lock a node fails. Any signal wakes
22678 +   sleeping process up and forces him to re-check lock status and received
22679 +   signal info. If "must-yield-this-lock" signals were received the locking
22680 +   primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code.
22681 +
22682 +   V4 LOCKING DRAWBACKS
22683 +
22684 +   If we have already balanced on one level, and we are propagating our changes
22685 +   upward to a higher level, it could be very messy to surrender all locks on
22686 +   the lower level because we put so much computational work into it, and
22687 +   reverting them to their state before they were locked might be very complex.
22688 +   We also don't want to acquire all locks before performing balancing because
22689 +   that would either be almost as much work as the balancing, or it would be
22690 +   too conservative and lock too much.  We want balancing to be done only at
22691 +   high priority.  Yet, we might want to go to the left one node and use some
22692 +   of its empty space... So we make one attempt at getting the node to the left
22693 +   using try_lock, and if it fails we do without it, because we didn't really
22694 +   need it, it was only a nice to have.
22695 +
22696 +   LOCK STRUCTURES DESCRIPTION
22697 +
22698 +   The following data structures are used in the reiser4 locking
22699 +   implementation:
22700 +
22701 +   All fields related to long-term locking are stored in znode->lock.
22702 +
22703 +   The lock stack is a per thread object.  It owns all znodes locked by the
22704 +   thread. One znode may be locked by several threads in case of read lock or
22705 +   one znode may be write locked by one thread several times. The special link
22706 +   objects (lock handles) support n<->m relation between znodes and lock
22707 +   owners.
22708 +
22709 +   <Thread 1>                       <Thread 2>
22710 +
22711 +   +---------+                     +---------+
22712 +   |  LS1    |                    |  LS2    |
22713 +   +---------+                    +---------+
22714 +       ^                                ^
22715 +       |---------------+                +----------+
22716 +       v               v                v          v
22717 +   +---------+      +---------+    +---------+   +---------+
22718 +   |  LH1    |      |   LH2   |           |  LH3    |   |   LH4   |
22719 +   +---------+     +---------+    +---------+   +---------+
22720 +       ^                   ^            ^           ^
22721 +       |                   +------------+           |
22722 +       v                   v                        v
22723 +   +---------+      +---------+                  +---------+
22724 +   |  Z1     |     |   Z2    |                  |  Z3     |
22725 +   +---------+     +---------+                  +---------+
22726 +
22727 +   Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The
22728 +   picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and
22729 +   LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it.  Znode
22730 +   Z1 is locked by only one thread, znode has only one lock handle LH1 on its
22731 +   list, similar situation is for Z3 which is locked by the thread 2 only. Z2
22732 +   is locked (for read) twice by different threads and two lock handles are on
22733 +   its list. Each lock handle represents a single relation of a locking of a
22734 +   znode by a thread. Locking of a znode is an establishing of a locking
22735 +   relation between the lock stack and the znode by adding of a new lock handle
22736 +   to a list of lock handles, the lock stack.  The lock stack links all lock
22737 +   handles for all znodes locked by the lock stack.  The znode list groups all
22738 +   lock handles for all locks stacks which locked the znode.
22739 +
22740 +   Yet another relation may exist between znode and lock owners.  If lock
22741 +   procedure cannot immediately take lock on an object it adds the lock owner
22742 +   on special `requestors' list belongs to znode.  That list represents a
22743 +   queue of pending lock requests.  Because one lock owner may request only
22744 +   only one lock object at a time, it is a 1->n relation between lock objects
22745 +   and a lock owner implemented as it is described above. Full information
22746 +   (priority, pointers to lock and link objects) about each lock request is
22747 +   stored in lock owner structure in `request' field.
22748 +
22749 +   SHORT_TERM LOCKING
22750 +
22751 +   This is a list of primitive operations over lock stacks / lock handles /
22752 +   znodes and locking descriptions for them.
22753 +
22754 +   1. locking / unlocking which is done by two list insertion/deletion, one
22755 +      to/from znode's list of lock handles, another one is to/from lock stack's
22756 +      list of lock handles.  The first insertion is protected by
22757 +      znode->lock.guard spinlock.  The list owned by the lock stack can be
22758 +      modified only by thread who owns the lock stack and nobody else can
22759 +      modify/read it. There is nothing to be protected by a spinlock or
22760 +      something else.
22761 +
22762 +   2. adding/removing a lock request to/from znode requesters list. The rule is
22763 +      that znode->lock.guard spinlock should be taken for this.
22764 +
22765 +   3. we can traverse list of lock handles and use references to lock stacks who
22766 +      locked given znode if znode->lock.guard spinlock is taken.
22767 +
22768 +   4. If a lock stack is associated with a znode as a lock requestor or lock
22769 +      owner its existence is guaranteed by znode->lock.guard spinlock.  Some its
22770 +      (lock stack's) fields should be protected from being accessed in parallel
22771 +      by two or more threads. Please look at  lock_stack structure definition
22772 +      for the info how those fields are protected. */
22773 +
22774 +/* Znode lock and capturing intertwining. */
22775 +/* In current implementation we capture formatted nodes before locking
22776 +   them. Take a look on longterm lock znode, reiser4_try_capture() request
22777 +   precedes locking requests.  The longterm_lock_znode function unconditionally
22778 +   captures znode before even checking of locking conditions.
22779 +
22780 +   Another variant is to capture znode after locking it.  It was not tested, but
22781 +   at least one deadlock condition is supposed to be there.  One thread has
22782 +   locked a znode (Node-1) and calls reiser4_try_capture() for it.
22783 +   reiser4_try_capture() sleeps because znode's atom has CAPTURE_WAIT state.
22784 +   Second thread is a flushing thread, its current atom is the atom Node-1
22785 +   belongs to. Second thread wants to lock Node-1 and sleeps because Node-1
22786 +   is locked by the first thread.  The described situation is a deadlock. */
22787 +
22788 +#include "debug.h"
22789 +#include "txnmgr.h"
22790 +#include "znode.h"
22791 +#include "jnode.h"
22792 +#include "tree.h"
22793 +#include "plugin/node/node.h"
22794 +#include "super.h"
22795 +
22796 +#include <linux/spinlock.h>
22797 +
22798 +#if REISER4_DEBUG
22799 +static int request_is_deadlock_safe(znode *, znode_lock_mode,
22800 +                                   znode_lock_request);
22801 +#endif
22802 +
22803 +/* Returns a lock owner associated with current thread */
22804 +lock_stack *get_current_lock_stack(void)
22805 +{
22806 +       return &get_current_context()->stack;
22807 +}
22808 +
22809 +/* Wakes up all low priority owners informing them about possible deadlock */
22810 +static void wake_up_all_lopri_owners(znode * node)
22811 +{
22812 +       lock_handle *handle;
22813 +
22814 +       assert_spin_locked(&(node->lock.guard));
22815 +       list_for_each_entry(handle, &node->lock.owners, owners_link) {
22816 +               assert("nikita-1832", handle->node == node);
22817 +               /* count this signal in owner->nr_signaled */
22818 +               if (!handle->signaled) {
22819 +                       handle->signaled = 1;
22820 +                       atomic_inc(&handle->owner->nr_signaled);
22821 +                       /* Wake up a single process */
22822 +                       reiser4_wake_up(handle->owner);
22823 +               }
22824 +       }
22825 +}
22826 +
22827 +/* Adds a lock to a lock owner, which means creating a link to the lock and
22828 +   putting the link into the two lists all links are on (the doubly linked list
22829 +   that forms the lock_stack, and the doubly linked list of links attached
22830 +   to a lock.
22831 +*/
22832 +static inline void
22833 +link_object(lock_handle * handle, lock_stack * owner, znode * node)
22834 +{
22835 +       assert("jmacd-810", handle->owner == NULL);
22836 +       assert_spin_locked(&(node->lock.guard));
22837 +
22838 +       handle->owner = owner;
22839 +       handle->node = node;
22840 +
22841 +       assert("reiser4-4",
22842 +              ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0));
22843 +
22844 +       /* add lock handle to the end of lock_stack's list of locks */
22845 +       list_add_tail(&handle->locks_link, &owner->locks);
22846 +       ON_DEBUG(owner->nr_locks++);
22847 +       reiser4_ctx_gfp_mask_set();
22848 +
22849 +       /* add lock handle to the head of znode's list of owners */
22850 +       list_add(&handle->owners_link, &node->lock.owners);
22851 +       handle->signaled = 0;
22852 +}
22853 +
22854 +/* Breaks a relation between a lock and its owner */
22855 +static inline void unlink_object(lock_handle * handle)
22856 +{
22857 +       assert("zam-354", handle->owner != NULL);
22858 +       assert("nikita-1608", handle->node != NULL);
22859 +       assert_spin_locked(&(handle->node->lock.guard));
22860 +       assert("nikita-1829", handle->owner == get_current_lock_stack());
22861 +       assert("reiser4-5", handle->owner->nr_locks > 0);
22862 +
22863 +       /* remove lock handle from lock_stack's list of locks */
22864 +       list_del(&handle->locks_link);
22865 +       ON_DEBUG(handle->owner->nr_locks--);
22866 +       reiser4_ctx_gfp_mask_set();
22867 +       assert("reiser4-6",
22868 +              ergo(list_empty_careful(&handle->owner->locks),
22869 +                   handle->owner->nr_locks == 0));
22870 +       /* remove lock handle from znode's list of owners */
22871 +       list_del(&handle->owners_link);
22872 +       /* indicates that lock handle is free now */
22873 +       handle->node = NULL;
22874 +#if REISER4_DEBUG
22875 +       INIT_LIST_HEAD(&handle->locks_link);
22876 +       INIT_LIST_HEAD(&handle->owners_link);
22877 +       handle->owner = NULL;
22878 +#endif
22879 +}
22880 +
22881 +/* Actually locks an object knowing that we are able to do this */
22882 +static void lock_object(lock_stack * owner)
22883 +{
22884 +       struct lock_request *request;
22885 +       znode *node;
22886 +
22887 +       request = &owner->request;
22888 +       node = request->node;
22889 +       assert_spin_locked(&(node->lock.guard));
22890 +       if (request->mode == ZNODE_READ_LOCK) {
22891 +               node->lock.nr_readers++;
22892 +       } else {
22893 +               /* check that we don't switched from read to write lock */
22894 +               assert("nikita-1840", node->lock.nr_readers <= 0);
22895 +               /* We allow recursive locking; a node can be locked several
22896 +                  times for write by same process */
22897 +               node->lock.nr_readers--;
22898 +       }
22899 +
22900 +       link_object(request->handle, owner, node);
22901 +
22902 +       if (owner->curpri) {
22903 +               node->lock.nr_hipri_owners++;
22904 +       }
22905 +}
22906 +
22907 +/* Check for recursive write locking */
22908 +static int recursive(lock_stack * owner)
22909 +{
22910 +       int ret;
22911 +       znode *node;
22912 +       lock_handle *lh;
22913 +
22914 +       node = owner->request.node;
22915 +
22916 +       /* Owners list is not empty for a locked node */
22917 +       assert("zam-314", !list_empty_careful(&node->lock.owners));
22918 +       assert("nikita-1841", owner == get_current_lock_stack());
22919 +       assert_spin_locked(&(node->lock.guard));
22920 +
22921 +       lh = list_entry(node->lock.owners.next, lock_handle, owners_link);
22922 +       ret = (lh->owner == owner);
22923 +
22924 +       /* Recursive read locking should be done usual way */
22925 +       assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK);
22926 +       /* mixing of read/write locks is not allowed */
22927 +       assert("zam-341", !ret || znode_is_wlocked(node));
22928 +
22929 +       return ret;
22930 +}
22931 +
22932 +#if REISER4_DEBUG
22933 +/* Returns true if the lock is held by the calling thread. */
22934 +int znode_is_any_locked(const znode * node)
22935 +{
22936 +       lock_handle *handle;
22937 +       lock_stack *stack;
22938 +       int ret;
22939 +
22940 +       if (!znode_is_locked(node)) {
22941 +               return 0;
22942 +       }
22943 +
22944 +       stack = get_current_lock_stack();
22945 +
22946 +       spin_lock_stack(stack);
22947 +
22948 +       ret = 0;
22949 +
22950 +       list_for_each_entry(handle, &stack->locks, locks_link) {
22951 +               if (handle->node == node) {
22952 +                       ret = 1;
22953 +                       break;
22954 +               }
22955 +       }
22956 +
22957 +       spin_unlock_stack(stack);
22958 +
22959 +       return ret;
22960 +}
22961 +
22962 +#endif
22963 +
22964 +/* Returns true if a write lock is held by the calling thread. */
22965 +int znode_is_write_locked(const znode * node)
22966 +{
22967 +       lock_stack *stack;
22968 +       lock_handle *handle;
22969 +
22970 +       assert("jmacd-8765", node != NULL);
22971 +
22972 +       if (!znode_is_wlocked(node)) {
22973 +               return 0;
22974 +       }
22975 +
22976 +       stack = get_current_lock_stack();
22977 +
22978 +       /*
22979 +        * When znode is write locked, all owner handles point to the same lock
22980 +        * stack. Get pointer to lock stack from the first lock handle from
22981 +        * znode's owner list
22982 +        */
22983 +       handle = list_entry(node->lock.owners.next, lock_handle, owners_link);
22984 +
22985 +       return (handle->owner == stack);
22986 +}
22987 +
22988 +/* This "deadlock" condition is the essential part of reiser4 locking
22989 +   implementation. This condition is checked explicitly by calling
22990 +   check_deadlock_condition() or implicitly in all places where znode lock
22991 +   state (set of owners and request queue) is changed. Locking code is
22992 +   designed to use this condition to trigger procedure of passing object from
22993 +   low priority owner(s) to high priority one(s).
22994 +
22995 +   The procedure results in passing an event (setting lock_handle->signaled
22996 +   flag) and counting this event in nr_signaled field of owner's lock stack
22997 +   object and wakeup owner's process.
22998 +*/
22999 +static inline int check_deadlock_condition(znode * node)
23000 +{
23001 +       assert_spin_locked(&(node->lock.guard));
23002 +       return node->lock.nr_hipri_requests > 0
23003 +           && node->lock.nr_hipri_owners == 0;
23004 +}
23005 +
23006 +static int check_livelock_condition(znode * node, znode_lock_mode mode)
23007 +{
23008 +       zlock * lock = &node->lock;
23009 +
23010 +       return mode == ZNODE_READ_LOCK &&
23011 +               lock -> nr_readers >= 0 && lock->nr_hipri_write_requests > 0;
23012 +}
23013 +
23014 +/* checks lock/request compatibility */
23015 +static int can_lock_object(lock_stack * owner)
23016 +{
23017 +       znode *node = owner->request.node;
23018 +
23019 +       assert_spin_locked(&(node->lock.guard));
23020 +
23021 +       /* See if the node is disconnected. */
23022 +       if (unlikely(ZF_ISSET(node, JNODE_IS_DYING)))
23023 +               return RETERR(-EINVAL);
23024 +
23025 +       /* Do not ever try to take a lock if we are going in low priority
23026 +          direction and a node have a high priority request without high
23027 +          priority owners. */
23028 +       if (unlikely(!owner->curpri && check_deadlock_condition(node)))
23029 +               return RETERR(-E_REPEAT);
23030 +       if (unlikely(owner->curpri && check_livelock_condition(node, owner->request.mode)))
23031 +               return RETERR(-E_REPEAT);
23032 +       if (unlikely(!is_lock_compatible(node, owner->request.mode)))
23033 +               return RETERR(-E_REPEAT);
23034 +       return 0;
23035 +}
23036 +
23037 +/* Setting of a high priority to the process. It clears "signaled" flags
23038 +   because znode locked by high-priority process can't satisfy our "deadlock
23039 +   condition". */
23040 +static void set_high_priority(lock_stack * owner)
23041 +{
23042 +       assert("nikita-1846", owner == get_current_lock_stack());
23043 +       /* Do nothing if current priority is already high */
23044 +       if (!owner->curpri) {
23045 +               /* We don't need locking for owner->locks list, because, this
23046 +                * function is only called with the lock stack of the current
23047 +                * thread, and no other thread can play with owner->locks list
23048 +                * and/or change ->node pointers of lock handles in this list.
23049 +                *
23050 +                * (Interrupts also are not involved.)
23051 +                */
23052 +               lock_handle *item = list_entry(owner->locks.next, lock_handle, locks_link);
23053 +               while (&owner->locks != &item->locks_link) {
23054 +                       znode *node = item->node;
23055 +
23056 +                       spin_lock_zlock(&node->lock);
23057 +
23058 +                       node->lock.nr_hipri_owners++;
23059 +
23060 +                       /* we can safely set signaled to zero, because
23061 +                          previous statement (nr_hipri_owners ++) guarantees
23062 +                          that signaled will be never set again. */
23063 +                       item->signaled = 0;
23064 +                       spin_unlock_zlock(&node->lock);
23065 +
23066 +                       item = list_entry(item->locks_link.next, lock_handle, locks_link);
23067 +               }
23068 +               owner->curpri = 1;
23069 +               atomic_set(&owner->nr_signaled, 0);
23070 +       }
23071 +}
23072 +
23073 +/* Sets a low priority to the process. */
23074 +static void set_low_priority(lock_stack * owner)
23075 +{
23076 +       assert("nikita-3075", owner == get_current_lock_stack());
23077 +       /* Do nothing if current priority is already low */
23078 +       if (owner->curpri) {
23079 +               /* scan all locks (lock handles) held by @owner, which is
23080 +                  actually current thread, and check whether we are reaching
23081 +                  deadlock possibility anywhere.
23082 +                */
23083 +               lock_handle *handle = list_entry(owner->locks.next, lock_handle, locks_link);
23084 +               while (&owner->locks != &handle->locks_link) {
23085 +                       znode *node = handle->node;
23086 +                       spin_lock_zlock(&node->lock);
23087 +                       /* this thread just was hipri owner of @node, so
23088 +                          nr_hipri_owners has to be greater than zero. */
23089 +                       assert("nikita-1835", node->lock.nr_hipri_owners > 0);
23090 +                       node->lock.nr_hipri_owners--;
23091 +                       /* If we have deadlock condition, adjust a nr_signaled
23092 +                          field. It is enough to set "signaled" flag only for
23093 +                          current process, other low-pri owners will be
23094 +                          signaled and waken up after current process unlocks
23095 +                          this object and any high-priority requestor takes
23096 +                          control. */
23097 +                       if (check_deadlock_condition(node)
23098 +                           && !handle->signaled) {
23099 +                               handle->signaled = 1;
23100 +                               atomic_inc(&owner->nr_signaled);
23101 +                       }
23102 +                       spin_unlock_zlock(&node->lock);
23103 +                       handle = list_entry(handle->locks_link.next, lock_handle, locks_link);
23104 +               }
23105 +               owner->curpri = 0;
23106 +       }
23107 +}
23108 +
23109 +static void remove_lock_request(lock_stack * requestor)
23110 +{
23111 +       zlock * lock = &requestor->request.node->lock;
23112 +
23113 +       if (requestor->curpri) {
23114 +               assert("nikita-1838", lock->nr_hipri_requests > 0);
23115 +               lock->nr_hipri_requests--;
23116 +               if (requestor->request.mode == ZNODE_WRITE_LOCK)
23117 +                       lock->nr_hipri_write_requests --;
23118 +       }
23119 +       list_del(&requestor->requestors_link);
23120 +}
23121 +
23122 +static void invalidate_all_lock_requests(znode * node)
23123 +{
23124 +       lock_stack *requestor, *tmp;
23125 +
23126 +       assert_spin_locked(&(node->lock.guard));
23127 +
23128 +       list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23129 +               remove_lock_request(requestor);
23130 +               requestor->request.ret_code = -EINVAL;
23131 +               reiser4_wake_up(requestor);
23132 +               requestor->request.mode = ZNODE_NO_LOCK;
23133 +       }
23134 +}
23135 +
23136 +static void dispatch_lock_requests(znode * node)
23137 +{
23138 +       lock_stack *requestor, *tmp;
23139 +
23140 +       assert_spin_locked(&(node->lock.guard));
23141 +
23142 +       list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23143 +               if (znode_is_write_locked(node))
23144 +                       break;
23145 +               if (!can_lock_object(requestor)) {
23146 +                       lock_object(requestor);
23147 +                       remove_lock_request(requestor);
23148 +                       requestor->request.ret_code = 0;
23149 +                       reiser4_wake_up(requestor);
23150 +                       requestor->request.mode = ZNODE_NO_LOCK;
23151 +               }
23152 +       }
23153 +}
23154 +
23155 +/* release long-term lock, acquired by longterm_lock_znode() */
23156 +void longterm_unlock_znode(lock_handle * handle)
23157 +{
23158 +       znode *node = handle->node;
23159 +       lock_stack *oldowner = handle->owner;
23160 +       int hipri;
23161 +       int readers;
23162 +       int rdelta;
23163 +       int youdie;
23164 +
23165 +       /*
23166 +        * this is time-critical and highly optimized code. Modify carefully.
23167 +        */
23168 +
23169 +       assert("jmacd-1021", handle != NULL);
23170 +       assert("jmacd-1022", handle->owner != NULL);
23171 +       assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode));
23172 +
23173 +       assert("zam-130", oldowner == get_current_lock_stack());
23174 +
23175 +       LOCK_CNT_DEC(long_term_locked_znode);
23176 +
23177 +       /*
23178 +        * to minimize amount of operations performed under lock, pre-compute
23179 +        * all variables used within critical section. This makes code
23180 +        * obscure.
23181 +        */
23182 +
23183 +       /* was this lock of hi or lo priority */
23184 +       hipri = oldowner->curpri ? 1 : 0;
23185 +       /* number of readers */
23186 +       readers = node->lock.nr_readers;
23187 +       /* +1 if write lock, -1 if read lock */
23188 +       rdelta = (readers > 0) ? -1 : +1;
23189 +       /* true if node is to die and write lock is released */
23190 +       youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0);
23191 +
23192 +       spin_lock_zlock(&node->lock);
23193 +
23194 +       assert("zam-101", znode_is_locked(node));
23195 +
23196 +       /* Adjust a number of high priority owners of this lock */
23197 +       assert("nikita-1836", node->lock.nr_hipri_owners >= hipri);
23198 +       node->lock.nr_hipri_owners -= hipri;
23199 +
23200 +       /* Handle znode deallocation on last write-lock release. */
23201 +       if (znode_is_wlocked_once(node)) {
23202 +               if (youdie) {
23203 +                       forget_znode(handle);
23204 +                       assert("nikita-2191", znode_invariant(node));
23205 +                       zput(node);
23206 +                       return;
23207 +               }
23208 +       }
23209 +
23210 +       if (handle->signaled)
23211 +               atomic_dec(&oldowner->nr_signaled);
23212 +
23213 +       /* Unlocking means owner<->object link deletion */
23214 +       unlink_object(handle);
23215 +
23216 +       /* This is enough to be sure whether an object is completely
23217 +          unlocked. */
23218 +       node->lock.nr_readers += rdelta;
23219 +
23220 +       /* If the node is locked it must have an owners list.  Likewise, if
23221 +          the node is unlocked it must have an empty owners list. */
23222 +       assert("zam-319", equi(znode_is_locked(node),
23223 +                              !list_empty_careful(&node->lock.owners)));
23224 +
23225 +#if REISER4_DEBUG
23226 +       if (!znode_is_locked(node))
23227 +               ++node->times_locked;
23228 +#endif
23229 +
23230 +       /* If there are pending lock requests we wake up a requestor */
23231 +       if (!znode_is_wlocked(node))
23232 +               dispatch_lock_requests(node);
23233 +       if (check_deadlock_condition(node))
23234 +               wake_up_all_lopri_owners(node);
23235 +       spin_unlock_zlock(&node->lock);
23236 +
23237 +       /* minus one reference from handle->node */
23238 +       assert("nikita-2190", znode_invariant(node));
23239 +       ON_DEBUG(check_lock_data());
23240 +       ON_DEBUG(check_lock_node_data(node));
23241 +       zput(node);
23242 +}
23243 +
23244 +/* final portion of longterm-lock */
23245 +static int
23246 +lock_tail(lock_stack * owner, int ok, znode_lock_mode mode)
23247 +{
23248 +       znode *node = owner->request.node;
23249 +
23250 +       assert_spin_locked(&(node->lock.guard));
23251 +
23252 +       /* If we broke with (ok == 0) it means we can_lock, now do it. */
23253 +       if (ok == 0) {
23254 +               lock_object(owner);
23255 +               owner->request.mode = 0;
23256 +               /* count a reference from lockhandle->node
23257 +
23258 +                  znode was already referenced at the entry to this function,
23259 +                  hence taking spin-lock here is not necessary (see comment
23260 +                  in the zref()).
23261 +                */
23262 +               zref(node);
23263 +
23264 +               LOCK_CNT_INC(long_term_locked_znode);
23265 +       }
23266 +       spin_unlock_zlock(&node->lock);
23267 +       ON_DEBUG(check_lock_data());
23268 +       ON_DEBUG(check_lock_node_data(node));
23269 +       return ok;
23270 +}
23271 +
23272 +/*
23273 + * version of longterm_znode_lock() optimized for the most common case: read
23274 + * lock without any special flags. This is the kind of lock that any tree
23275 + * traversal takes on the root node of the tree, which is very frequent.
23276 + */
23277 +static int longterm_lock_tryfast(lock_stack * owner)
23278 +{
23279 +       int result;
23280 +       znode *node;
23281 +       zlock *lock;
23282 +
23283 +       node = owner->request.node;
23284 +       lock = &node->lock;
23285 +
23286 +       assert("nikita-3340", reiser4_schedulable());
23287 +       assert("nikita-3341", request_is_deadlock_safe(node,
23288 +                                                      ZNODE_READ_LOCK,
23289 +                                                      ZNODE_LOCK_LOPRI));
23290 +       spin_lock_zlock(lock);
23291 +       result = can_lock_object(owner);
23292 +       spin_unlock_zlock(lock);
23293 +
23294 +       if (likely(result != -EINVAL)) {
23295 +               spin_lock_znode(node);
23296 +               result = reiser4_try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0);
23297 +               spin_unlock_znode(node);
23298 +               spin_lock_zlock(lock);
23299 +               if (unlikely(result != 0)) {
23300 +                       owner->request.mode = 0;
23301 +               } else {
23302 +                       result = can_lock_object(owner);
23303 +                       if (unlikely(result == -E_REPEAT)) {
23304 +                               /* fall back to longterm_lock_znode() */
23305 +                               spin_unlock_zlock(lock);
23306 +                               return 1;
23307 +                       }
23308 +               }
23309 +               return lock_tail(owner, result, ZNODE_READ_LOCK);
23310 +       } else
23311 +               return 1;
23312 +}
23313 +
23314 +/* locks given lock object */
23315 +int longterm_lock_znode(
23316 +                              /* local link object (allocated by lock owner thread, usually on its own
23317 +                               * stack) */
23318 +                              lock_handle * handle,
23319 +                              /* znode we want to lock. */
23320 +                              znode * node,
23321 +                              /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */
23322 +                              znode_lock_mode mode,
23323 +                              /* {0, -EINVAL, -E_DEADLOCK}, see return codes description. */
23324 +                              znode_lock_request request) {
23325 +       int ret;
23326 +       int hipri = (request & ZNODE_LOCK_HIPRI) != 0;
23327 +       int non_blocking = 0;
23328 +       int has_atom;
23329 +       txn_capture cap_flags;
23330 +       zlock *lock;
23331 +       txn_handle *txnh;
23332 +       tree_level level;
23333 +
23334 +       /* Get current process context */
23335 +       lock_stack *owner = get_current_lock_stack();
23336 +
23337 +       /* Check that the lock handle is initialized and isn't already being
23338 +        * used. */
23339 +       assert("jmacd-808", handle->owner == NULL);
23340 +       assert("nikita-3026", reiser4_schedulable());
23341 +       assert("nikita-3219", request_is_deadlock_safe(node, mode, request));
23342 +       assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0);
23343 +       /* long term locks are not allowed in the VM contexts (->writepage(),
23344 +        * prune_{d,i}cache()).
23345 +        *
23346 +        * FIXME this doesn't work due to unused-dentry-with-unlinked-inode
23347 +        * bug caused by d_splice_alias() only working for directories.
23348 +        */
23349 +       assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0));
23350 +       assert ("zam-1055", mode != ZNODE_NO_LOCK);
23351 +
23352 +       cap_flags = 0;
23353 +       if (request & ZNODE_LOCK_NONBLOCK) {
23354 +               cap_flags |= TXN_CAPTURE_NONBLOCKING;
23355 +               non_blocking = 1;
23356 +       }
23357 +
23358 +       if (request & ZNODE_LOCK_DONT_FUSE)
23359 +               cap_flags |= TXN_CAPTURE_DONT_FUSE;
23360 +
23361 +       /* If we are changing our process priority we must adjust a number
23362 +          of high priority owners for each znode that we already lock */
23363 +       if (hipri) {
23364 +               set_high_priority(owner);
23365 +       } else {
23366 +               set_low_priority(owner);
23367 +       }
23368 +
23369 +       level = znode_get_level(node);
23370 +
23371 +       /* Fill request structure with our values. */
23372 +       owner->request.mode = mode;
23373 +       owner->request.handle = handle;
23374 +       owner->request.node = node;
23375 +
23376 +       txnh = get_current_context()->trans;
23377 +       lock = &node->lock;
23378 +
23379 +       if (mode == ZNODE_READ_LOCK && request == 0) {
23380 +               ret = longterm_lock_tryfast(owner);
23381 +               if (ret <= 0)
23382 +                       return ret;
23383 +       }
23384 +
23385 +       has_atom = (txnh->atom != NULL);
23386 +
23387 +       /* Synchronize on node's zlock guard lock. */
23388 +       spin_lock_zlock(lock);
23389 +
23390 +       if (znode_is_locked(node) &&
23391 +           mode == ZNODE_WRITE_LOCK && recursive(owner))
23392 +               return lock_tail(owner, 0, mode);
23393 +
23394 +       for (;;) {
23395 +               /* Check the lock's availability: if it is unavaiable we get
23396 +                  E_REPEAT, 0 indicates "can_lock", otherwise the node is
23397 +                  invalid.  */
23398 +               ret = can_lock_object(owner);
23399 +
23400 +               if (unlikely(ret == -EINVAL)) {
23401 +                       /* @node is dying. Leave it alone. */
23402 +                       break;
23403 +               }
23404 +
23405 +               if (unlikely(ret == -E_REPEAT && non_blocking)) {
23406 +                       /* either locking of @node by the current thread will
23407 +                        * lead to the deadlock, or lock modes are
23408 +                        * incompatible. */
23409 +                       break;
23410 +               }
23411 +
23412 +               assert("nikita-1844", (ret == 0)
23413 +                      || ((ret == -E_REPEAT) && !non_blocking));
23414 +               /* If we can get the lock... Try to capture first before
23415 +                  taking the lock. */
23416 +
23417 +               /* first handle commonest case where node and txnh are already
23418 +                * in the same atom. */
23419 +               /* safe to do without taking locks, because:
23420 +                *
23421 +                * 1. read of aligned word is atomic with respect to writes to
23422 +                * this word
23423 +                *
23424 +                * 2. false negatives are handled in reiser4_try_capture().
23425 +                *
23426 +                * 3. false positives are impossible.
23427 +                *
23428 +                * PROOF: left as an exercise to the curious reader.
23429 +                *
23430 +                * Just kidding. Here is one:
23431 +                *
23432 +                * At the time T0 txnh->atom is stored in txnh_atom.
23433 +                *
23434 +                * At the time T1 node->atom is stored in node_atom.
23435 +                *
23436 +                * At the time T2 we observe that
23437 +                *
23438 +                *     txnh_atom != NULL && node_atom == txnh_atom.
23439 +                *
23440 +                * Imagine that at this moment we acquire node and txnh spin
23441 +                * lock in this order. Suppose that under spin lock we have
23442 +                *
23443 +                *     node->atom != txnh->atom,                       (S1)
23444 +                *
23445 +                * at the time T3.
23446 +                *
23447 +                * txnh->atom != NULL still, because txnh is open by the
23448 +                * current thread.
23449 +                *
23450 +                * Suppose node->atom == NULL, that is, node was un-captured
23451 +                * between T1, and T3. But un-capturing of formatted node is
23452 +                * always preceded by the call to reiser4_invalidate_lock(),
23453 +                * which marks znode as JNODE_IS_DYING under zlock spin
23454 +                * lock. Contradiction, because can_lock_object() above checks
23455 +                * for JNODE_IS_DYING. Hence, node->atom != NULL at T3.
23456 +                *
23457 +                * Suppose that node->atom != node_atom, that is, atom, node
23458 +                * belongs to was fused into another atom: node_atom was fused
23459 +                * into node->atom. Atom of txnh was equal to node_atom at T2,
23460 +                * which means that under spin lock, txnh->atom == node->atom,
23461 +                * because txnh->atom can only follow fusion
23462 +                * chain. Contradicts S1.
23463 +                *
23464 +                * The same for hypothesis txnh->atom != txnh_atom. Hence,
23465 +                * node->atom == node_atom == txnh_atom == txnh->atom. Again
23466 +                * contradicts S1. Hence S1 is false. QED.
23467 +                *
23468 +                */
23469 +
23470 +               if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) {
23471 +                       ;
23472 +               } else {
23473 +                       /*
23474 +                        * unlock zlock spin lock here. It is possible for
23475 +                        * longterm_unlock_znode() to sneak in here, but there
23476 +                        * is no harm: reiser4_invalidate_lock() will mark znode
23477 +                        * as JNODE_IS_DYING and this will be noted by
23478 +                        * can_lock_object() below.
23479 +                        */
23480 +                       spin_unlock_zlock(lock);
23481 +                       spin_lock_znode(node);
23482 +                       ret = reiser4_try_capture(ZJNODE(node), mode, cap_flags);
23483 +                       spin_unlock_znode(node);
23484 +                       spin_lock_zlock(lock);
23485 +                       if (unlikely(ret != 0)) {
23486 +                               /* In the failure case, the txnmgr releases
23487 +                                  the znode's lock (or in some cases, it was
23488 +                                  released a while ago).  There's no need to
23489 +                                  reacquire it so we should return here,
23490 +                                  avoid releasing the lock. */
23491 +                               owner->request.mode = 0;
23492 +                               break;
23493 +                       }
23494 +
23495 +                       /* Check the lock's availability again -- this is
23496 +                          because under some circumstances the capture code
23497 +                          has to release and reacquire the znode spinlock. */
23498 +                       ret = can_lock_object(owner);
23499 +               }
23500 +
23501 +               /* This time, a return of (ret == 0) means we can lock, so we
23502 +                  should break out of the loop. */
23503 +               if (likely(ret != -E_REPEAT || non_blocking))
23504 +                       break;
23505 +
23506 +               /* Lock is unavailable, we have to wait. */
23507 +               ret = reiser4_prepare_to_sleep(owner);
23508 +               if (unlikely(ret != 0))
23509 +                       break;
23510 +
23511 +               assert_spin_locked(&(node->lock.guard));
23512 +               if (hipri) {
23513 +                       /* If we are going in high priority direction then
23514 +                          increase high priority requests counter for the
23515 +                          node */
23516 +                       lock->nr_hipri_requests++;
23517 +                       if (mode == ZNODE_WRITE_LOCK)
23518 +                               lock->nr_hipri_write_requests ++;
23519 +                       /* If there are no high priority owners for a node,
23520 +                          then immediately wake up low priority owners, so
23521 +                          they can detect possible deadlock */
23522 +                       if (lock->nr_hipri_owners == 0)
23523 +                               wake_up_all_lopri_owners(node);
23524 +               }
23525 +               list_add_tail(&owner->requestors_link, &lock->requestors);
23526 +
23527 +               /* Ok, here we have prepared a lock request, so unlock
23528 +                  a znode ... */
23529 +               spin_unlock_zlock(lock);
23530 +               /* ... and sleep */
23531 +               reiser4_go_to_sleep(owner);
23532 +               if (owner->request.mode == ZNODE_NO_LOCK)
23533 +                       goto request_is_done;
23534 +               spin_lock_zlock(lock);
23535 +               if (owner->request.mode == ZNODE_NO_LOCK) {
23536 +                       spin_unlock_zlock(lock);
23537 +               request_is_done:
23538 +                       if (owner->request.ret_code == 0) {
23539 +                               LOCK_CNT_INC(long_term_locked_znode);
23540 +                               zref(node);
23541 +                       }
23542 +                       return owner->request.ret_code;
23543 +               }
23544 +               remove_lock_request(owner);
23545 +       }
23546 +
23547 +       return lock_tail(owner, ret, mode);
23548 +}
23549 +
23550 +/* lock object invalidation means changing of lock object state to `INVALID'
23551 +   and waiting for all other processes to cancel theirs lock requests. */
23552 +void reiser4_invalidate_lock(lock_handle * handle      /* path to lock
23553 +                                                        * owner and lock
23554 +                                                        * object is being
23555 +                                                        * invalidated. */ )
23556 +{
23557 +       znode *node = handle->node;
23558 +       lock_stack *owner = handle->owner;
23559 +
23560 +       assert("zam-325", owner == get_current_lock_stack());
23561 +       assert("zam-103", znode_is_write_locked(node));
23562 +       assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED));
23563 +       assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED));
23564 +       assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
23565 +       assert("nikita-3097", znode_is_wlocked_once(node));
23566 +       assert_spin_locked(&(node->lock.guard));
23567 +
23568 +       if (handle->signaled)
23569 +               atomic_dec(&owner->nr_signaled);
23570 +
23571 +       ZF_SET(node, JNODE_IS_DYING);
23572 +       unlink_object(handle);
23573 +       node->lock.nr_readers = 0;
23574 +
23575 +       invalidate_all_lock_requests(node);
23576 +       spin_unlock_zlock(&node->lock);
23577 +}
23578 +
23579 +/* Initializes lock_stack. */
23580 +void init_lock_stack(lock_stack * owner        /* pointer to
23581 +                                        * allocated
23582 +                                        * structure. */ )
23583 +{
23584 +       INIT_LIST_HEAD(&owner->locks);
23585 +       INIT_LIST_HEAD(&owner->requestors_link);
23586 +       spin_lock_init(&owner->sguard);
23587 +       owner->curpri = 1;
23588 +       init_waitqueue_head(&owner->wait);
23589 +}
23590 +
23591 +/* Initializes lock object. */
23592 +void reiser4_init_lock(zlock * lock    /* pointer on allocated
23593 +                                        * uninitialized lock object
23594 +                                        * structure. */ )
23595 +{
23596 +       memset(lock, 0, sizeof(zlock));
23597 +       spin_lock_init(&lock->guard);
23598 +       INIT_LIST_HEAD(&lock->requestors);
23599 +       INIT_LIST_HEAD(&lock->owners);
23600 +}
23601 +
23602 +/* Transfer a lock handle (presumably so that variables can be moved between stack and
23603 +   heap locations). */
23604 +static void
23605 +move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old)
23606 +{
23607 +       znode *node = old->node;
23608 +       lock_stack *owner = old->owner;
23609 +       int signaled;
23610 +
23611 +       /* locks_list, modified by link_object() is not protected by
23612 +          anything. This is valid because only current thread ever modifies
23613 +          locks_list of its lock_stack.
23614 +        */
23615 +       assert("nikita-1827", owner == get_current_lock_stack());
23616 +       assert("nikita-1831", new->owner == NULL);
23617 +
23618 +       spin_lock_zlock(&node->lock);
23619 +
23620 +       signaled = old->signaled;
23621 +       if (unlink_old) {
23622 +               unlink_object(old);
23623 +       } else {
23624 +               if (node->lock.nr_readers > 0) {
23625 +                       node->lock.nr_readers += 1;
23626 +               } else {
23627 +                       node->lock.nr_readers -= 1;
23628 +               }
23629 +               if (signaled) {
23630 +                       atomic_inc(&owner->nr_signaled);
23631 +               }
23632 +               if (owner->curpri) {
23633 +                       node->lock.nr_hipri_owners += 1;
23634 +               }
23635 +               LOCK_CNT_INC(long_term_locked_znode);
23636 +
23637 +               zref(node);
23638 +       }
23639 +       link_object(new, owner, node);
23640 +       new->signaled = signaled;
23641 +
23642 +       spin_unlock_zlock(&node->lock);
23643 +}
23644 +
23645 +void move_lh(lock_handle * new, lock_handle * old)
23646 +{
23647 +       move_lh_internal(new, old, /*unlink_old */ 1);
23648 +}
23649 +
23650 +void copy_lh(lock_handle * new, lock_handle * old)
23651 +{
23652 +       move_lh_internal(new, old, /*unlink_old */ 0);
23653 +}
23654 +
23655 +/* after getting -E_DEADLOCK we unlock znodes until this function returns false */
23656 +int reiser4_check_deadlock(void)
23657 +{
23658 +       lock_stack *owner = get_current_lock_stack();
23659 +       return atomic_read(&owner->nr_signaled) != 0;
23660 +}
23661 +
23662 +/* Before going to sleep we re-check "release lock" requests which might come from threads with hi-pri lock
23663 +   priorities. */
23664 +int reiser4_prepare_to_sleep(lock_stack * owner)
23665 +{
23666 +       assert("nikita-1847", owner == get_current_lock_stack());
23667 +
23668 +       /* We return -E_DEADLOCK if one or more "give me the lock" messages are
23669 +        * counted in nr_signaled */
23670 +       if (unlikely(atomic_read(&owner->nr_signaled) != 0)) {
23671 +               assert("zam-959", !owner->curpri);
23672 +               return RETERR(-E_DEADLOCK);
23673 +       }
23674 +       return 0;
23675 +}
23676 +
23677 +/* Wakes up a single thread */
23678 +void __reiser4_wake_up(lock_stack * owner)
23679 +{
23680 +       atomic_set(&owner->wakeup, 1);
23681 +       wake_up(&owner->wait);
23682 +}
23683 +
23684 +/* Puts a thread to sleep */
23685 +void reiser4_go_to_sleep(lock_stack * owner)
23686 +{
23687 +       /* Well, we might sleep here, so holding of any spinlocks is no-no */
23688 +       assert("nikita-3027", reiser4_schedulable());
23689 +
23690 +       wait_event(owner->wait, atomic_read(&owner->wakeup));
23691 +       atomic_set(&owner->wakeup, 0);
23692 +}
23693 +
23694 +int lock_stack_isclean(lock_stack * owner)
23695 +{
23696 +       if (list_empty_careful(&owner->locks)) {
23697 +               assert("zam-353", atomic_read(&owner->nr_signaled) == 0);
23698 +               return 1;
23699 +       }
23700 +
23701 +       return 0;
23702 +}
23703 +
23704 +#if REISER4_DEBUG
23705 +
23706 +/*
23707 + * debugging functions
23708 + */
23709 +
23710 +static void list_check(struct list_head *head)
23711 +{
23712 +       struct list_head *pos;
23713 +
23714 +       list_for_each(pos, head)
23715 +               assert("", (pos->prev != NULL && pos->next != NULL &&
23716 +                           pos->prev->next == pos && pos->next->prev == pos));
23717 +}
23718 +
23719 +/* check consistency of locking data-structures hanging of the @stack */
23720 +static void check_lock_stack(lock_stack * stack)
23721 +{
23722 +       spin_lock_stack(stack);
23723 +       /* check that stack->locks is not corrupted */
23724 +       list_check(&stack->locks);
23725 +       spin_unlock_stack(stack);
23726 +}
23727 +
23728 +/* check consistency of locking data structures */
23729 +void check_lock_data(void)
23730 +{
23731 +       check_lock_stack(&get_current_context()->stack);
23732 +}
23733 +
23734 +/* check consistency of locking data structures for @node */
23735 +void check_lock_node_data(znode * node)
23736 +{
23737 +       spin_lock_zlock(&node->lock);
23738 +       list_check(&node->lock.owners);
23739 +       list_check(&node->lock.requestors);
23740 +       spin_unlock_zlock(&node->lock);
23741 +}
23742 +
23743 +/* check that given lock request is dead lock safe. This check is, of course,
23744 + * not exhaustive. */
23745 +static int
23746 +request_is_deadlock_safe(znode * node, znode_lock_mode mode,
23747 +                        znode_lock_request request)
23748 +{
23749 +       lock_stack *owner;
23750 +
23751 +       owner = get_current_lock_stack();
23752 +       /*
23753 +        * check that hipri lock request is not issued when there are locked
23754 +        * nodes at the higher levels.
23755 +        */
23756 +       if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) &&
23757 +           znode_get_level(node) != 0) {
23758 +               lock_handle *item;
23759 +
23760 +               list_for_each_entry(item, &owner->locks, locks_link) {
23761 +                       znode *other;
23762 +
23763 +                       other = item->node;
23764 +
23765 +                       if (znode_get_level(other) == 0)
23766 +                               continue;
23767 +                       if (znode_get_level(other) > znode_get_level(node))
23768 +                               return 0;
23769 +               }
23770 +       }
23771 +       return 1;
23772 +}
23773 +
23774 +#endif
23775 +
23776 +/* return pointer to static storage with name of lock_mode. For
23777 +    debugging */
23778 +const char *lock_mode_name(znode_lock_mode lock /* lock mode to get name of */ )
23779 +{
23780 +       if (lock == ZNODE_READ_LOCK)
23781 +               return "read";
23782 +       else if (lock == ZNODE_WRITE_LOCK)
23783 +               return "write";
23784 +       else {
23785 +               static char buf[30];
23786 +
23787 +               sprintf(buf, "unknown: %i", lock);
23788 +               return buf;
23789 +       }
23790 +}
23791 +
23792 +/* Make Linus happy.
23793 +   Local variables:
23794 +   c-indentation-style: "K&R"
23795 +   mode-name: "LC"
23796 +   c-basic-offset: 8
23797 +   tab-width: 8
23798 +   fill-column: 79
23799 +   End:
23800 +*/
23801 diff -urN linux-2.6.27.orig/fs/reiser4/lock.h linux-2.6.27/fs/reiser4/lock.h
23802 --- linux-2.6.27.orig/fs/reiser4/lock.h 1970-01-01 03:00:00.000000000 +0300
23803 +++ linux-2.6.27/fs/reiser4/lock.h      2008-10-12 18:20:00.000000000 +0400
23804 @@ -0,0 +1,249 @@
23805 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
23806 +
23807 +/* Long term locking data structures. See lock.c for details. */
23808 +
23809 +#ifndef __LOCK_H__
23810 +#define __LOCK_H__
23811 +
23812 +#include "forward.h"
23813 +#include "debug.h"
23814 +#include "dformat.h"
23815 +#include "key.h"
23816 +#include "coord.h"
23817 +#include "plugin/node/node.h"
23818 +#include "txnmgr.h"
23819 +#include "readahead.h"
23820 +
23821 +#include <linux/types.h>
23822 +#include <linux/spinlock.h>
23823 +#include <linux/pagemap.h>     /* for PAGE_CACHE_SIZE */
23824 +#include <asm/atomic.h>
23825 +#include <linux/wait.h>
23826 +
23827 +/* Per-znode lock object */
23828 +struct zlock {
23829 +       spinlock_t guard;
23830 +       /* The number of readers if positive; the number of recursively taken
23831 +          write locks if negative. Protected by zlock spin lock. */
23832 +       int nr_readers;
23833 +       /* A number of processes (lock_stacks) that have this object
23834 +          locked with high priority */
23835 +       unsigned nr_hipri_owners;
23836 +       /* A number of attempts to lock znode in high priority direction */
23837 +       unsigned nr_hipri_requests;
23838 +       /* A linked list of lock_handle objects that contains pointers
23839 +          for all lock_stacks which have this lock object locked */
23840 +       unsigned nr_hipri_write_requests;
23841 +       struct list_head owners;
23842 +       /* A linked list of lock_stacks that wait for this lock */
23843 +       struct list_head requestors;
23844 +};
23845 +
23846 +static inline void spin_lock_zlock(zlock *lock)
23847 +{
23848 +       /* check that zlock is not locked */
23849 +       assert("", LOCK_CNT_NIL(spin_locked_zlock));
23850 +       /* check that spinlocks of lower priorities are not held */
23851 +       assert("", LOCK_CNT_NIL(spin_locked_stack));
23852 +
23853 +       spin_lock(&lock->guard);
23854 +
23855 +       LOCK_CNT_INC(spin_locked_zlock);
23856 +       LOCK_CNT_INC(spin_locked);
23857 +}
23858 +
23859 +static inline void spin_unlock_zlock(zlock *lock)
23860 +{
23861 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock));
23862 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
23863 +
23864 +       LOCK_CNT_DEC(spin_locked_zlock);
23865 +       LOCK_CNT_DEC(spin_locked);
23866 +
23867 +       spin_unlock(&lock->guard);
23868 +}
23869 +
23870 +#define lock_is_locked(lock)          ((lock)->nr_readers != 0)
23871 +#define lock_is_rlocked(lock)         ((lock)->nr_readers > 0)
23872 +#define lock_is_wlocked(lock)         ((lock)->nr_readers < 0)
23873 +#define lock_is_wlocked_once(lock)    ((lock)->nr_readers == -1)
23874 +#define lock_can_be_rlocked(lock)     ((lock)->nr_readers >=0)
23875 +#define lock_mode_compatible(lock, mode)                               \
23876 +             (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \
23877 +              ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock)))
23878 +
23879 +/* Since we have R/W znode locks we need additional bidirectional `link'
23880 +   objects to implement n<->m relationship between lock owners and lock
23881 +   objects. We call them `lock handles'.
23882 +
23883 +   Locking: see lock.c/"SHORT-TERM LOCKING"
23884 +*/
23885 +struct lock_handle {
23886 +       /* This flag indicates that a signal to yield a lock was passed to
23887 +          lock owner and counted in owner->nr_signalled
23888 +
23889 +          Locking: this is accessed under spin lock on ->node.
23890 +        */
23891 +       int signaled;
23892 +       /* A link to owner of a lock */
23893 +       lock_stack *owner;
23894 +       /* A link to znode locked */
23895 +       znode *node;
23896 +       /* A list of all locks for a process */
23897 +       struct list_head locks_link;
23898 +       /* A list of all owners for a znode */
23899 +       struct list_head owners_link;
23900 +};
23901 +
23902 +struct lock_request {
23903 +       /* A pointer to uninitialized link object */
23904 +       lock_handle *handle;
23905 +       /* A pointer to the object we want to lock */
23906 +       znode *node;
23907 +       /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */
23908 +       znode_lock_mode mode;
23909 +       /* how dispatch_lock_requests() returns lock request result code */
23910 +       int ret_code;
23911 +};
23912 +
23913 +/* A lock stack structure for accumulating locks owned by a process */
23914 +struct lock_stack {
23915 +       /* A guard lock protecting a lock stack */
23916 +       spinlock_t sguard;
23917 +       /* number of znodes which were requested by high priority processes */
23918 +       atomic_t nr_signaled;
23919 +       /* Current priority of a process
23920 +
23921 +          This is only accessed by the current thread and thus requires no
23922 +          locking.
23923 +        */
23924 +       int curpri;
23925 +       /* A list of all locks owned by this process. Elements can be added to
23926 +        * this list only by the current thread. ->node pointers in this list
23927 +        * can be only changed by the current thread. */
23928 +       struct list_head locks;
23929 +       /* When lock_stack waits for the lock, it puts itself on double-linked
23930 +          requestors list of that lock */
23931 +       struct list_head requestors_link;
23932 +       /* Current lock request info.
23933 +
23934 +          This is only accessed by the current thread and thus requires no
23935 +          locking.
23936 +        */
23937 +       struct lock_request request;
23938 +       /* the following two fields are the lock stack's
23939 +        * synchronization object to use with the standard linux/wait.h
23940 +        * interface. See reiser4_go_to_sleep and __reiser4_wake_up for
23941 +        * usage details. */
23942 +       wait_queue_head_t wait;
23943 +       atomic_t wakeup;
23944 +#if REISER4_DEBUG
23945 +       int nr_locks;           /* number of lock handles in the above list */
23946 +#endif
23947 +};
23948 +
23949 +/*
23950 +  User-visible znode locking functions
23951 +*/
23952 +
23953 +extern int longterm_lock_znode(lock_handle * handle,
23954 +                              znode * node,
23955 +                              znode_lock_mode mode,
23956 +                              znode_lock_request request);
23957 +
23958 +extern void longterm_unlock_znode(lock_handle * handle);
23959 +
23960 +extern int reiser4_check_deadlock(void);
23961 +
23962 +extern lock_stack *get_current_lock_stack(void);
23963 +
23964 +extern void init_lock_stack(lock_stack * owner);
23965 +extern void reiser4_init_lock(zlock * lock);
23966 +
23967 +static inline void init_lh(lock_handle *lh)
23968 +{
23969 +#if REISER4_DEBUG
23970 +       memset(lh, 0, sizeof *lh);
23971 +       INIT_LIST_HEAD(&lh->locks_link);
23972 +       INIT_LIST_HEAD(&lh->owners_link);
23973 +#else
23974 +       lh->node = NULL;
23975 +#endif
23976 +}
23977 +
23978 +static inline  void done_lh(lock_handle *lh)
23979 +{
23980 +       assert("zam-342", lh != NULL);
23981 +       if (lh->node != NULL)
23982 +               longterm_unlock_znode(lh);
23983 +}
23984 +
23985 +extern void move_lh(lock_handle * new, lock_handle * old);
23986 +extern void copy_lh(lock_handle * new, lock_handle * old);
23987 +
23988 +extern int reiser4_prepare_to_sleep(lock_stack * owner);
23989 +extern void reiser4_go_to_sleep(lock_stack * owner);
23990 +extern void __reiser4_wake_up(lock_stack * owner);
23991 +
23992 +extern int lock_stack_isclean(lock_stack * owner);
23993 +
23994 +/* zlock object state check macros: only used in assertions.  Both forms imply that the
23995 +   lock is held by the current thread. */
23996 +extern int znode_is_write_locked(const znode *);
23997 +extern void reiser4_invalidate_lock(lock_handle *);
23998 +
23999 +/* lock ordering is: first take zlock spin lock, then lock stack spin lock */
24000 +#define spin_ordering_pred_stack(stack)                        \
24001 +       (LOCK_CNT_NIL(spin_locked_stack) &&             \
24002 +        LOCK_CNT_NIL(spin_locked_txnmgr) &&            \
24003 +        LOCK_CNT_NIL(spin_locked_inode) &&             \
24004 +        LOCK_CNT_NIL(rw_locked_cbk_cache) &&           \
24005 +        LOCK_CNT_NIL(spin_locked_super_eflush) )
24006 +
24007 +static inline void spin_lock_stack(lock_stack *stack)
24008 +{
24009 +       assert("", spin_ordering_pred_stack(stack));
24010 +       spin_lock(&(stack->sguard));
24011 +       LOCK_CNT_INC(spin_locked_stack);
24012 +       LOCK_CNT_INC(spin_locked);
24013 +}
24014 +
24015 +static inline void spin_unlock_stack(lock_stack *stack)
24016 +{
24017 +       assert_spin_locked(&(stack->sguard));
24018 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack));
24019 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24020 +       LOCK_CNT_DEC(spin_locked_stack);
24021 +       LOCK_CNT_DEC(spin_locked);
24022 +       spin_unlock(&(stack->sguard));
24023 +}
24024 +
24025 +static inline void reiser4_wake_up(lock_stack * owner)
24026 +{
24027 +       spin_lock_stack(owner);
24028 +       __reiser4_wake_up(owner);
24029 +       spin_unlock_stack(owner);
24030 +}
24031 +
24032 +const char *lock_mode_name(znode_lock_mode lock);
24033 +
24034 +#if REISER4_DEBUG
24035 +extern void check_lock_data(void);
24036 +extern void check_lock_node_data(znode * node);
24037 +#else
24038 +#define check_lock_data() noop
24039 +#define check_lock_node_data() noop
24040 +#endif
24041 +
24042 +/* __LOCK_H__ */
24043 +#endif
24044 +
24045 +/* Make Linus happy.
24046 +   Local variables:
24047 +   c-indentation-style: "K&R"
24048 +   mode-name: "LC"
24049 +   c-basic-offset: 8
24050 +   tab-width: 8
24051 +   fill-column: 120
24052 +   End:
24053 +*/
24054 diff -urN linux-2.6.27.orig/fs/reiser4/Makefile linux-2.6.27/fs/reiser4/Makefile
24055 --- linux-2.6.27.orig/fs/reiser4/Makefile       1970-01-01 03:00:00.000000000 +0300
24056 +++ linux-2.6.27/fs/reiser4/Makefile    2008-10-12 18:20:00.000000000 +0400
24057 @@ -0,0 +1,98 @@
24058 +#
24059 +# reiser4/Makefile
24060 +#
24061 +
24062 +obj-$(CONFIG_REISER4_FS) += reiser4.o
24063 +
24064 +reiser4-y := \
24065 +                  debug.o \
24066 +                  jnode.o \
24067 +                  znode.o \
24068 +                  key.o \
24069 +                  pool.o \
24070 +                  tree_mod.o \
24071 +                  estimate.o \
24072 +                  carry.o \
24073 +                  carry_ops.o \
24074 +                  lock.o \
24075 +                  tree.o \
24076 +                  context.o \
24077 +                  tap.o \
24078 +                  coord.o \
24079 +                  block_alloc.o \
24080 +                  txnmgr.o \
24081 +                  kassign.o \
24082 +                  flush.o \
24083 +                  wander.o \
24084 +                  eottl.o \
24085 +                  search.o \
24086 +                  page_cache.o \
24087 +                  seal.o \
24088 +                  dscale.o \
24089 +                  flush_queue.o \
24090 +                  ktxnmgrd.o \
24091 +                  blocknrset.o \
24092 +                  super.o \
24093 +                  super_ops.o \
24094 +                  fsdata.o \
24095 +                  export_ops.o \
24096 +                  oid.o \
24097 +                  tree_walk.o \
24098 +                  inode.o \
24099 +                  vfs_ops.o \
24100 +                  as_ops.o \
24101 +                  entd.o\
24102 +                  readahead.o \
24103 +                  status_flags.o \
24104 +                  init_super.o \
24105 +                  safe_link.o \
24106 +           \
24107 +                  plugin/plugin.o \
24108 +                  plugin/plugin_set.o \
24109 +                  plugin/node/node.o \
24110 +                  plugin/object.o \
24111 +                  plugin/cluster.o \
24112 +                  plugin/inode_ops.o \
24113 +                  plugin/inode_ops_rename.o \
24114 +                  plugin/file_ops.o \
24115 +                  plugin/file_ops_readdir.o \
24116 +                  plugin/file_plugin_common.o \
24117 +                  plugin/file/file.o \
24118 +                  plugin/file/tail_conversion.o \
24119 +                  plugin/file/file_conversion.o \
24120 +                  plugin/file/symlink.o \
24121 +                  plugin/file/cryptcompress.o \
24122 +                  plugin/dir_plugin_common.o \
24123 +                  plugin/dir/hashed_dir.o \
24124 +                  plugin/dir/seekable_dir.o \
24125 +                  plugin/node/node40.o \
24126 +           \
24127 +                  plugin/crypto/cipher.o \
24128 +                  plugin/crypto/digest.o \
24129 +           \
24130 +                  plugin/compress/compress.o \
24131 +                  plugin/compress/compress_mode.o \
24132 +           \
24133 +                  plugin/item/static_stat.o \
24134 +                  plugin/item/sde.o \
24135 +                  plugin/item/cde.o \
24136 +                  plugin/item/blackbox.o \
24137 +                  plugin/item/internal.o \
24138 +                  plugin/item/tail.o \
24139 +                  plugin/item/ctail.o \
24140 +                  plugin/item/extent.o \
24141 +                  plugin/item/extent_item_ops.o \
24142 +                  plugin/item/extent_file_ops.o \
24143 +                  plugin/item/extent_flush_ops.o \
24144 +           \
24145 +                  plugin/hash.o \
24146 +                  plugin/fibration.o \
24147 +                  plugin/tail_policy.o \
24148 +                  plugin/item/item.o \
24149 +           \
24150 +                  plugin/security/perm.o \
24151 +                  plugin/space/bitmap.o \
24152 +           \
24153 +                  plugin/disk_format/disk_format40.o \
24154 +                  plugin/disk_format/disk_format.o
24155 +
24156 diff -urN linux-2.6.27.orig/fs/reiser4/oid.c linux-2.6.27/fs/reiser4/oid.c
24157 --- linux-2.6.27.orig/fs/reiser4/oid.c  1970-01-01 03:00:00.000000000 +0300
24158 +++ linux-2.6.27/fs/reiser4/oid.c       2008-10-12 18:20:00.000000000 +0400
24159 @@ -0,0 +1,141 @@
24160 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
24161 +
24162 +#include "debug.h"
24163 +#include "super.h"
24164 +#include "txnmgr.h"
24165 +
24166 +/* we used to have oid allocation plugin. It was removed because it
24167 +   was recognized as providing unneeded level of abstraction. If one
24168 +   ever will find it useful - look at yet_unneeded_abstractions/oid
24169 +*/
24170 +
24171 +/*
24172 + * initialize in-memory data for oid allocator at @super. @nr_files and @next
24173 + * are provided by disk format plugin that reads them from the disk during
24174 + * mount.
24175 + */
24176 +int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next)
24177 +{
24178 +       reiser4_super_info_data *sbinfo;
24179 +
24180 +       sbinfo = get_super_private(super);
24181 +
24182 +       sbinfo->next_to_use = next;
24183 +       sbinfo->oids_in_use = nr_files;
24184 +       return 0;
24185 +}
24186 +
24187 +/*
24188 + * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator
24189 + * runs out of oids.
24190 + */
24191 +oid_t oid_allocate(struct super_block * super)
24192 +{
24193 +       reiser4_super_info_data *sbinfo;
24194 +       oid_t oid;
24195 +
24196 +       sbinfo = get_super_private(super);
24197 +
24198 +       spin_lock_reiser4_super(sbinfo);
24199 +       if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) {
24200 +               oid = sbinfo->next_to_use++;
24201 +               sbinfo->oids_in_use++;
24202 +       } else
24203 +               oid = ABSOLUTE_MAX_OID;
24204 +       spin_unlock_reiser4_super(sbinfo);
24205 +       return oid;
24206 +}
24207 +
24208 +/*
24209 + * Tell oid allocator that @oid is now free.
24210 + */
24211 +int oid_release(struct super_block *super, oid_t oid UNUSED_ARG)
24212 +{
24213 +       reiser4_super_info_data *sbinfo;
24214 +
24215 +       sbinfo = get_super_private(super);
24216 +
24217 +       spin_lock_reiser4_super(sbinfo);
24218 +       sbinfo->oids_in_use--;
24219 +       spin_unlock_reiser4_super(sbinfo);
24220 +       return 0;
24221 +}
24222 +
24223 +/*
24224 + * return next @oid that would be allocated (i.e., returned by oid_allocate())
24225 + * without actually allocating it. This is used by disk format plugin to save
24226 + * oid allocator state on the disk.
24227 + */
24228 +oid_t oid_next(const struct super_block * super)
24229 +{
24230 +       reiser4_super_info_data *sbinfo;
24231 +       oid_t oid;
24232 +
24233 +       sbinfo = get_super_private(super);
24234 +
24235 +       spin_lock_reiser4_super(sbinfo);
24236 +       oid = sbinfo->next_to_use;
24237 +       spin_unlock_reiser4_super(sbinfo);
24238 +       return oid;
24239 +}
24240 +
24241 +/*
24242 + * returns number of currently used oids. This is used by statfs(2) to report
24243 + * number of "inodes" and by disk format plugin to save oid allocator state on
24244 + * the disk.
24245 + */
24246 +long oids_used(const struct super_block *super)
24247 +{
24248 +       reiser4_super_info_data *sbinfo;
24249 +       oid_t used;
24250 +
24251 +       sbinfo = get_super_private(super);
24252 +
24253 +       spin_lock_reiser4_super(sbinfo);
24254 +       used = sbinfo->oids_in_use;
24255 +       spin_unlock_reiser4_super(sbinfo);
24256 +       if (used < (__u64) ((long)~0) >> 1)
24257 +               return (long)used;
24258 +       else
24259 +               return (long)-1;
24260 +}
24261 +
24262 +/*
24263 + * Count oid as allocated in atom. This is done after call to oid_allocate()
24264 + * at the point when we are irrevocably committed to creation of the new file
24265 + * (i.e., when oid allocation cannot be any longer rolled back due to some
24266 + * error).
24267 + */
24268 +void oid_count_allocated(void)
24269 +{
24270 +       txn_atom *atom;
24271 +
24272 +       atom = get_current_atom_locked();
24273 +       atom->nr_objects_created++;
24274 +       spin_unlock_atom(atom);
24275 +}
24276 +
24277 +/*
24278 + * Count oid as free in atom. This is done after call to oid_release() at the
24279 + * point when we are irrevocably committed to the deletion of the file (i.e.,
24280 + * when oid release cannot be any longer rolled back due to some error).
24281 + */
24282 +void oid_count_released(void)
24283 +{
24284 +       txn_atom *atom;
24285 +
24286 +       atom = get_current_atom_locked();
24287 +       atom->nr_objects_deleted++;
24288 +       spin_unlock_atom(atom);
24289 +}
24290 +
24291 +/*
24292 +   Local variables:
24293 +   c-indentation-style: "K&R"
24294 +   mode-name: "LC"
24295 +   c-basic-offset: 8
24296 +   tab-width: 8
24297 +   fill-column: 120
24298 +   scroll-step: 1
24299 +   End:
24300 +*/
24301 diff -urN linux-2.6.27.orig/fs/reiser4/page_cache.c linux-2.6.27/fs/reiser4/page_cache.c
24302 --- linux-2.6.27.orig/fs/reiser4/page_cache.c   1970-01-01 03:00:00.000000000 +0300
24303 +++ linux-2.6.27/fs/reiser4/page_cache.c        2008-10-13 11:49:28.000000000 +0400
24304 @@ -0,0 +1,714 @@
24305 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
24306 + * reiser4/README */
24307 +
24308 +/* Memory pressure hooks. Fake inodes handling. */
24309 +
24310 +/*   GLOSSARY
24311 +
24312 +   . Formatted and unformatted nodes.
24313 +     Elements of reiser4 balanced tree to store data and metadata.
24314 +     Unformatted nodes are pointed to by extent pointers. Such nodes
24315 +     are used to store data of large objects. Unlike unformatted nodes,
24316 +     formatted ones have associated format described by node4X plugin.
24317 +
24318 +   . Jnode (or journal node)
24319 +     The in-memory header which is used to track formatted and unformatted
24320 +     nodes, bitmap nodes, etc. In particular, jnodes are used to track
24321 +     transactional information associated with each block(see reiser4/jnode.c
24322 +     for details).
24323 +
24324 +   . Znode
24325 +     The in-memory header which is used to track formatted nodes. Contains
24326 +     embedded jnode (see reiser4/znode.c for details).
24327 +*/
24328 +
24329 +/* We store all file system meta data (and data, of course) in the page cache.
24330 +
24331 +   What does this mean? In stead of using bread/brelse we create special
24332 +   "fake" inode (one per super block) and store content of formatted nodes
24333 +   into pages bound to this inode in the page cache. In newer kernels bread()
24334 +   already uses inode attached to block device (bd_inode). Advantage of having
24335 +   our own fake inode is that we can install appropriate methods in its
24336 +   address_space operations. Such methods are called by VM on memory pressure
24337 +   (or during background page flushing) and we can use them to react
24338 +   appropriately.
24339 +
24340 +   In initial version we only support one block per page. Support for multiple
24341 +   blocks per page is complicated by relocation.
24342 +
24343 +   To each page, used by reiser4, jnode is attached. jnode is analogous to
24344 +   buffer head. Difference is that jnode is bound to the page permanently:
24345 +   jnode cannot be removed from memory until its backing page is.
24346 +
24347 +   jnode contain pointer to page (->pg field) and page contain pointer to
24348 +   jnode in ->private field. Pointer from jnode to page is protected to by
24349 +   jnode's spinlock and pointer from page to jnode is protected by page lock
24350 +   (PG_locked bit). Lock ordering is: first take page lock, then jnode spin
24351 +   lock. To go into reverse direction use jnode_lock_page() function that uses
24352 +   standard try-lock-and-release device.
24353 +
24354 +   Properties:
24355 +
24356 +   1. when jnode-to-page mapping is established (by jnode_attach_page()), page
24357 +   reference counter is increased.
24358 +
24359 +   2. when jnode-to-page mapping is destroyed (by page_clear_jnode(), page
24360 +   reference counter is decreased.
24361 +
24362 +   3. on jload() reference counter on jnode page is increased, page is
24363 +   kmapped and `referenced'.
24364 +
24365 +   4. on jrelse() inverse operations are performed.
24366 +
24367 +   5. kmapping/kunmapping of unformatted pages is done by read/write methods.
24368 +
24369 +   DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting
24370 +   historically.]
24371 +
24372 +   [In the following discussion, `lock' invariably means long term lock on
24373 +   znode.] (What about page locks?)
24374 +
24375 +   There is some special class of deadlock possibilities related to memory
24376 +   pressure. Locks acquired by other reiser4 threads are accounted for in
24377 +   deadlock prevention mechanism (lock.c), but when ->vm_writeback() is
24378 +   invoked additional hidden arc is added to the locking graph: thread that
24379 +   tries to allocate memory waits for ->vm_writeback() to finish. If this
24380 +   thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock
24381 +   prevention is useless.
24382 +
24383 +   Another related problem is possibility for ->vm_writeback() to run out of
24384 +   memory itself. This is not a problem for ext2 and friends, because their
24385 +   ->vm_writeback() don't allocate much memory, but reiser4 flush is
24386 +   definitely able to allocate huge amounts of memory.
24387 +
24388 +   It seems that there is no reliable way to cope with the problems above. In
24389 +   stead it was decided that ->vm_writeback() (as invoked in the kswapd
24390 +   context) wouldn't perform any flushing itself, but rather should just wake
24391 +   up some auxiliary thread dedicated for this purpose (or, the same thread
24392 +   that does periodic commit of old atoms (ktxnmgrd.c)).
24393 +
24394 +   Details:
24395 +
24396 +   1. Page is called `reclaimable' against particular reiser4 mount F if this
24397 +   page can be ultimately released by try_to_free_pages() under presumptions
24398 +   that:
24399 +
24400 +    a. ->vm_writeback() for F is no-op, and
24401 +
24402 +    b. none of the threads accessing F are making any progress, and
24403 +
24404 +    c. other reiser4 mounts obey the same memory reservation protocol as F
24405 +    (described below).
24406 +
24407 +   For example, clean un-pinned page, or page occupied by ext2 data are
24408 +   reclaimable against any reiser4 mount.
24409 +
24410 +   When there is more than one reiser4 mount in a system, condition (c) makes
24411 +   reclaim-ability not easily verifiable beyond trivial cases mentioned above.
24412 +
24413 +   THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE
24414 +
24415 +   Fake inode is used to bound formatted nodes and each node is indexed within
24416 +   fake inode by its block number. If block size of smaller than page size, it
24417 +   may so happen that block mapped to the page with formatted node is occupied
24418 +   by unformatted node or is unallocated. This lead to some complications,
24419 +   because flushing whole page can lead to an incorrect overwrite of
24420 +   unformatted node that is moreover, can be cached in some other place as
24421 +   part of the file body. To avoid this, buffers for unformatted nodes are
24422 +   never marked dirty. Also pages in the fake are never marked dirty. This
24423 +   rules out usage of ->writepage() as memory pressure hook. In stead
24424 +   ->releasepage() is used.
24425 +
24426 +   Josh is concerned that page->buffer is going to die. This should not pose
24427 +   significant problem though, because we need to add some data structures to
24428 +   the page anyway (jnode) and all necessary book keeping can be put there.
24429 +
24430 +*/
24431 +
24432 +/* Life cycle of pages/nodes.
24433 +
24434 +   jnode contains reference to page and page contains reference back to
24435 +   jnode. This reference is counted in page ->count. Thus, page bound to jnode
24436 +   cannot be released back into free pool.
24437 +
24438 +    1. Formatted nodes.
24439 +
24440 +      1. formatted node is represented by znode. When new znode is created its
24441 +      ->pg pointer is NULL initially.
24442 +
24443 +      2. when node content is loaded into znode (by call to zload()) for the
24444 +      first time following happens (in call to ->read_node() or
24445 +      ->allocate_node()):
24446 +
24447 +        1. new page is added to the page cache.
24448 +
24449 +        2. this page is attached to znode and its ->count is increased.
24450 +
24451 +        3. page is kmapped.
24452 +
24453 +      3. if more calls to zload() follow (without corresponding zrelses), page
24454 +      counter is left intact and in its stead ->d_count is increased in znode.
24455 +
24456 +      4. each call to zrelse decreases ->d_count. When ->d_count drops to zero
24457 +      ->release_node() is called and page is kunmapped as result.
24458 +
24459 +      5. at some moment node can be captured by a transaction. Its ->x_count
24460 +      is then increased by transaction manager.
24461 +
24462 +      6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE
24463 +      bit set) following will happen (also see comment at the top of znode.c):
24464 +
24465 +        1. when last lock is released, node will be uncaptured from
24466 +        transaction. This released reference that transaction manager acquired
24467 +        at the step 5.
24468 +
24469 +        2. when last reference is released, zput() detects that node is
24470 +        actually deleted and calls ->delete_node()
24471 +        operation. page_cache_delete_node() implementation detaches jnode from
24472 +        page and releases page.
24473 +
24474 +      7. otherwise (node wasn't removed from the tree), last reference to
24475 +      znode will be released after transaction manager committed transaction
24476 +      node was in. This implies squallocing of this node (see
24477 +      flush.c). Nothing special happens at this point. Znode is still in the
24478 +      hash table and page is still attached to it.
24479 +
24480 +      8. znode is actually removed from the memory because of the memory
24481 +      pressure, or during umount (znodes_tree_done()). Anyway, znode is
24482 +      removed by the call to zdrop(). At this moment, page is detached from
24483 +      znode and removed from the inode address space.
24484 +
24485 +*/
24486 +
24487 +#include "debug.h"
24488 +#include "dformat.h"
24489 +#include "key.h"
24490 +#include "txnmgr.h"
24491 +#include "jnode.h"
24492 +#include "znode.h"
24493 +#include "block_alloc.h"
24494 +#include "tree.h"
24495 +#include "vfs_ops.h"
24496 +#include "inode.h"
24497 +#include "super.h"
24498 +#include "entd.h"
24499 +#include "page_cache.h"
24500 +#include "ktxnmgrd.h"
24501 +
24502 +#include <linux/types.h>
24503 +#include <linux/fs.h>
24504 +#include <linux/mm.h>          /* for struct page */
24505 +#include <linux/swap.h>                /* for struct page */
24506 +#include <linux/pagemap.h>
24507 +#include <linux/bio.h>
24508 +#include <linux/writeback.h>
24509 +#include <linux/blkdev.h>
24510 +#include <linux/task_io_accounting_ops.h>
24511 +
24512 +static struct bio *page_bio(struct page *, jnode *, int rw, gfp_t gfp);
24513 +
24514 +static struct address_space_operations formatted_fake_as_ops;
24515 +
24516 +static const oid_t fake_ino = 0x1;
24517 +static const oid_t bitmap_ino = 0x2;
24518 +static const oid_t cc_ino = 0x3;
24519 +
24520 +static void
24521 +init_fake_inode(struct super_block *super, struct inode *fake,
24522 +               struct inode **pfake)
24523 +{
24524 +       assert("nikita-2168", fake->i_state & I_NEW);
24525 +       fake->i_mapping->a_ops = &formatted_fake_as_ops;
24526 +       *pfake = fake;
24527 +       /* NOTE-NIKITA something else? */
24528 +       unlock_new_inode(fake);
24529 +}
24530 +
24531 +/**
24532 + * reiser4_init_formatted_fake - iget inodes for formatted nodes and bitmaps
24533 + * @super: super block to init fake inode for
24534 + *
24535 + * Initializes fake inode to which formatted nodes are bound in the page cache
24536 + * and inode for bitmaps.
24537 + */
24538 +int reiser4_init_formatted_fake(struct super_block *super)
24539 +{
24540 +       struct inode *fake;
24541 +       struct inode *bitmap;
24542 +       struct inode *cc;
24543 +       reiser4_super_info_data *sinfo;
24544 +
24545 +       assert("nikita-1703", super != NULL);
24546 +
24547 +       sinfo = get_super_private_nocheck(super);
24548 +       fake = iget_locked(super, oid_to_ino(fake_ino));
24549 +
24550 +       if (fake != NULL) {
24551 +               init_fake_inode(super, fake, &sinfo->fake);
24552 +
24553 +               bitmap = iget_locked(super, oid_to_ino(bitmap_ino));
24554 +               if (bitmap != NULL) {
24555 +                       init_fake_inode(super, bitmap, &sinfo->bitmap);
24556 +
24557 +                       cc = iget_locked(super, oid_to_ino(cc_ino));
24558 +                       if (cc != NULL) {
24559 +                               init_fake_inode(super, cc, &sinfo->cc);
24560 +                               return 0;
24561 +                       } else {
24562 +                               iput(sinfo->fake);
24563 +                               iput(sinfo->bitmap);
24564 +                               sinfo->fake = NULL;
24565 +                               sinfo->bitmap = NULL;
24566 +                       }
24567 +               } else {
24568 +                       iput(sinfo->fake);
24569 +                       sinfo->fake = NULL;
24570 +               }
24571 +       }
24572 +       return RETERR(-ENOMEM);
24573 +}
24574 +
24575 +/**
24576 + * reiser4_done_formatted_fake - release inode used by formatted nodes and bitmaps
24577 + * @super: super block to init fake inode for
24578 + *
24579 + * Releases inodes which were used as address spaces of bitmap and formatted
24580 + * nodes.
24581 + */
24582 +void reiser4_done_formatted_fake(struct super_block *super)
24583 +{
24584 +       reiser4_super_info_data *sinfo;
24585 +
24586 +       sinfo = get_super_private_nocheck(super);
24587 +
24588 +       if (sinfo->fake != NULL) {
24589 +               iput(sinfo->fake);
24590 +               sinfo->fake = NULL;
24591 +       }
24592 +
24593 +       if (sinfo->bitmap != NULL) {
24594 +               iput(sinfo->bitmap);
24595 +               sinfo->bitmap = NULL;
24596 +       }
24597 +
24598 +       if (sinfo->cc != NULL) {
24599 +               iput(sinfo->cc);
24600 +               sinfo->cc = NULL;
24601 +       }
24602 +       return;
24603 +}
24604 +
24605 +void reiser4_wait_page_writeback(struct page *page)
24606 +{
24607 +       assert("zam-783", PageLocked(page));
24608 +
24609 +       do {
24610 +               unlock_page(page);
24611 +               wait_on_page_writeback(page);
24612 +               lock_page(page);
24613 +       } while (PageWriteback(page));
24614 +}
24615 +
24616 +/* return tree @page is in */
24617 +reiser4_tree *reiser4_tree_by_page(const struct page *page /* page to query */ )
24618 +{
24619 +       assert("nikita-2461", page != NULL);
24620 +       return &get_super_private(page->mapping->host->i_sb)->tree;
24621 +}
24622 +
24623 +/* completion handler for single page bio-based read.
24624 +
24625 +   mpage_end_io_read() would also do. But it's static.
24626 +
24627 +*/
24628 +static void
24629 +end_bio_single_page_read(struct bio *bio, int err UNUSED_ARG)
24630 +{
24631 +       struct page *page;
24632 +
24633 +       page = bio->bi_io_vec[0].bv_page;
24634 +
24635 +       if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
24636 +               SetPageUptodate(page);
24637 +       } else {
24638 +               ClearPageUptodate(page);
24639 +               SetPageError(page);
24640 +       }
24641 +       unlock_page(page);
24642 +       bio_put(bio);
24643 +}
24644 +
24645 +/* completion handler for single page bio-based write.
24646 +
24647 +   mpage_end_io_write() would also do. But it's static.
24648 +
24649 +*/
24650 +static void
24651 +end_bio_single_page_write(struct bio *bio, int err UNUSED_ARG)
24652 +{
24653 +       struct page *page;
24654 +
24655 +       page = bio->bi_io_vec[0].bv_page;
24656 +
24657 +       if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
24658 +               SetPageError(page);
24659 +       end_page_writeback(page);
24660 +       bio_put(bio);
24661 +}
24662 +
24663 +/* ->readpage() method for formatted nodes */
24664 +static int formatted_readpage(struct file *f UNUSED_ARG,
24665 +                             struct page *page /* page to read */ )
24666 +{
24667 +       assert("nikita-2412", PagePrivate(page) && jprivate(page));
24668 +       return reiser4_page_io(page, jprivate(page), READ,
24669 +                              reiser4_ctx_gfp_mask_get());
24670 +}
24671 +
24672 +/**
24673 + * reiser4_page_io - submit single-page bio request
24674 + * @page: page to perform io for
24675 + * @node: jnode of page
24676 + * @rw: read or write
24677 + * @gfp: gfp mask for bio allocation
24678 + *
24679 + * Submits single page read or write.
24680 + */
24681 +int reiser4_page_io(struct page *page, jnode *node, int rw, gfp_t gfp)
24682 +{
24683 +       struct bio *bio;
24684 +       int result;
24685 +
24686 +       assert("nikita-2094", page != NULL);
24687 +       assert("nikita-2226", PageLocked(page));
24688 +       assert("nikita-2634", node != NULL);
24689 +       assert("nikita-2893", rw == READ || rw == WRITE);
24690 +
24691 +       if (rw) {
24692 +               if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) {
24693 +                       unlock_page(page);
24694 +                       return 0;
24695 +               }
24696 +       }
24697 +
24698 +       bio = page_bio(page, node, rw, gfp);
24699 +       if (!IS_ERR(bio)) {
24700 +               if (rw == WRITE) {
24701 +                       set_page_writeback(page);
24702 +                       unlock_page(page);
24703 +               }
24704 +               reiser4_submit_bio(rw, bio);
24705 +               result = 0;
24706 +       } else {
24707 +               unlock_page(page);
24708 +               result = PTR_ERR(bio);
24709 +       }
24710 +
24711 +       return result;
24712 +}
24713 +
24714 +/* helper function to construct bio for page */
24715 +static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp)
24716 +{
24717 +       struct bio *bio;
24718 +       assert("nikita-2092", page != NULL);
24719 +       assert("nikita-2633", node != NULL);
24720 +
24721 +       /* Simple implementation in the assumption that blocksize == pagesize.
24722 +
24723 +          We only have to submit one block, but submit_bh() will allocate bio
24724 +          anyway, so lets use all the bells-and-whistles of bio code.
24725 +        */
24726 +
24727 +       bio = bio_alloc(gfp, 1);
24728 +       if (bio != NULL) {
24729 +               int blksz;
24730 +               struct super_block *super;
24731 +               reiser4_block_nr blocknr;
24732 +
24733 +               super = page->mapping->host->i_sb;
24734 +               assert("nikita-2029", super != NULL);
24735 +               blksz = super->s_blocksize;
24736 +               assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE);
24737 +
24738 +               spin_lock_jnode(node);
24739 +               blocknr = *jnode_get_io_block(node);
24740 +               spin_unlock_jnode(node);
24741 +
24742 +               assert("nikita-2275", blocknr != (reiser4_block_nr) 0);
24743 +               assert("nikita-2276", !reiser4_blocknr_is_fake(&blocknr));
24744 +
24745 +               bio->bi_bdev = super->s_bdev;
24746 +               /* fill bio->bi_sector before calling bio_add_page(), because
24747 +                * q->merge_bvec_fn may want to inspect it (see
24748 +                * drivers/md/linear.c:linear_mergeable_bvec() for example. */
24749 +               bio->bi_sector = blocknr * (blksz >> 9);
24750 +
24751 +               if (!bio_add_page(bio, page, blksz, 0)) {
24752 +                       warning("nikita-3452",
24753 +                               "Single page bio cannot be constructed");
24754 +                       return ERR_PTR(RETERR(-EINVAL));
24755 +               }
24756 +
24757 +               /* bio -> bi_idx is filled by bio_init() */
24758 +               bio->bi_end_io = (rw == READ) ?
24759 +                   end_bio_single_page_read : end_bio_single_page_write;
24760 +
24761 +               return bio;
24762 +       } else
24763 +               return ERR_PTR(RETERR(-ENOMEM));
24764 +}
24765 +
24766 +/* this function is internally called by jnode_make_dirty() */
24767 +void reiser4_set_page_dirty_internal(struct page *page)
24768 +{
24769 +       struct address_space *mapping;
24770 +       mapping = page->mapping;
24771 +       assert("edward-1557", mapping != NULL);
24772 +
24773 +       if (!TestSetPageDirty(page)) {
24774 +               WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
24775 +               if (mapping_cap_account_dirty(mapping)) {
24776 +                       __inc_zone_page_state(page, NR_FILE_DIRTY);
24777 +                       __inc_bdi_stat(mapping->backing_dev_info,
24778 +                                      BDI_RECLAIMABLE);
24779 +                       task_io_account_write(PAGE_CACHE_SIZE);
24780 +               }
24781 +               assert("edward-1558", mapping->host != NULL);
24782 +               __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
24783 +       }
24784 +}
24785 +
24786 +#if 0
24787 +static int can_hit_entd(reiser4_context *ctx, struct super_block *s)
24788 +{
24789 +       if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic)
24790 +               return 1;
24791 +       if (ctx->super != s)
24792 +               return 1;
24793 +       if (get_super_private(s)->entd.tsk == current)
24794 +               return 0;
24795 +       if (!lock_stack_isclean(&ctx->stack))
24796 +               return 0;
24797 +       if (ctx->trans->atom != NULL)
24798 +               return 0;
24799 +       return 1;
24800 +}
24801 +#endif
24802 +
24803 +/**
24804 + * reiser4_writepage - writepage of struct address_space_operations
24805 + * @page: page to write
24806 + * @wbc:
24807 + *
24808 + *
24809 + */
24810 +/* Common memory pressure notification. */
24811 +int reiser4_writepage(struct page *page,
24812 +                     struct writeback_control *wbc)
24813 +{
24814 +       struct super_block *s;
24815 +       reiser4_context *ctx;
24816 +
24817 +       assert("vs-828", PageLocked(page));
24818 +
24819 +       s = page->mapping->host->i_sb;
24820 +       ctx = get_current_context_check();
24821 +
24822 +       //assert("", can_hit_entd(ctx, s));
24823 +       return write_page_by_ent(page, wbc);
24824 +}
24825 +
24826 +/* ->set_page_dirty() method of formatted address_space */
24827 +static int formatted_set_page_dirty(struct page *page)
24828 +{
24829 +       assert("nikita-2173", page != NULL);
24830 +       BUG();
24831 +       return __set_page_dirty_nobuffers(page);
24832 +}
24833 +
24834 +/* writepages method of address space operations in reiser4 is used to involve
24835 +   into transactions pages which are dirtied via mmap. Only regular files can
24836 +   have such pages. Fake inode is used to access formatted nodes via page
24837 +   cache. As formatted nodes can never be mmaped, fake inode's writepages has
24838 +   nothing to do */
24839 +static int
24840 +writepages_fake(struct address_space *mapping, struct writeback_control *wbc)
24841 +{
24842 +       return 0;
24843 +}
24844 +
24845 +/* address space operations for the fake inode */
24846 +static struct address_space_operations formatted_fake_as_ops = {
24847 +       /* Perform a writeback of a single page as a memory-freeing
24848 +        * operation. */
24849 +       .writepage = reiser4_writepage,
24850 +       /* this is called to read formatted node */
24851 +       .readpage = formatted_readpage,
24852 +       /* ->sync_page() method of fake inode address space operations. Called
24853 +          from wait_on_page() and lock_page().
24854 +
24855 +          This is most annoyingly misnomered method. Actually it is called
24856 +          from wait_on_page_bit() and lock_page() and its purpose is to
24857 +          actually start io by jabbing device drivers.
24858 +        */
24859 +       .sync_page = block_sync_page,
24860 +       /* Write back some dirty pages from this mapping. Called from sync.
24861 +          called during sync (pdflush) */
24862 +       .writepages = writepages_fake,
24863 +       /* Set a page dirty */
24864 +       .set_page_dirty = formatted_set_page_dirty,
24865 +       /* used for read-ahead. Not applicable */
24866 +       .readpages = NULL,
24867 +       .prepare_write = NULL,
24868 +       .commit_write = NULL,
24869 +       .bmap = NULL,
24870 +       /* called just before page is being detached from inode mapping and
24871 +          removed from memory. Called on truncate, cut/squeeze, and
24872 +          umount. */
24873 +       .invalidatepage = reiser4_invalidatepage,
24874 +       /* this is called by shrink_cache() so that file system can try to
24875 +          release objects (jnodes, buffers, journal heads) attached to page
24876 +          and, may be made page itself free-able.
24877 +        */
24878 +       .releasepage = reiser4_releasepage,
24879 +       .direct_IO = NULL
24880 +};
24881 +
24882 +/* called just before page is released (no longer used by reiser4). Callers:
24883 +   jdelete() and extent2tail(). */
24884 +void reiser4_drop_page(struct page *page)
24885 +{
24886 +       assert("nikita-2181", PageLocked(page));
24887 +       clear_page_dirty_for_io(page);
24888 +       ClearPageUptodate(page);
24889 +#if defined(PG_skipped)
24890 +       ClearPageSkipped(page);
24891 +#endif
24892 +       unlock_page(page);
24893 +}
24894 +
24895 +#define JNODE_GANG_SIZE (16)
24896 +
24897 +/* find all jnodes from range specified and invalidate them */
24898 +static int
24899 +truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count)
24900 +{
24901 +       reiser4_inode *info;
24902 +       int truncated_jnodes;
24903 +       reiser4_tree *tree;
24904 +       unsigned long index;
24905 +       unsigned long end;
24906 +
24907 +       if (inode_file_plugin(inode) ==
24908 +           file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
24909 +               /*
24910 +                * No need to get rid of jnodes here: if the single jnode of
24911 +                * page cluster did not have page, then it was found and killed
24912 +                * before in
24913 +                * truncate_complete_page_cluster()->jput()->jput_final(),
24914 +                * otherwise it will be dropped by reiser4_invalidatepage()
24915 +                */
24916 +               return 0;
24917 +       truncated_jnodes = 0;
24918 +
24919 +       info = reiser4_inode_data(inode);
24920 +       tree = reiser4_tree_by_inode(inode);
24921 +
24922 +       index = from;
24923 +       end = from + count;
24924 +
24925 +       while (1) {
24926 +               jnode *gang[JNODE_GANG_SIZE];
24927 +               int taken;
24928 +               int i;
24929 +               jnode *node;
24930 +
24931 +               assert("nikita-3466", index <= end);
24932 +
24933 +               read_lock_tree(tree);
24934 +               taken =
24935 +                   radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info),
24936 +                                          (void **)gang, index,
24937 +                                          JNODE_GANG_SIZE);
24938 +               for (i = 0; i < taken; ++i) {
24939 +                       node = gang[i];
24940 +                       if (index_jnode(node) < end)
24941 +                               jref(node);
24942 +                       else
24943 +                               gang[i] = NULL;
24944 +               }
24945 +               read_unlock_tree(tree);
24946 +
24947 +               for (i = 0; i < taken; ++i) {
24948 +                       node = gang[i];
24949 +                       if (node != NULL) {
24950 +                               index = max(index, index_jnode(node));
24951 +                               spin_lock_jnode(node);
24952 +                               assert("edward-1457", node->pg == NULL);
24953 +                               /* this is always called after
24954 +                                  truncate_inode_pages_range(). Therefore, here
24955 +                                  jnode can not have page. New pages can not be
24956 +                                  created because truncate_jnodes_range goes
24957 +                                  under exclusive access on file obtained,
24958 +                                  where as new page creation requires
24959 +                                  non-exclusive access obtained */
24960 +                               JF_SET(node, JNODE_HEARD_BANSHEE);
24961 +                               reiser4_uncapture_jnode(node);
24962 +                               unhash_unformatted_jnode(node);
24963 +                               truncated_jnodes++;
24964 +                               jput(node);
24965 +                       } else
24966 +                               break;
24967 +               }
24968 +               if (i != taken || taken == 0)
24969 +                       break;
24970 +       }
24971 +       return truncated_jnodes;
24972 +}
24973 +
24974 +/* Truncating files in reiser4: problems and solutions.
24975 +
24976 +   VFS calls fs's truncate after it has called truncate_inode_pages()
24977 +   to get rid of pages corresponding to part of file being truncated.
24978 +   In reiser4 it may cause existence of unallocated extents which do
24979 +   not have jnodes. Flush code does not expect that. Solution of this
24980 +   problem is straightforward. As vfs's truncate is implemented using
24981 +   setattr operation, it seems reasonable to have ->setattr() that
24982 +   will cut file body. However, flush code also does not expect dirty
24983 +   pages without parent items, so it is impossible to cut all items,
24984 +   then truncate all pages in two steps. We resolve this problem by
24985 +   cutting items one-by-one. Each such fine-grained step performed
24986 +   under longterm znode lock calls at the end ->kill_hook() method of
24987 +   a killed item to remove its binded pages and jnodes.
24988 +
24989 +   The following function is a common part of mentioned kill hooks.
24990 +   Also, this is called before tail-to-extent conversion (to not manage
24991 +   few copies of the data).
24992 +*/
24993 +void reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from,
24994 +                             unsigned long count, int even_cows)
24995 +{
24996 +       loff_t from_bytes, count_bytes;
24997 +
24998 +       if (count == 0)
24999 +               return;
25000 +       from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT;
25001 +       count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT;
25002 +
25003 +       unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows);
25004 +       truncate_inode_pages_range(mapping, from_bytes,
25005 +                                  from_bytes + count_bytes - 1);
25006 +       truncate_jnodes_range(mapping->host, from, count);
25007 +}
25008 +
25009 +/*
25010 + * Local variables:
25011 + * c-indentation-style: "K&R"
25012 + * mode-name: "LC"
25013 + * c-basic-offset: 8
25014 + * tab-width: 8
25015 + * fill-column: 120
25016 + * scroll-step: 1
25017 + * End:
25018 + */
25019 diff -urN linux-2.6.27.orig/fs/reiser4/page_cache.h linux-2.6.27/fs/reiser4/page_cache.h
25020 --- linux-2.6.27.orig/fs/reiser4/page_cache.h   1970-01-01 03:00:00.000000000 +0300
25021 +++ linux-2.6.27/fs/reiser4/page_cache.h        2008-10-13 11:45:13.000000000 +0400
25022 @@ -0,0 +1,68 @@
25023 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
25024 + * reiser4/README */
25025 +/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */
25026 +
25027 +#if !defined( __REISER4_PAGE_CACHE_H__ )
25028 +#define __REISER4_PAGE_CACHE_H__
25029 +
25030 +#include "forward.h"
25031 +#include "context.h"            /* for reiser4_ctx_gfp_mask_get() */
25032 +
25033 +#include <linux/fs.h>          /* for struct super_block, address_space  */
25034 +#include <linux/mm.h>          /* for struct page  */
25035 +#include <linux/pagemap.h>     /* for lock_page()  */
25036 +#include <linux/vmalloc.h>     /* for __vmalloc()  */
25037 +
25038 +extern int reiser4_init_formatted_fake(struct super_block *);
25039 +extern void reiser4_done_formatted_fake(struct super_block *);
25040 +
25041 +extern reiser4_tree *reiser4_tree_by_page(const struct page *);
25042 +
25043 +extern void reiser4_set_page_dirty_internal(struct page *);
25044 +
25045 +#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio))
25046 +
25047 +extern void reiser4_wait_page_writeback(struct page *);
25048 +static inline void lock_and_wait_page_writeback(struct page *page)
25049 +{
25050 +       lock_page(page);
25051 +       if (unlikely(PageWriteback(page)))
25052 +               reiser4_wait_page_writeback(page);
25053 +}
25054 +
25055 +#define jprivate(page) ((jnode *)page_private(page))
25056 +
25057 +extern int reiser4_page_io(struct page *, jnode *, int rw, gfp_t);
25058 +extern void reiser4_drop_page(struct page *);
25059 +extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from,
25060 +                                    unsigned long count, int even_cows);
25061 +extern void capture_reiser4_inodes(struct super_block *,
25062 +                                  struct writeback_control *);
25063 +static inline void * reiser4_vmalloc (unsigned long size)
25064 +{
25065 +       return __vmalloc(size,
25066 +                        reiser4_ctx_gfp_mask_get() | __GFP_HIGHMEM,
25067 +                        PAGE_KERNEL);
25068 +}
25069 +
25070 +#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY
25071 +
25072 +#if REISER4_DEBUG
25073 +extern void print_page(const char *prefix, struct page *page);
25074 +#else
25075 +#define print_page(prf, p) noop
25076 +#endif
25077 +
25078 +/* __REISER4_PAGE_CACHE_H__ */
25079 +#endif
25080 +
25081 +/* Make Linus happy.
25082 +   Local variables:
25083 +   c-indentation-style: "K&R"
25084 +   mode-name: "LC"
25085 +   c-basic-offset: 8
25086 +   tab-width: 8
25087 +   fill-column: 120
25088 +   scroll-step: 1
25089 +   End:
25090 +*/
25091 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/cluster.c linux-2.6.27/fs/reiser4/plugin/cluster.c
25092 --- linux-2.6.27.orig/fs/reiser4/plugin/cluster.c       1970-01-01 03:00:00.000000000 +0300
25093 +++ linux-2.6.27/fs/reiser4/plugin/cluster.c    2008-10-12 18:20:00.000000000 +0400
25094 @@ -0,0 +1,71 @@
25095 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25096 +
25097 +/* Contains reiser4 cluster plugins (see
25098 +   http://www.namesys.com/cryptcompress_design.html
25099 +   "Concepts of clustering" for details). */
25100 +
25101 +#include "plugin_header.h"
25102 +#include "plugin.h"
25103 +#include "../inode.h"
25104 +
25105 +static int change_cluster(struct inode *inode,
25106 +                         reiser4_plugin * plugin,
25107 +                         pset_member memb)
25108 +{
25109 +       assert("edward-1324", inode != NULL);
25110 +       assert("edward-1325", plugin != NULL);
25111 +       assert("edward-1326", is_reiser4_inode(inode));
25112 +       assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE);
25113 +
25114 +       /* Can't change the cluster plugin for already existent regular files. */
25115 +       if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25116 +               return RETERR(-EINVAL);
25117 +
25118 +       /* If matches, nothing to change. */
25119 +       if (inode_hash_plugin(inode) != NULL &&
25120 +           inode_hash_plugin(inode)->h.id == plugin->h.id)
25121 +               return 0;
25122 +
25123 +       return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25124 +                              PSET_CLUSTER, plugin);
25125 +}
25126 +
25127 +static reiser4_plugin_ops cluster_plugin_ops = {
25128 +       .init = NULL,
25129 +       .load = NULL,
25130 +       .save_len = NULL,
25131 +       .save = NULL,
25132 +       .change = &change_cluster
25133 +};
25134 +
25135 +#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC)                        \
25136 +       [CLUSTER_ ## ID ## _ID] = {                             \
25137 +               .h = {                                          \
25138 +                       .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \
25139 +                       .id = CLUSTER_ ## ID ## _ID,            \
25140 +                       .pops = &cluster_plugin_ops,            \
25141 +                       .label = LABEL,                         \
25142 +                       .desc = DESC,                           \
25143 +                       .linkage = {NULL, NULL}                 \
25144 +               },                                              \
25145 +               .shift = SHIFT                                  \
25146 +       }
25147 +
25148 +cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = {
25149 +       SUPPORT_CLUSTER(16, 64K, "64K", "Large"),
25150 +       SUPPORT_CLUSTER(15, 32K, "32K", "Big"),
25151 +       SUPPORT_CLUSTER(14, 16K, "16K", "Average"),
25152 +       SUPPORT_CLUSTER(13, 8K, "8K", "Small"),
25153 +       SUPPORT_CLUSTER(12, 4K, "4K", "Minimal")
25154 +};
25155 +
25156 +/*
25157 +  Local variables:
25158 +  c-indentation-style: "K&R"
25159 +  mode-name: "LC"
25160 +  c-basic-offset: 8
25161 +  tab-width: 8
25162 +  fill-column: 120
25163 +  scroll-step: 1
25164 +  End:
25165 +*/
25166 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/cluster.h linux-2.6.27/fs/reiser4/plugin/cluster.h
25167 --- linux-2.6.27.orig/fs/reiser4/plugin/cluster.h       1970-01-01 03:00:00.000000000 +0300
25168 +++ linux-2.6.27/fs/reiser4/plugin/cluster.h    2008-10-12 18:20:00.000000000 +0400
25169 @@ -0,0 +1,409 @@
25170 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25171 +
25172 +/* This file contains size/offset translators, modulators
25173 +   and other helper functions. */
25174 +
25175 +#if !defined( __FS_REISER4_CLUSTER_H__ )
25176 +#define __FS_REISER4_CLUSTER_H__
25177 +
25178 +#include "../inode.h"
25179 +
25180 +static inline int inode_cluster_shift(struct inode *inode)
25181 +{
25182 +       assert("edward-92", inode != NULL);
25183 +       assert("edward-93", reiser4_inode_data(inode) != NULL);
25184 +
25185 +       return inode_cluster_plugin(inode)->shift;
25186 +}
25187 +
25188 +static inline unsigned cluster_nrpages_shift(struct inode *inode)
25189 +{
25190 +       return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT;
25191 +}
25192 +
25193 +/* cluster size in page units */
25194 +static inline unsigned cluster_nrpages(struct inode *inode)
25195 +{
25196 +       return 1U << cluster_nrpages_shift(inode);
25197 +}
25198 +
25199 +static inline size_t inode_cluster_size(struct inode *inode)
25200 +{
25201 +       assert("edward-96", inode != NULL);
25202 +
25203 +       return 1U << inode_cluster_shift(inode);
25204 +}
25205 +
25206 +static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode)
25207 +{
25208 +       return idx >> cluster_nrpages_shift(inode);
25209 +}
25210 +
25211 +static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode)
25212 +{
25213 +       return idx << cluster_nrpages_shift(inode);
25214 +}
25215 +
25216 +static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode)
25217 +{
25218 +       return clust_to_pg(pg_to_clust(idx, inode), inode);
25219 +}
25220 +
25221 +static inline pgoff_t off_to_pg(loff_t off)
25222 +{
25223 +       return (off >> PAGE_CACHE_SHIFT);
25224 +}
25225 +
25226 +static inline loff_t pg_to_off(pgoff_t idx)
25227 +{
25228 +       return ((loff_t) (idx) << PAGE_CACHE_SHIFT);
25229 +}
25230 +
25231 +static inline cloff_t off_to_clust(loff_t off, struct inode *inode)
25232 +{
25233 +       return off >> inode_cluster_shift(inode);
25234 +}
25235 +
25236 +static inline loff_t clust_to_off(cloff_t idx, struct inode *inode)
25237 +{
25238 +       return (loff_t) idx << inode_cluster_shift(inode);
25239 +}
25240 +
25241 +static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode)
25242 +{
25243 +       return clust_to_off(off_to_clust(off, inode), inode);
25244 +}
25245 +
25246 +static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode)
25247 +{
25248 +       return clust_to_pg(off_to_clust(off, inode), inode);
25249 +}
25250 +
25251 +static inline unsigned off_to_pgoff(loff_t off)
25252 +{
25253 +       return off & (PAGE_CACHE_SIZE - 1);
25254 +}
25255 +
25256 +static inline unsigned off_to_cloff(loff_t off, struct inode *inode)
25257 +{
25258 +       return off & ((loff_t) (inode_cluster_size(inode)) - 1);
25259 +}
25260 +
25261 +static inline  pgoff_t offset_in_clust(struct page * page)
25262 +{
25263 +       assert("edward-1488", page != NULL);
25264 +       assert("edward-1489", page->mapping != NULL);
25265 +
25266 +       return page_index(page) & ((cluster_nrpages(page->mapping->host)) - 1);
25267 +}
25268 +
25269 +static inline int first_page_in_cluster(struct page * page)
25270 +{
25271 +       return offset_in_clust(page) == 0;
25272 +}
25273 +
25274 +static inline int last_page_in_cluster(struct page * page)
25275 +{
25276 +       return offset_in_clust(page) ==
25277 +               cluster_nrpages(page->mapping->host) - 1;
25278 +}
25279 +
25280 +static inline unsigned
25281 +pg_to_off_to_cloff(unsigned long idx, struct inode *inode)
25282 +{
25283 +       return off_to_cloff(pg_to_off(idx), inode);
25284 +}
25285 +
25286 +/*********************** Size translators **************************/
25287 +
25288 +/* Translate linear size.
25289 + * New units are (1 << @blk_shift) times larger, then old ones.
25290 + * In other words, calculate number of logical blocks, occupied
25291 + * by @count elements
25292 + */
25293 +static inline unsigned long size_in_blocks(loff_t count, unsigned blkbits)
25294 +{
25295 +       return (count + (1UL << blkbits) - 1) >> blkbits;
25296 +}
25297 +
25298 +/* size in pages */
25299 +static inline pgoff_t size_in_pages(loff_t size)
25300 +{
25301 +       return size_in_blocks(size, PAGE_CACHE_SHIFT);
25302 +}
25303 +
25304 +/* size in logical clusters */
25305 +static inline cloff_t size_in_lc(loff_t size, struct inode *inode)
25306 +{
25307 +       return size_in_blocks(size, inode_cluster_shift(inode));
25308 +}
25309 +
25310 +/* size in pages to the size in page clusters */
25311 +static inline cloff_t sp_to_spcl(pgoff_t size, struct inode *inode)
25312 +{
25313 +       return size_in_blocks(size, cluster_nrpages_shift(inode));
25314 +}
25315 +
25316 +/*********************** Size modulators ***************************/
25317 +
25318 +/*
25319 +  Modulate linear size by nominated block size and offset.
25320 +
25321 +  The "finite" function (which is zero almost everywhere).
25322 +  How much is a height of the figure at a position @pos,
25323 +  when trying to construct rectangle of height (1 << @blkbits),
25324 +  and square @size.
25325 +
25326 +  ******
25327 +  *******
25328 +  *******
25329 +  *******
25330 +  ----------> pos
25331 +*/
25332 +static inline unsigned __mbb(loff_t size, unsigned long pos, int blkbits)
25333 +{
25334 +       unsigned end = size >> blkbits;
25335 +       if (pos < end)
25336 +               return 1U << blkbits;
25337 +       if (unlikely(pos > end))
25338 +               return 0;
25339 +       return size & ~(~0ull << blkbits);
25340 +}
25341 +
25342 +/* the same as above, but block size is page size */
25343 +static inline unsigned __mbp(loff_t size, pgoff_t pos)
25344 +{
25345 +       return __mbb(size, pos, PAGE_CACHE_SHIFT);
25346 +}
25347 +
25348 +/* number of file's bytes in the nominated logical cluster */
25349 +static inline unsigned lbytes(cloff_t index, struct inode * inode)
25350 +{
25351 +       return __mbb(i_size_read(inode), index, inode_cluster_shift(inode));
25352 +}
25353 +
25354 +/* number of file's bytes in the nominated page */
25355 +static inline unsigned pbytes(pgoff_t index, struct inode * inode)
25356 +{
25357 +       return __mbp(i_size_read(inode), index);
25358 +}
25359 +
25360 +/**
25361 + * number of pages occuped by @win->count bytes starting from
25362 + * @win->off at logical cluster defined by @win. This is exactly
25363 + * a number of pages to be modified and dirtied in any cluster operation.
25364 + */
25365 +static inline pgoff_t win_count_to_nrpages(struct reiser4_slide * win)
25366 +{
25367 +       return ((win->off + win->count +
25368 +                (1UL << PAGE_CACHE_SHIFT) - 1) >> PAGE_CACHE_SHIFT) -
25369 +               off_to_pg(win->off);
25370 +}
25371 +
25372 +/* return true, if logical cluster is not occupied by the file */
25373 +static inline int new_logical_cluster(struct cluster_handle * clust,
25374 +                                     struct inode *inode)
25375 +{
25376 +       return clust_to_off(clust->index, inode) >= i_size_read(inode);
25377 +}
25378 +
25379 +/* return true, if pages @p1 and @p2 are of the same page cluster */
25380 +static inline int same_page_cluster(struct page * p1, struct page * p2)
25381 +{
25382 +       assert("edward-1490", p1 != NULL);
25383 +       assert("edward-1491", p2 != NULL);
25384 +       assert("edward-1492", p1->mapping != NULL);
25385 +       assert("edward-1493", p2->mapping != NULL);
25386 +
25387 +       return (pg_to_clust(page_index(p1), p1->mapping->host) ==
25388 +               pg_to_clust(page_index(p2), p2->mapping->host));
25389 +}
25390 +
25391 +static inline int cluster_is_complete(struct cluster_handle * clust,
25392 +                                     struct inode * inode)
25393 +{
25394 +       return clust->tc.lsize == inode_cluster_size(inode);
25395 +}
25396 +
25397 +static inline void reiser4_slide_init(struct reiser4_slide * win)
25398 +{
25399 +       assert("edward-1084", win != NULL);
25400 +       memset(win, 0, sizeof *win);
25401 +}
25402 +
25403 +static inline tfm_action
25404 +cluster_get_tfm_act(struct tfm_cluster * tc)
25405 +{
25406 +       assert("edward-1356", tc != NULL);
25407 +       return tc->act;
25408 +}
25409 +
25410 +static inline void
25411 +cluster_set_tfm_act(struct tfm_cluster * tc, tfm_action act)
25412 +{
25413 +       assert("edward-1356", tc != NULL);
25414 +       tc->act = act;
25415 +}
25416 +
25417 +static inline void cluster_init_act(struct cluster_handle * clust,
25418 +                                   tfm_action act,
25419 +                                   struct reiser4_slide * window)
25420 +{
25421 +       assert("edward-84", clust != NULL);
25422 +       memset(clust, 0, sizeof *clust);
25423 +       cluster_set_tfm_act(&clust->tc, act);
25424 +       clust->dstat = INVAL_DISK_CLUSTER;
25425 +       clust->win = window;
25426 +}
25427 +
25428 +static inline void cluster_init_read(struct cluster_handle * clust,
25429 +                                    struct reiser4_slide * window)
25430 +{
25431 +       cluster_init_act (clust, TFMA_READ, window);
25432 +}
25433 +
25434 +static inline void cluster_init_write(struct cluster_handle * clust,
25435 +                                     struct reiser4_slide * window)
25436 +{
25437 +       cluster_init_act (clust, TFMA_WRITE, window);
25438 +}
25439 +
25440 +/* true if @p1 and @p2 are items of the same disk cluster */
25441 +static inline int same_disk_cluster(const coord_t * p1, const coord_t * p2)
25442 +{
25443 +       /* drop this if you have other items to aggregate */
25444 +       assert("edward-1494", item_id_by_coord(p1) == CTAIL_ID);
25445 +
25446 +       return item_plugin_by_coord(p1)->b.mergeable(p1, p2);
25447 +}
25448 +
25449 +static inline int dclust_get_extension_dsize(hint_t * hint)
25450 +{
25451 +       return hint->ext_coord.extension.ctail.dsize;
25452 +}
25453 +
25454 +static inline void dclust_set_extension_dsize(hint_t * hint, int dsize)
25455 +{
25456 +       hint->ext_coord.extension.ctail.dsize = dsize;
25457 +}
25458 +
25459 +static inline int dclust_get_extension_shift(hint_t * hint)
25460 +{
25461 +       return hint->ext_coord.extension.ctail.shift;
25462 +}
25463 +
25464 +static inline int dclust_get_extension_ncount(hint_t * hint)
25465 +{
25466 +       return hint->ext_coord.extension.ctail.ncount;
25467 +}
25468 +
25469 +static inline void dclust_inc_extension_ncount(hint_t * hint)
25470 +{
25471 +       hint->ext_coord.extension.ctail.ncount ++;
25472 +}
25473 +
25474 +static inline void dclust_init_extension(hint_t * hint)
25475 +{
25476 +       memset(&hint->ext_coord.extension.ctail, 0,
25477 +              sizeof(hint->ext_coord.extension.ctail));
25478 +}
25479 +
25480 +static inline int hint_is_unprepped_dclust(hint_t * hint)
25481 +{
25482 +       assert("edward-1451", hint_is_valid(hint));
25483 +       return dclust_get_extension_shift(hint) == (int)UCTAIL_SHIFT;
25484 +}
25485 +
25486 +static inline void coord_set_between_clusters(coord_t * coord)
25487 +{
25488 +#if REISER4_DEBUG
25489 +       int result;
25490 +       result = zload(coord->node);
25491 +       assert("edward-1296", !result);
25492 +#endif
25493 +       if (!coord_is_between_items(coord)) {
25494 +               coord->between = AFTER_ITEM;
25495 +               coord->unit_pos = 0;
25496 +       }
25497 +#if REISER4_DEBUG
25498 +       zrelse(coord->node);
25499 +#endif
25500 +}
25501 +
25502 +int reiser4_inflate_cluster(struct cluster_handle *, struct inode *);
25503 +int find_disk_cluster(struct cluster_handle *, struct inode *, int read,
25504 +                     znode_lock_mode mode);
25505 +int checkout_logical_cluster(struct cluster_handle *, jnode *, struct inode *);
25506 +int reiser4_deflate_cluster(struct cluster_handle *, struct inode *);
25507 +void truncate_complete_page_cluster(struct inode *inode, cloff_t start,
25508 +                                        int even_cows);
25509 +void invalidate_hint_cluster(struct cluster_handle * clust);
25510 +int get_disk_cluster_locked(struct cluster_handle * clust, struct inode * inode,
25511 +                           znode_lock_mode lock_mode);
25512 +void reset_cluster_params(struct cluster_handle * clust);
25513 +int set_cluster_by_page(struct cluster_handle * clust, struct page * page,
25514 +                       int count);
25515 +int prepare_page_cluster(struct inode *inode, struct cluster_handle * clust,
25516 +                        rw_op rw);
25517 +void __put_page_cluster(int from, int count,
25518 +                       struct page ** pages, struct inode  * inode);
25519 +void put_page_cluster(struct cluster_handle * clust,
25520 +                     struct inode  * inode, rw_op rw);
25521 +void put_cluster_handle(struct cluster_handle * clust);
25522 +int grab_tfm_stream(struct inode *inode, struct tfm_cluster * tc, tfm_stream_id id);
25523 +int tfm_cluster_is_uptodate(struct tfm_cluster * tc);
25524 +void tfm_cluster_set_uptodate(struct tfm_cluster * tc);
25525 +void tfm_cluster_clr_uptodate(struct tfm_cluster * tc);
25526 +
25527 +/* move cluster handle to the target position
25528 +   specified by the page of index @pgidx */
25529 +static inline void move_cluster_forward(struct cluster_handle * clust,
25530 +                                       struct inode *inode,
25531 +                                       pgoff_t pgidx)
25532 +{
25533 +       assert("edward-1297", clust != NULL);
25534 +       assert("edward-1298", inode != NULL);
25535 +
25536 +       reset_cluster_params(clust);
25537 +       if (clust->index_valid &&
25538 +           /* Hole in the indices. Hint became invalid and can not be
25539 +              used by find_cluster_item() even if seal/node versions
25540 +              will coincide */
25541 +           pg_to_clust(pgidx, inode) != clust->index + 1) {
25542 +               reiser4_unset_hint(clust->hint);
25543 +               invalidate_hint_cluster(clust);
25544 +       }
25545 +       clust->index = pg_to_clust(pgidx, inode);
25546 +       clust->index_valid = 1;
25547 +}
25548 +
25549 +static inline int alloc_clust_pages(struct cluster_handle * clust,
25550 +                                   struct inode *inode)
25551 +{
25552 +       assert("edward-791", clust != NULL);
25553 +       assert("edward-792", inode != NULL);
25554 +       clust->pages =
25555 +               kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode),
25556 +                       reiser4_ctx_gfp_mask_get());
25557 +       if (!clust->pages)
25558 +               return -ENOMEM;
25559 +       return 0;
25560 +}
25561 +
25562 +static inline void free_clust_pages(struct cluster_handle * clust)
25563 +{
25564 +       kfree(clust->pages);
25565 +}
25566 +
25567 +#endif                         /* __FS_REISER4_CLUSTER_H__ */
25568 +
25569 +/* Make Linus happy.
25570 +   Local variables:
25571 +   c-indentation-style: "K&R"
25572 +   mode-name: "LC"
25573 +   c-basic-offset: 8
25574 +   tab-width: 8
25575 +   fill-column: 120
25576 +   scroll-step: 1
25577 +   End:
25578 +*/
25579 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/compress/compress.c linux-2.6.27/fs/reiser4/plugin/compress/compress.c
25580 --- linux-2.6.27.orig/fs/reiser4/plugin/compress/compress.c     1970-01-01 03:00:00.000000000 +0300
25581 +++ linux-2.6.27/fs/reiser4/plugin/compress/compress.c  2008-10-12 18:20:00.000000000 +0400
25582 @@ -0,0 +1,355 @@
25583 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25584 +/* reiser4 compression transform plugins */
25585 +
25586 +#include "../../debug.h"
25587 +#include "../../inode.h"
25588 +#include "../plugin.h"
25589 +
25590 +#include <linux/lzo.h>
25591 +#include <linux/zlib.h>
25592 +#include <linux/types.h>
25593 +#include <linux/hardirq.h>
25594 +
25595 +static int change_compression(struct inode *inode,
25596 +                             reiser4_plugin * plugin,
25597 +                             pset_member memb)
25598 +{
25599 +       assert("edward-1316", inode != NULL);
25600 +       assert("edward-1317", plugin != NULL);
25601 +       assert("edward-1318", is_reiser4_inode(inode));
25602 +       assert("edward-1319",
25603 +              plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE);
25604 +
25605 +       /* cannot change compression plugin of already existing regular object */
25606 +       if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25607 +               return RETERR(-EINVAL);
25608 +
25609 +       /* If matches, nothing to change. */
25610 +       if (inode_hash_plugin(inode) != NULL &&
25611 +           inode_hash_plugin(inode)->h.id == plugin->h.id)
25612 +               return 0;
25613 +
25614 +       return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25615 +                              PSET_COMPRESSION, plugin);
25616 +}
25617 +
25618 +static reiser4_plugin_ops compression_plugin_ops = {
25619 +       .init = NULL,
25620 +       .load = NULL,
25621 +       .save_len = NULL,
25622 +       .save = NULL,
25623 +       .change = &change_compression
25624 +};
25625 +
25626 +/******************************************************************************/
25627 +/*                         gzip1 compression                                  */
25628 +/******************************************************************************/
25629 +
25630 +#define GZIP1_DEF_LEVEL                        Z_BEST_SPEED
25631 +#define GZIP1_DEF_WINBITS              15
25632 +#define GZIP1_DEF_MEMLEVEL             MAX_MEM_LEVEL
25633 +
25634 +static int gzip1_init(void)
25635 +{
25636 +       return 0;
25637 +}
25638 +
25639 +static int gzip1_overrun(unsigned src_len UNUSED_ARG)
25640 +{
25641 +       return 0;
25642 +}
25643 +
25644 +static coa_t gzip1_alloc(tfm_action act)
25645 +{
25646 +       coa_t coa = NULL;
25647 +       int ret = 0;
25648 +       switch (act) {
25649 +       case TFMA_WRITE:        /* compress */
25650 +               coa = reiser4_vmalloc(zlib_deflate_workspacesize());
25651 +               if (!coa) {
25652 +                       ret = -ENOMEM;
25653 +                       break;
25654 +               }
25655 +               break;
25656 +       case TFMA_READ: /* decompress */
25657 +               coa = reiser4_vmalloc(zlib_inflate_workspacesize());
25658 +               if (!coa) {
25659 +                       ret = -ENOMEM;
25660 +                       break;
25661 +               }
25662 +               break;
25663 +       default:
25664 +               impossible("edward-767",
25665 +                          "trying to alloc workspace for unknown tfm action");
25666 +       }
25667 +       if (ret) {
25668 +               warning("edward-768",
25669 +                       "alloc workspace for gzip1 (tfm action = %d) failed\n",
25670 +                       act);
25671 +               return ERR_PTR(ret);
25672 +       }
25673 +       return coa;
25674 +}
25675 +
25676 +static void gzip1_free(coa_t coa, tfm_action act)
25677 +{
25678 +       assert("edward-769", coa != NULL);
25679 +
25680 +       switch (act) {
25681 +       case TFMA_WRITE:        /* compress */
25682 +               vfree(coa);
25683 +               break;
25684 +       case TFMA_READ:         /* decompress */
25685 +               vfree(coa);
25686 +               break;
25687 +       default:
25688 +               impossible("edward-770", "unknown tfm action");
25689 +       }
25690 +       return;
25691 +}
25692 +
25693 +static int gzip1_min_size_deflate(void)
25694 +{
25695 +       return 64;
25696 +}
25697 +
25698 +static void
25699 +gzip1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
25700 +              __u8 * dst_first, unsigned *dst_len)
25701 +{
25702 +       int ret = 0;
25703 +       struct z_stream_s stream;
25704 +
25705 +       assert("edward-842", coa != NULL);
25706 +       assert("edward-875", src_len != 0);
25707 +
25708 +       stream.workspace = coa;
25709 +       ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED,
25710 +                               -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL,
25711 +                               Z_DEFAULT_STRATEGY);
25712 +       if (ret != Z_OK) {
25713 +               warning("edward-771", "zlib_deflateInit2 returned %d\n", ret);
25714 +               goto rollback;
25715 +       }
25716 +       ret = zlib_deflateReset(&stream);
25717 +       if (ret != Z_OK) {
25718 +               warning("edward-772", "zlib_deflateReset returned %d\n", ret);
25719 +               goto rollback;
25720 +       }
25721 +       stream.next_in = src_first;
25722 +       stream.avail_in = src_len;
25723 +       stream.next_out = dst_first;
25724 +       stream.avail_out = *dst_len;
25725 +
25726 +       ret = zlib_deflate(&stream, Z_FINISH);
25727 +       if (ret != Z_STREAM_END) {
25728 +               if (ret != Z_OK)
25729 +                       warning("edward-773",
25730 +                               "zlib_deflate returned %d\n", ret);
25731 +               goto rollback;
25732 +       }
25733 +       *dst_len = stream.total_out;
25734 +       return;
25735 +      rollback:
25736 +       *dst_len = src_len;
25737 +       return;
25738 +}
25739 +
25740 +static void
25741 +gzip1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
25742 +                __u8 * dst_first, unsigned *dst_len)
25743 +{
25744 +       int ret = 0;
25745 +       struct z_stream_s stream;
25746 +
25747 +       assert("edward-843", coa != NULL);
25748 +       assert("edward-876", src_len != 0);
25749 +
25750 +       stream.workspace = coa;
25751 +       ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS);
25752 +       if (ret != Z_OK) {
25753 +               warning("edward-774", "zlib_inflateInit2 returned %d\n", ret);
25754 +               return;
25755 +       }
25756 +       ret = zlib_inflateReset(&stream);
25757 +       if (ret != Z_OK) {
25758 +               warning("edward-775", "zlib_inflateReset returned %d\n", ret);
25759 +               return;
25760 +       }
25761 +
25762 +       stream.next_in = src_first;
25763 +       stream.avail_in = src_len;
25764 +       stream.next_out = dst_first;
25765 +       stream.avail_out = *dst_len;
25766 +
25767 +       ret = zlib_inflate(&stream, Z_SYNC_FLUSH);
25768 +       /*
25769 +        * Work around a bug in zlib, which sometimes wants to taste an extra
25770 +        * byte when being used in the (undocumented) raw deflate mode.
25771 +        * (From USAGI).
25772 +        */
25773 +       if (ret == Z_OK && !stream.avail_in && stream.avail_out) {
25774 +               u8 zerostuff = 0;
25775 +               stream.next_in = &zerostuff;
25776 +               stream.avail_in = 1;
25777 +               ret = zlib_inflate(&stream, Z_FINISH);
25778 +       }
25779 +       if (ret != Z_STREAM_END) {
25780 +               warning("edward-776", "zlib_inflate returned %d\n", ret);
25781 +               return;
25782 +       }
25783 +       *dst_len = stream.total_out;
25784 +       return;
25785 +}
25786 +
25787 +/******************************************************************************/
25788 +/*                            lzo1 compression                                */
25789 +/******************************************************************************/
25790 +
25791 +static int lzo1_init(void)
25792 +{
25793 +       return 0;
25794 +}
25795 +
25796 +static int lzo1_overrun(unsigned in_len)
25797 +{
25798 +       return in_len / 64 + 16 + 3;
25799 +}
25800 +
25801 +static coa_t lzo1_alloc(tfm_action act)
25802 +{
25803 +       int ret = 0;
25804 +       coa_t coa = NULL;
25805 +
25806 +       switch (act) {
25807 +       case TFMA_WRITE:        /* compress */
25808 +               coa = reiser4_vmalloc(LZO1X_1_MEM_COMPRESS);
25809 +               if (!coa) {
25810 +                       ret = -ENOMEM;
25811 +                       break;
25812 +               }
25813 +       case TFMA_READ:         /* decompress */
25814 +               break;
25815 +       default:
25816 +               impossible("edward-877",
25817 +                          "trying to alloc workspace for unknown tfm action");
25818 +       }
25819 +       if (ret) {
25820 +               warning("edward-878",
25821 +                       "alloc workspace for lzo1 (tfm action = %d) failed\n",
25822 +                       act);
25823 +               return ERR_PTR(ret);
25824 +       }
25825 +       return coa;
25826 +}
25827 +
25828 +static void lzo1_free(coa_t coa, tfm_action act)
25829 +{
25830 +       assert("edward-879", coa != NULL);
25831 +
25832 +       switch (act) {
25833 +       case TFMA_WRITE:        /* compress */
25834 +               vfree(coa);
25835 +               break;
25836 +       case TFMA_READ:         /* decompress */
25837 +               impossible("edward-1304",
25838 +                          "trying to free non-allocated workspace");
25839 +       default:
25840 +               impossible("edward-880", "unknown tfm action");
25841 +       }
25842 +       return;
25843 +}
25844 +
25845 +static int lzo1_min_size_deflate(void)
25846 +{
25847 +       return 256;
25848 +}
25849 +
25850 +static void
25851 +lzo1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
25852 +             __u8 * dst_first, unsigned *dst_len)
25853 +{
25854 +       int result;
25855 +
25856 +       assert("edward-846", coa != NULL);
25857 +       assert("edward-847", src_len != 0);
25858 +
25859 +       result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa);
25860 +       if (unlikely(result != LZO_E_OK)) {
25861 +               warning("edward-849", "lzo1x_1_compress failed\n");
25862 +               goto out;
25863 +       }
25864 +       if (*dst_len >= src_len) {
25865 +               //warning("edward-850", "lzo1x_1_compress: incompressible data\n");
25866 +               goto out;
25867 +       }
25868 +       return;
25869 +      out:
25870 +       *dst_len = src_len;
25871 +       return;
25872 +}
25873 +
25874 +static void
25875 +lzo1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
25876 +               __u8 * dst_first, unsigned *dst_len)
25877 +{
25878 +       int result;
25879 +
25880 +       assert("edward-851", coa == NULL);
25881 +       assert("edward-852", src_len != 0);
25882 +
25883 +       result = lzo1x_decompress_safe(src_first, src_len, dst_first, dst_len);
25884 +       if (result != LZO_E_OK)
25885 +               warning("edward-853", "lzo1x_1_decompress failed\n");
25886 +       return;
25887 +}
25888 +
25889 +compression_plugin compression_plugins[LAST_COMPRESSION_ID] = {
25890 +       [LZO1_COMPRESSION_ID] = {
25891 +               .h = {
25892 +                       .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
25893 +                       .id = LZO1_COMPRESSION_ID,
25894 +                       .pops = &compression_plugin_ops,
25895 +                       .label = "lzo1",
25896 +                       .desc = "lzo1 compression transform",
25897 +                       .linkage = {NULL, NULL}
25898 +               },
25899 +               .init = lzo1_init,
25900 +               .overrun = lzo1_overrun,
25901 +               .alloc = lzo1_alloc,
25902 +               .free = lzo1_free,
25903 +               .min_size_deflate = lzo1_min_size_deflate,
25904 +               .checksum = reiser4_adler32,
25905 +               .compress = lzo1_compress,
25906 +               .decompress = lzo1_decompress
25907 +       },
25908 +       [GZIP1_COMPRESSION_ID] = {
25909 +               .h = {
25910 +                       .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
25911 +                       .id = GZIP1_COMPRESSION_ID,
25912 +                       .pops = &compression_plugin_ops,
25913 +                       .label = "gzip1",
25914 +                       .desc = "gzip1 compression transform",
25915 +                       .linkage = {NULL, NULL}
25916 +               },
25917 +               .init = gzip1_init,
25918 +               .overrun = gzip1_overrun,
25919 +               .alloc = gzip1_alloc,
25920 +               .free = gzip1_free,
25921 +               .min_size_deflate = gzip1_min_size_deflate,
25922 +               .checksum = reiser4_adler32,
25923 +               .compress = gzip1_compress,
25924 +               .decompress = gzip1_decompress
25925 +       }
25926 +};
25927 +
25928 +/*
25929 +  Local variables:
25930 +  c-indentation-style: "K&R"
25931 +  mode-name: "LC"
25932 +  c-basic-offset: 8
25933 +  tab-width: 8
25934 +  fill-column: 120
25935 +  scroll-step: 1
25936 +  End:
25937 +*/
25938 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/compress/compress.h linux-2.6.27/fs/reiser4/plugin/compress/compress.h
25939 --- linux-2.6.27.orig/fs/reiser4/plugin/compress/compress.h     1970-01-01 03:00:00.000000000 +0300
25940 +++ linux-2.6.27/fs/reiser4/plugin/compress/compress.h  2008-10-12 18:20:00.000000000 +0400
25941 @@ -0,0 +1,43 @@
25942 +#if !defined( __FS_REISER4_COMPRESS_H__ )
25943 +#define __FS_REISER4_COMPRESS_H__
25944 +
25945 +#include <linux/types.h>
25946 +#include <linux/string.h>
25947 +
25948 +/* transform direction */
25949 +typedef enum {
25950 +       TFMA_READ,   /* decrypt, decompress */
25951 +       TFMA_WRITE,  /* encrypt, compress */
25952 +       TFMA_LAST
25953 +} tfm_action;
25954 +
25955 +/* supported compression algorithms */
25956 +typedef enum {
25957 +       LZO1_COMPRESSION_ID,
25958 +       GZIP1_COMPRESSION_ID,
25959 +       LAST_COMPRESSION_ID,
25960 +} reiser4_compression_id;
25961 +
25962 +/* the same as pgoff, but units are page clusters */
25963 +typedef unsigned long cloff_t;
25964 +
25965 +/* working data of a (de)compression algorithm */
25966 +typedef void *coa_t;
25967 +
25968 +/* table for all supported (de)compression algorithms */
25969 +typedef coa_t coa_set[LAST_COMPRESSION_ID][TFMA_LAST];
25970 +
25971 +__u32 reiser4_adler32(char *data, __u32 len);
25972 +
25973 +#endif                         /* __FS_REISER4_COMPRESS_H__ */
25974 +
25975 +/* Make Linus happy.
25976 +   Local variables:
25977 +   c-indentation-style: "K&R"
25978 +   mode-name: "LC"
25979 +   c-basic-offset: 8
25980 +   tab-width: 8
25981 +   fill-column: 120
25982 +   scroll-step: 1
25983 +   End:
25984 +*/
25985 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/compress/compress_mode.c linux-2.6.27/fs/reiser4/plugin/compress/compress_mode.c
25986 --- linux-2.6.27.orig/fs/reiser4/plugin/compress/compress_mode.c        1970-01-01 03:00:00.000000000 +0300
25987 +++ linux-2.6.27/fs/reiser4/plugin/compress/compress_mode.c     2008-10-12 18:20:00.000000000 +0400
25988 @@ -0,0 +1,162 @@
25989 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25990 +/* This file contains Reiser4 compression mode plugins.
25991 +
25992 +   Compression mode plugin is a set of handlers called by compressor
25993 +   at flush time and represent some heuristics including the ones
25994 +   which are to avoid compression of incompressible data, see
25995 +   http://www.namesys.com/cryptcompress_design.html for more details.
25996 +*/
25997 +#include "../../inode.h"
25998 +#include "../plugin.h"
25999 +
26000 +static int should_deflate_none(struct inode * inode, cloff_t index)
26001 +{
26002 +       return 0;
26003 +}
26004 +
26005 +static int should_deflate_common(struct inode * inode, cloff_t index)
26006 +{
26007 +       return compression_is_on(cryptcompress_inode_data(inode));
26008 +}
26009 +
26010 +static int discard_hook_ultim(struct inode *inode, cloff_t index)
26011 +{
26012 +       turn_off_compression(cryptcompress_inode_data(inode));
26013 +       return 0;
26014 +}
26015 +
26016 +static int discard_hook_lattd(struct inode *inode, cloff_t index)
26017 +{
26018 +       struct cryptcompress_info * info = cryptcompress_inode_data(inode);
26019 +
26020 +       assert("edward-1462",
26021 +              get_lattice_factor(info) >= MIN_LATTICE_FACTOR &&
26022 +              get_lattice_factor(info) <= MAX_LATTICE_FACTOR);
26023 +
26024 +       turn_off_compression(info);
26025 +       if (get_lattice_factor(info) < MAX_LATTICE_FACTOR)
26026 +               set_lattice_factor(info, get_lattice_factor(info) << 1);
26027 +       return 0;
26028 +}
26029 +
26030 +static int accept_hook_lattd(struct inode *inode, cloff_t index)
26031 +{
26032 +       turn_on_compression(cryptcompress_inode_data(inode));
26033 +       set_lattice_factor(cryptcompress_inode_data(inode), MIN_LATTICE_FACTOR);
26034 +       return 0;
26035 +}
26036 +
26037 +/* Check on dynamic lattice, the adaptive compression modes which
26038 +   defines the following behavior:
26039 +
26040 +   Compression is on: try to compress everything and turn
26041 +   it off, whenever cluster is incompressible.
26042 +
26043 +   Compression is off: try to compress clusters of indexes
26044 +   k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of
26045 +   them is compressible. If incompressible, then increase FACTOR */
26046 +
26047 +/* check if @index belongs to one-dimensional lattice
26048 +   of sparce factor @factor */
26049 +static int is_on_lattice(cloff_t index, int factor)
26050 +{
26051 +       return (factor ? index % factor == 0: index == 0);
26052 +}
26053 +
26054 +static int should_deflate_lattd(struct inode * inode, cloff_t index)
26055 +{
26056 +       return should_deflate_common(inode, index) ||
26057 +               is_on_lattice(index,
26058 +                             get_lattice_factor
26059 +                             (cryptcompress_inode_data(inode)));
26060 +}
26061 +
26062 +/* compression mode_plugins */
26063 +compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = {
26064 +       [NONE_COMPRESSION_MODE_ID] = {
26065 +               .h = {
26066 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26067 +                       .id = NONE_COMPRESSION_MODE_ID,
26068 +                       .pops = NULL,
26069 +                       .label = "none",
26070 +                       .desc = "Compress nothing",
26071 +                       .linkage = {NULL, NULL}
26072 +               },
26073 +               .should_deflate = should_deflate_none,
26074 +               .accept_hook = NULL,
26075 +               .discard_hook = NULL
26076 +       },
26077 +       /* Check-on-dynamic-lattice adaptive compression mode */
26078 +       [LATTD_COMPRESSION_MODE_ID] = {
26079 +               .h = {
26080 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26081 +                       .id = LATTD_COMPRESSION_MODE_ID,
26082 +                       .pops = NULL,
26083 +                       .label = "lattd",
26084 +                       .desc = "Check on dynamic lattice",
26085 +                       .linkage = {NULL, NULL}
26086 +               },
26087 +               .should_deflate = should_deflate_lattd,
26088 +               .accept_hook = accept_hook_lattd,
26089 +               .discard_hook = discard_hook_lattd
26090 +       },
26091 +       /* Check-ultimately compression mode:
26092 +          Turn off compression forever as soon as we meet
26093 +          incompressible data */
26094 +       [ULTIM_COMPRESSION_MODE_ID] = {
26095 +               .h = {
26096 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26097 +                       .id = ULTIM_COMPRESSION_MODE_ID,
26098 +                       .pops = NULL,
26099 +                       .label = "ultim",
26100 +                       .desc = "Check ultimately",
26101 +                       .linkage = {NULL, NULL}
26102 +               },
26103 +               .should_deflate = should_deflate_common,
26104 +               .accept_hook = NULL,
26105 +               .discard_hook = discard_hook_ultim
26106 +       },
26107 +       /* Force-to-compress-everything compression mode */
26108 +       [FORCE_COMPRESSION_MODE_ID] = {
26109 +               .h = {
26110 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26111 +                       .id = FORCE_COMPRESSION_MODE_ID,
26112 +                       .pops = NULL,
26113 +                       .label = "force",
26114 +                       .desc = "Force to compress everything",
26115 +                       .linkage = {NULL, NULL}
26116 +               },
26117 +               .should_deflate = NULL,
26118 +               .accept_hook = NULL,
26119 +               .discard_hook = NULL
26120 +       },
26121 +       /* Convert-to-extent compression mode.
26122 +          In this mode items will be converted to extents and management
26123 +          will be passed to (classic) unix file plugin as soon as ->write()
26124 +          detects that the first complete logical cluster (of index #0) is
26125 +          incompressible. */
26126 +       [CONVX_COMPRESSION_MODE_ID] = {
26127 +               .h = {
26128 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26129 +                       .id = CONVX_COMPRESSION_MODE_ID,
26130 +                       .pops = NULL,
26131 +                       .label = "conv",
26132 +                       .desc = "Convert to extent",
26133 +                       .linkage = {NULL, NULL}
26134 +               },
26135 +               .should_deflate = should_deflate_common,
26136 +               .accept_hook = NULL,
26137 +               .discard_hook = NULL
26138 +       }
26139 +};
26140 +
26141 +/*
26142 +  Local variables:
26143 +  c-indentation-style: "K&R"
26144 +  mode-name: "LC"
26145 +  c-basic-offset: 8
26146 +  tab-width: 8
26147 +  fill-column: 120
26148 +  scroll-step: 1
26149 +  End:
26150 +*/
26151 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/compress/Makefile linux-2.6.27/fs/reiser4/plugin/compress/Makefile
26152 --- linux-2.6.27.orig/fs/reiser4/plugin/compress/Makefile       1970-01-01 03:00:00.000000000 +0300
26153 +++ linux-2.6.27/fs/reiser4/plugin/compress/Makefile    2008-10-12 18:20:00.000000000 +0400
26154 @@ -0,0 +1,5 @@
26155 +obj-$(CONFIG_REISER4_FS) += compress_plugins.o
26156 +
26157 +compress_plugins-objs :=       \
26158 +       compress.o              \
26159 +       compress_mode.o
26160 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/crypto/cipher.c linux-2.6.27/fs/reiser4/plugin/crypto/cipher.c
26161 --- linux-2.6.27.orig/fs/reiser4/plugin/crypto/cipher.c 1970-01-01 03:00:00.000000000 +0300
26162 +++ linux-2.6.27/fs/reiser4/plugin/crypto/cipher.c      2008-10-12 18:20:00.000000000 +0400
26163 @@ -0,0 +1,37 @@
26164 +/* Copyright 2001, 2002, 2003 by Hans Reiser,
26165 +   licensing governed by reiser4/README */
26166 +/* Reiser4 cipher transform plugins */
26167 +
26168 +#include "../../debug.h"
26169 +#include "../plugin.h"
26170 +
26171 +cipher_plugin cipher_plugins[LAST_CIPHER_ID] = {
26172 +       [NONE_CIPHER_ID] = {
26173 +               .h = {
26174 +                       .type_id = REISER4_CIPHER_PLUGIN_TYPE,
26175 +                       .id = NONE_CIPHER_ID,
26176 +                       .pops = NULL,
26177 +                       .label = "none",
26178 +                       .desc = "no cipher transform",
26179 +                       .linkage = {NULL, NULL}
26180 +               },
26181 +               .alloc = NULL,
26182 +               .free = NULL,
26183 +               .scale = NULL,
26184 +               .align_stream = NULL,
26185 +               .setkey = NULL,
26186 +               .encrypt = NULL,
26187 +               .decrypt = NULL
26188 +       }
26189 +};
26190 +
26191 +/* Make Linus happy.
26192 +   Local variables:
26193 +   c-indentation-style: "K&R"
26194 +   mode-name: "LC"
26195 +   c-basic-offset: 8
26196 +   tab-width: 8
26197 +   fill-column: 120
26198 +   scroll-step: 1
26199 +   End:
26200 +*/
26201 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/crypto/cipher.h linux-2.6.27/fs/reiser4/plugin/crypto/cipher.h
26202 --- linux-2.6.27.orig/fs/reiser4/plugin/crypto/cipher.h 1970-01-01 03:00:00.000000000 +0300
26203 +++ linux-2.6.27/fs/reiser4/plugin/crypto/cipher.h      2008-10-12 18:20:00.000000000 +0400
26204 @@ -0,0 +1,55 @@
26205 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26206 +/* This file contains definitions for the objects operated
26207 +   by reiser4 key manager, which is something like keyring
26208 +   wrapped by appropriate reiser4 plugin */
26209 +
26210 +#if !defined( __FS_REISER4_CRYPT_H__ )
26211 +#define __FS_REISER4_CRYPT_H__
26212 +
26213 +#include <linux/crypto.h>
26214 +
26215 +/* key info imported from user space */
26216 +struct reiser4_crypto_data {
26217 +       int keysize;    /* uninstantiated key size */
26218 +       __u8 * key;     /* uninstantiated key */
26219 +       int keyid_size; /* size of passphrase */
26220 +       __u8 * keyid;   /* passphrase */
26221 +};
26222 +
26223 +/* This object contains all needed infrastructure to implement
26224 +   cipher transform. This is operated (allocating, inheriting,
26225 +   validating, binding to host inode, etc..) by reiser4 key manager.
26226 +
26227 +   This info can be allocated in two cases:
26228 +   1. importing a key from user space.
26229 +   2. reading inode from disk */
26230 +struct reiser4_crypto_info {
26231 +       struct inode * host;
26232 +       struct crypto_hash      * digest;
26233 +       struct crypto_blkcipher * cipher;
26234 +#if 0
26235 +       cipher_key_plugin * kplug; /* key manager */
26236 +#endif
26237 +       __u8 * keyid;              /* key fingerprint, created by digest plugin,
26238 +                                     using uninstantiated key and passphrase.
26239 +                                     supposed to be stored in disk stat-data */
26240 +       int inst;                  /* this indicates if the cipher key is
26241 +                                     instantiated (case 1 above) */
26242 +       int keysize;               /* uninstantiated key size (bytes), supposed
26243 +                                     to be stored in disk stat-data */
26244 +       int keyload_count;         /* number of the objects which has this
26245 +                                     crypto-stat attached */
26246 +};
26247 +
26248 +#endif /* __FS_REISER4_CRYPT_H__ */
26249 +
26250 +/*
26251 +   Local variables:
26252 +   c-indentation-style: "K&R"
26253 +   mode-name: "LC"
26254 +   c-basic-offset: 8
26255 +   tab-width: 8
26256 +   fill-column: 120
26257 +   scroll-step: 1
26258 +   End:
26259 +*/
26260 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/crypto/digest.c linux-2.6.27/fs/reiser4/plugin/crypto/digest.c
26261 --- linux-2.6.27.orig/fs/reiser4/plugin/crypto/digest.c 1970-01-01 03:00:00.000000000 +0300
26262 +++ linux-2.6.27/fs/reiser4/plugin/crypto/digest.c      2008-10-12 18:20:00.000000000 +0400
26263 @@ -0,0 +1,58 @@
26264 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26265 +
26266 +/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */
26267 +/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */
26268 +#include "../../debug.h"
26269 +#include "../plugin_header.h"
26270 +#include "../plugin.h"
26271 +#include "../file/cryptcompress.h"
26272 +
26273 +#include <linux/types.h>
26274 +
26275 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
26276 +
26277 +static struct crypto_hash * alloc_sha256 (void)
26278 +{
26279 +#if REISER4_SHA256
26280 +       return crypto_alloc_hash ("sha256", 0, CRYPTO_ALG_ASYNC);
26281 +#else
26282 +       warning("edward-1418", "sha256 unsupported");
26283 +       return ERR_PTR(-EINVAL);
26284 +#endif
26285 +}
26286 +
26287 +static void free_sha256 (struct crypto_hash * tfm)
26288 +{
26289 +#if REISER4_SHA256
26290 +       crypto_free_hash(tfm);
26291 +#endif
26292 +       return;
26293 +}
26294 +
26295 +/* digest plugins */
26296 +digest_plugin digest_plugins[LAST_DIGEST_ID] = {
26297 +       [SHA256_32_DIGEST_ID] = {
26298 +               .h = {
26299 +                       .type_id = REISER4_DIGEST_PLUGIN_TYPE,
26300 +                       .id = SHA256_32_DIGEST_ID,
26301 +                       .pops = NULL,
26302 +                       .label = "sha256_32",
26303 +                       .desc = "sha256_32 digest transform",
26304 +                       .linkage = {NULL, NULL}
26305 +               },
26306 +               .fipsize = sizeof(__u32),
26307 +               .alloc = alloc_sha256,
26308 +               .free = free_sha256
26309 +       }
26310 +};
26311 +
26312 +/*
26313 +  Local variables:
26314 +  c-indentation-style: "K&R"
26315 +  mode-name: "LC"
26316 +  c-basic-offset: 8
26317 +  tab-width: 8
26318 +  fill-column: 120
26319 +  scroll-step: 1
26320 +  End:
26321 +*/
26322 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/dir/dir.h linux-2.6.27/fs/reiser4/plugin/dir/dir.h
26323 --- linux-2.6.27.orig/fs/reiser4/plugin/dir/dir.h       1970-01-01 03:00:00.000000000 +0300
26324 +++ linux-2.6.27/fs/reiser4/plugin/dir/dir.h    2008-10-12 18:20:00.000000000 +0400
26325 @@ -0,0 +1,36 @@
26326 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
26327 + * reiser4/README */
26328 +
26329 +/* this file contains declarations of methods implementing directory plugins */
26330 +
26331 +#if !defined( __REISER4_DIR_H__ )
26332 +#define __REISER4_DIR_H__
26333 +
26334 +/*#include "../../key.h"
26335 +
26336 +#include <linux/fs.h>*/
26337 +
26338 +/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */
26339 +
26340 +/* "hashed" directory methods of dir plugin */
26341 +void build_entry_key_hashed(const struct inode *, const struct qstr *,
26342 +                           reiser4_key *);
26343 +
26344 +/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */
26345 +
26346 +/* "seekable" directory methods of dir plugin */
26347 +void build_entry_key_seekable(const struct inode *, const struct qstr *,
26348 +                             reiser4_key *);
26349 +
26350 +/* __REISER4_DIR_H__ */
26351 +#endif
26352 +
26353 +/*
26354 +   Local variables:
26355 +   c-indentation-style: "K&R"
26356 +   mode-name: "LC"
26357 +   c-basic-offset: 8
26358 +   tab-width: 8
26359 +   fill-column: 120
26360 +   End:
26361 +*/
26362 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/dir/hashed_dir.c linux-2.6.27/fs/reiser4/plugin/dir/hashed_dir.c
26363 --- linux-2.6.27.orig/fs/reiser4/plugin/dir/hashed_dir.c        1970-01-01 03:00:00.000000000 +0300
26364 +++ linux-2.6.27/fs/reiser4/plugin/dir/hashed_dir.c     2008-10-12 18:20:00.000000000 +0400
26365 @@ -0,0 +1,81 @@
26366 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
26367 + * reiser4/README */
26368 +
26369 +/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file
26370 +   names to the files. */
26371 +
26372 +/*
26373 + * Hashed directory logically consists of persistent directory
26374 + * entries. Directory entry is a pair of a file name and a key of stat-data of
26375 + * a file that has this name in the given directory.
26376 + *
26377 + * Directory entries are stored in the tree in the form of directory
26378 + * items. Directory item should implement dir_entry_ops portion of item plugin
26379 + * interface (see plugin/item/item.h). Hashed directory interacts with
26380 + * directory item plugin exclusively through dir_entry_ops operations.
26381 + *
26382 + * Currently there are two implementations of directory items: "simple
26383 + * directory item" (plugin/item/sde.[ch]), and "compound directory item"
26384 + * (plugin/item/cde.[ch]) with the latter being the default.
26385 + *
26386 + * There is, however some delicate way through which directory code interferes
26387 + * with item plugin: key assignment policy. A key for a directory item is
26388 + * chosen by directory code, and as described in kassign.c, this key contains
26389 + * a portion of file name. Directory item uses this knowledge to avoid storing
26390 + * this portion of file name twice: in the key and in the directory item body.
26391 + *
26392 + */
26393 +
26394 +#include "../../inode.h"
26395 +
26396 +void complete_entry_key(const struct inode *, const char *name,
26397 +                       int len, reiser4_key * result);
26398 +
26399 +/* this is implementation of build_entry_key method of dir
26400 +   plugin for HASHED_DIR_PLUGIN_ID
26401 + */
26402 +void build_entry_key_hashed(const struct inode *dir,   /* directory where entry is
26403 +                                                        * (or will be) in.*/
26404 +                           const struct qstr *qname,   /* name of file referenced
26405 +                                                        * by this entry */
26406 +                           reiser4_key * result        /* resulting key of directory
26407 +                                                        * entry */ )
26408 +{
26409 +       const char *name;
26410 +       int len;
26411 +
26412 +       assert("nikita-1139", dir != NULL);
26413 +       assert("nikita-1140", qname != NULL);
26414 +       assert("nikita-1141", qname->name != NULL);
26415 +       assert("nikita-1142", result != NULL);
26416 +
26417 +       name = qname->name;
26418 +       len = qname->len;
26419 +
26420 +       assert("nikita-2867", strlen(name) == len);
26421 +
26422 +       reiser4_key_init(result);
26423 +       /* locality of directory entry's key is objectid of parent
26424 +          directory */
26425 +       set_key_locality(result, get_inode_oid(dir));
26426 +       /* minor packing locality is constant */
26427 +       set_key_type(result, KEY_FILE_NAME_MINOR);
26428 +       /* dot is special case---we always want it to be first entry in
26429 +          a directory. Actually, we just want to have smallest
26430 +          directory entry.
26431 +        */
26432 +       if (len == 1 && name[0] == '.')
26433 +               return;
26434 +
26435 +       /* initialize part of entry key which depends on file name */
26436 +       complete_entry_key(dir, name, len, result);
26437 +}
26438 +
26439 +/* Local variables:
26440 +   c-indentation-style: "K&R"
26441 +   mode-name: "LC"
26442 +   c-basic-offset: 8
26443 +   tab-width: 8
26444 +   fill-column: 120
26445 +   End:
26446 +*/
26447 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/dir/Makefile linux-2.6.27/fs/reiser4/plugin/dir/Makefile
26448 --- linux-2.6.27.orig/fs/reiser4/plugin/dir/Makefile    1970-01-01 03:00:00.000000000 +0300
26449 +++ linux-2.6.27/fs/reiser4/plugin/dir/Makefile 2008-10-12 18:20:00.000000000 +0400
26450 @@ -0,0 +1,5 @@
26451 +obj-$(CONFIG_REISER4_FS) += dir_plugins.o
26452 +
26453 +dir_plugins-objs :=    \
26454 +       hashed_dir.o    \
26455 +       seekable_dir.o
26456 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/dir/seekable_dir.c linux-2.6.27/fs/reiser4/plugin/dir/seekable_dir.c
26457 --- linux-2.6.27.orig/fs/reiser4/plugin/dir/seekable_dir.c      1970-01-01 03:00:00.000000000 +0300
26458 +++ linux-2.6.27/fs/reiser4/plugin/dir/seekable_dir.c   2008-10-12 18:20:00.000000000 +0400
26459 @@ -0,0 +1,46 @@
26460 +/* Copyright 2005 by Hans Reiser, licensing governed by
26461 + * reiser4/README */
26462 +
26463 +#include "../../inode.h"
26464 +
26465 +/* this is implementation of build_entry_key method of dir
26466 +   plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID
26467 +   This is for directories where we want repeatable and restartable readdir()
26468 +   even in case 32bit user level struct dirent (readdir(3)).
26469 +*/
26470 +void
26471 +build_entry_key_seekable(const struct inode *dir, const struct qstr *name,
26472 +                        reiser4_key * result)
26473 +{
26474 +       oid_t objectid;
26475 +
26476 +       assert("nikita-2283", dir != NULL);
26477 +       assert("nikita-2284", name != NULL);
26478 +       assert("nikita-2285", name->name != NULL);
26479 +       assert("nikita-2286", result != NULL);
26480 +
26481 +       reiser4_key_init(result);
26482 +       /* locality of directory entry's key is objectid of parent
26483 +          directory */
26484 +       set_key_locality(result, get_inode_oid(dir));
26485 +       /* minor packing locality is constant */
26486 +       set_key_type(result, KEY_FILE_NAME_MINOR);
26487 +       /* dot is special case---we always want it to be first entry in
26488 +          a directory. Actually, we just want to have smallest
26489 +          directory entry.
26490 +        */
26491 +       if ((name->len == 1) && (name->name[0] == '.'))
26492 +               return;
26493 +
26494 +       /* objectid of key is 31 lowest bits of hash. */
26495 +       objectid =
26496 +           inode_hash_plugin(dir)->hash(name->name,
26497 +                                        (int)name->len) & 0x7fffffff;
26498 +
26499 +       assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK));
26500 +       set_key_objectid(result, objectid);
26501 +
26502 +       /* offset is always 0. */
26503 +       set_key_offset(result, (__u64) 0);
26504 +       return;
26505 +}
26506 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/dir_plugin_common.c linux-2.6.27/fs/reiser4/plugin/dir_plugin_common.c
26507 --- linux-2.6.27.orig/fs/reiser4/plugin/dir_plugin_common.c     1970-01-01 03:00:00.000000000 +0300
26508 +++ linux-2.6.27/fs/reiser4/plugin/dir_plugin_common.c  2008-10-12 18:20:00.000000000 +0400
26509 @@ -0,0 +1,872 @@
26510 +/* Copyright 2005 by Hans Reiser, licensing governed by
26511 +   reiser4/README */
26512 +
26513 +/* this file contains typical implementations for most of methods of
26514 +   directory plugin
26515 +*/
26516 +
26517 +#include "../inode.h"
26518 +
26519 +int reiser4_find_entry(struct inode *dir, struct dentry *name,
26520 +              lock_handle *, znode_lock_mode, reiser4_dir_entry_desc *);
26521 +int reiser4_lookup_name(struct inode *parent, struct dentry *dentry, reiser4_key * key);
26522 +void check_light_weight(struct inode *inode, struct inode *parent);
26523 +
26524 +/* this is common implementation of get_parent method of dir plugin
26525 +   this is used by NFS kernel server to "climb" up directory tree to
26526 +   check permissions
26527 + */
26528 +struct dentry *get_parent_common(struct inode *child)
26529 +{
26530 +       struct super_block *s;
26531 +       struct inode *parent;
26532 +       struct dentry dotdot;
26533 +       struct dentry *dentry;
26534 +       reiser4_key key;
26535 +       int result;
26536 +
26537 +       /*
26538 +        * lookup dotdot entry.
26539 +        */
26540 +
26541 +       s = child->i_sb;
26542 +       memset(&dotdot, 0, sizeof(dotdot));
26543 +       dotdot.d_name.name = "..";
26544 +       dotdot.d_name.len = 2;
26545 +       dotdot.d_op = &get_super_private(s)->ops.dentry;
26546 +
26547 +       result = reiser4_lookup_name(child, &dotdot, &key);
26548 +       if (result != 0)
26549 +               return ERR_PTR(result);
26550 +
26551 +       parent = reiser4_iget(s, &key, 1);
26552 +       if (!IS_ERR(parent)) {
26553 +               /*
26554 +                * FIXME-NIKITA dubious: attributes are inherited from @child
26555 +                * to @parent. But:
26556 +                *
26557 +                *     (*) this is the only this we can do
26558 +                *
26559 +                *     (*) attributes of light-weight object are inherited
26560 +                *     from a parent through which object was looked up first,
26561 +                *     so it is ambiguous anyway.
26562 +                *
26563 +                */
26564 +               check_light_weight(parent, child);
26565 +               reiser4_iget_complete(parent);
26566 +               dentry = d_alloc_anon(parent);
26567 +               if (dentry == NULL) {
26568 +                       iput(parent);
26569 +                       dentry = ERR_PTR(RETERR(-ENOMEM));
26570 +               } else
26571 +                       dentry->d_op = &get_super_private(s)->ops.dentry;
26572 +       } else if (PTR_ERR(parent) == -ENOENT)
26573 +               dentry = ERR_PTR(RETERR(-ESTALE));
26574 +       else
26575 +               dentry = (void *)parent;
26576 +       return dentry;
26577 +}
26578 +
26579 +/* this is common implementation of is_name_acceptable method of dir
26580 +   plugin
26581 + */
26582 +int is_name_acceptable_common(const struct inode *inode,       /* directory to check */
26583 +                             const char *name UNUSED_ARG,      /* name to check */
26584 +                             int len /* @name's length */ )
26585 +{
26586 +       assert("nikita-733", inode != NULL);
26587 +       assert("nikita-734", name != NULL);
26588 +       assert("nikita-735", len > 0);
26589 +
26590 +       return len <= reiser4_max_filename_len(inode);
26591 +}
26592 +
26593 +/* there is no common implementation of build_entry_key method of dir
26594 +   plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or
26595 +   plugin/dir/seekable.c:build_entry_key_seekable() for example
26596 +*/
26597 +
26598 +/* this is common implementation of build_readdir_key method of dir
26599 +   plugin
26600 +   see reiser4_readdir_common for more details
26601 +*/
26602 +int build_readdir_key_common(struct file *dir /* directory being read */ ,
26603 +                            reiser4_key * result /* where to store key */ )
26604 +{
26605 +       reiser4_file_fsdata *fdata;
26606 +       struct inode *inode;
26607 +
26608 +       assert("nikita-1361", dir != NULL);
26609 +       assert("nikita-1362", result != NULL);
26610 +       assert("nikita-1363", dir->f_dentry != NULL);
26611 +       inode = dir->f_dentry->d_inode;
26612 +       assert("nikita-1373", inode != NULL);
26613 +
26614 +       fdata = reiser4_get_file_fsdata(dir);
26615 +       if (IS_ERR(fdata))
26616 +               return PTR_ERR(fdata);
26617 +       assert("nikita-1364", fdata != NULL);
26618 +       return extract_key_from_de_id(get_inode_oid(inode),
26619 +                                     &fdata->dir.readdir.position.
26620 +                                     dir_entry_key, result);
26621 +
26622 +}
26623 +
26624 +void reiser4_adjust_dir_file(struct inode *, const struct dentry *, int offset,
26625 +                            int adj);
26626 +
26627 +/* this is common implementation of add_entry method of dir plugin
26628 +*/
26629 +int reiser4_add_entry_common(struct inode *object, /* directory to add new name
26630 +                                                   * in */
26631 +                            struct dentry *where,      /* new name */
26632 +                            reiser4_object_create_data * data, /* parameters of
26633 +                                                               *  new object */
26634 +                            reiser4_dir_entry_desc * entry /* parameters of
26635 +                                                            * new directory
26636 +                                                            * entry */)
26637 +{
26638 +       int result;
26639 +       coord_t *coord;
26640 +       lock_handle lh;
26641 +       struct reiser4_dentry_fsdata *fsdata;
26642 +       reiser4_block_nr reserve;
26643 +
26644 +       assert("nikita-1114", object != NULL);
26645 +       assert("nikita-1250", where != NULL);
26646 +
26647 +       fsdata = reiser4_get_dentry_fsdata(where);
26648 +       if (unlikely(IS_ERR(fsdata)))
26649 +               return PTR_ERR(fsdata);
26650 +
26651 +       reserve = inode_dir_plugin(object)->estimate.add_entry(object);
26652 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
26653 +               return RETERR(-ENOSPC);
26654 +
26655 +       init_lh(&lh);
26656 +       coord = &fsdata->dec.entry_coord;
26657 +       coord_clear_iplug(coord);
26658 +
26659 +       /* check for this entry in a directory. This is plugin method. */
26660 +       result = reiser4_find_entry(object, where, &lh, ZNODE_WRITE_LOCK,
26661 +                                   entry);
26662 +       if (likely(result == -ENOENT)) {
26663 +               /* add new entry. Just pass control to the directory
26664 +                  item plugin. */
26665 +               assert("nikita-1709", inode_dir_item_plugin(object));
26666 +               assert("nikita-2230", coord->node == lh.node);
26667 +               reiser4_seal_done(&fsdata->dec.entry_seal);
26668 +               result =
26669 +                   inode_dir_item_plugin(object)->s.dir.add_entry(object,
26670 +                                                                  coord, &lh,
26671 +                                                                  where,
26672 +                                                                  entry);
26673 +               if (result == 0) {
26674 +                       reiser4_adjust_dir_file(object, where,
26675 +                                               fsdata->dec.pos + 1, +1);
26676 +                       INODE_INC_FIELD(object, i_size);
26677 +               }
26678 +       } else if (result == 0) {
26679 +               assert("nikita-2232", coord->node == lh.node);
26680 +               result = RETERR(-EEXIST);
26681 +       }
26682 +       done_lh(&lh);
26683 +
26684 +       return result;
26685 +}
26686 +
26687 +/**
26688 + * rem_entry - remove entry from directory item
26689 + * @dir:
26690 + * @dentry:
26691 + * @entry:
26692 + * @coord:
26693 + * @lh:
26694 + *
26695 + * Checks that coordinate @coord is set properly and calls item plugin
26696 + * method to cut entry.
26697 + */
26698 +static int
26699 +rem_entry(struct inode *dir, struct dentry *dentry,
26700 +         reiser4_dir_entry_desc * entry, coord_t * coord, lock_handle * lh)
26701 +{
26702 +       item_plugin *iplug;
26703 +       struct inode *child;
26704 +
26705 +       iplug = inode_dir_item_plugin(dir);
26706 +       child = dentry->d_inode;
26707 +       assert("nikita-3399", child != NULL);
26708 +
26709 +       /* check that we are really destroying an entry for @child */
26710 +       if (REISER4_DEBUG) {
26711 +               int result;
26712 +               reiser4_key key;
26713 +
26714 +               result = iplug->s.dir.extract_key(coord, &key);
26715 +               if (result != 0)
26716 +                       return result;
26717 +               if (get_key_objectid(&key) != get_inode_oid(child)) {
26718 +                       warning("nikita-3397",
26719 +                               "rem_entry: %#llx != %#llx\n",
26720 +                               get_key_objectid(&key),
26721 +                               (unsigned long long)get_inode_oid(child));
26722 +                       return RETERR(-EIO);
26723 +               }
26724 +       }
26725 +       return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry);
26726 +}
26727 +
26728 +/**
26729 + * reiser4_rem_entry_common - remove entry from a directory
26730 + * @dir: directory to remove entry from
26731 + * @where: name that is being removed
26732 + * @entry: description of entry being removed
26733 + *
26734 + * This is common implementation of rem_entry method of dir plugin.
26735 + */
26736 +int reiser4_rem_entry_common(struct inode *dir,
26737 +                            struct dentry *dentry,
26738 +                            reiser4_dir_entry_desc *entry)
26739 +{
26740 +       int result;
26741 +       coord_t *coord;
26742 +       lock_handle lh;
26743 +       struct reiser4_dentry_fsdata *fsdata;
26744 +       __u64 tograb;
26745 +
26746 +       assert("nikita-1124", dir != NULL);
26747 +       assert("nikita-1125", dentry != NULL);
26748 +
26749 +       tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir);
26750 +       result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED);
26751 +       if (result != 0)
26752 +               return RETERR(-ENOSPC);
26753 +
26754 +       init_lh(&lh);
26755 +
26756 +       /* check for this entry in a directory. This is plugin method. */
26757 +       result = reiser4_find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry);
26758 +       fsdata = reiser4_get_dentry_fsdata(dentry);
26759 +       if (IS_ERR(fsdata)) {
26760 +               done_lh(&lh);
26761 +               return PTR_ERR(fsdata);
26762 +       }
26763 +
26764 +       coord = &fsdata->dec.entry_coord;
26765 +
26766 +       assert("nikita-3404",
26767 +              get_inode_oid(dentry->d_inode) != get_inode_oid(dir) ||
26768 +              dir->i_size <= 1);
26769 +
26770 +       coord_clear_iplug(coord);
26771 +       if (result == 0) {
26772 +               /* remove entry. Just pass control to the directory item
26773 +                  plugin. */
26774 +               assert("vs-542", inode_dir_item_plugin(dir));
26775 +               reiser4_seal_done(&fsdata->dec.entry_seal);
26776 +               reiser4_adjust_dir_file(dir, dentry, fsdata->dec.pos, -1);
26777 +               result =
26778 +                   WITH_COORD(coord,
26779 +                              rem_entry(dir, dentry, entry, coord, &lh));
26780 +               if (result == 0) {
26781 +                       if (dir->i_size >= 1)
26782 +                               INODE_DEC_FIELD(dir, i_size);
26783 +                       else {
26784 +                               warning("nikita-2509", "Dir %llu is runt",
26785 +                                       (unsigned long long)
26786 +                                       get_inode_oid(dir));
26787 +                               result = RETERR(-EIO);
26788 +                       }
26789 +
26790 +                       assert("nikita-3405", dentry->d_inode->i_nlink != 1 ||
26791 +                              dentry->d_inode->i_size != 2 ||
26792 +                              inode_dir_plugin(dentry->d_inode) == NULL);
26793 +               }
26794 +       }
26795 +       done_lh(&lh);
26796 +
26797 +       return result;
26798 +}
26799 +
26800 +static reiser4_block_nr estimate_init(struct inode *parent,
26801 +                                     struct inode *object);
26802 +static int create_dot_dotdot(struct inode *object, struct inode *parent);
26803 +
26804 +/* this is common implementation of init method of dir plugin
26805 +   create "." and ".." entries
26806 +*/
26807 +int reiser4_dir_init_common(struct inode *object,      /* new directory */
26808 +                           struct inode *parent,       /* parent directory */
26809 +                           reiser4_object_create_data * data /* info passed
26810 +                                                              * to us, this
26811 +                                                              * is filled by
26812 +                                                              * reiser4()
26813 +                                                              * syscall in
26814 +                                                              * particular */)
26815 +{
26816 +       reiser4_block_nr reserve;
26817 +
26818 +       assert("nikita-680", object != NULL);
26819 +       assert("nikita-681", S_ISDIR(object->i_mode));
26820 +       assert("nikita-682", parent != NULL);
26821 +       assert("nikita-684", data != NULL);
26822 +       assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID);
26823 +       assert("nikita-687", object->i_mode & S_IFDIR);
26824 +
26825 +       reserve = estimate_init(parent, object);
26826 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
26827 +               return RETERR(-ENOSPC);
26828 +
26829 +       return create_dot_dotdot(object, parent);
26830 +}
26831 +
26832 +/* this is common implementation of done method of dir plugin
26833 +   remove "." entry
26834 +*/
26835 +int reiser4_dir_done_common(struct inode *object /* object being deleted */ )
26836 +{
26837 +       int result;
26838 +       reiser4_block_nr reserve;
26839 +       struct dentry goodby_dots;
26840 +       reiser4_dir_entry_desc entry;
26841 +
26842 +       assert("nikita-1449", object != NULL);
26843 +
26844 +       if (reiser4_inode_get_flag(object, REISER4_NO_SD))
26845 +               return 0;
26846 +
26847 +       /* of course, this can be rewritten to sweep everything in one
26848 +          reiser4_cut_tree(). */
26849 +       memset(&entry, 0, sizeof entry);
26850 +
26851 +       /* FIXME: this done method is called from reiser4_delete_dir_common which
26852 +        * reserved space already */
26853 +       reserve = inode_dir_plugin(object)->estimate.rem_entry(object);
26854 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED))
26855 +               return RETERR(-ENOSPC);
26856 +
26857 +       memset(&goodby_dots, 0, sizeof goodby_dots);
26858 +       entry.obj = goodby_dots.d_inode = object;
26859 +       goodby_dots.d_name.name = ".";
26860 +       goodby_dots.d_name.len = 1;
26861 +       result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
26862 +       reiser4_free_dentry_fsdata(&goodby_dots);
26863 +       if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
26864 +               /* only worth a warning
26865 +
26866 +                  "values of \ eB\ f will give rise to dom!\n"
26867 +                  -- v6src/s2/mv.c:89
26868 +                */
26869 +               warning("nikita-2252", "Cannot remove dot of %lli: %i",
26870 +                       (unsigned long long)get_inode_oid(object), result);
26871 +       return 0;
26872 +}
26873 +
26874 +/* this is common implementation of attach method of dir plugin
26875 +*/
26876 +int reiser4_attach_common(struct inode *child UNUSED_ARG,
26877 +                         struct inode *parent UNUSED_ARG)
26878 +{
26879 +       assert("nikita-2647", child != NULL);
26880 +       assert("nikita-2648", parent != NULL);
26881 +
26882 +       return 0;
26883 +}
26884 +
26885 +/* this is common implementation of detach method of dir plugin
26886 +   remove "..", decrease nlink on parent
26887 +*/
26888 +int reiser4_detach_common(struct inode *object, struct inode *parent)
26889 +{
26890 +       int result;
26891 +       struct dentry goodby_dots;
26892 +       reiser4_dir_entry_desc entry;
26893 +
26894 +       assert("nikita-2885", object != NULL);
26895 +       assert("nikita-2886", !reiser4_inode_get_flag(object, REISER4_NO_SD));
26896 +
26897 +       memset(&entry, 0, sizeof entry);
26898 +
26899 +       /* NOTE-NIKITA this only works if @parent is -the- parent of
26900 +          @object, viz. object whose key is stored in dotdot
26901 +          entry. Wouldn't work with hard-links on directories. */
26902 +       memset(&goodby_dots, 0, sizeof goodby_dots);
26903 +       entry.obj = goodby_dots.d_inode = parent;
26904 +       goodby_dots.d_name.name = "..";
26905 +       goodby_dots.d_name.len = 2;
26906 +       result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
26907 +       reiser4_free_dentry_fsdata(&goodby_dots);
26908 +       if (result == 0) {
26909 +               /* the dot should be the only entry remaining at this time... */
26910 +               assert("nikita-3400",
26911 +                      object->i_size == 1 && object->i_nlink <= 2);
26912 +#if 0
26913 +               /* and, together with the only name directory can have, they
26914 +                * provides for the last 2 remaining references. If we get
26915 +                * here as part of error handling during mkdir, @object
26916 +                * possibly has no name yet, so its nlink == 1. If we get here
26917 +                * from rename (targeting empty directory), it has no name
26918 +                * already, so its nlink == 1. */
26919 +               assert("nikita-3401",
26920 +                      object->i_nlink == 2 || object->i_nlink == 1);
26921 +#endif
26922 +
26923 +               /* decrement nlink of directory removed ".." pointed
26924 +                  to */
26925 +               reiser4_del_nlink(parent, NULL, 0);
26926 +       }
26927 +       return result;
26928 +}
26929 +
26930 +/* this is common implementation of estimate.add_entry method of
26931 +   dir plugin
26932 +   estimation of adding entry which supposes that entry is inserting a
26933 +   unit into item
26934 +*/
26935 +reiser4_block_nr estimate_add_entry_common(const struct inode * inode)
26936 +{
26937 +       return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
26938 +}
26939 +
26940 +/* this is common implementation of estimate.rem_entry method of dir
26941 +   plugin
26942 +*/
26943 +reiser4_block_nr estimate_rem_entry_common(const struct inode * inode)
26944 +{
26945 +       return estimate_one_item_removal(reiser4_tree_by_inode(inode));
26946 +}
26947 +
26948 +/* this is common implementation of estimate.unlink method of dir
26949 +   plugin
26950 +*/
26951 +reiser4_block_nr
26952 +dir_estimate_unlink_common(const struct inode * parent,
26953 +                          const struct inode * object)
26954 +{
26955 +       reiser4_block_nr res;
26956 +
26957 +       /* hashed_rem_entry(object) */
26958 +       res = inode_dir_plugin(object)->estimate.rem_entry(object);
26959 +       /* del_nlink(parent) */
26960 +       res += 2 * inode_file_plugin(parent)->estimate.update(parent);
26961 +
26962 +       return res;
26963 +}
26964 +
26965 +/*
26966 + * helper for inode_ops ->lookup() and dir plugin's ->get_parent()
26967 + * methods: if @inode is a light-weight file, setup its credentials
26968 + * that are not stored in the stat-data in this case
26969 + */
26970 +void check_light_weight(struct inode *inode, struct inode *parent)
26971 +{
26972 +       if (reiser4_inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) {
26973 +               inode->i_uid = parent->i_uid;
26974 +               inode->i_gid = parent->i_gid;
26975 +               /* clear light-weight flag. If inode would be read by any
26976 +                  other name, [ug]id wouldn't change. */
26977 +               reiser4_inode_clr_flag(inode, REISER4_LIGHT_WEIGHT);
26978 +       }
26979 +}
26980 +
26981 +/* looks for name specified in @dentry in directory @parent and if name is
26982 +   found - key of object found entry points to is stored in @entry->key */
26983 +int reiser4_lookup_name(struct inode *parent,  /* inode of directory to lookup for
26984 +                                        * name in */
26985 +               struct dentry *dentry,  /* name to look for */
26986 +               reiser4_key * key /* place to store key */ )
26987 +{
26988 +       int result;
26989 +       coord_t *coord;
26990 +       lock_handle lh;
26991 +       const char *name;
26992 +       int len;
26993 +       reiser4_dir_entry_desc entry;
26994 +       struct reiser4_dentry_fsdata *fsdata;
26995 +
26996 +       assert("nikita-1247", parent != NULL);
26997 +       assert("nikita-1248", dentry != NULL);
26998 +       assert("nikita-1123", dentry->d_name.name != NULL);
26999 +       assert("vs-1486",
27000 +              dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry);
27001 +
27002 +       name = dentry->d_name.name;
27003 +       len = dentry->d_name.len;
27004 +
27005 +       if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len))
27006 +               /* some arbitrary error code to return */
27007 +               return RETERR(-ENAMETOOLONG);
27008 +
27009 +       fsdata = reiser4_get_dentry_fsdata(dentry);
27010 +       if (IS_ERR(fsdata))
27011 +               return PTR_ERR(fsdata);
27012 +
27013 +       coord = &fsdata->dec.entry_coord;
27014 +       coord_clear_iplug(coord);
27015 +       init_lh(&lh);
27016 +
27017 +       /* find entry in a directory. This is plugin method. */
27018 +       result = reiser4_find_entry(parent, dentry, &lh, ZNODE_READ_LOCK,
27019 +                                   &entry);
27020 +       if (result == 0) {
27021 +               /* entry was found, extract object key from it. */
27022 +               result =
27023 +                   WITH_COORD(coord,
27024 +                              item_plugin_by_coord(coord)->s.dir.
27025 +                              extract_key(coord, key));
27026 +       }
27027 +       done_lh(&lh);
27028 +       return result;
27029 +
27030 +}
27031 +
27032 +/* helper for reiser4_dir_init_common(): estimate number of blocks to reserve */
27033 +static reiser4_block_nr
27034 +estimate_init(struct inode *parent, struct inode *object)
27035 +{
27036 +       reiser4_block_nr res = 0;
27037 +
27038 +       assert("vpf-321", parent != NULL);
27039 +       assert("vpf-322", object != NULL);
27040 +
27041 +       /* hashed_add_entry(object) */
27042 +       res += inode_dir_plugin(object)->estimate.add_entry(object);
27043 +       /* reiser4_add_nlink(object) */
27044 +       res += inode_file_plugin(object)->estimate.update(object);
27045 +       /* hashed_add_entry(object) */
27046 +       res += inode_dir_plugin(object)->estimate.add_entry(object);
27047 +       /* reiser4_add_nlink(parent) */
27048 +       res += inode_file_plugin(parent)->estimate.update(parent);
27049 +
27050 +       return 0;
27051 +}
27052 +
27053 +/* helper function for reiser4_dir_init_common(). Create "." and ".." */
27054 +static int create_dot_dotdot(struct inode *object /* object to create dot and
27055 +                                                  * dotdot for */ ,
27056 +                            struct inode *parent /* parent of @object */)
27057 +{
27058 +       int result;
27059 +       struct dentry dots_entry;
27060 +       reiser4_dir_entry_desc entry;
27061 +
27062 +       assert("nikita-688", object != NULL);
27063 +       assert("nikita-689", S_ISDIR(object->i_mode));
27064 +       assert("nikita-691", parent != NULL);
27065 +
27066 +       /* We store dot and dotdot as normal directory entries. This is
27067 +          not necessary, because almost all information stored in them
27068 +          is already in the stat-data of directory, the only thing
27069 +          being missed is objectid of grand-parent directory that can
27070 +          easily be added there as extension.
27071 +
27072 +          But it is done the way it is done, because not storing dot
27073 +          and dotdot will lead to the following complications:
27074 +
27075 +          . special case handling in ->lookup().
27076 +          . addition of another extension to the sd.
27077 +          . dependency on key allocation policy for stat data.
27078 +
27079 +        */
27080 +
27081 +       memset(&entry, 0, sizeof entry);
27082 +       memset(&dots_entry, 0, sizeof dots_entry);
27083 +       entry.obj = dots_entry.d_inode = object;
27084 +       dots_entry.d_name.name = ".";
27085 +       dots_entry.d_name.len = 1;
27086 +       result = reiser4_add_entry_common(object, &dots_entry, NULL, &entry);
27087 +       reiser4_free_dentry_fsdata(&dots_entry);
27088 +
27089 +       if (result == 0) {
27090 +               result = reiser4_add_nlink(object, object, 0);
27091 +               if (result == 0) {
27092 +                       entry.obj = dots_entry.d_inode = parent;
27093 +                       dots_entry.d_name.name = "..";
27094 +                       dots_entry.d_name.len = 2;
27095 +                       result = reiser4_add_entry_common(object,
27096 +                                                 &dots_entry, NULL, &entry);
27097 +                       reiser4_free_dentry_fsdata(&dots_entry);
27098 +                       /* if creation of ".." failed, iput() will delete
27099 +                          object with ".". */
27100 +                       if (result == 0) {
27101 +                               result = reiser4_add_nlink(parent, object, 0);
27102 +                               if (result != 0)
27103 +                                       /*
27104 +                                        * if we failed to bump i_nlink, try
27105 +                                        * to remove ".."
27106 +                                        */
27107 +                                       reiser4_detach_common(object, parent);
27108 +                       }
27109 +               }
27110 +       }
27111 +
27112 +       if (result != 0) {
27113 +               /*
27114 +                * in the case of error, at least update stat-data so that,
27115 +                * ->i_nlink updates are not lingering.
27116 +                */
27117 +               reiser4_update_sd(object);
27118 +               reiser4_update_sd(parent);
27119 +       }
27120 +
27121 +       return result;
27122 +}
27123 +
27124 +/*
27125 + * return 0 iff @coord contains a directory entry for the file with the name
27126 + * @name.
27127 + */
27128 +static int
27129 +check_item(const struct inode *dir, const coord_t * coord, const char *name)
27130 +{
27131 +       item_plugin *iplug;
27132 +       char buf[DE_NAME_BUF_LEN];
27133 +
27134 +       iplug = item_plugin_by_coord(coord);
27135 +       if (iplug == NULL) {
27136 +               warning("nikita-1135", "Cannot get item plugin");
27137 +               print_coord("coord", coord, 1);
27138 +               return RETERR(-EIO);
27139 +       } else if (item_id_by_coord(coord) !=
27140 +                  item_id_by_plugin(inode_dir_item_plugin(dir))) {
27141 +               /* item id of current item does not match to id of items a
27142 +                  directory is built of */
27143 +               warning("nikita-1136", "Wrong item plugin");
27144 +               print_coord("coord", coord, 1);
27145 +               return RETERR(-EIO);
27146 +       }
27147 +       assert("nikita-1137", iplug->s.dir.extract_name);
27148 +
27149 +       /* Compare name stored in this entry with name we are looking for.
27150 +
27151 +          NOTE-NIKITA Here should go code for support of something like
27152 +          unicode, code tables, etc.
27153 +        */
27154 +       return !!strcmp(name, iplug->s.dir.extract_name(coord, buf));
27155 +}
27156 +
27157 +static int
27158 +check_entry(const struct inode *dir, coord_t * coord, const struct qstr *name)
27159 +{
27160 +       return WITH_COORD(coord, check_item(dir, coord, name->name));
27161 +}
27162 +
27163 +/*
27164 + * argument package used by entry_actor to scan entries with identical keys.
27165 + */
27166 +struct entry_actor_args {
27167 +       /* name we are looking for */
27168 +       const char *name;
27169 +       /* key of directory entry. entry_actor() scans through sequence of
27170 +        * items/units having the same key */
27171 +       reiser4_key *key;
27172 +       /* how many entries with duplicate key was scanned so far. */
27173 +       int non_uniq;
27174 +#if REISER4_USE_COLLISION_LIMIT
27175 +       /* scan limit */
27176 +       int max_non_uniq;
27177 +#endif
27178 +       /* return parameter: set to true, if ->name wasn't found */
27179 +       int not_found;
27180 +       /* what type of lock to take when moving to the next node during
27181 +        * scan */
27182 +       znode_lock_mode mode;
27183 +
27184 +       /* last coord that was visited during scan */
27185 +       coord_t last_coord;
27186 +       /* last node locked during scan */
27187 +       lock_handle last_lh;
27188 +       /* inode of directory */
27189 +       const struct inode *inode;
27190 +};
27191 +
27192 +/* Function called by reiser4_find_entry() to look for given name
27193 +   in the directory. */
27194 +static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ ,
27195 +                      coord_t * coord /* current coord */ ,
27196 +                      lock_handle * lh /* current lock handle */ ,
27197 +                      void *entry_actor_arg /* argument to scan */ )
27198 +{
27199 +       reiser4_key unit_key;
27200 +       struct entry_actor_args *args;
27201 +
27202 +       assert("nikita-1131", tree != NULL);
27203 +       assert("nikita-1132", coord != NULL);
27204 +       assert("nikita-1133", entry_actor_arg != NULL);
27205 +
27206 +       args = entry_actor_arg;
27207 +       ++args->non_uniq;
27208 +#if REISER4_USE_COLLISION_LIMIT
27209 +       if (args->non_uniq > args->max_non_uniq) {
27210 +               args->not_found = 1;
27211 +               /* hash collision overflow. */
27212 +               return RETERR(-EBUSY);
27213 +       }
27214 +#endif
27215 +
27216 +       /*
27217 +        * did we just reach the end of the sequence of items/units with
27218 +        * identical keys?
27219 +        */
27220 +       if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) {
27221 +               assert("nikita-1791",
27222 +                      keylt(args->key, unit_key_by_coord(coord, &unit_key)));
27223 +               args->not_found = 1;
27224 +               args->last_coord.between = AFTER_UNIT;
27225 +               return 0;
27226 +       }
27227 +
27228 +       coord_dup(&args->last_coord, coord);
27229 +       /*
27230 +        * did scan just moved to the next node?
27231 +        */
27232 +       if (args->last_lh.node != lh->node) {
27233 +               int lock_result;
27234 +
27235 +               /*
27236 +                * if so, lock new node with the mode requested by the caller
27237 +                */
27238 +               done_lh(&args->last_lh);
27239 +               assert("nikita-1896", znode_is_any_locked(lh->node));
27240 +               lock_result = longterm_lock_znode(&args->last_lh, lh->node,
27241 +                                                 args->mode, ZNODE_LOCK_HIPRI);
27242 +               if (lock_result != 0)
27243 +                       return lock_result;
27244 +       }
27245 +       return check_item(args->inode, coord, args->name);
27246 +}
27247 +
27248 +/* Look for given @name within directory @dir.
27249 +
27250 +   This is called during lookup, creation and removal of directory
27251 +   entries and on reiser4_rename_common
27252 +
27253 +   First calculate key that directory entry for @name would have. Search
27254 +   for this key in the tree. If such key is found, scan all items with
27255 +   the same key, checking name in each directory entry along the way.
27256 +*/
27257 +int reiser4_find_entry(struct inode *dir,      /* directory to scan */
27258 +                      struct dentry *de,       /* name to search for */
27259 +                      lock_handle * lh,        /* resulting lock handle */
27260 +                      znode_lock_mode mode,    /* required lock mode */
27261 +                      reiser4_dir_entry_desc * entry   /* parameters of found
27262 +                                                          directory entry */)
27263 +{
27264 +       const struct qstr *name;
27265 +       seal_t *seal;
27266 +       coord_t *coord;
27267 +       int result;
27268 +       __u32 flags;
27269 +       struct de_location *dec;
27270 +       struct reiser4_dentry_fsdata *fsdata;
27271 +
27272 +       assert("nikita-1130", lh != NULL);
27273 +       assert("nikita-1128", dir != NULL);
27274 +
27275 +       name = &de->d_name;
27276 +       assert("nikita-1129", name != NULL);
27277 +
27278 +       /* dentry private data don't require lock, because dentry
27279 +          manipulations are protected by i_mutex on parent.
27280 +
27281 +          This is not so for inodes, because there is no -the- parent in
27282 +          inode case.
27283 +        */
27284 +       fsdata = reiser4_get_dentry_fsdata(de);
27285 +       if (IS_ERR(fsdata))
27286 +               return PTR_ERR(fsdata);
27287 +       dec = &fsdata->dec;
27288 +
27289 +       coord = &dec->entry_coord;
27290 +       coord_clear_iplug(coord);
27291 +       seal = &dec->entry_seal;
27292 +       /* compose key of directory entry for @name */
27293 +       inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key);
27294 +
27295 +       if (reiser4_seal_is_set(seal)) {
27296 +               /* check seal */
27297 +               result = reiser4_seal_validate(seal, coord, &entry->key,
27298 +                                              lh, mode, ZNODE_LOCK_LOPRI);
27299 +               if (result == 0) {
27300 +                       /* key was found. Check that it is really item we are
27301 +                          looking for. */
27302 +                       result = check_entry(dir, coord, name);
27303 +                       if (result == 0)
27304 +                               return 0;
27305 +               }
27306 +       }
27307 +       flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
27308 +       /*
27309 +        * find place in the tree where directory item should be located.
27310 +        */
27311 +       result = reiser4_object_lookup(dir, &entry->key, coord, lh, mode,
27312 +                                      FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL,
27313 +                                      flags, NULL /*ra_info */ );
27314 +       if (result == CBK_COORD_FOUND) {
27315 +               struct entry_actor_args arg;
27316 +
27317 +               /* fast path: no hash collisions */
27318 +               result = check_entry(dir, coord, name);
27319 +               if (result == 0) {
27320 +                       reiser4_seal_init(seal, coord, &entry->key);
27321 +                       dec->pos = 0;
27322 +               } else if (result > 0) {
27323 +                       /* Iterate through all units with the same keys. */
27324 +                       arg.name = name->name;
27325 +                       arg.key = &entry->key;
27326 +                       arg.not_found = 0;
27327 +                       arg.non_uniq = 0;
27328 +#if REISER4_USE_COLLISION_LIMIT
27329 +                       arg.max_non_uniq = max_hash_collisions(dir);
27330 +                       assert("nikita-2851", arg.max_non_uniq > 1);
27331 +#endif
27332 +                       arg.mode = mode;
27333 +                       arg.inode = dir;
27334 +                       coord_init_zero(&arg.last_coord);
27335 +                       init_lh(&arg.last_lh);
27336 +
27337 +                       result = reiser4_iterate_tree
27338 +                               (reiser4_tree_by_inode(dir),
27339 +                                coord, lh,
27340 +                                entry_actor, &arg, mode, 1);
27341 +                       /* if end of the tree or extent was reached during
27342 +                          scanning. */
27343 +                       if (arg.not_found || (result == -E_NO_NEIGHBOR)) {
27344 +                               /* step back */
27345 +                               done_lh(lh);
27346 +
27347 +                               result = zload(arg.last_coord.node);
27348 +                               if (result == 0) {
27349 +                                       coord_clear_iplug(&arg.last_coord);
27350 +                                       coord_dup(coord, &arg.last_coord);
27351 +                                       move_lh(lh, &arg.last_lh);
27352 +                                       result = RETERR(-ENOENT);
27353 +                                       zrelse(arg.last_coord.node);
27354 +                                       --arg.non_uniq;
27355 +                               }
27356 +                       }
27357 +
27358 +                       done_lh(&arg.last_lh);
27359 +                       if (result == 0)
27360 +                               reiser4_seal_init(seal, coord, &entry->key);
27361 +
27362 +                       if (result == 0 || result == -ENOENT) {
27363 +                               assert("nikita-2580", arg.non_uniq > 0);
27364 +                               dec->pos = arg.non_uniq - 1;
27365 +                       }
27366 +               }
27367 +       } else
27368 +               dec->pos = -1;
27369 +       return result;
27370 +}
27371 +
27372 +/*
27373 +   Local variables:
27374 +   c-indentation-style: "K&R"
27375 +   mode-name: "LC"
27376 +   c-basic-offset: 8
27377 +   tab-width: 8
27378 +   fill-column: 120
27379 +   scroll-step: 1
27380 +   End:
27381 +*/
27382 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/disk_format/disk_format40.c linux-2.6.27/fs/reiser4/plugin/disk_format/disk_format40.c
27383 --- linux-2.6.27.orig/fs/reiser4/plugin/disk_format/disk_format40.c     1970-01-01 03:00:00.000000000 +0300
27384 +++ linux-2.6.27/fs/reiser4/plugin/disk_format/disk_format40.c  2008-10-12 18:20:00.000000000 +0400
27385 @@ -0,0 +1,655 @@
27386 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
27387 +
27388 +#include "../../debug.h"
27389 +#include "../../dformat.h"
27390 +#include "../../key.h"
27391 +#include "../node/node.h"
27392 +#include "../space/space_allocator.h"
27393 +#include "disk_format40.h"
27394 +#include "../plugin.h"
27395 +#include "../../txnmgr.h"
27396 +#include "../../jnode.h"
27397 +#include "../../tree.h"
27398 +#include "../../super.h"
27399 +#include "../../wander.h"
27400 +#include "../../inode.h"
27401 +#include "../../ktxnmgrd.h"
27402 +#include "../../status_flags.h"
27403 +
27404 +#include <linux/types.h>       /* for __u??  */
27405 +#include <linux/fs.h>          /* for struct super_block  */
27406 +#include <linux/buffer_head.h>
27407 +
27408 +/* reiser 4.0 default disk layout */
27409 +
27410 +/* Amount of free blocks needed to perform release_format40 when fs gets
27411 +   mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header
27412 +   & tx record. */
27413 +#define RELEASE_RESERVED 4
27414 +
27415 +/* The greatest supported format40 version number */
27416 +#define FORMAT40_VERSION PLUGIN_LIBRARY_VERSION
27417 +
27418 +/* This flag indicates that backup should be updated
27419 +   (the update is performed by fsck) */
27420 +#define FORMAT40_UPDATE_BACKUP (1 << 31)
27421 +
27422 +/* functions to access fields of format40_disk_super_block */
27423 +static __u64 get_format40_block_count(const format40_disk_super_block * sb)
27424 +{
27425 +       return le64_to_cpu(get_unaligned(&sb->block_count));
27426 +}
27427 +
27428 +static __u64 get_format40_free_blocks(const format40_disk_super_block * sb)
27429 +{
27430 +       return le64_to_cpu(get_unaligned(&sb->free_blocks));
27431 +}
27432 +
27433 +static __u64 get_format40_root_block(const format40_disk_super_block * sb)
27434 +{
27435 +       return le64_to_cpu(get_unaligned(&sb->root_block));
27436 +}
27437 +
27438 +static __u16 get_format40_tree_height(const format40_disk_super_block * sb)
27439 +{
27440 +       return le16_to_cpu(get_unaligned(&sb->tree_height));
27441 +}
27442 +
27443 +static __u64 get_format40_file_count(const format40_disk_super_block * sb)
27444 +{
27445 +       return le64_to_cpu(get_unaligned(&sb->file_count));
27446 +}
27447 +
27448 +static __u64 get_format40_oid(const format40_disk_super_block * sb)
27449 +{
27450 +       return le64_to_cpu(get_unaligned(&sb->oid));
27451 +}
27452 +
27453 +static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb)
27454 +{
27455 +       return le32_to_cpu(get_unaligned(&sb->mkfs_id));
27456 +}
27457 +
27458 +static __u64 get_format40_flags(const format40_disk_super_block * sb)
27459 +{
27460 +       return le64_to_cpu(get_unaligned(&sb->flags));
27461 +}
27462 +
27463 +static __u32 get_format40_version(const format40_disk_super_block * sb)
27464 +{
27465 +       return le32_to_cpu(get_unaligned(&sb->version)) &
27466 +               ~FORMAT40_UPDATE_BACKUP;
27467 +}
27468 +
27469 +static int update_backup_version(const format40_disk_super_block * sb)
27470 +{
27471 +       return (le32_to_cpu(get_unaligned(&sb->version)) &
27472 +               FORMAT40_UPDATE_BACKUP);
27473 +}
27474 +
27475 +static int update_disk_version(const format40_disk_super_block * sb)
27476 +{
27477 +       return (get_format40_version(sb) < FORMAT40_VERSION);
27478 +}
27479 +
27480 +static int incomplete_compatibility(const format40_disk_super_block * sb)
27481 +{
27482 +       return (get_format40_version(sb) > FORMAT40_VERSION);
27483 +}
27484 +
27485 +static format40_super_info *get_sb_info(struct super_block *super)
27486 +{
27487 +       return &get_super_private(super)->u.format40;
27488 +}
27489 +
27490 +static int consult_diskmap(struct super_block *s)
27491 +{
27492 +       format40_super_info *info;
27493 +       journal_location *jloc;
27494 +
27495 +       info = get_sb_info(s);
27496 +       jloc = &get_super_private(s)->jloc;
27497 +       /* Default format-specific locations, if there is nothing in
27498 +        * diskmap */
27499 +       jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR;
27500 +       jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR;
27501 +       info->loc.super = FORMAT40_OFFSET / s->s_blocksize;
27502 +#ifdef CONFIG_REISER4_BADBLOCKS
27503 +       reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF,
27504 +                                 &jloc->footer);
27505 +       reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH,
27506 +                                 &jloc->header);
27507 +       reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER,
27508 +                                 &info->loc.super);
27509 +#endif
27510 +       return 0;
27511 +}
27512 +
27513 +/* find any valid super block of disk_format40 (even if the first
27514 +   super block is destroyed), will change block numbers of actual journal header/footer (jf/jh)
27515 +   if needed */
27516 +static struct buffer_head *find_a_disk_format40_super_block(struct super_block
27517 +                                                           *s)
27518 +{
27519 +       struct buffer_head *super_bh;
27520 +       format40_disk_super_block *disk_sb;
27521 +       format40_super_info *info;
27522 +
27523 +       assert("umka-487", s != NULL);
27524 +
27525 +       info = get_sb_info(s);
27526 +
27527 +       super_bh = sb_bread(s, info->loc.super);
27528 +       if (super_bh == NULL)
27529 +               return ERR_PTR(RETERR(-EIO));
27530 +
27531 +       disk_sb = (format40_disk_super_block *) super_bh->b_data;
27532 +       if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) {
27533 +               brelse(super_bh);
27534 +               return ERR_PTR(RETERR(-EINVAL));
27535 +       }
27536 +
27537 +       reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)));
27538 +       reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) -
27539 +                               le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
27540 +       reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
27541 +
27542 +       return super_bh;
27543 +}
27544 +
27545 +/* find the most recent version of super block. This is called after journal is
27546 +   replayed */
27547 +static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG)
27548 +{
27549 +       /* Here the most recent superblock copy has to be read. However, as
27550 +          journal replay isn't complete, we are using
27551 +          find_a_disk_format40_super_block() function. */
27552 +       return find_a_disk_format40_super_block(s);
27553 +}
27554 +
27555 +static int get_super_jnode(struct super_block *s)
27556 +{
27557 +       reiser4_super_info_data *sbinfo = get_super_private(s);
27558 +       jnode *sb_jnode;
27559 +       int ret;
27560 +
27561 +       sb_jnode = reiser4_alloc_io_head(&get_sb_info(s)->loc.super);
27562 +
27563 +       ret = jload(sb_jnode);
27564 +
27565 +       if (ret) {
27566 +               reiser4_drop_io_head(sb_jnode);
27567 +               return ret;
27568 +       }
27569 +
27570 +       pin_jnode_data(sb_jnode);
27571 +       jrelse(sb_jnode);
27572 +
27573 +       sbinfo->u.format40.sb_jnode = sb_jnode;
27574 +
27575 +       return 0;
27576 +}
27577 +
27578 +static void done_super_jnode(struct super_block *s)
27579 +{
27580 +       jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode;
27581 +
27582 +       if (sb_jnode) {
27583 +               unpin_jnode_data(sb_jnode);
27584 +               reiser4_drop_io_head(sb_jnode);
27585 +       }
27586 +}
27587 +
27588 +typedef enum format40_init_stage {
27589 +       NONE_DONE = 0,
27590 +       CONSULT_DISKMAP,
27591 +       FIND_A_SUPER,
27592 +       INIT_JOURNAL_INFO,
27593 +       INIT_STATUS,
27594 +       JOURNAL_REPLAY,
27595 +       READ_SUPER,
27596 +       KEY_CHECK,
27597 +       INIT_OID,
27598 +       INIT_TREE,
27599 +       JOURNAL_RECOVER,
27600 +       INIT_SA,
27601 +       INIT_JNODE,
27602 +       ALL_DONE
27603 +} format40_init_stage;
27604 +
27605 +static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh)
27606 +{
27607 +       format40_disk_super_block *sb_copy;
27608 +
27609 +       sb_copy = kmalloc(sizeof(format40_disk_super_block),
27610 +                         reiser4_ctx_gfp_mask_get());
27611 +       if (sb_copy == NULL)
27612 +               return ERR_PTR(RETERR(-ENOMEM));
27613 +       memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data),
27614 +              sizeof(format40_disk_super_block));
27615 +       return sb_copy;
27616 +}
27617 +
27618 +static int check_key_format(const format40_disk_super_block *sb_copy)
27619 +{
27620 +       if (!equi(REISER4_LARGE_KEY,
27621 +                 get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) {
27622 +               warning("nikita-3228", "Key format mismatch. "
27623 +                       "Only %s keys are supported.",
27624 +                       REISER4_LARGE_KEY ? "large" : "small");
27625 +               return RETERR(-EINVAL);
27626 +       }
27627 +       return 0;
27628 +}
27629 +
27630 +/**
27631 + * try_init_format40
27632 + * @super:
27633 + * @stage:
27634 + *
27635 + */
27636 +static int try_init_format40(struct super_block *super,
27637 +                            format40_init_stage *stage)
27638 +{
27639 +       int result;
27640 +       struct buffer_head *super_bh;
27641 +       reiser4_super_info_data *sbinfo;
27642 +       format40_disk_super_block *sb_copy;
27643 +       tree_level height;
27644 +       reiser4_block_nr root_block;
27645 +       node_plugin *nplug;
27646 +
27647 +       assert("vs-475", super != NULL);
27648 +       assert("vs-474", get_super_private(super));
27649 +
27650 +       *stage = NONE_DONE;
27651 +
27652 +       result = consult_diskmap(super);
27653 +       if (result)
27654 +               return result;
27655 +       *stage = CONSULT_DISKMAP;
27656 +
27657 +       super_bh = find_a_disk_format40_super_block(super);
27658 +       if (IS_ERR(super_bh))
27659 +               return PTR_ERR(super_bh);
27660 +       brelse(super_bh);
27661 +       *stage = FIND_A_SUPER;
27662 +
27663 +       /* ok, we are sure that filesystem format is a format40 format */
27664 +
27665 +       /* map jnodes for journal control blocks (header, footer) to disk  */
27666 +       result = reiser4_init_journal_info(super);
27667 +       if (result)
27668 +               return result;
27669 +       *stage = INIT_JOURNAL_INFO;
27670 +
27671 +       /* ok, we are sure that filesystem format is a format40 format */
27672 +       /* Now check it's state */
27673 +       result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR);
27674 +       if (result != 0 && result != -EINVAL)
27675 +               /* -EINVAL means there is no magic, so probably just old
27676 +                * fs. */
27677 +               return result;
27678 +       *stage = INIT_STATUS;
27679 +
27680 +       result = reiser4_status_query(NULL, NULL);
27681 +       if (result == REISER4_STATUS_MOUNT_WARN)
27682 +               notice("vpf-1363", "Warning: mounting %s with errors.",
27683 +                      super->s_id);
27684 +       if (result == REISER4_STATUS_MOUNT_RO)
27685 +               notice("vpf-1364", "Warning: mounting %s with fatal errors,"
27686 +                      " forcing read-only mount.", super->s_id);
27687 +       result = reiser4_journal_replay(super);
27688 +       if (result)
27689 +               return result;
27690 +       *stage = JOURNAL_REPLAY;
27691 +
27692 +       super_bh = read_super_block(super);
27693 +       if (IS_ERR(super_bh))
27694 +               return PTR_ERR(super_bh);
27695 +       *stage = READ_SUPER;
27696 +
27697 +       /* allocate and make a copy of format40_disk_super_block */
27698 +       sb_copy = copy_sb(super_bh);
27699 +       brelse(super_bh);
27700 +
27701 +       if (IS_ERR(sb_copy))
27702 +               return PTR_ERR(sb_copy);
27703 +       printk("reiser4: %s: found disk format 4.0.%u.\n",
27704 +              super->s_id,
27705 +              get_format40_version(sb_copy));
27706 +       if (incomplete_compatibility(sb_copy))
27707 +               printk("reiser4: Warning: The last completely supported "
27708 +                      "version of disk format40 is %u. Some objects of "
27709 +                      "the semantic tree can be unaccessible.\n",
27710 +                      FORMAT40_VERSION);
27711 +       /* make sure that key format of kernel and filesystem match */
27712 +       result = check_key_format(sb_copy);
27713 +       if (result) {
27714 +               kfree(sb_copy);
27715 +               return result;
27716 +       }
27717 +       *stage = KEY_CHECK;
27718 +
27719 +       result = oid_init_allocator(super, get_format40_file_count(sb_copy),
27720 +                                   get_format40_oid(sb_copy));
27721 +       if (result) {
27722 +               kfree(sb_copy);
27723 +               return result;
27724 +       }
27725 +       *stage = INIT_OID;
27726 +
27727 +       /* get things necessary to init reiser4_tree */
27728 +       root_block = get_format40_root_block(sb_copy);
27729 +       height = get_format40_tree_height(sb_copy);
27730 +       nplug = node_plugin_by_id(NODE40_ID);
27731 +
27732 +       /* initialize reiser4_super_info_data */
27733 +       sbinfo = get_super_private(super);
27734 +       assert("", sbinfo->tree.super == super);
27735 +       /* init reiser4_tree for the filesystem */
27736 +       result = reiser4_init_tree(&sbinfo->tree, &root_block, height, nplug);
27737 +       if (result) {
27738 +               kfree(sb_copy);
27739 +               return result;
27740 +       }
27741 +       *stage = INIT_TREE;
27742 +
27743 +       /*
27744 +        * initialize reiser4_super_info_data with data from format40 super
27745 +        * block
27746 +        */
27747 +       sbinfo->default_uid = 0;
27748 +       sbinfo->default_gid = 0;
27749 +       sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy);
27750 +       /* number of blocks in filesystem and reserved space */
27751 +       reiser4_set_block_count(super, get_format40_block_count(sb_copy));
27752 +       sbinfo->blocks_free = get_format40_free_blocks(sb_copy);
27753 +       sbinfo->version = get_format40_version(sb_copy);
27754 +       kfree(sb_copy);
27755 +
27756 +       if (update_backup_version(sb_copy))
27757 +               printk("reiser4: Warning: metadata backup is not updated. "
27758 +                      "Please run 'fsck.reiser4 --fix' on %s.\n",
27759 +                      super->s_id);
27760 +
27761 +       sbinfo->fsuid = 0;
27762 +       sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories
27763 +                                                * are not supported */
27764 +       sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN);     /* all nodes in
27765 +                                                                * layout 40 are
27766 +                                                                * of one
27767 +                                                                * plugin */
27768 +       /* sbinfo->tmgr is initialized already */
27769 +
27770 +       /* recover sb data which were logged separately from sb block */
27771 +
27772 +       /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls
27773 +        * oid_init_allocator() and reiser4_set_free_blocks() with new
27774 +        * data. What's the reason to call them above? */
27775 +       result = reiser4_journal_recover_sb_data(super);
27776 +       if (result != 0)
27777 +               return result;
27778 +       *stage = JOURNAL_RECOVER;
27779 +
27780 +       /*
27781 +        * Set number of used blocks.  The number of used blocks is not stored
27782 +        * neither in on-disk super block nor in the journal footer blocks.  At
27783 +        * this moment actual values of total blocks and free block counters
27784 +        * are set in the reiser4 super block (in-memory structure) and we can
27785 +        * calculate number of used blocks from them.
27786 +        */
27787 +       reiser4_set_data_blocks(super,
27788 +                               reiser4_block_count(super) -
27789 +                               reiser4_free_blocks(super));
27790 +
27791 +#if REISER4_DEBUG
27792 +       sbinfo->min_blocks_used = 16 /* reserved area */  +
27793 +               2 /* super blocks */  +
27794 +               2 /* journal footer and header */ ;
27795 +#endif
27796 +
27797 +       /* init disk space allocator */
27798 +       result = sa_init_allocator(reiser4_get_space_allocator(super),
27799 +                                  super, NULL);
27800 +       if (result)
27801 +               return result;
27802 +       *stage = INIT_SA;
27803 +
27804 +       result = get_super_jnode(super);
27805 +       if (result == 0)
27806 +               *stage = ALL_DONE;
27807 +       return result;
27808 +}
27809 +
27810 +/* plugin->u.format.get_ready */
27811 +int init_format_format40(struct super_block *s, void *data UNUSED_ARG)
27812 +{
27813 +       int result;
27814 +       format40_init_stage stage;
27815 +
27816 +       result = try_init_format40(s, &stage);
27817 +       switch (stage) {
27818 +       case ALL_DONE:
27819 +               assert("nikita-3458", result == 0);
27820 +               break;
27821 +       case INIT_JNODE:
27822 +               done_super_jnode(s);
27823 +       case INIT_SA:
27824 +               sa_destroy_allocator(reiser4_get_space_allocator(s), s);
27825 +       case JOURNAL_RECOVER:
27826 +       case INIT_TREE:
27827 +               reiser4_done_tree(&get_super_private(s)->tree);
27828 +       case INIT_OID:
27829 +       case KEY_CHECK:
27830 +       case READ_SUPER:
27831 +       case JOURNAL_REPLAY:
27832 +       case INIT_STATUS:
27833 +               reiser4_status_finish();
27834 +       case INIT_JOURNAL_INFO:
27835 +               reiser4_done_journal_info(s);
27836 +       case FIND_A_SUPER:
27837 +       case CONSULT_DISKMAP:
27838 +       case NONE_DONE:
27839 +               break;
27840 +       default:
27841 +               impossible("nikita-3457", "init stage: %i", stage);
27842 +       }
27843 +
27844 +       if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED)
27845 +               return RETERR(-ENOSPC);
27846 +
27847 +       return result;
27848 +}
27849 +
27850 +static void pack_format40_super(const struct super_block *s, char *data)
27851 +{
27852 +       format40_disk_super_block *super_data =
27853 +           (format40_disk_super_block *) data;
27854 +
27855 +       reiser4_super_info_data *sbinfo = get_super_private(s);
27856 +
27857 +       assert("zam-591", data != NULL);
27858 +
27859 +       put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)),
27860 +                     &super_data->free_blocks);
27861 +
27862 +       put_unaligned(cpu_to_le64(sbinfo->tree.root_block),
27863 +                     &super_data->root_block);
27864 +
27865 +       put_unaligned(cpu_to_le64(oid_next(s)),
27866 +                     &super_data->oid);
27867 +
27868 +       put_unaligned(cpu_to_le64(oids_used(s)),
27869 +                     &super_data->file_count);
27870 +
27871 +       put_unaligned(cpu_to_le16(sbinfo->tree.height),
27872 +                     &super_data->tree_height);
27873 +
27874 +       if (update_disk_version(super_data)) {
27875 +               __u32 version = FORMAT40_VERSION | FORMAT40_UPDATE_BACKUP;
27876 +
27877 +               put_unaligned(cpu_to_le32(version), &super_data->version);
27878 +       }
27879 +}
27880 +
27881 +/* plugin->u.format.log_super
27882 +   return a jnode which should be added to transaction when the super block
27883 +   gets logged */
27884 +jnode *log_super_format40(struct super_block *s)
27885 +{
27886 +       jnode *sb_jnode;
27887 +
27888 +       sb_jnode = get_super_private(s)->u.format40.sb_jnode;
27889 +
27890 +       jload(sb_jnode);
27891 +
27892 +       pack_format40_super(s, jdata(sb_jnode));
27893 +
27894 +       jrelse(sb_jnode);
27895 +
27896 +       return sb_jnode;
27897 +}
27898 +
27899 +/* plugin->u.format.release */
27900 +int release_format40(struct super_block *s)
27901 +{
27902 +       int ret;
27903 +       reiser4_super_info_data *sbinfo;
27904 +
27905 +       sbinfo = get_super_private(s);
27906 +       assert("zam-579", sbinfo != NULL);
27907 +
27908 +       if (!rofs_super(s)) {
27909 +               ret = reiser4_capture_super_block(s);
27910 +               if (ret != 0)
27911 +                       warning("vs-898",
27912 +                               "reiser4_capture_super_block failed: %d",
27913 +                               ret);
27914 +
27915 +               ret = txnmgr_force_commit_all(s, 1);
27916 +               if (ret != 0)
27917 +                       warning("jmacd-74438", "txn_force failed: %d", ret);
27918 +
27919 +               all_grabbed2free();
27920 +       }
27921 +
27922 +       sa_destroy_allocator(&sbinfo->space_allocator, s);
27923 +       reiser4_done_journal_info(s);
27924 +       done_super_jnode(s);
27925 +
27926 +       rcu_barrier();
27927 +       reiser4_done_tree(&sbinfo->tree);
27928 +       /* call finish_rcu(), because some znode were "released" in
27929 +        * reiser4_done_tree(). */
27930 +       rcu_barrier();
27931 +
27932 +       return 0;
27933 +}
27934 +
27935 +#define FORMAT40_ROOT_LOCALITY 41
27936 +#define FORMAT40_ROOT_OBJECTID 42
27937 +
27938 +/* plugin->u.format.root_dir_key */
27939 +const reiser4_key *root_dir_key_format40(const struct super_block *super
27940 +                                        UNUSED_ARG)
27941 +{
27942 +       static const reiser4_key FORMAT40_ROOT_DIR_KEY = {
27943 +               .el = {
27944 +                       __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR),
27945 +#if REISER4_LARGE_KEY
27946 +                       ON_LARGE_KEY(0ull,)
27947 +#endif
27948 +                       __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID),
27949 +                       0ull
27950 +               }
27951 +       };
27952 +
27953 +       return &FORMAT40_ROOT_DIR_KEY;
27954 +}
27955 +
27956 +/* plugin->u.format.check_open.
27957 +   Check the opened object for validness. For now it checks for the valid oid &
27958 +   locality only, can be improved later and it its work may depend on the mount
27959 +   options. */
27960 +int check_open_format40(const struct inode *object)
27961 +{
27962 +       oid_t max, oid;
27963 +
27964 +       max = oid_next(object->i_sb) - 1;
27965 +
27966 +       /* Check the oid. */
27967 +       oid = get_inode_oid(object);
27968 +       if (oid > max) {
27969 +               warning("vpf-1360", "The object with the oid %llu "
27970 +                       "greater then the max used oid %llu found.",
27971 +                       (unsigned long long)oid, (unsigned long long)max);
27972 +
27973 +               return RETERR(-EIO);
27974 +       }
27975 +
27976 +       /* Check the locality. */
27977 +       oid = reiser4_inode_data(object)->locality_id;
27978 +       if (oid > max) {
27979 +               warning("vpf-1361", "The object with the locality %llu "
27980 +                       "greater then the max used oid %llu found.",
27981 +                       (unsigned long long)oid, (unsigned long long)max);
27982 +
27983 +               return RETERR(-EIO);
27984 +       }
27985 +
27986 +       return 0;
27987 +}
27988 +
27989 +/* plugin->u.format.version_update.
27990 +   Perform all version update operations from the on-disk
27991 +   format40_disk_super_block.version on disk to FORMAT40_VERSION.
27992 + */
27993 +int version_update_format40(struct super_block *super) {
27994 +       txn_handle * trans;
27995 +       lock_handle lh;
27996 +       txn_atom *atom;
27997 +       int ret;
27998 +
27999 +       /* Nothing to do if RO mount or the on-disk version is not less. */
28000 +       if (super->s_flags & MS_RDONLY)
28001 +               return 0;
28002 +
28003 +       if (get_super_private(super)->version >= FORMAT40_VERSION)
28004 +               return 0;
28005 +
28006 +       printk("reiser4: Updating disk format to 4.0.%u. The reiser4 metadata "
28007 +              "backup is left unchanged. Please run 'fsck.reiser4 --fix' "
28008 +              "on %s to update it too.\n", FORMAT40_VERSION, super->s_id);
28009 +
28010 +       /* Mark the uber znode dirty to call log_super on write_logs. */
28011 +       init_lh(&lh);
28012 +       ret = get_uber_znode(reiser4_get_tree(super), ZNODE_WRITE_LOCK,
28013 +                            ZNODE_LOCK_HIPRI, &lh);
28014 +       if (ret != 0)
28015 +               return ret;
28016 +
28017 +       znode_make_dirty(lh.node);
28018 +       done_lh(&lh);
28019 +
28020 +       /* Update the backup blocks. */
28021 +
28022 +       /* Force write_logs immediately. */
28023 +       trans = get_current_context()->trans;
28024 +       atom = get_current_atom_locked();
28025 +       assert("vpf-1906", atom != NULL);
28026 +
28027 +       spin_lock_txnh(trans);
28028 +       return force_commit_atom(trans);
28029 +}
28030 +
28031 +/* Make Linus happy.
28032 +   Local variables:
28033 +   c-indentation-style: "K&R"
28034 +   mode-name: "LC"
28035 +   c-basic-offset: 8
28036 +   tab-width: 8
28037 +   fill-column: 120
28038 +   scroll-step: 1
28039 +   End:
28040 +*/
28041 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/disk_format/disk_format40.h linux-2.6.27/fs/reiser4/plugin/disk_format/disk_format40.h
28042 --- linux-2.6.27.orig/fs/reiser4/plugin/disk_format/disk_format40.h     1970-01-01 03:00:00.000000000 +0300
28043 +++ linux-2.6.27/fs/reiser4/plugin/disk_format/disk_format40.h  2008-10-12 18:20:00.000000000 +0400
28044 @@ -0,0 +1,109 @@
28045 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28046 +
28047 +/* this file contains:
28048 +   - definition of ondisk super block of standart disk layout for
28049 +     reiser 4.0 (layout 40)
28050 +   - definition of layout 40 specific portion of in-core super block
28051 +   - declarations of functions implementing methods of layout plugin
28052 +     for layout 40
28053 +   - declarations of functions used to get/set fields in layout 40 super block
28054 +*/
28055 +
28056 +#ifndef __DISK_FORMAT40_H__
28057 +#define __DISK_FORMAT40_H__
28058 +
28059 +/* magic for default reiser4 layout */
28060 +#define FORMAT40_MAGIC "ReIsEr40FoRmAt"
28061 +#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE)
28062 +
28063 +#include "../../dformat.h"
28064 +
28065 +#include <linux/fs.h>          /* for struct super_block  */
28066 +
28067 +typedef enum {
28068 +       FORMAT40_LARGE_KEYS
28069 +} format40_flags;
28070 +
28071 +/* ondisk super block for format 40. It is 512 bytes long */
28072 +typedef struct format40_disk_super_block {
28073 +       /*   0 */ d64 block_count;
28074 +       /* number of block in a filesystem */
28075 +       /*   8 */ d64 free_blocks;
28076 +       /* number of free blocks */
28077 +       /*  16 */ d64 root_block;
28078 +       /* filesystem tree root block */
28079 +       /*  24 */ d64 oid;
28080 +       /* smallest free objectid */
28081 +       /*  32 */ d64 file_count;
28082 +       /* number of files in a filesystem */
28083 +       /*  40 */ d64 flushes;
28084 +       /* number of times super block was
28085 +          flushed. Needed if format 40
28086 +          will have few super blocks */
28087 +       /*  48 */ d32 mkfs_id;
28088 +       /* unique identifier of fs */
28089 +       /*  52 */ char magic[16];
28090 +       /* magic string ReIsEr40FoRmAt */
28091 +       /*  68 */ d16 tree_height;
28092 +       /* height of filesystem tree */
28093 +       /*  70 */ d16 formatting_policy;
28094 +       /* not used anymore */
28095 +       /*  72 */ d64 flags;
28096 +       /*  80 */ d32 version;
28097 +       /* on-disk format version number
28098 +          initially assigned by mkfs as the greatest format40
28099 +          version number supported by reiser4progs and updated
28100 +          in mount time in accordance with the greatest format40
28101 +          version number supported by kernel.
28102 +          Is used by fsck to catch possible corruption and
28103 +          for various compatibility issues */
28104 +       /*  84 */ char not_used[428];
28105 +} format40_disk_super_block;
28106 +
28107 +/* format 40 specific part of reiser4_super_info_data */
28108 +typedef struct format40_super_info {
28109 +/*     format40_disk_super_block actual_sb; */
28110 +       jnode *sb_jnode;
28111 +       struct {
28112 +               reiser4_block_nr super;
28113 +       } loc;
28114 +} format40_super_info;
28115 +
28116 +/* Defines for journal header and footer respectively. */
28117 +#define FORMAT40_JOURNAL_HEADER_BLOCKNR \
28118 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3)
28119 +
28120 +#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \
28121 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4)
28122 +
28123 +#define FORMAT40_STATUS_BLOCKNR \
28124 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5)
28125 +
28126 +/* Diskmap declarations */
28127 +#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID))
28128 +#define FORMAT40_SUPER 1
28129 +#define FORMAT40_JH 2
28130 +#define FORMAT40_JF 3
28131 +
28132 +/* declarations of functions implementing methods of layout plugin for
28133 +   format 40. The functions theirself are in disk_format40.c */
28134 +extern int init_format_format40(struct super_block *, void *data);
28135 +extern const reiser4_key *root_dir_key_format40(const struct super_block *);
28136 +extern int release_format40(struct super_block *s);
28137 +extern jnode *log_super_format40(struct super_block *s);
28138 +extern int check_open_format40(const struct inode *object);
28139 +extern int version_update_format40(struct super_block *super);
28140 +
28141 +/* __DISK_FORMAT40_H__ */
28142 +#endif
28143 +
28144 +/* Make Linus happy.
28145 +   Local variables:
28146 +   c-indentation-style: "K&R"
28147 +   mode-name: "LC"
28148 +   c-basic-offset: 8
28149 +   tab-width: 8
28150 +   fill-column: 120
28151 +   scroll-step: 1
28152 +   End:
28153 +*/
28154 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/disk_format/disk_format.c linux-2.6.27/fs/reiser4/plugin/disk_format/disk_format.c
28155 --- linux-2.6.27.orig/fs/reiser4/plugin/disk_format/disk_format.c       1970-01-01 03:00:00.000000000 +0300
28156 +++ linux-2.6.27/fs/reiser4/plugin/disk_format/disk_format.c    2008-10-12 18:20:01.000000000 +0400
28157 @@ -0,0 +1,38 @@
28158 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28159 +
28160 +#include "../../debug.h"
28161 +#include "../plugin_header.h"
28162 +#include "disk_format40.h"
28163 +#include "disk_format.h"
28164 +#include "../plugin.h"
28165 +
28166 +/* initialization of disk layout plugins */
28167 +disk_format_plugin format_plugins[LAST_FORMAT_ID] = {
28168 +       [FORMAT40_ID] = {
28169 +               .h = {
28170 +                       .type_id = REISER4_FORMAT_PLUGIN_TYPE,
28171 +                       .id = FORMAT40_ID,
28172 +                       .pops = NULL,
28173 +                       .label = "reiser40",
28174 +                       .desc = "standard disk layout for reiser40",
28175 +                       .linkage = {NULL, NULL}
28176 +               },
28177 +               .init_format = init_format_format40,
28178 +               .root_dir_key = root_dir_key_format40,
28179 +               .release = release_format40,
28180 +               .log_super = log_super_format40,
28181 +               .check_open = check_open_format40,
28182 +               .version_update = version_update_format40
28183 +       }
28184 +};
28185 +
28186 +/* Make Linus happy.
28187 +   Local variables:
28188 +   c-indentation-style: "K&R"
28189 +   mode-name: "LC"
28190 +   c-basic-offset: 8
28191 +   tab-width: 8
28192 +   fill-column: 120
28193 +   scroll-step: 1
28194 +   End:
28195 +*/
28196 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/disk_format/disk_format.h linux-2.6.27/fs/reiser4/plugin/disk_format/disk_format.h
28197 --- linux-2.6.27.orig/fs/reiser4/plugin/disk_format/disk_format.h       1970-01-01 03:00:00.000000000 +0300
28198 +++ linux-2.6.27/fs/reiser4/plugin/disk_format/disk_format.h    2008-10-12 18:20:01.000000000 +0400
28199 @@ -0,0 +1,27 @@
28200 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28201 +
28202 +/* identifiers for disk layouts, they are also used as indexes in array of disk
28203 +   plugins */
28204 +
28205 +#if !defined( __REISER4_DISK_FORMAT_H__ )
28206 +#define __REISER4_DISK_FORMAT_H__
28207 +
28208 +typedef enum {
28209 +       /* standard reiser4 disk layout plugin id */
28210 +       FORMAT40_ID,
28211 +       LAST_FORMAT_ID
28212 +} disk_format_id;
28213 +
28214 +/* __REISER4_DISK_FORMAT_H__ */
28215 +#endif
28216 +
28217 +/* Make Linus happy.
28218 +   Local variables:
28219 +   c-indentation-style: "K&R"
28220 +   mode-name: "LC"
28221 +   c-basic-offset: 8
28222 +   tab-width: 8
28223 +   fill-column: 120
28224 +   scroll-step: 1
28225 +   End:
28226 +*/
28227 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/disk_format/Makefile linux-2.6.27/fs/reiser4/plugin/disk_format/Makefile
28228 --- linux-2.6.27.orig/fs/reiser4/plugin/disk_format/Makefile    1970-01-01 03:00:00.000000000 +0300
28229 +++ linux-2.6.27/fs/reiser4/plugin/disk_format/Makefile 2008-10-12 18:20:01.000000000 +0400
28230 @@ -0,0 +1,5 @@
28231 +obj-$(CONFIG_REISER4_FS) += df_plugins.o
28232 +
28233 +df_plugins-objs :=     \
28234 +       disk_format40.o \
28235 +       disk_format.o
28236 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/fibration.c linux-2.6.27/fs/reiser4/plugin/fibration.c
28237 --- linux-2.6.27.orig/fs/reiser4/plugin/fibration.c     1970-01-01 03:00:00.000000000 +0300
28238 +++ linux-2.6.27/fs/reiser4/plugin/fibration.c  2008-10-12 18:20:01.000000000 +0400
28239 @@ -0,0 +1,175 @@
28240 +/* Copyright 2004 by Hans Reiser, licensing governed by
28241 + * reiser4/README */
28242 +
28243 +/* Directory fibrations */
28244 +
28245 +/*
28246 + * Suppose we have a directory tree with sources of some project. During
28247 + * compilation .o files are created within this tree. This makes access
28248 + * to the original source files less efficient, because source files are
28249 + * now "diluted" by object files: default directory plugin uses prefix
28250 + * of a file name as a part of the key for directory entry (and this
28251 + * part is also inherited by the key of file body). This means that
28252 + * foo.o will be located close to foo.c and foo.h in the tree.
28253 + *
28254 + * To avoid this effect directory plugin fill highest 7 (unused
28255 + * originally) bits of the second component of the directory entry key
28256 + * by bit-pattern depending on the file name (see
28257 + * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called
28258 + * "fibre". Fibre of the file name key is inherited by key of stat data
28259 + * and keys of file body (in the case of REISER4_LARGE_KEY).
28260 + *
28261 + * Fibre for a given file is chosen by per-directory fibration
28262 + * plugin. Names within given fibre are ordered lexicographically.
28263 + */
28264 +
28265 +#include "../debug.h"
28266 +#include "plugin_header.h"
28267 +#include "plugin.h"
28268 +#include "../super.h"
28269 +#include "../inode.h"
28270 +
28271 +#include <linux/types.h>
28272 +
28273 +static const int fibre_shift = 57;
28274 +
28275 +#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift)
28276 +
28277 +/*
28278 + * Trivial fibration: all files of directory are just ordered
28279 + * lexicographically.
28280 + */
28281 +static __u64 fibre_trivial(const struct inode *dir, const char *name, int len)
28282 +{
28283 +       return FIBRE_NO(0);
28284 +}
28285 +
28286 +/*
28287 + * dot-o fibration: place .o files after all others.
28288 + */
28289 +static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len)
28290 +{
28291 +       /* special treatment for .*\.o */
28292 +       if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.')
28293 +               return FIBRE_NO(1);
28294 +       else
28295 +               return FIBRE_NO(0);
28296 +}
28297 +
28298 +/*
28299 + * ext.1 fibration: subdivide directory into 128 fibrations one for each
28300 + * 7bit extension character (file "foo.h" goes into fibre "h"), plus
28301 + * default fibre for the rest.
28302 + */
28303 +static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len)
28304 +{
28305 +       if (len > 2 && name[len - 2] == '.')
28306 +               return FIBRE_NO(name[len - 1]);
28307 +       else
28308 +               return FIBRE_NO(0);
28309 +}
28310 +
28311 +/*
28312 + * ext.3 fibration: try to separate files with different 3-character
28313 + * extensions from each other.
28314 + */
28315 +static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len)
28316 +{
28317 +       if (len > 4 && name[len - 4] == '.')
28318 +               return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]);
28319 +       else
28320 +               return FIBRE_NO(0);
28321 +}
28322 +
28323 +static int change_fibration(struct inode *inode,
28324 +                           reiser4_plugin * plugin,
28325 +                           pset_member memb)
28326 +{
28327 +       int result;
28328 +
28329 +       assert("nikita-3503", inode != NULL);
28330 +       assert("nikita-3504", plugin != NULL);
28331 +
28332 +       assert("nikita-3505", is_reiser4_inode(inode));
28333 +       assert("nikita-3506", inode_dir_plugin(inode) != NULL);
28334 +       assert("nikita-3507",
28335 +              plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE);
28336 +
28337 +       result = 0;
28338 +       if (inode_fibration_plugin(inode) == NULL ||
28339 +           inode_fibration_plugin(inode)->h.id != plugin->h.id) {
28340 +               if (is_dir_empty(inode) == 0)
28341 +                       result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
28342 +                                                PSET_FIBRATION, plugin);
28343 +               else
28344 +                       result = RETERR(-ENOTEMPTY);
28345 +
28346 +       }
28347 +       return result;
28348 +}
28349 +
28350 +static reiser4_plugin_ops fibration_plugin_ops = {
28351 +       .init = NULL,
28352 +       .load = NULL,
28353 +       .save_len = NULL,
28354 +       .save = NULL,
28355 +       .change = change_fibration
28356 +};
28357 +
28358 +/* fibration plugins */
28359 +fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = {
28360 +       [FIBRATION_LEXICOGRAPHIC] = {
28361 +               .h = {
28362 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28363 +                       .id = FIBRATION_LEXICOGRAPHIC,
28364 +                       .pops = &fibration_plugin_ops,
28365 +                       .label = "lexicographic",
28366 +                       .desc = "no fibration",
28367 +                       .linkage = {NULL, NULL}
28368 +               },
28369 +               .fibre = fibre_trivial
28370 +       },
28371 +       [FIBRATION_DOT_O] = {
28372 +               .h = {
28373 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28374 +                       .id = FIBRATION_DOT_O,
28375 +                       .pops = &fibration_plugin_ops,
28376 +                       .label = "dot-o",
28377 +                       .desc = "fibrate .o files separately",
28378 +                       .linkage = {NULL, NULL}
28379 +               },
28380 +               .fibre = fibre_dot_o
28381 +       },
28382 +       [FIBRATION_EXT_1] = {
28383 +               .h = {
28384 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28385 +                       .id = FIBRATION_EXT_1,
28386 +                       .pops = &fibration_plugin_ops,
28387 +                       .label = "ext-1",
28388 +                       .desc = "fibrate file by single character extension",
28389 +                       .linkage = {NULL, NULL}
28390 +               },
28391 +               .fibre = fibre_ext_1
28392 +       },
28393 +       [FIBRATION_EXT_3] = {
28394 +               .h = {
28395 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
28396 +                       .id = FIBRATION_EXT_3,
28397 +                       .pops = &fibration_plugin_ops,
28398 +                       .label = "ext-3",
28399 +                       .desc = "fibrate file by three character extension",
28400 +                       .linkage = {NULL, NULL}
28401 +               },
28402 +               .fibre = fibre_ext_3
28403 +       }
28404 +};
28405 +
28406 +/*
28407 + * Local variables:
28408 + * c-indentation-style: "K&R"
28409 + * mode-name: "LC"
28410 + * c-basic-offset: 8
28411 + * tab-width: 8
28412 + * fill-column: 79
28413 + * End:
28414 + */
28415 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/fibration.h linux-2.6.27/fs/reiser4/plugin/fibration.h
28416 --- linux-2.6.27.orig/fs/reiser4/plugin/fibration.h     1970-01-01 03:00:00.000000000 +0300
28417 +++ linux-2.6.27/fs/reiser4/plugin/fibration.h  2008-10-12 18:20:01.000000000 +0400
28418 @@ -0,0 +1,37 @@
28419 +/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */
28420 +
28421 +/* Fibration plugin used by hashed directory plugin to segment content
28422 + * of directory. See fs/reiser4/plugin/fibration.c for more on this. */
28423 +
28424 +#if !defined( __FS_REISER4_PLUGIN_FIBRATION_H__ )
28425 +#define __FS_REISER4_PLUGIN_FIBRATION_H__
28426 +
28427 +#include "plugin_header.h"
28428 +
28429 +typedef struct fibration_plugin {
28430 +       /* generic fields */
28431 +       plugin_header h;
28432 +
28433 +        __u64(*fibre) (const struct inode * dir, const char *name, int len);
28434 +} fibration_plugin;
28435 +
28436 +typedef enum {
28437 +       FIBRATION_LEXICOGRAPHIC,
28438 +       FIBRATION_DOT_O,
28439 +       FIBRATION_EXT_1,
28440 +       FIBRATION_EXT_3,
28441 +       LAST_FIBRATION_ID
28442 +} reiser4_fibration_id;
28443 +
28444 +/* __FS_REISER4_PLUGIN_FIBRATION_H__ */
28445 +#endif
28446 +
28447 +/* Make Linus happy.
28448 +   Local variables:
28449 +   c-indentation-style: "K&R"
28450 +   mode-name: "LC"
28451 +   c-basic-offset: 8
28452 +   tab-width: 8
28453 +   fill-column: 120
28454 +   End:
28455 +*/
28456 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/file/cryptcompress.c linux-2.6.27/fs/reiser4/plugin/file/cryptcompress.c
28457 --- linux-2.6.27.orig/fs/reiser4/plugin/file/cryptcompress.c    1970-01-01 03:00:00.000000000 +0300
28458 +++ linux-2.6.27/fs/reiser4/plugin/file/cryptcompress.c 2008-10-12 18:20:01.000000000 +0400
28459 @@ -0,0 +1,3775 @@
28460 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
28461 +   reiser4/README */
28462 +/*
28463 + * Written by Edward Shishkin.
28464 + *
28465 + * Implementations of inode/file/address_space operations
28466 + * specific for cryptcompress file plugin which manages
28467 + * regular files built of compressed and(or) encrypted bodies.
28468 + * See http://dev.namesys.com/CryptcompressPlugin for details.
28469 + */
28470 +
28471 +#include "../../inode.h"
28472 +#include "../cluster.h"
28473 +#include "../object.h"
28474 +#include "../../tree_walk.h"
28475 +#include "cryptcompress.h"
28476 +
28477 +#include <linux/pagevec.h>
28478 +#include <asm/uaccess.h>
28479 +#include <linux/swap.h>
28480 +#include <linux/writeback.h>
28481 +#include <linux/random.h>
28482 +#include <linux/scatterlist.h>
28483 +
28484 +/*
28485 +               Managing primary and secondary caches by Reiser4
28486 +               cryptcompress file plugin. Synchronization scheme.
28487 +
28488 +
28489 +                                             +------------------+
28490 +                        +------------------->|    tfm stream    |
28491 +                        |                    | (compressed data)|
28492 +                  flush |                    +------------------+
28493 +                        +-----------------+           |
28494 +                        |(->)longterm lock|           V
28495 +--+        writepages() |                 |        +-***-+  reiser4        +---+
28496 +  |                     |                 +--+     | *** |  storage tree   |   |
28497 +  |                     |                    |     +-***-+  (primary cache)|   |
28498 +u | write()   (secondary| cache)             V    /   |   \                |   |
28499 +s | ---->  +----+ +----+ +----+ +----+     +-***** ******* **----+  ---->  | d |
28500 +e |        |    | |page cluster |    |     | **disk cluster**    |         | i |
28501 +r | <----  +----+ +----+ +----+ +----+     +-***** **********----+  <----  | s |
28502 +  | read()              ^                      ^      |                    | k |
28503 +  |                     |     (->)longterm lock|      |           page_io()|   |
28504 +  |                     |                      +------+                    |   |
28505 +--+         readpages() |                             |                    +---+
28506 +                        |                             V
28507 +                        |                    +------------------+
28508 +                        +--------------------|    tfm stream    |
28509 +                                             |   (plain text)   |
28510 +                                             +------------------+
28511 +*/
28512 +
28513 +/* get cryptcompress specific portion of inode */
28514 +struct cryptcompress_info *cryptcompress_inode_data(const struct inode *inode)
28515 +{
28516 +       return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info;
28517 +}
28518 +
28519 +/* plugin->u.file.init_inode_data */
28520 +void init_inode_data_cryptcompress(struct inode *inode,
28521 +                                  reiser4_object_create_data * crd,
28522 +                                  int create)
28523 +{
28524 +       struct cryptcompress_info *data;
28525 +
28526 +       data = cryptcompress_inode_data(inode);
28527 +       assert("edward-685", data != NULL);
28528 +
28529 +       memset(data, 0, sizeof(*data));
28530 +
28531 +       mutex_init(&data->checkin_mutex);
28532 +       data->trunc_index = ULONG_MAX;
28533 +       turn_on_compression(data);
28534 +       set_lattice_factor(data, MIN_LATTICE_FACTOR);
28535 +       init_inode_ordering(inode, crd, create);
28536 +}
28537 +
28538 +/* The following is a part of reiser4 cipher key manager
28539 +   which is called when opening/creating a cryptcompress file */
28540 +
28541 +/* get/set cipher key info */
28542 +struct reiser4_crypto_info * inode_crypto_info (struct inode * inode)
28543 +{
28544 +       assert("edward-90", inode != NULL);
28545 +       assert("edward-91", reiser4_inode_data(inode) != NULL);
28546 +       return cryptcompress_inode_data(inode)->crypt;
28547 +}
28548 +
28549 +static void set_inode_crypto_info (struct inode * inode,
28550 +                                  struct reiser4_crypto_info * info)
28551 +{
28552 +       cryptcompress_inode_data(inode)->crypt = info;
28553 +}
28554 +
28555 +/* allocate a cipher key info */
28556 +struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode)
28557 +{
28558 +       struct reiser4_crypto_info *info;
28559 +       int fipsize;
28560 +
28561 +       info = kzalloc(sizeof(*info), reiser4_ctx_gfp_mask_get());
28562 +       if (!info)
28563 +               return ERR_PTR(-ENOMEM);
28564 +
28565 +       fipsize = inode_digest_plugin(inode)->fipsize;
28566 +       info->keyid = kmalloc(fipsize, reiser4_ctx_gfp_mask_get());
28567 +       if (!info->keyid) {
28568 +               kfree(info);
28569 +               return ERR_PTR(-ENOMEM);
28570 +       }
28571 +       info->host = inode;
28572 +       return info;
28573 +}
28574 +
28575 +#if 0
28576 +/* allocate/free low-level info for cipher and digest
28577 +   transforms */
28578 +static int alloc_crypto_tfms(struct reiser4_crypto_info * info)
28579 +{
28580 +       struct crypto_blkcipher * ctfm = NULL;
28581 +       struct crypto_hash      * dtfm = NULL;
28582 +       cipher_plugin * cplug = inode_cipher_plugin(info->host);
28583 +       digest_plugin * dplug = inode_digest_plugin(info->host);
28584 +
28585 +       if (cplug->alloc) {
28586 +               ctfm = cplug->alloc();
28587 +               if (IS_ERR(ctfm)) {
28588 +                       warning("edward-1364",
28589 +                               "Can not allocate info for %s\n",
28590 +                               cplug->h.desc);
28591 +                       return RETERR(PTR_ERR(ctfm));
28592 +               }
28593 +       }
28594 +       info_set_cipher(info, ctfm);
28595 +       if (dplug->alloc) {
28596 +               dtfm = dplug->alloc();
28597 +               if (IS_ERR(dtfm)) {
28598 +                       warning("edward-1365",
28599 +                               "Can not allocate info for %s\n",
28600 +                               dplug->h.desc);
28601 +                       goto unhappy_with_digest;
28602 +               }
28603 +       }
28604 +       info_set_digest(info, dtfm);
28605 +       return 0;
28606 + unhappy_with_digest:
28607 +       if (cplug->free) {
28608 +               cplug->free(ctfm);
28609 +               info_set_cipher(info, NULL);
28610 +       }
28611 +       return RETERR(PTR_ERR(dtfm));
28612 +}
28613 +#endif
28614 +
28615 +static void
28616 +free_crypto_tfms(struct reiser4_crypto_info * info)
28617 +{
28618 +       assert("edward-1366", info != NULL);
28619 +       if (!info_get_cipher(info)) {
28620 +               assert("edward-1601", !info_get_digest(info));
28621 +               return;
28622 +       }
28623 +       inode_cipher_plugin(info->host)->free(info_get_cipher(info));
28624 +       info_set_cipher(info, NULL);
28625 +       inode_digest_plugin(info->host)->free(info_get_digest(info));
28626 +       info_set_digest(info, NULL);
28627 +       return;
28628 +}
28629 +
28630 +#if 0
28631 +/* create a key fingerprint for disk stat-data */
28632 +static int create_keyid (struct reiser4_crypto_info * info,
28633 +                        struct reiser4_crypto_data * data)
28634 +{
28635 +       int ret = -ENOMEM;
28636 +       size_t blk, pad;
28637 +       __u8 * dmem;
28638 +       __u8 * cmem;
28639 +       struct hash_desc      ddesc;
28640 +       struct blkcipher_desc cdesc;
28641 +       struct scatterlist sg;
28642 +
28643 +       assert("edward-1367", info != NULL);
28644 +       assert("edward-1368", info->keyid != NULL);
28645 +
28646 +       ddesc.tfm = info_get_digest(info);
28647 +       ddesc.flags = 0;
28648 +       cdesc.tfm = info_get_cipher(info);
28649 +       cdesc.flags = 0;
28650 +
28651 +       dmem = kmalloc((size_t)crypto_hash_digestsize(ddesc.tfm),
28652 +                      reiser4_ctx_gfp_mask_get());
28653 +       if (!dmem)
28654 +               goto exit1;
28655 +
28656 +       blk = crypto_blkcipher_blocksize(cdesc.tfm);
28657 +
28658 +       pad = data->keyid_size % blk;
28659 +       pad = (pad ? blk - pad : 0);
28660 +
28661 +       cmem = kmalloc((size_t)data->keyid_size + pad,
28662 +                      reiser4_ctx_gfp_mask_get());
28663 +       if (!cmem)
28664 +               goto exit2;
28665 +       memcpy(cmem, data->keyid, data->keyid_size);
28666 +       memset(cmem + data->keyid_size, 0, pad);
28667 +
28668 +       sg_init_one(&sg, cmem, data->keyid_size + pad);
28669 +
28670 +       ret = crypto_blkcipher_encrypt(&cdesc, &sg, &sg,
28671 +                                      data->keyid_size + pad);
28672 +       if (ret) {
28673 +               warning("edward-1369",
28674 +                       "encryption failed flags=%x\n", cdesc.flags);
28675 +               goto exit3;
28676 +       }
28677 +       ret = crypto_hash_digest(&ddesc, &sg, sg.length, dmem);
28678 +       if (ret) {
28679 +               warning("edward-1602",
28680 +                       "digest failed flags=%x\n", ddesc.flags);
28681 +               goto exit3;
28682 +       }
28683 +       memcpy(info->keyid, dmem, inode_digest_plugin(info->host)->fipsize);
28684 + exit3:
28685 +       kfree(cmem);
28686 + exit2:
28687 +       kfree(dmem);
28688 + exit1:
28689 +       return ret;
28690 +}
28691 +#endif
28692 +
28693 +static void destroy_keyid(struct reiser4_crypto_info * info)
28694 +{
28695 +       assert("edward-1370", info != NULL);
28696 +       assert("edward-1371", info->keyid != NULL);
28697 +       kfree(info->keyid);
28698 +       return;
28699 +}
28700 +
28701 +static void __free_crypto_info (struct inode * inode)
28702 +{
28703 +       struct reiser4_crypto_info * info = inode_crypto_info(inode);
28704 +       assert("edward-1372", info != NULL);
28705 +
28706 +       free_crypto_tfms(info);
28707 +       destroy_keyid(info);
28708 +       kfree(info);
28709 +}
28710 +
28711 +#if 0
28712 +static void instantiate_crypto_info(struct reiser4_crypto_info * info)
28713 +{
28714 +       assert("edward-1373", info != NULL);
28715 +       assert("edward-1374", info->inst == 0);
28716 +       info->inst = 1;
28717 +}
28718 +#endif
28719 +
28720 +static void uninstantiate_crypto_info(struct reiser4_crypto_info * info)
28721 +{
28722 +       assert("edward-1375", info != NULL);
28723 +       info->inst = 0;
28724 +}
28725 +
28726 +#if 0
28727 +static int is_crypto_info_instantiated(struct reiser4_crypto_info * info)
28728 +{
28729 +       return info->inst;
28730 +}
28731 +
28732 +static int inode_has_cipher_key(struct inode * inode)
28733 +{
28734 +       assert("edward-1376", inode != NULL);
28735 +       return inode_crypto_info(inode) &&
28736 +               is_crypto_info_instantiated(inode_crypto_info(inode));
28737 +}
28738 +#endif
28739 +
28740 +static void free_crypto_info (struct inode * inode)
28741 +{
28742 +       uninstantiate_crypto_info(inode_crypto_info(inode));
28743 +       __free_crypto_info(inode);
28744 +}
28745 +
28746 +static int need_cipher(struct inode * inode)
28747 +{
28748 +       return inode_cipher_plugin(inode) !=
28749 +               cipher_plugin_by_id(NONE_CIPHER_ID);
28750 +}
28751 +
28752 +/* Parse @data which contains a (uninstantiated) cipher key imported
28753 +   from user space, create a low-level cipher info and attach it to
28754 +   the @object. If success, then info contains an instantiated key */
28755 +#if 0
28756 +struct reiser4_crypto_info * create_crypto_info(struct inode * object,
28757 +                                 struct reiser4_crypto_data * data)
28758 +{
28759 +       int ret;
28760 +       struct reiser4_crypto_info * info;
28761 +
28762 +       assert("edward-1377", data != NULL);
28763 +       assert("edward-1378", need_cipher(object));
28764 +
28765 +       if (inode_file_plugin(object) !=
28766 +           file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID))
28767 +               return ERR_PTR(-EINVAL);
28768 +
28769 +       info = reiser4_alloc_crypto_info(object);
28770 +       if (IS_ERR(info))
28771 +               return info;
28772 +       ret = alloc_crypto_tfms(info);
28773 +       if (ret)
28774 +               goto err;
28775 +       /* instantiating a key */
28776 +       ret = crypto_blkcipher_setkey(info_get_cipher(info),
28777 +                                     data->key,
28778 +                                     data->keysize);
28779 +       if (ret) {
28780 +               warning("edward-1379",
28781 +                       "setkey failed flags=%x",
28782 +                       crypto_blkcipher_get_flags(info_get_cipher(info)));
28783 +               goto err;
28784 +       }
28785 +       info->keysize = data->keysize;
28786 +       ret = create_keyid(info, data);
28787 +       if (ret)
28788 +               goto err;
28789 +       instantiate_crypto_info(info);
28790 +       return info;
28791 + err:
28792 +       __free_crypto_info(object);
28793 +       return ERR_PTR(ret);
28794 +}
28795 +#endif
28796 +
28797 +/* increment/decrement a load counter when
28798 +   attaching/detaching the crypto-stat to any object */
28799 +static void load_crypto_info(struct reiser4_crypto_info * info)
28800 +{
28801 +       assert("edward-1380", info != NULL);
28802 +       inc_keyload_count(info);
28803 +}
28804 +
28805 +static void unload_crypto_info(struct inode * inode)
28806 +{
28807 +       struct reiser4_crypto_info * info = inode_crypto_info(inode);
28808 +       assert("edward-1381", info->keyload_count > 0);
28809 +
28810 +       dec_keyload_count(inode_crypto_info(inode));
28811 +       if (info->keyload_count == 0)
28812 +               /* final release */
28813 +               free_crypto_info(inode);
28814 +}
28815 +
28816 +/* attach/detach an existing crypto-stat */
28817 +void reiser4_attach_crypto_info(struct inode * inode,
28818 +                               struct reiser4_crypto_info * info)
28819 +{
28820 +       assert("edward-1382", inode != NULL);
28821 +       assert("edward-1383", info != NULL);
28822 +       assert("edward-1384", inode_crypto_info(inode) == NULL);
28823 +
28824 +       set_inode_crypto_info(inode, info);
28825 +       load_crypto_info(info);
28826 +}
28827 +
28828 +/* returns true, if crypto stat can be attached to the @host */
28829 +#if REISER4_DEBUG
28830 +static int host_allows_crypto_info(struct inode * host)
28831 +{
28832 +       int ret;
28833 +       file_plugin * fplug = inode_file_plugin(host);
28834 +
28835 +       switch (fplug->h.id) {
28836 +       case CRYPTCOMPRESS_FILE_PLUGIN_ID:
28837 +               ret = 1;
28838 +               break;
28839 +       default:
28840 +               ret = 0;
28841 +       }
28842 +       return ret;
28843 +}
28844 +#endif  /*  REISER4_DEBUG  */
28845 +
28846 +static void reiser4_detach_crypto_info(struct inode * inode)
28847 +{
28848 +       assert("edward-1385", inode != NULL);
28849 +       assert("edward-1386", host_allows_crypto_info(inode));
28850 +
28851 +       if (inode_crypto_info(inode))
28852 +               unload_crypto_info(inode);
28853 +       set_inode_crypto_info(inode, NULL);
28854 +}
28855 +
28856 +#if 0
28857 +
28858 +/* compare fingerprints of @child and @parent */
28859 +static int keyid_eq(struct reiser4_crypto_info * child,
28860 +                   struct reiser4_crypto_info * parent)
28861 +{
28862 +       return !memcmp(child->keyid,
28863 +                      parent->keyid,
28864 +                      info_digest_plugin(parent)->fipsize);
28865 +}
28866 +
28867 +/* check if a crypto-stat (which is bound to @parent) can be inherited */
28868 +int can_inherit_crypto_cryptcompress(struct inode *child, struct inode *parent)
28869 +{
28870 +       if (!need_cipher(child))
28871 +               return 0;
28872 +       /* the child is created */
28873 +       if (!inode_crypto_info(child))
28874 +               return 1;
28875 +       /* the child is looked up */
28876 +       if (!inode_crypto_info(parent))
28877 +               return 0;
28878 +       return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) &&
28879 +               inode_digest_plugin(child) == inode_digest_plugin(parent) &&
28880 +               inode_crypto_info(child)->keysize ==
28881 +               inode_crypto_info(parent)->keysize &&
28882 +               keyid_eq(inode_crypto_info(child), inode_crypto_info(parent)));
28883 +}
28884 +#endif
28885 +
28886 +/* helper functions for ->create() method of the cryptcompress plugin */
28887 +static int inode_set_crypto(struct inode * object)
28888 +{
28889 +       reiser4_inode * info;
28890 +       if (!inode_crypto_info(object)) {
28891 +               if (need_cipher(object))
28892 +                       return RETERR(-EINVAL);
28893 +               /* the file is not to be encrypted */
28894 +               return 0;
28895 +       }
28896 +       info = reiser4_inode_data(object);
28897 +       info->extmask |= (1 << CRYPTO_STAT);
28898 +       return 0;
28899 +}
28900 +
28901 +static int inode_init_compression(struct inode * object)
28902 +{
28903 +       int result = 0;
28904 +       assert("edward-1461", object != NULL);
28905 +       if (inode_compression_plugin(object)->init)
28906 +               result = inode_compression_plugin(object)->init();
28907 +       return result;
28908 +}
28909 +
28910 +static int inode_check_cluster(struct inode * object)
28911 +{
28912 +       assert("edward-696", object != NULL);
28913 +
28914 +       if (unlikely(inode_cluster_size(object) < PAGE_CACHE_SIZE)) {
28915 +               warning("edward-1320", "Can not support '%s' "
28916 +                       "logical clusters (less then page size)",
28917 +                       inode_cluster_plugin(object)->h.label);
28918 +               return RETERR(-EINVAL);
28919 +       }
28920 +       if (unlikely(inode_cluster_shift(object)) >= BITS_PER_BYTE*sizeof(int)){
28921 +               warning("edward-1463", "Can not support '%s' "
28922 +                       "logical clusters (too big for transform)",
28923 +                       inode_cluster_plugin(object)->h.label);
28924 +               return RETERR(-EINVAL);
28925 +       }
28926 +       return 0;
28927 +}
28928 +
28929 +/* plugin->destroy_inode() */
28930 +void destroy_inode_cryptcompress(struct inode * inode)
28931 +{
28932 +       assert("edward-1464", INODE_PGCOUNT(inode) == 0);
28933 +       reiser4_detach_crypto_info(inode);
28934 +       return;
28935 +}
28936 +
28937 +/* plugin->create_object():
28938 +. install plugins
28939 +. attach crypto info if specified
28940 +. attach compression info if specified
28941 +. attach cluster info
28942 +*/
28943 +int create_object_cryptcompress(struct inode *object, struct inode *parent,
28944 +                               reiser4_object_create_data * data)
28945 +{
28946 +       int result;
28947 +       reiser4_inode *info;
28948 +
28949 +       assert("edward-23", object != NULL);
28950 +       assert("edward-24", parent != NULL);
28951 +       assert("edward-30", data != NULL);
28952 +       assert("edward-26", reiser4_inode_get_flag(object, REISER4_NO_SD));
28953 +       assert("edward-27", data->id == CRYPTCOMPRESS_FILE_PLUGIN_ID);
28954 +
28955 +       info = reiser4_inode_data(object);
28956 +
28957 +       assert("edward-29", info != NULL);
28958 +
28959 +       /* set file bit */
28960 +       info->plugin_mask |= (1 << PSET_FILE);
28961 +
28962 +       /* set crypto */
28963 +       result = inode_set_crypto(object);
28964 +       if (result)
28965 +               goto error;
28966 +       /* set compression */
28967 +       result = inode_init_compression(object);
28968 +       if (result)
28969 +               goto error;
28970 +       /* set cluster */
28971 +       result = inode_check_cluster(object);
28972 +       if (result)
28973 +               goto error;
28974 +
28975 +       /* save everything in disk stat-data */
28976 +       result = write_sd_by_inode_common(object);
28977 +       if (!result)
28978 +               return 0;
28979 + error:
28980 +       reiser4_detach_crypto_info(object);
28981 +       return result;
28982 +}
28983 +
28984 +/* plugin->open() */
28985 +int open_cryptcompress(struct inode * inode, struct file * file)
28986 +{
28987 +       return 0;
28988 +}
28989 +
28990 +/* returns a blocksize, the attribute of a cipher algorithm */
28991 +static unsigned int
28992 +cipher_blocksize(struct inode * inode)
28993 +{
28994 +       assert("edward-758", need_cipher(inode));
28995 +       assert("edward-1400", inode_crypto_info(inode) != NULL);
28996 +       return crypto_blkcipher_blocksize
28997 +               (info_get_cipher(inode_crypto_info(inode)));
28998 +}
28999 +
29000 +/* returns offset translated by scale factor of the crypto-algorithm */
29001 +static loff_t inode_scaled_offset (struct inode * inode,
29002 +                                  const loff_t src_off /* input offset */)
29003 +{
29004 +       assert("edward-97", inode != NULL);
29005 +
29006 +       if (!need_cipher(inode) ||
29007 +           src_off == get_key_offset(reiser4_min_key()) ||
29008 +           src_off == get_key_offset(reiser4_max_key()))
29009 +               return src_off;
29010 +
29011 +       return inode_cipher_plugin(inode)->scale(inode,
29012 +                                                cipher_blocksize(inode),
29013 +                                                src_off);
29014 +}
29015 +
29016 +/* returns disk cluster size */
29017 +size_t inode_scaled_cluster_size(struct inode * inode)
29018 +{
29019 +       assert("edward-110", inode != NULL);
29020 +
29021 +       return inode_scaled_offset(inode, inode_cluster_size(inode));
29022 +}
29023 +
29024 +/* set number of cluster pages */
29025 +static void set_cluster_nrpages(struct cluster_handle * clust,
29026 +                               struct inode *inode)
29027 +{
29028 +       struct reiser4_slide * win;
29029 +
29030 +       assert("edward-180", clust != NULL);
29031 +       assert("edward-1040", inode != NULL);
29032 +
29033 +       clust->old_nrpages = size_in_pages(lbytes(clust->index, inode));
29034 +       win = clust->win;
29035 +       if (!win) {
29036 +               clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
29037 +               return;
29038 +       }
29039 +       assert("edward-1176", clust->op != LC_INVAL);
29040 +       assert("edward-1064", win->off + win->count + win->delta != 0);
29041 +
29042 +       if (win->stat == HOLE_WINDOW &&
29043 +           win->off == 0 && win->count == inode_cluster_size(inode)) {
29044 +               /* special case: writing a "fake" logical cluster */
29045 +               clust->nr_pages = 0;
29046 +               return;
29047 +       }
29048 +       clust->nr_pages = size_in_pages(max(win->off + win->count + win->delta,
29049 +                                           lbytes(clust->index, inode)));
29050 +       return;
29051 +}
29052 +
29053 +/* plugin->key_by_inode()
29054 +   build key of a disk cluster */
29055 +int key_by_inode_cryptcompress(struct inode *inode, loff_t off,
29056 +                              reiser4_key * key)
29057 +{
29058 +       assert("edward-64", inode != 0);
29059 +
29060 +       if (likely(off != get_key_offset(reiser4_max_key())))
29061 +               off = off_to_clust_to_off(off, inode);
29062 +       if (inode_crypto_info(inode))
29063 +               off = inode_scaled_offset(inode, off);
29064 +
29065 +       key_by_inode_and_offset_common(inode, 0, key);
29066 +       set_key_offset(key, (__u64)off);
29067 +       return 0;
29068 +}
29069 +
29070 +/* plugin->flow_by_inode() */
29071 +/* flow is used to read/write disk clusters */
29072 +int flow_by_inode_cryptcompress(struct inode *inode, const char __user * buf,
29073 +                               int user,       /* 1: @buf is of user space,
29074 +                                                  0: kernel space */
29075 +                               loff_t size,    /* @buf size */
29076 +                               loff_t off,     /* offset to start io from */
29077 +                               rw_op op,       /* READ or WRITE */
29078 +                               flow_t * f      /* resulting flow */)
29079 +{
29080 +       assert("edward-436", f != NULL);
29081 +       assert("edward-149", inode != NULL);
29082 +       assert("edward-150", inode_file_plugin(inode) != NULL);
29083 +       assert("edward-1465", user == 0); /* we use flow to read/write
29084 +                                           disk clusters located in
29085 +                                           kernel space */
29086 +       f->length = size;
29087 +       memcpy(&f->data, &buf, sizeof(buf));
29088 +       f->user = user;
29089 +       f->op = op;
29090 +
29091 +       return key_by_inode_cryptcompress(inode, off, &f->key);
29092 +}
29093 +
29094 +static int
29095 +cryptcompress_hint_validate(hint_t * hint, const reiser4_key * key,
29096 +                           znode_lock_mode lock_mode)
29097 +{
29098 +       coord_t *coord;
29099 +
29100 +       assert("edward-704", hint != NULL);
29101 +       assert("edward-1089", !hint_is_valid(hint));
29102 +       assert("edward-706", hint->lh.owner == NULL);
29103 +
29104 +       coord = &hint->ext_coord.coord;
29105 +
29106 +       if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
29107 +               /* hint either not set or set by different operation */
29108 +               return RETERR(-E_REPEAT);
29109 +
29110 +       if (get_key_offset(key) != hint->offset)
29111 +               /* hint is set for different key */
29112 +               return RETERR(-E_REPEAT);
29113 +
29114 +       assert("edward-707", reiser4_schedulable());
29115 +
29116 +       return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord,
29117 +                                    key, &hint->lh, lock_mode,
29118 +                                    ZNODE_LOCK_LOPRI);
29119 +}
29120 +
29121 +/* reserve disk space when writing a logical cluster */
29122 +static int reserve4cluster(struct inode *inode, struct cluster_handle *clust)
29123 +{
29124 +       int result = 0;
29125 +
29126 +       assert("edward-965", reiser4_schedulable());
29127 +       assert("edward-439", inode != NULL);
29128 +       assert("edward-440", clust != NULL);
29129 +       assert("edward-441", clust->pages != NULL);
29130 +
29131 +       if (clust->nr_pages == 0) {
29132 +               assert("edward-1152", clust->win != NULL);
29133 +               assert("edward-1153", clust->win->stat == HOLE_WINDOW);
29134 +               /* don't reserve disk space for fake logical cluster */
29135 +               return 0;
29136 +       }
29137 +       assert("edward-442", jprivate(clust->pages[0]) != NULL);
29138 +
29139 +       result = reiser4_grab_space_force(estimate_insert_cluster(inode) +
29140 +                                         estimate_update_cluster(inode),
29141 +                                         BA_CAN_COMMIT);
29142 +       if (result)
29143 +               return result;
29144 +       clust->reserved = 1;
29145 +       grabbed2cluster_reserved(estimate_insert_cluster(inode) +
29146 +                                estimate_update_cluster(inode));
29147 +#if REISER4_DEBUG
29148 +       clust->reserved_prepped = estimate_update_cluster(inode);
29149 +       clust->reserved_unprepped = estimate_insert_cluster(inode);
29150 +#endif
29151 +       /* there can be space grabbed by txnmgr_force_commit_all */
29152 +       return 0;
29153 +}
29154 +
29155 +/* free reserved disk space if writing a logical cluster fails */
29156 +static void free_reserved4cluster(struct inode *inode,
29157 +                                 struct cluster_handle *ch, int count)
29158 +{
29159 +       assert("edward-967", ch->reserved == 1);
29160 +
29161 +       cluster_reserved2free(count);
29162 +       ch->reserved = 0;
29163 +}
29164 +
29165 +/* The core search procedure of the cryptcompress plugin.
29166 +   If returned value is not cbk_errored, then current znode is locked */
29167 +static int find_cluster_item(hint_t * hint,
29168 +                            const reiser4_key * key, /* key of the item we are
29169 +                                                        looking for */
29170 +                            znode_lock_mode lock_mode /* which lock */ ,
29171 +                            ra_info_t * ra_info, lookup_bias bias, __u32 flags)
29172 +{
29173 +       int result;
29174 +       reiser4_key ikey;
29175 +       int went_right = 0;
29176 +       coord_t *coord = &hint->ext_coord.coord;
29177 +       coord_t orig = *coord;
29178 +
29179 +       assert("edward-152", hint != NULL);
29180 +
29181 +       if (!hint_is_valid(hint)) {
29182 +               result = cryptcompress_hint_validate(hint, key, lock_mode);
29183 +               if (result == -E_REPEAT)
29184 +                       goto traverse_tree;
29185 +               else if (result) {
29186 +                       assert("edward-1216", 0);
29187 +                       return result;
29188 +               }
29189 +               hint_set_valid(hint);
29190 +       }
29191 +       assert("edward-709", znode_is_any_locked(coord->node));
29192 +
29193 +       /* In-place lookup is going here, it means we just need to
29194 +          check if next item of the @coord match to the @keyhint) */
29195 +
29196 +       if (equal_to_rdk(coord->node, key)) {
29197 +               result = goto_right_neighbor(coord, &hint->lh);
29198 +               if (result == -E_NO_NEIGHBOR) {
29199 +                       assert("edward-1217", 0);
29200 +                       return RETERR(-EIO);
29201 +               }
29202 +               if (result)
29203 +                       return result;
29204 +               assert("edward-1218", equal_to_ldk(coord->node, key));
29205 +               went_right = 1;
29206 +       } else {
29207 +               coord->item_pos++;
29208 +               coord->unit_pos = 0;
29209 +               coord->between = AT_UNIT;
29210 +       }
29211 +       result = zload(coord->node);
29212 +       if (result)
29213 +               return result;
29214 +       assert("edward-1219", !node_is_empty(coord->node));
29215 +
29216 +       if (!coord_is_existing_item(coord)) {
29217 +               zrelse(coord->node);
29218 +               goto not_found;
29219 +       }
29220 +       item_key_by_coord(coord, &ikey);
29221 +       zrelse(coord->node);
29222 +       if (!keyeq(key, &ikey))
29223 +               goto not_found;
29224 +       /* Ok, item is found, update node counts */
29225 +       if (went_right)
29226 +               dclust_inc_extension_ncount(hint);
29227 +       return CBK_COORD_FOUND;
29228 +
29229 + not_found:
29230 +       assert("edward-1220", coord->item_pos > 0);
29231 +       //coord->item_pos--;
29232 +       /* roll back */
29233 +       *coord = orig;
29234 +       ON_DEBUG(coord_update_v(coord));
29235 +       return CBK_COORD_NOTFOUND;
29236 +
29237 + traverse_tree:
29238 +       assert("edward-713", hint->lh.owner == NULL);
29239 +       assert("edward-714", reiser4_schedulable());
29240 +
29241 +       reiser4_unset_hint(hint);
29242 +       dclust_init_extension(hint);
29243 +       coord_init_zero(coord);
29244 +       result = coord_by_key(current_tree, key, coord, &hint->lh,
29245 +                             lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL,
29246 +                             CBK_UNIQUE | flags, ra_info);
29247 +       if (cbk_errored(result))
29248 +               return result;
29249 +       if(result == CBK_COORD_FOUND)
29250 +               dclust_inc_extension_ncount(hint);
29251 +       hint_set_valid(hint);
29252 +       return result;
29253 +}
29254 +
29255 +/* This function is called by deflate[inflate] manager when
29256 +   creating a transformed/plain stream to check if we should
29257 +   create/cut some overhead. If this returns true, then @oh
29258 +   contains the size of this overhead.
29259 + */
29260 +static int need_cut_or_align(struct inode * inode,
29261 +                            struct cluster_handle * ch, rw_op rw, int * oh)
29262 +{
29263 +       struct tfm_cluster * tc = &ch->tc;
29264 +       switch (rw) {
29265 +       case WRITE_OP: /* estimate align */
29266 +               *oh = tc->len % cipher_blocksize(inode);
29267 +               if (*oh != 0)
29268 +                       return 1;
29269 +               break;
29270 +       case READ_OP:  /* estimate cut */
29271 +               *oh = *(tfm_output_data(ch) + tc->len - 1);
29272 +               break;
29273 +       default:
29274 +               impossible("edward-1401", "bad option");
29275 +       }
29276 +       return (tc->len != tc->lsize);
29277 +}
29278 +
29279 +/* create/cut an overhead of transformed/plain stream */
29280 +static void align_or_cut_overhead(struct inode * inode,
29281 +                                 struct cluster_handle * ch, rw_op rw)
29282 +{
29283 +       int oh;
29284 +       cipher_plugin * cplug = inode_cipher_plugin(inode);
29285 +
29286 +       assert("edward-1402", need_cipher(inode));
29287 +
29288 +       if (!need_cut_or_align(inode, ch, rw, &oh))
29289 +               return;
29290 +       switch (rw) {
29291 +       case WRITE_OP: /* do align */
29292 +               ch->tc.len +=
29293 +                       cplug->align_stream(tfm_input_data(ch) +
29294 +                                           ch->tc.len, ch->tc.len,
29295 +                                           cipher_blocksize(inode));
29296 +               *(tfm_input_data(ch) + ch->tc.len - 1) =
29297 +                       cipher_blocksize(inode) - oh;
29298 +               break;
29299 +       case READ_OP: /* do cut */
29300 +               assert("edward-1403", oh <= cipher_blocksize(inode));
29301 +               ch->tc.len -= oh;
29302 +               break;
29303 +       default:
29304 +               impossible("edward-1404", "bad option");
29305 +       }
29306 +       return;
29307 +}
29308 +
29309 +static unsigned max_cipher_overhead(struct inode * inode)
29310 +{
29311 +       if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream)
29312 +               return 0;
29313 +       return cipher_blocksize(inode);
29314 +}
29315 +
29316 +static int deflate_overhead(struct inode *inode)
29317 +{
29318 +       return (inode_compression_plugin(inode)->
29319 +               checksum ? DC_CHECKSUM_SIZE : 0);
29320 +}
29321 +
29322 +static unsigned deflate_overrun(struct inode * inode, int ilen)
29323 +{
29324 +       return coa_overrun(inode_compression_plugin(inode), ilen);
29325 +}
29326 +
29327 +/* Estimating compressibility of a logical cluster by various
29328 +   policies represented by compression mode plugin.
29329 +   If this returns false, then compressor won't be called for
29330 +   the cluster of index @index.
29331 +*/
29332 +static int should_compress(struct tfm_cluster * tc, cloff_t index,
29333 +                          struct inode *inode)
29334 +{
29335 +       compression_plugin *cplug = inode_compression_plugin(inode);
29336 +       compression_mode_plugin *mplug = inode_compression_mode_plugin(inode);
29337 +
29338 +       assert("edward-1321", tc->len != 0);
29339 +       assert("edward-1322", cplug != NULL);
29340 +       assert("edward-1323", mplug != NULL);
29341 +
29342 +       return /* estimate by size */
29343 +               (cplug->min_size_deflate ?
29344 +                tc->len >= cplug->min_size_deflate() :
29345 +                1) &&
29346 +               /* estimate by compression mode plugin */
29347 +               (mplug->should_deflate ?
29348 +                mplug->should_deflate(inode, index) :
29349 +                1);
29350 +}
29351 +
29352 +/* Evaluating results of compression transform.
29353 +   Returns true, if we need to accept this results */
29354 +static int save_compressed(int size_before, int size_after, struct inode *inode)
29355 +{
29356 +       return (size_after + deflate_overhead(inode) +
29357 +               max_cipher_overhead(inode) < size_before);
29358 +}
29359 +
29360 +/* Guess result of the evaluation above */
29361 +static int need_inflate(struct cluster_handle * ch, struct inode * inode,
29362 +                       int encrypted /* is cluster encrypted */ )
29363 +{
29364 +       struct tfm_cluster * tc = &ch->tc;
29365 +
29366 +       assert("edward-142", tc != 0);
29367 +       assert("edward-143", inode != NULL);
29368 +
29369 +       return tc->len <
29370 +           (encrypted ?
29371 +            inode_scaled_offset(inode, tc->lsize) :
29372 +            tc->lsize);
29373 +}
29374 +
29375 +/* If results of compression were accepted, then we add
29376 +   a checksum to catch possible disk cluster corruption.
29377 +   The following is a format of the data stored in disk clusters:
29378 +
29379 +                  data                   This is (transformed) logical cluster.
29380 +                  cipher_overhead        This is created by ->align() method
29381 +                                          of cipher plugin. May be absent.
29382 +                  checksum          (4)  This is created by ->checksum method
29383 +                                          of compression plugin to check
29384 +                                          integrity. May be absent.
29385 +
29386 +                  Crypto overhead format:
29387 +
29388 +                  data
29389 +                  control_byte      (1)   contains aligned overhead size:
29390 +                                          1 <= overhead <= cipher_blksize
29391 +*/
29392 +/* Append a checksum at the end of a transformed stream */
29393 +static void dc_set_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
29394 +{
29395 +       __u32 checksum;
29396 +
29397 +       assert("edward-1309", tc != NULL);
29398 +       assert("edward-1310", tc->len > 0);
29399 +       assert("edward-1311", cplug->checksum != NULL);
29400 +
29401 +       checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len);
29402 +       put_unaligned(cpu_to_le32(checksum),
29403 +                (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len));
29404 +       tc->len += (int)DC_CHECKSUM_SIZE;
29405 +}
29406 +
29407 +/* Check a disk cluster checksum.
29408 +   Returns 0 if checksum is correct, otherwise returns 1 */
29409 +static int dc_check_checksum(compression_plugin * cplug, struct tfm_cluster * tc)
29410 +{
29411 +       assert("edward-1312", tc != NULL);
29412 +       assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE);
29413 +       assert("edward-1314", cplug->checksum != NULL);
29414 +
29415 +       if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM),
29416 +                           tc->len - (int)DC_CHECKSUM_SIZE) !=
29417 +           le32_to_cpu(get_unaligned((d32 *)
29418 +                                     (tfm_stream_data(tc, INPUT_STREAM)
29419 +                                      + tc->len - (int)DC_CHECKSUM_SIZE)))) {
29420 +               warning("edward-156",
29421 +                       "Bad disk cluster checksum %d, (should be %d) Fsck?\n",
29422 +                       (int)le32_to_cpu
29423 +                       (get_unaligned((d32 *)
29424 +                                      (tfm_stream_data(tc, INPUT_STREAM) +
29425 +                                       tc->len - (int)DC_CHECKSUM_SIZE))),
29426 +                       (int)cplug->checksum
29427 +                       (tfm_stream_data(tc, INPUT_STREAM),
29428 +                        tc->len - (int)DC_CHECKSUM_SIZE));
29429 +               return 1;
29430 +       }
29431 +       tc->len -= (int)DC_CHECKSUM_SIZE;
29432 +       return 0;
29433 +}
29434 +
29435 +/* get input/output stream for some transform action */
29436 +int grab_tfm_stream(struct inode * inode, struct tfm_cluster * tc,
29437 +                   tfm_stream_id id)
29438 +{
29439 +       size_t size = inode_scaled_cluster_size(inode);
29440 +
29441 +       assert("edward-901", tc != NULL);
29442 +       assert("edward-1027", inode_compression_plugin(inode) != NULL);
29443 +
29444 +       if (cluster_get_tfm_act(tc) == TFMA_WRITE)
29445 +               size += deflate_overrun(inode, inode_cluster_size(inode));
29446 +
29447 +       if (!get_tfm_stream(tc, id) && id == INPUT_STREAM)
29448 +               alternate_streams(tc);
29449 +       if (!get_tfm_stream(tc, id))
29450 +               return alloc_tfm_stream(tc, size, id);
29451 +
29452 +       assert("edward-902", tfm_stream_is_set(tc, id));
29453 +
29454 +       if (tfm_stream_size(tc, id) < size)
29455 +               return realloc_tfm_stream(tc, size, id);
29456 +       return 0;
29457 +}
29458 +
29459 +/* Common deflate manager */
29460 +int reiser4_deflate_cluster(struct cluster_handle * clust, struct inode * inode)
29461 +{
29462 +       int result = 0;
29463 +       int compressed = 0;
29464 +       int encrypted = 0;
29465 +       struct tfm_cluster * tc = &clust->tc;
29466 +       compression_plugin * coplug;
29467 +
29468 +       assert("edward-401", inode != NULL);
29469 +       assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM));
29470 +       assert("edward-1348", cluster_get_tfm_act(tc) == TFMA_WRITE);
29471 +       assert("edward-498", !tfm_cluster_is_uptodate(tc));
29472 +
29473 +       coplug = inode_compression_plugin(inode);
29474 +       if (should_compress(tc, clust->index, inode)) {
29475 +               /* try to compress, discard bad results */
29476 +               __u32 dst_len;
29477 +               compression_mode_plugin * mplug =
29478 +                       inode_compression_mode_plugin(inode);
29479 +               assert("edward-602", coplug != NULL);
29480 +               assert("edward-1423", coplug->compress != NULL);
29481 +
29482 +               result = grab_coa(tc, coplug);
29483 +               if (result) {
29484 +                   warning("edward-1424",
29485 +                           "alloc_coa failed with ret=%d, skipped compression",
29486 +                           result);
29487 +                   goto cipher;
29488 +               }
29489 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29490 +               if (result) {
29491 +                   warning("edward-1425",
29492 +                        "alloc stream failed with ret=%d, skipped compression",
29493 +                           result);
29494 +                   goto cipher;
29495 +               }
29496 +               dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
29497 +               coplug->compress(get_coa(tc, coplug->h.id, tc->act),
29498 +                                tfm_input_data(clust), tc->len,
29499 +                                tfm_output_data(clust), &dst_len);
29500 +               /* make sure we didn't overwrite extra bytes */
29501 +               assert("edward-603",
29502 +                      dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
29503 +
29504 +               /* evaluate results of compression transform */
29505 +               if (save_compressed(tc->len, dst_len, inode)) {
29506 +                       /* good result, accept */
29507 +                       tc->len = dst_len;
29508 +                       if (mplug->accept_hook != NULL) {
29509 +                              result = mplug->accept_hook(inode, clust->index);
29510 +                              if (result)
29511 +                                      warning("edward-1426",
29512 +                                              "accept_hook failed with ret=%d",
29513 +                                              result);
29514 +                       }
29515 +                       compressed = 1;
29516 +               }
29517 +               else {
29518 +                       /* bad result, discard */
29519 +#if 0
29520 +                       if (cluster_is_complete(clust, inode))
29521 +                             warning("edward-1496",
29522 +                                     "incompressible cluster %lu (inode %llu)",
29523 +                                     clust->index,
29524 +                                     (unsigned long long)get_inode_oid(inode));
29525 +#endif
29526 +                       if (mplug->discard_hook != NULL &&
29527 +                           cluster_is_complete(clust, inode)) {
29528 +                               result = mplug->discard_hook(inode,
29529 +                                                            clust->index);
29530 +                               if (result)
29531 +                                     warning("edward-1427",
29532 +                                             "discard_hook failed with ret=%d",
29533 +                                             result);
29534 +                       }
29535 +               }
29536 +       }
29537 + cipher:
29538 +       if (need_cipher(inode)) {
29539 +               cipher_plugin * ciplug;
29540 +               struct blkcipher_desc desc;
29541 +               struct scatterlist src;
29542 +               struct scatterlist dst;
29543 +
29544 +               ciplug = inode_cipher_plugin(inode);
29545 +               desc.tfm = info_get_cipher(inode_crypto_info(inode));
29546 +               desc.flags = 0;
29547 +               if (compressed)
29548 +                       alternate_streams(tc);
29549 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29550 +               if (result)
29551 +                       return result;
29552 +
29553 +               align_or_cut_overhead(inode, clust, WRITE_OP);
29554 +               sg_init_one(&src, tfm_input_data(clust), tc->len);
29555 +               sg_init_one(&dst, tfm_output_data(clust), tc->len);
29556 +
29557 +               result = crypto_blkcipher_encrypt(&desc, &dst, &src, tc->len);
29558 +               if (result) {
29559 +                       warning("edward-1405",
29560 +                               "encryption failed flags=%x\n", desc.flags);
29561 +                       return result;
29562 +               }
29563 +               encrypted = 1;
29564 +       }
29565 +       if (compressed && coplug->checksum != NULL)
29566 +               dc_set_checksum(coplug, tc);
29567 +       if (!compressed && !encrypted)
29568 +               alternate_streams(tc);
29569 +       return result;
29570 +}
29571 +
29572 +/* Common inflate manager. */
29573 +int reiser4_inflate_cluster(struct cluster_handle * clust, struct inode * inode)
29574 +{
29575 +       int result = 0;
29576 +       int transformed = 0;
29577 +       struct tfm_cluster * tc = &clust->tc;
29578 +       compression_plugin * coplug;
29579 +
29580 +       assert("edward-905", inode != NULL);
29581 +       assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER);
29582 +       assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM));
29583 +       assert("edward-1349", tc->act == TFMA_READ);
29584 +       assert("edward-907", !tfm_cluster_is_uptodate(tc));
29585 +
29586 +       /* Handle a checksum (if any) */
29587 +       coplug = inode_compression_plugin(inode);
29588 +       if (need_inflate(clust, inode, need_cipher(inode)) &&
29589 +           coplug->checksum != NULL) {
29590 +               result = dc_check_checksum(coplug, tc);
29591 +               if (unlikely(result)) {
29592 +                       warning("edward-1460",
29593 +                               "Inode %llu: disk cluster %lu looks corrupted",
29594 +                               (unsigned long long)get_inode_oid(inode),
29595 +                               clust->index);
29596 +                       return RETERR(-EIO);
29597 +               }
29598 +       }
29599 +       if (need_cipher(inode)) {
29600 +               cipher_plugin * ciplug;
29601 +               struct blkcipher_desc desc;
29602 +               struct scatterlist src;
29603 +               struct scatterlist dst;
29604 +
29605 +               ciplug = inode_cipher_plugin(inode);
29606 +               desc.tfm = info_get_cipher(inode_crypto_info(inode));
29607 +               desc.flags = 0;
29608 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29609 +               if (result)
29610 +                       return result;
29611 +               assert("edward-909", tfm_cluster_is_set(tc));
29612 +
29613 +               sg_init_one(&src, tfm_input_data(clust), tc->len);
29614 +               sg_init_one(&dst, tfm_output_data(clust), tc->len);
29615 +
29616 +               result = crypto_blkcipher_decrypt(&desc, &dst, &src, tc->len);
29617 +               if (result) {
29618 +                       warning("edward-1600", "decrypt failed flags=%x\n",
29619 +                               desc.flags);
29620 +                       return result;
29621 +               }
29622 +               align_or_cut_overhead(inode, clust, READ_OP);
29623 +               transformed = 1;
29624 +       }
29625 +       if (need_inflate(clust, inode, 0)) {
29626 +               unsigned dst_len = inode_cluster_size(inode);
29627 +               if(transformed)
29628 +                       alternate_streams(tc);
29629 +
29630 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
29631 +               if (result)
29632 +                       return result;
29633 +               assert("edward-1305", coplug->decompress != NULL);
29634 +               assert("edward-910", tfm_cluster_is_set(tc));
29635 +
29636 +               coplug->decompress(get_coa(tc, coplug->h.id, tc->act),
29637 +                                  tfm_input_data(clust), tc->len,
29638 +                                  tfm_output_data(clust), &dst_len);
29639 +               /* check length */
29640 +               tc->len = dst_len;
29641 +               assert("edward-157", dst_len == tc->lsize);
29642 +               transformed = 1;
29643 +       }
29644 +       if (!transformed)
29645 +               alternate_streams(tc);
29646 +       return result;
29647 +}
29648 +
29649 +/* This is implementation of readpage method of struct
29650 +   address_space_operations for cryptcompress plugin. */
29651 +int readpage_cryptcompress(struct file *file, struct page *page)
29652 +{
29653 +       reiser4_context *ctx;
29654 +       struct cluster_handle clust;
29655 +       item_plugin *iplug;
29656 +       int result;
29657 +
29658 +       assert("edward-88", PageLocked(page));
29659 +       assert("vs-976", !PageUptodate(page));
29660 +       assert("edward-89", page->mapping && page->mapping->host);
29661 +
29662 +       ctx = reiser4_init_context(page->mapping->host->i_sb);
29663 +       if (IS_ERR(ctx)) {
29664 +               unlock_page(page);
29665 +               return PTR_ERR(ctx);
29666 +       }
29667 +       assert("edward-113",
29668 +              ergo(file != NULL,
29669 +                   page->mapping == file->f_dentry->d_inode->i_mapping));
29670 +
29671 +       if (PageUptodate(page)) {
29672 +               warning("edward-1338", "page is already uptodate\n");
29673 +               unlock_page(page);
29674 +               reiser4_exit_context(ctx);
29675 +               return 0;
29676 +       }
29677 +       cluster_init_read(&clust, NULL);
29678 +       clust.file = file;
29679 +       iplug = item_plugin_by_id(CTAIL_ID);
29680 +       if (!iplug->s.file.readpage) {
29681 +               unlock_page(page);
29682 +               put_cluster_handle(&clust);
29683 +               reiser4_exit_context(ctx);
29684 +               return -EINVAL;
29685 +       }
29686 +       result = iplug->s.file.readpage(&clust, page);
29687 +
29688 +       put_cluster_handle(&clust);
29689 +       reiser4_txn_restart(ctx);
29690 +       reiser4_exit_context(ctx);
29691 +       return result;
29692 +}
29693 +
29694 +/* number of pages to check in */
29695 +static int get_new_nrpages(struct cluster_handle * clust)
29696 +{
29697 +       switch (clust->op) {
29698 +       case LC_APPOV:
29699 +               return clust->nr_pages;
29700 +       case LC_TRUNC:
29701 +               assert("edward-1179", clust->win != NULL);
29702 +               return size_in_pages(clust->win->off + clust->win->count);
29703 +       default:
29704 +               impossible("edward-1180", "bad page cluster option");
29705 +               return 0;
29706 +       }
29707 +}
29708 +
29709 +static void set_cluster_pages_dirty(struct cluster_handle * clust,
29710 +                                   struct inode * inode)
29711 +{
29712 +       int i;
29713 +       struct page *pg;
29714 +       int nrpages = get_new_nrpages(clust);
29715 +
29716 +       for (i = 0; i < nrpages; i++) {
29717 +
29718 +               pg = clust->pages[i];
29719 +               assert("edward-968", pg != NULL);
29720 +               lock_page(pg);
29721 +               assert("edward-1065", PageUptodate(pg));
29722 +               reiser4_set_page_dirty_internal(pg);
29723 +               unlock_page(pg);
29724 +               mark_page_accessed(pg);
29725 +       }
29726 +}
29727 +
29728 +/* Grab a page cluster for read/write operations.
29729 +   Attach a jnode for write operations (when preparing for modifications, which
29730 +   are supposed to be committed).
29731 +
29732 +   We allocate only one jnode per page cluster; this jnode is binded to the
29733 +   first page of this cluster, so we have an extra-reference that will be put
29734 +   as soon as jnode is evicted from memory), other references will be cleaned
29735 +   up in flush time (assume that check in page cluster was successful).
29736 +*/
29737 +int grab_page_cluster(struct inode * inode,
29738 +                     struct cluster_handle * clust, rw_op rw)
29739 +{
29740 +       int i;
29741 +       int result = 0;
29742 +       jnode *node = NULL;
29743 +
29744 +       assert("edward-182", clust != NULL);
29745 +       assert("edward-183", clust->pages != NULL);
29746 +       assert("edward-1466", clust->node == NULL);
29747 +       assert("edward-1428", inode != NULL);
29748 +       assert("edward-1429", inode->i_mapping != NULL);
29749 +       assert("edward-184", clust->nr_pages <= cluster_nrpages(inode));
29750 +
29751 +       if (clust->nr_pages == 0)
29752 +               return 0;
29753 +
29754 +       for (i = 0; i < clust->nr_pages; i++) {
29755 +
29756 +               assert("edward-1044", clust->pages[i] == NULL);
29757 +
29758 +               clust->pages[i] =
29759 +                      find_or_create_page(inode->i_mapping,
29760 +                                          clust_to_pg(clust->index, inode) + i,
29761 +                                          reiser4_ctx_gfp_mask_get());
29762 +               if (!clust->pages[i]) {
29763 +                       result = RETERR(-ENOMEM);
29764 +                       break;
29765 +               }
29766 +               if (i == 0 && rw == WRITE_OP) {
29767 +                       node = jnode_of_page(clust->pages[i]);
29768 +                       if (IS_ERR(node)) {
29769 +                               result = PTR_ERR(node);
29770 +                               unlock_page(clust->pages[i]);
29771 +                               break;
29772 +                       }
29773 +                       JF_SET(node, JNODE_CLUSTER_PAGE);
29774 +                       assert("edward-920", jprivate(clust->pages[0]));
29775 +               }
29776 +               INODE_PGCOUNT_INC(inode);
29777 +               unlock_page(clust->pages[i]);
29778 +       }
29779 +       if (unlikely(result)) {
29780 +               while (i) {
29781 +                       put_cluster_page(clust->pages[--i]);
29782 +                       INODE_PGCOUNT_DEC(inode);
29783 +               }
29784 +               if (node && !IS_ERR(node))
29785 +                       jput(node);
29786 +               return result;
29787 +       }
29788 +       clust->node = node;
29789 +       return 0;
29790 +}
29791 +
29792 +static void truncate_page_cluster_range(struct inode * inode,
29793 +                                       struct page ** pages,
29794 +                                       cloff_t index,
29795 +                                       int from, int count,
29796 +                                       int even_cows)
29797 +{
29798 +       assert("edward-1467", count > 0);
29799 +       reiser4_invalidate_pages(inode->i_mapping,
29800 +                                clust_to_pg(index, inode) + from,
29801 +                                count, even_cows);
29802 +}
29803 +
29804 +/* Put @count pages starting from @from offset */
29805 +void __put_page_cluster(int from, int count,
29806 +                       struct page ** pages, struct inode  * inode)
29807 +{
29808 +       int i;
29809 +       assert("edward-1468", pages != NULL);
29810 +       assert("edward-1469", inode != NULL);
29811 +       assert("edward-1470", from >= 0 && count >= 0);
29812 +
29813 +       for (i = 0; i < count; i++) {
29814 +               assert("edward-1471", pages[from + i] != NULL);
29815 +               assert("edward-1472",
29816 +                      pages[from + i]->index == pages[from]->index + i);
29817 +
29818 +               put_cluster_page(pages[from + i]);
29819 +               INODE_PGCOUNT_DEC(inode);
29820 +       }
29821 +}
29822 +
29823 +/*
29824 + * This is dual to grab_page_cluster,
29825 + * however if @rw == WRITE_OP, then we call this function
29826 + * only if something is failed before checkin page cluster.
29827 + */
29828 +void put_page_cluster(struct cluster_handle * clust,
29829 +                     struct inode * inode, rw_op rw)
29830 +{
29831 +       assert("edward-445", clust != NULL);
29832 +       assert("edward-922", clust->pages != NULL);
29833 +       assert("edward-446",
29834 +              ergo(clust->nr_pages != 0, clust->pages[0] != NULL));
29835 +
29836 +       __put_page_cluster(0, clust->nr_pages, clust->pages, inode);
29837 +       if (rw == WRITE_OP) {
29838 +               if (unlikely(clust->node)) {
29839 +                       assert("edward-447",
29840 +                              clust->node == jprivate(clust->pages[0]));
29841 +                       jput(clust->node);
29842 +                       clust->node = NULL;
29843 +               }
29844 +       }
29845 +}
29846 +
29847 +#if REISER4_DEBUG
29848 +int cryptcompress_inode_ok(struct inode *inode)
29849 +{
29850 +       if (!(reiser4_inode_data(inode)->plugin_mask & (1 << PSET_FILE)))
29851 +               return 0;
29852 +       if (!cluster_shift_ok(inode_cluster_shift(inode)))
29853 +               return 0;
29854 +       return 1;
29855 +}
29856 +
29857 +static int window_ok(struct reiser4_slide * win, struct inode *inode)
29858 +{
29859 +       assert("edward-1115", win != NULL);
29860 +       assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW));
29861 +
29862 +       return (win->off != inode_cluster_size(inode)) &&
29863 +           (win->off + win->count + win->delta <= inode_cluster_size(inode));
29864 +}
29865 +
29866 +static int cluster_ok(struct cluster_handle * clust, struct inode *inode)
29867 +{
29868 +       assert("edward-279", clust != NULL);
29869 +
29870 +       if (!clust->pages)
29871 +               return 0;
29872 +       return (clust->win ? window_ok(clust->win, inode) : 1);
29873 +}
29874 +#if 0
29875 +static int pages_truncate_ok(struct inode *inode, pgoff_t start)
29876 +{
29877 +       int found;
29878 +       struct page * page;
29879 +
29880 +       found = find_get_pages(inode->i_mapping, start, 1, &page);
29881 +       if (found)
29882 +               put_cluster_page(page);
29883 +       return !found;
29884 +}
29885 +#else
29886 +#define pages_truncate_ok(inode, start) 1
29887 +#endif
29888 +
29889 +static int jnode_truncate_ok(struct inode *inode, cloff_t index)
29890 +{
29891 +       jnode *node;
29892 +       node = jlookup(current_tree, get_inode_oid(inode),
29893 +                      clust_to_pg(index, inode));
29894 +       if (likely(!node))
29895 +               return 1;
29896 +       jput(node);
29897 +       return 0;
29898 +}
29899 +
29900 +static int find_fake_appended(struct inode *inode, cloff_t * index);
29901 +
29902 +static int body_truncate_ok(struct inode *inode, cloff_t aidx)
29903 +{
29904 +       int result;
29905 +       cloff_t raidx;
29906 +
29907 +       result = find_fake_appended(inode, &raidx);
29908 +       return !result && (aidx == raidx);
29909 +}
29910 +#endif
29911 +
29912 +/* guess next window stat */
29913 +static inline window_stat next_window_stat(struct reiser4_slide * win)
29914 +{
29915 +       assert("edward-1130", win != NULL);
29916 +       return ((win->stat == HOLE_WINDOW && win->delta == 0) ?
29917 +               HOLE_WINDOW : DATA_WINDOW);
29918 +}
29919 +
29920 +/* guess and set next cluster index and window params */
29921 +static void move_update_window(struct inode * inode,
29922 +                              struct cluster_handle * clust,
29923 +                              loff_t file_off, loff_t to_file)
29924 +{
29925 +       struct reiser4_slide * win;
29926 +
29927 +       assert("edward-185", clust != NULL);
29928 +       assert("edward-438", clust->pages != NULL);
29929 +       assert("edward-281", cluster_ok(clust, inode));
29930 +
29931 +       win = clust->win;
29932 +       if (!win)
29933 +               return;
29934 +
29935 +       switch (win->stat) {
29936 +       case DATA_WINDOW:
29937 +               /* increment */
29938 +               clust->index++;
29939 +               win->stat = DATA_WINDOW;
29940 +               win->off = 0;
29941 +               win->count = min((loff_t)inode_cluster_size(inode), to_file);
29942 +               break;
29943 +       case HOLE_WINDOW:
29944 +               switch (next_window_stat(win)) {
29945 +               case HOLE_WINDOW:
29946 +                       /* skip */
29947 +                       clust->index = off_to_clust(file_off, inode);
29948 +                       win->stat = HOLE_WINDOW;
29949 +                       win->off = 0;
29950 +                       win->count = off_to_cloff(file_off, inode);
29951 +                       win->delta = min((loff_t)(inode_cluster_size(inode) -
29952 +                                                 win->count), to_file);
29953 +                       break;
29954 +               case DATA_WINDOW:
29955 +                       /* stay */
29956 +                       win->stat = DATA_WINDOW;
29957 +                       /* off+count+delta=inv */
29958 +                       win->off = win->off + win->count;
29959 +                       win->count = win->delta;
29960 +                       win->delta = 0;
29961 +                       break;
29962 +               default:
29963 +                       impossible("edward-282", "wrong next window state");
29964 +               }
29965 +               break;
29966 +       default:
29967 +               impossible("edward-283", "wrong current window state");
29968 +       }
29969 +       assert("edward-1068", cluster_ok(clust, inode));
29970 +}
29971 +
29972 +static int update_sd_cryptcompress(struct inode *inode)
29973 +{
29974 +       int result = 0;
29975 +
29976 +       assert("edward-978", reiser4_schedulable());
29977 +
29978 +       result = reiser4_grab_space_force(/* one for stat data update */
29979 +                                         estimate_update_common(inode),
29980 +                                         BA_CAN_COMMIT);
29981 +       if (result)
29982 +               return result;
29983 +       inode->i_ctime = inode->i_mtime = CURRENT_TIME;
29984 +       result = reiser4_update_sd(inode);
29985 +
29986 +       return result;
29987 +}
29988 +
29989 +static void uncapture_cluster_jnode(jnode * node)
29990 +{
29991 +       txn_atom *atom;
29992 +
29993 +       assert_spin_locked(&(node->guard));
29994 +
29995 +       atom = jnode_get_atom(node);
29996 +       if (atom == NULL) {
29997 +               assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
29998 +               spin_unlock_jnode(node);
29999 +               return;
30000 +       }
30001 +       reiser4_uncapture_block(node);
30002 +       spin_unlock_atom(atom);
30003 +       jput(node);
30004 +}
30005 +
30006 +static void put_found_pages(struct page **pages, int nr)
30007 +{
30008 +       int i;
30009 +       for (i = 0; i < nr; i++) {
30010 +               assert("edward-1045", pages[i] != NULL);
30011 +               put_cluster_page(pages[i]);
30012 +       }
30013 +}
30014 +
30015 +/*             Lifecycle of a logical cluster in the system.
30016 + *
30017 + *
30018 + * Logical cluster of a cryptcompress file is represented in the system by
30019 + * . page cluster (in memory, primary cache, contains plain text);
30020 + * . disk cluster (in memory, secondary cache, contains transformed text).
30021 + * Primary cache is to reduce number of transform operations (compression,
30022 + * encryption), i.e. to implement transform-caching strategy.
30023 + * Secondary cache is to reduce number of I/O operations, i.e. for usual
30024 + * write-caching strategy. Page cluster is a set of pages, i.e. mapping of
30025 + * a logical cluster to the primary cache. Disk cluster is a set of items
30026 + * of the same type defined by some reiser4 item plugin id.
30027 + *
30028 + *              1. Performing modifications
30029 + *
30030 + * Every modification of a cryptcompress file is considered as a set of
30031 + * operations performed on file's logical clusters. Every such "atomic"
30032 + * modification is truncate, append and(or) overwrite some bytes of a
30033 + * logical cluster performed in the primary cache with the following
30034 + * synchronization with the secondary cache (in flush time). Disk clusters,
30035 + * which live in the secondary cache, are supposed to be synchronized with
30036 + * disk. The mechanism of synchronization of primary and secondary caches
30037 + * includes so-called checkin/checkout technique described below.
30038 + *
30039 + *              2. Submitting modifications
30040 + *
30041 + * Each page cluster has associated jnode (a special in-memory header to
30042 + * keep a track of transactions in reiser4), which is attached to its first
30043 + * page when grabbing page cluster for modifications (see grab_page_cluster).
30044 + * Submitting modifications (see checkin_logical_cluster) is going per logical
30045 + * cluster and includes:
30046 + * . checkin_cluster_size;
30047 + * . checkin_page_cluster.
30048 + * checkin_cluster_size() is resolved to file size update (which completely
30049 + * defines new size of logical cluster (number of file's bytes in a logical
30050 + * cluster).
30051 + * checkin_page_cluster() captures jnode of a page cluster and installs
30052 + * jnode's dirty flag (if needed) to indicate that modifications are
30053 + * successfully checked in.
30054 + *
30055 + *              3. Checking out modifications
30056 + *
30057 + * Is going per logical cluster in flush time (see checkout_logical_cluster).
30058 + * This is the time of synchronizing primary and secondary caches.
30059 + * checkout_logical_cluster() includes:
30060 + * . checkout_page_cluster (retrieving checked in pages).
30061 + * . uncapture jnode (including clear dirty flag and unlock)
30062 + *
30063 + *              4. Committing modifications
30064 + *
30065 + * Proceeding a synchronization of primary and secondary caches. When checking
30066 + * out page cluster (the phase above) pages are locked/flushed/unlocked
30067 + * one-by-one in ascending order of their indexes to contiguous stream, which
30068 + * is supposed to be transformed (compressed, encrypted), chopped up into items
30069 + * and committed to disk as a disk cluster.
30070 + *
30071 + *              5. Managing page references
30072 + *
30073 + * Every checked in page have a special additional "control" reference,
30074 + * which is dropped at checkout. We need this to avoid unexpected evicting
30075 + * pages from memory before checkout. Control references are managed so
30076 + * they are not accumulated with every checkin:
30077 + *
30078 + *            0
30079 + * checkin -> 1
30080 + *            0 -> checkout
30081 + * checkin -> 1
30082 + * checkin -> 1
30083 + * checkin -> 1
30084 + *            0 -> checkout
30085 + *           ...
30086 + *
30087 + * Every page cluster has its own unique "cluster lock". Update/drop
30088 + * references are serialized via this lock. Number of checked in cluster
30089 + * pages is calculated by i_size under cluster lock. File size is updated
30090 + * at every checkin action also under cluster lock (except cases of
30091 + * appending/truncating fake logical clusters).
30092 + *
30093 + * Proof of correctness:
30094 + *
30095 + * Since we update file size under cluster lock, in the case of non-fake
30096 + * logical cluster with its lock held we do have expected number of checked
30097 + * in pages. On the other hand, append/truncate of fake logical clusters
30098 + * doesn't change number of checked in pages of any cluster.
30099 + *
30100 + * NOTE-EDWARD: As cluster lock we use guard (spinlock_t) of its jnode.
30101 + * Currently, I don't see any reason to create a special lock for those
30102 + * needs.
30103 + */
30104 +
30105 +static inline void lock_cluster(jnode * node)
30106 +{
30107 +       spin_lock_jnode(node);
30108 +}
30109 +
30110 +static inline void unlock_cluster(jnode * node)
30111 +{
30112 +       spin_unlock_jnode(node);
30113 +}
30114 +
30115 +static inline void unlock_cluster_uncapture(jnode * node)
30116 +{
30117 +       uncapture_cluster_jnode(node);
30118 +}
30119 +
30120 +/* Set new file size by window. Cluster lock is required. */
30121 +static void checkin_file_size(struct cluster_handle * clust,
30122 +                             struct inode * inode)
30123 +{
30124 +       loff_t new_size;
30125 +       struct reiser4_slide * win;
30126 +
30127 +       assert("edward-1181", clust != NULL);
30128 +       assert("edward-1182", inode != NULL);
30129 +       assert("edward-1473", clust->pages != NULL);
30130 +       assert("edward-1474", clust->pages[0] != NULL);
30131 +       assert("edward-1475", jprivate(clust->pages[0]) != NULL);
30132 +       assert_spin_locked(&(jprivate(clust->pages[0])->guard));
30133 +
30134 +
30135 +       win = clust->win;
30136 +       assert("edward-1183", win != NULL);
30137 +
30138 +       new_size = clust_to_off(clust->index, inode) + win->off;
30139 +
30140 +       switch (clust->op) {
30141 +       case LC_APPOV:
30142 +               if (new_size + win->count <= i_size_read(inode))
30143 +                       /* overwrite only */
30144 +                       return;
30145 +               new_size += win->count;
30146 +               break;
30147 +       case LC_TRUNC:
30148 +               break;
30149 +       default:
30150 +               impossible("edward-1184", "bad page cluster option");
30151 +               break;
30152 +       }
30153 +       inode_check_scale_nolock(inode, i_size_read(inode), new_size);
30154 +       i_size_write(inode, new_size);
30155 +       return;
30156 +}
30157 +
30158 +static inline void checkin_cluster_size(struct cluster_handle * clust,
30159 +                                       struct inode * inode)
30160 +{
30161 +       if (clust->win)
30162 +               checkin_file_size(clust, inode);
30163 +}
30164 +
30165 +static int checkin_page_cluster(struct cluster_handle * clust,
30166 +                               struct inode * inode)
30167 +{
30168 +       int result;
30169 +       jnode * node;
30170 +       int old_nrpages = clust->old_nrpages;
30171 +       int new_nrpages = get_new_nrpages(clust);
30172 +
30173 +       node = clust->node;
30174 +
30175 +       assert("edward-221", node != NULL);
30176 +       assert("edward-971", clust->reserved == 1);
30177 +       assert("edward-1263",
30178 +              clust->reserved_prepped == estimate_update_cluster(inode));
30179 +       assert("edward-1264", clust->reserved_unprepped == 0);
30180 +
30181 +       if (JF_ISSET(node, JNODE_DIRTY)) {
30182 +               /*
30183 +                * page cluster was checked in, but not yet
30184 +                * checked out, so release related resources
30185 +                */
30186 +               free_reserved4cluster(inode, clust,
30187 +                                     estimate_update_cluster(inode));
30188 +               __put_page_cluster(0, clust->old_nrpages,
30189 +                                  clust->pages, inode);
30190 +       } else {
30191 +               result = capture_cluster_jnode(node);
30192 +               if (unlikely(result)) {
30193 +                       unlock_cluster(node);
30194 +                       return result;
30195 +               }
30196 +               jnode_make_dirty_locked(node);
30197 +               clust->reserved = 0;
30198 +       }
30199 +       unlock_cluster(node);
30200 +
30201 +       if (new_nrpages < old_nrpages) {
30202 +               /* truncate >= 1 complete pages */
30203 +               __put_page_cluster(new_nrpages,
30204 +                                  old_nrpages - new_nrpages,
30205 +                                  clust->pages, inode);
30206 +               truncate_page_cluster_range(inode,
30207 +                                           clust->pages, clust->index,
30208 +                                           new_nrpages,
30209 +                                           old_nrpages - new_nrpages,
30210 +                                           0);
30211 +       }
30212 +#if REISER4_DEBUG
30213 +       clust->reserved_prepped -= estimate_update_cluster(inode);
30214 +#endif
30215 +       return 0;
30216 +}
30217 +
30218 +/* Submit modifications of a logical cluster */
30219 +static int checkin_logical_cluster(struct cluster_handle * clust,
30220 +                                  struct inode *inode)
30221 +{
30222 +       int result = 0;
30223 +       jnode * node;
30224 +
30225 +       node = clust->node;
30226 +
30227 +       assert("edward-1035", node != NULL);
30228 +       assert("edward-1029", clust != NULL);
30229 +       assert("edward-1030", clust->reserved == 1);
30230 +       assert("edward-1031", clust->nr_pages != 0);
30231 +       assert("edward-1032", clust->pages != NULL);
30232 +       assert("edward-1033", clust->pages[0] != NULL);
30233 +       assert("edward-1446", jnode_is_cluster_page(node));
30234 +       assert("edward-1476", node == jprivate(clust->pages[0]));
30235 +
30236 +       lock_cluster(node);
30237 +       checkin_cluster_size(clust, inode);
30238 +       /* this will unlock cluster */
30239 +       result = checkin_page_cluster(clust, inode);
30240 +       jput(node);
30241 +       clust->node = NULL;
30242 +       return result;
30243 +}
30244 +
30245 +/*
30246 + * Retrieve size of logical cluster that was checked in at
30247 + * the latest modifying session (cluster lock is required)
30248 + */
30249 +static inline void checkout_cluster_size(struct cluster_handle * clust,
30250 +                                        struct inode * inode)
30251 +{
30252 +       struct tfm_cluster *tc = &clust->tc;
30253 +
30254 +       tc->len = lbytes(clust->index, inode);
30255 +       assert("edward-1478", tc->len != 0);
30256 +}
30257 +
30258 +/*
30259 + * Retrieve a page cluster with the latest submitted modifications
30260 + * and flush its pages to previously allocated contiguous stream.
30261 + */
30262 +static void checkout_page_cluster(struct cluster_handle * clust,
30263 +                                 jnode * node, struct inode * inode)
30264 +{
30265 +       int i;
30266 +       int found;
30267 +       int to_put;
30268 +       struct tfm_cluster *tc = &clust->tc;
30269 +
30270 +       /* find and put checked in pages: cluster is locked,
30271 +        * so we must get expected number (to_put) of pages
30272 +        */
30273 +       to_put = size_in_pages(lbytes(clust->index, inode));
30274 +       found = find_get_pages(inode->i_mapping,
30275 +                              clust_to_pg(clust->index, inode),
30276 +                              to_put, clust->pages);
30277 +       BUG_ON(found != to_put);
30278 +
30279 +       __put_page_cluster(0, to_put, clust->pages, inode);
30280 +       unlock_cluster_uncapture(node);
30281 +
30282 +       /* Flush found pages.
30283 +        *
30284 +        * Note, that we don't disable modifications while flushing,
30285 +        * moreover, some found pages can be truncated, as we have
30286 +        * released cluster lock.
30287 +        */
30288 +       for (i = 0; i < found; i++) {
30289 +               int in_page;
30290 +               char * data;
30291 +               assert("edward-1479",
30292 +                      clust->pages[i]->index == clust->pages[0]->index + i);
30293 +
30294 +               lock_page(clust->pages[i]);
30295 +               if (!PageUptodate(clust->pages[i])) {
30296 +                       /* page was truncated */
30297 +                       assert("edward-1480",
30298 +                              i_size_read(inode) <= page_offset(clust->pages[i]));
30299 +                       assert("edward-1481",
30300 +                              clust->pages[i]->mapping != inode->i_mapping);
30301 +                       unlock_page(clust->pages[i]);
30302 +                       break;
30303 +               }
30304 +               /* Update the number of bytes in the logical cluster,
30305 +                * as it could be partially truncated. Note, that only
30306 +                * partial truncate is possible (complete truncate can
30307 +                * not go here, as it is performed via ->kill_hook()
30308 +                 * called by cut_file_items(), and the last one must
30309 +                 * wait for znode locked with parent coord).
30310 +                */
30311 +               checkout_cluster_size(clust, inode);
30312 +
30313 +               /* this can be zero, as new file size is
30314 +                  checked in before truncating pages */
30315 +               in_page = __mbp(tc->len, i);
30316 +
30317 +               data = kmap(clust->pages[i]);
30318 +               memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
30319 +                      data, in_page);
30320 +               kunmap(clust->pages[i]);
30321 +
30322 +               if (PageDirty(clust->pages[i]))
30323 +                       cancel_dirty_page(clust->pages[i], PAGE_CACHE_SIZE);
30324 +
30325 +               unlock_page(clust->pages[i]);
30326 +
30327 +               if (in_page < PAGE_CACHE_SIZE)
30328 +                       /* end of the file */
30329 +                       break;
30330 +       }
30331 +       put_found_pages(clust->pages, found); /* find_get_pages */
30332 +       tc->lsize = tc->len;
30333 +       return;
30334 +}
30335 +
30336 +/* Check out modifications of a logical cluster */
30337 +int checkout_logical_cluster(struct cluster_handle * clust,
30338 +                            jnode * node, struct inode *inode)
30339 +{
30340 +       int result;
30341 +       struct tfm_cluster *tc = &clust->tc;
30342 +
30343 +       assert("edward-980", node != NULL);
30344 +       assert("edward-236", inode != NULL);
30345 +       assert("edward-237", clust != NULL);
30346 +       assert("edward-240", !clust->win);
30347 +       assert("edward-241", reiser4_schedulable());
30348 +       assert("edward-718", cryptcompress_inode_ok(inode));
30349 +
30350 +       result = grab_tfm_stream(inode, tc, INPUT_STREAM);
30351 +       if (result) {
30352 +               warning("edward-1430", "alloc stream failed with ret=%d",
30353 +                       result);
30354 +               return RETERR(-E_REPEAT);
30355 +       }
30356 +       lock_cluster(node);
30357 +
30358 +       if (unlikely(!JF_ISSET(node, JNODE_DIRTY))) {
30359 +               /* race with another flush */
30360 +               warning("edward-982",
30361 +                       "checking out logical cluster %lu of inode %llu: "
30362 +                       "jnode is not dirty", clust->index,
30363 +                       (unsigned long long)get_inode_oid(inode));
30364 +               unlock_cluster(node);
30365 +               return RETERR(-E_REPEAT);
30366 +       }
30367 +       cluster_reserved2grabbed(estimate_update_cluster(inode));
30368 +
30369 +       /* this will unlock cluster */
30370 +       checkout_page_cluster(clust, node, inode);
30371 +       return 0;
30372 +}
30373 +
30374 +/* set hint for the cluster of the index @index */
30375 +static void set_hint_cluster(struct inode *inode, hint_t * hint,
30376 +                            cloff_t index, znode_lock_mode mode)
30377 +{
30378 +       reiser4_key key;
30379 +       assert("edward-722", cryptcompress_inode_ok(inode));
30380 +       assert("edward-723",
30381 +              inode_file_plugin(inode) ==
30382 +              file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
30383 +
30384 +       inode_file_plugin(inode)->key_by_inode(inode,
30385 +                                              clust_to_off(index, inode),
30386 +                                              &key);
30387 +
30388 +       reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, &key);
30389 +       hint->offset = get_key_offset(&key);
30390 +       hint->mode = mode;
30391 +}
30392 +
30393 +void invalidate_hint_cluster(struct cluster_handle * clust)
30394 +{
30395 +       assert("edward-1291", clust != NULL);
30396 +       assert("edward-1292", clust->hint != NULL);
30397 +
30398 +       done_lh(&clust->hint->lh);
30399 +       hint_clr_valid(clust->hint);
30400 +}
30401 +
30402 +static void put_hint_cluster(struct cluster_handle * clust,
30403 +                            struct inode *inode, znode_lock_mode mode)
30404 +{
30405 +       assert("edward-1286", clust != NULL);
30406 +       assert("edward-1287", clust->hint != NULL);
30407 +
30408 +       set_hint_cluster(inode, clust->hint, clust->index + 1, mode);
30409 +       invalidate_hint_cluster(clust);
30410 +}
30411 +
30412 +static int balance_dirty_page_cluster(struct cluster_handle * clust,
30413 +                                     struct inode *inode, loff_t off,
30414 +                                     loff_t to_file,
30415 +                                     int nr_dirtied)
30416 +{
30417 +       int result;
30418 +       struct cryptcompress_info * info;
30419 +
30420 +       assert("edward-724", inode != NULL);
30421 +       assert("edward-725", cryptcompress_inode_ok(inode));
30422 +       assert("edward-1547",
30423 +              nr_dirtied != 0 && nr_dirtied <= cluster_nrpages(inode));
30424 +
30425 +       /* set next window params */
30426 +       move_update_window(inode, clust, off, to_file);
30427 +
30428 +       result = update_sd_cryptcompress(inode);
30429 +       if (result)
30430 +               return result;
30431 +       assert("edward-726", clust->hint->lh.owner == NULL);
30432 +       info = cryptcompress_inode_data(inode);
30433 +
30434 +       mutex_unlock(&info->checkin_mutex);
30435 +       reiser4_txn_restart_current();
30436 +       balance_dirty_pages_ratelimited_nr(inode->i_mapping, nr_dirtied);
30437 +       mutex_lock(&info->checkin_mutex);
30438 +       return 0;
30439 +}
30440 +
30441 +/* set zeroes to the page cluster, proceed it, and maybe, try to capture
30442 +   its pages */
30443 +static int write_hole(struct inode *inode, struct cluster_handle * clust,
30444 +                     loff_t file_off, loff_t to_file)
30445 +{
30446 +       int result = 0;
30447 +       unsigned cl_off, cl_count = 0;
30448 +       unsigned to_pg, pg_off;
30449 +       struct reiser4_slide * win;
30450 +
30451 +       assert("edward-190", clust != NULL);
30452 +       assert("edward-1069", clust->win != NULL);
30453 +       assert("edward-191", inode != NULL);
30454 +       assert("edward-727", cryptcompress_inode_ok(inode));
30455 +       assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER);
30456 +       assert("edward-1154",
30457 +              ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1));
30458 +
30459 +       win = clust->win;
30460 +
30461 +       assert("edward-1070", win != NULL);
30462 +       assert("edward-201", win->stat == HOLE_WINDOW);
30463 +       assert("edward-192", cluster_ok(clust, inode));
30464 +
30465 +       if (win->off == 0 && win->count == inode_cluster_size(inode)) {
30466 +               /* This part of the hole will be represented by "fake"
30467 +                * logical cluster, i.e. which doesn't have appropriate
30468 +                * disk cluster until someone modify this logical cluster
30469 +                * and make it dirty.
30470 +                * So go forward here..
30471 +                */
30472 +               move_update_window(inode, clust, file_off, to_file);
30473 +               return 0;
30474 +       }
30475 +       cl_count = win->count;  /* number of zeroes to write */
30476 +       cl_off = win->off;
30477 +       pg_off = off_to_pgoff(win->off);
30478 +
30479 +       while (cl_count) {
30480 +               struct page *page;
30481 +               page = clust->pages[off_to_pg(cl_off)];
30482 +
30483 +               assert("edward-284", page != NULL);
30484 +
30485 +               to_pg = min((typeof(pg_off))PAGE_CACHE_SIZE - pg_off, cl_count);
30486 +               lock_page(page);
30487 +               zero_user(page, pg_off, to_pg);
30488 +               SetPageUptodate(page);
30489 +               reiser4_set_page_dirty_internal(page);
30490 +               mark_page_accessed(page);
30491 +               unlock_page(page);
30492 +
30493 +               cl_off += to_pg;
30494 +               cl_count -= to_pg;
30495 +               pg_off = 0;
30496 +       }
30497 +       if (!win->delta) {
30498 +               /* only zeroes in this window, try to capture
30499 +                */
30500 +               result = checkin_logical_cluster(clust, inode);
30501 +               if (result)
30502 +                       return result;
30503 +               put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
30504 +               result = balance_dirty_page_cluster(clust,
30505 +                                                   inode, file_off, to_file,
30506 +                                                   win_count_to_nrpages(win));
30507 +       } else
30508 +               move_update_window(inode, clust, file_off, to_file);
30509 +       return result;
30510 +}
30511 +
30512 +/*
30513 +  The main disk search procedure for cryptcompress plugin, which
30514 +  . scans all items of disk cluster with the lock mode @mode
30515 +  . maybe reads each one (if @read)
30516 +  . maybe makes its znode dirty (if write lock mode was specified)
30517 +
30518 +  NOTE-EDWARD: Callers should handle the case when disk cluster
30519 +  is incomplete (-EIO)
30520 +*/
30521 +int find_disk_cluster(struct cluster_handle * clust,
30522 +                     struct inode *inode, int read, znode_lock_mode mode)
30523 +{
30524 +       flow_t f;
30525 +       hint_t *hint;
30526 +       int result = 0;
30527 +       int was_grabbed;
30528 +       ra_info_t ra_info;
30529 +       file_plugin *fplug;
30530 +       item_plugin *iplug;
30531 +       struct tfm_cluster *tc;
30532 +       struct cryptcompress_info * info;
30533 +
30534 +       assert("edward-138", clust != NULL);
30535 +       assert("edward-728", clust->hint != NULL);
30536 +       assert("edward-226", reiser4_schedulable());
30537 +       assert("edward-137", inode != NULL);
30538 +       assert("edward-729", cryptcompress_inode_ok(inode));
30539 +
30540 +       hint = clust->hint;
30541 +       fplug = inode_file_plugin(inode);
30542 +       was_grabbed = get_current_context()->grabbed_blocks;
30543 +       info = cryptcompress_inode_data(inode);
30544 +       tc = &clust->tc;
30545 +
30546 +       assert("edward-462", !tfm_cluster_is_uptodate(tc));
30547 +       assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM)));
30548 +
30549 +       dclust_init_extension(hint);
30550 +
30551 +       /* set key of the first disk cluster item */
30552 +       fplug->flow_by_inode(inode,
30553 +                            (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL),
30554 +                            0 /* kernel space */ ,
30555 +                            inode_scaled_cluster_size(inode),
30556 +                            clust_to_off(clust->index, inode), READ_OP, &f);
30557 +       if (mode == ZNODE_WRITE_LOCK) {
30558 +               /* reserve for flush to make dirty all the leaf nodes
30559 +                  which contain disk cluster */
30560 +               result =
30561 +                   reiser4_grab_space_force(estimate_dirty_cluster(inode),
30562 +                                            BA_CAN_COMMIT);
30563 +               if (result)
30564 +                       goto out;
30565 +       }
30566 +
30567 +       ra_info.key_to_stop = f.key;
30568 +       set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
30569 +
30570 +       while (f.length) {
30571 +               result = find_cluster_item(hint, &f.key, mode,
30572 +                                          NULL, FIND_EXACT,
30573 +                                          (mode == ZNODE_WRITE_LOCK ?
30574 +                                           CBK_FOR_INSERT : 0));
30575 +               switch (result) {
30576 +               case CBK_COORD_NOTFOUND:
30577 +                       result = 0;
30578 +                       if (inode_scaled_offset
30579 +                           (inode, clust_to_off(clust->index, inode)) ==
30580 +                           get_key_offset(&f.key)) {
30581 +                               /* first item not found, this is treated
30582 +                                  as disk cluster is absent */
30583 +                               clust->dstat = FAKE_DISK_CLUSTER;
30584 +                               goto out;
30585 +                       }
30586 +                       /* we are outside the cluster, stop search here */
30587 +                       assert("edward-146",
30588 +                              f.length != inode_scaled_cluster_size(inode));
30589 +                       goto ok;
30590 +               case CBK_COORD_FOUND:
30591 +                       assert("edward-148",
30592 +                              hint->ext_coord.coord.between == AT_UNIT);
30593 +                       assert("edward-460",
30594 +                              hint->ext_coord.coord.unit_pos == 0);
30595 +
30596 +                       coord_clear_iplug(&hint->ext_coord.coord);
30597 +                       result = zload_ra(hint->ext_coord.coord.node, &ra_info);
30598 +                       if (unlikely(result))
30599 +                               goto out;
30600 +                       iplug = item_plugin_by_coord(&hint->ext_coord.coord);
30601 +                       assert("edward-147",
30602 +                              item_id_by_coord(&hint->ext_coord.coord) ==
30603 +                              CTAIL_ID);
30604 +
30605 +                       result = iplug->s.file.read(NULL, &f, hint);
30606 +                       if (result) {
30607 +                               zrelse(hint->ext_coord.coord.node);
30608 +                               goto out;
30609 +                       }
30610 +                       if (mode == ZNODE_WRITE_LOCK) {
30611 +                               /* Don't make dirty more nodes then it was
30612 +                                  estimated (see comments before
30613 +                                  estimate_dirty_cluster). Missed nodes will be
30614 +                                  read up in flush time if they are evicted from
30615 +                                  memory */
30616 +                               if (dclust_get_extension_ncount(hint) <=
30617 +                                   estimate_dirty_cluster(inode))
30618 +                                  znode_make_dirty(hint->ext_coord.coord.node);
30619 +
30620 +                               znode_set_convertible(hint->ext_coord.coord.
30621 +                                                     node);
30622 +                       }
30623 +                       zrelse(hint->ext_coord.coord.node);
30624 +                       break;
30625 +               default:
30626 +                       goto out;
30627 +               }
30628 +       }
30629 + ok:
30630 +       /* at least one item was found  */
30631 +       /* NOTE-EDWARD: Callers should handle the case
30632 +          when disk cluster is incomplete (-EIO) */
30633 +       tc->len = inode_scaled_cluster_size(inode) - f.length;
30634 +       tc->lsize = lbytes(clust->index, inode);
30635 +       assert("edward-1196", tc->len > 0);
30636 +       assert("edward-1406", tc->lsize > 0);
30637 +
30638 +       if (hint_is_unprepped_dclust(clust->hint)) {
30639 +               clust->dstat = UNPR_DISK_CLUSTER;
30640 +       } else if (clust->index == info->trunc_index) {
30641 +               clust->dstat = TRNC_DISK_CLUSTER;
30642 +       } else {
30643 +               clust->dstat = PREP_DISK_CLUSTER;
30644 +               dclust_set_extension_dsize(clust->hint, tc->len);
30645 +       }
30646 + out:
30647 +       assert("edward-1339",
30648 +              get_current_context()->grabbed_blocks >= was_grabbed);
30649 +       grabbed2free(get_current_context(),
30650 +                    get_current_super_private(),
30651 +                    get_current_context()->grabbed_blocks - was_grabbed);
30652 +       return result;
30653 +}
30654 +
30655 +int get_disk_cluster_locked(struct cluster_handle * clust, struct inode *inode,
30656 +                           znode_lock_mode lock_mode)
30657 +{
30658 +       reiser4_key key;
30659 +       ra_info_t ra_info;
30660 +
30661 +       assert("edward-730", reiser4_schedulable());
30662 +       assert("edward-731", clust != NULL);
30663 +       assert("edward-732", inode != NULL);
30664 +
30665 +       if (hint_is_valid(clust->hint)) {
30666 +               assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER);
30667 +               assert("edward-1294",
30668 +                      znode_is_write_locked(clust->hint->lh.node));
30669 +               /* already have a valid locked position */
30670 +               return (clust->dstat ==
30671 +                       FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND :
30672 +                       CBK_COORD_FOUND);
30673 +       }
30674 +       key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode),
30675 +                                  &key);
30676 +       ra_info.key_to_stop = key;
30677 +       set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
30678 +
30679 +       return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT,
30680 +                                CBK_FOR_INSERT);
30681 +}
30682 +
30683 +/* Read needed cluster pages before modifying.
30684 +   If success, @clust->hint contains locked position in the tree.
30685 +   Also:
30686 +   . find and set disk cluster state
30687 +   . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER.
30688 +*/
30689 +static int read_some_cluster_pages(struct inode * inode,
30690 +                                  struct cluster_handle * clust)
30691 +{
30692 +       int i;
30693 +       int result = 0;
30694 +       item_plugin *iplug;
30695 +       struct reiser4_slide * win = clust->win;
30696 +       znode_lock_mode mode = ZNODE_WRITE_LOCK;
30697 +
30698 +       iplug = item_plugin_by_id(CTAIL_ID);
30699 +
30700 +       assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc));
30701 +
30702 +#if REISER4_DEBUG
30703 +       if (clust->nr_pages == 0) {
30704 +               /* start write hole from fake disk cluster */
30705 +               assert("edward-1117", win != NULL);
30706 +               assert("edward-1118", win->stat == HOLE_WINDOW);
30707 +               assert("edward-1119", new_logical_cluster(clust, inode));
30708 +       }
30709 +#endif
30710 +       if (new_logical_cluster(clust, inode)) {
30711 +               /*
30712 +                  new page cluster is about to be written, nothing to read,
30713 +                */
30714 +               assert("edward-734", reiser4_schedulable());
30715 +               assert("edward-735", clust->hint->lh.owner == NULL);
30716 +
30717 +               if (clust->nr_pages) {
30718 +                       int off;
30719 +                       struct page * pg;
30720 +                       assert("edward-1419", clust->pages != NULL);
30721 +                       pg = clust->pages[clust->nr_pages - 1];
30722 +                       assert("edward-1420", pg != NULL);
30723 +                       off = off_to_pgoff(win->off+win->count+win->delta);
30724 +                       if (off) {
30725 +                               lock_page(pg);
30726 +                               zero_user_segment(pg, off, PAGE_CACHE_SIZE);
30727 +                               unlock_page(pg);
30728 +                       }
30729 +               }
30730 +               clust->dstat = FAKE_DISK_CLUSTER;
30731 +               return 0;
30732 +       }
30733 +       /*
30734 +          Here we should search for disk cluster to figure out its real state.
30735 +          Also there is one more important reason to do disk search: we need
30736 +          to make disk cluster _dirty_ if it exists
30737 +        */
30738 +
30739 +       /* if windows is specified, read the only pages
30740 +          that will be modified partially */
30741 +
30742 +       for (i = 0; i < clust->nr_pages; i++) {
30743 +               struct page *pg = clust->pages[i];
30744 +
30745 +               lock_page(pg);
30746 +               if (PageUptodate(pg)) {
30747 +                       unlock_page(pg);
30748 +                       continue;
30749 +               }
30750 +               unlock_page(pg);
30751 +
30752 +               if (win &&
30753 +                   i >= size_in_pages(win->off) &&
30754 +                   i < off_to_pg(win->off + win->count + win->delta))
30755 +                       /* page will be completely overwritten */
30756 +                       continue;
30757 +
30758 +               if (win && (i == clust->nr_pages - 1) &&
30759 +                   /* the last page is
30760 +                      partially modified,
30761 +                      not uptodate .. */
30762 +                   (size_in_pages(i_size_read(inode)) <= pg->index)) {
30763 +                       /* .. and appended,
30764 +                          so set zeroes to the rest */
30765 +                       int offset;
30766 +                       lock_page(pg);
30767 +                       assert("edward-1260",
30768 +                              size_in_pages(win->off + win->count +
30769 +                                            win->delta) - 1 == i);
30770 +
30771 +                       offset =
30772 +                           off_to_pgoff(win->off + win->count + win->delta);
30773 +                       zero_user_segment(pg, offset, PAGE_CACHE_SIZE);
30774 +                       unlock_page(pg);
30775 +                       /* still not uptodate */
30776 +                       break;
30777 +               }
30778 +               lock_page(pg);
30779 +               result = do_readpage_ctail(inode, clust, pg, mode);
30780 +
30781 +               assert("edward-1526", ergo(!result, PageUptodate(pg)));
30782 +               unlock_page(pg);
30783 +               if (result) {
30784 +                       warning("edward-219", "do_readpage_ctail failed");
30785 +                       goto out;
30786 +               }
30787 +       }
30788 +       if (!tfm_cluster_is_uptodate(&clust->tc)) {
30789 +               /* disk cluster unclaimed, but we need to make its znodes dirty
30790 +                * to make flush update convert its content
30791 +                */
30792 +               result = find_disk_cluster(clust, inode,
30793 +                                          0 /* do not read items */,
30794 +                                          mode);
30795 +       }
30796 + out:
30797 +       tfm_cluster_clr_uptodate(&clust->tc);
30798 +       return result;
30799 +}
30800 +
30801 +static int should_create_unprepped_cluster(struct cluster_handle * clust,
30802 +                                          struct inode * inode)
30803 +{
30804 +       assert("edward-737", clust != NULL);
30805 +
30806 +       switch (clust->dstat) {
30807 +       case PREP_DISK_CLUSTER:
30808 +       case UNPR_DISK_CLUSTER:
30809 +               return 0;
30810 +       case FAKE_DISK_CLUSTER:
30811 +               if (clust->win &&
30812 +                   clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) {
30813 +                       assert("edward-1172",
30814 +                              new_logical_cluster(clust, inode));
30815 +                       return 0;
30816 +               }
30817 +               return 1;
30818 +       default:
30819 +               impossible("edward-1173", "bad disk cluster state");
30820 +               return 0;
30821 +       }
30822 +}
30823 +
30824 +static int cryptcompress_make_unprepped_cluster(struct cluster_handle * clust,
30825 +                                               struct inode *inode)
30826 +{
30827 +       int result;
30828 +
30829 +       assert("edward-1123", reiser4_schedulable());
30830 +       assert("edward-737", clust != NULL);
30831 +       assert("edward-738", inode != NULL);
30832 +       assert("edward-739", cryptcompress_inode_ok(inode));
30833 +       assert("edward-1053", clust->hint != NULL);
30834 +
30835 +       if (!should_create_unprepped_cluster(clust, inode)) {
30836 +               if (clust->reserved) {
30837 +                       cluster_reserved2free(estimate_insert_cluster(inode));
30838 +#if REISER4_DEBUG
30839 +                       assert("edward-1267",
30840 +                              clust->reserved_unprepped ==
30841 +                              estimate_insert_cluster(inode));
30842 +                       clust->reserved_unprepped -=
30843 +                               estimate_insert_cluster(inode);
30844 +#endif
30845 +               }
30846 +               return 0;
30847 +       }
30848 +       assert("edward-1268", clust->reserved);
30849 +       cluster_reserved2grabbed(estimate_insert_cluster(inode));
30850 +#if REISER4_DEBUG
30851 +       assert("edward-1441",
30852 +              clust->reserved_unprepped == estimate_insert_cluster(inode));
30853 +       clust->reserved_unprepped -= estimate_insert_cluster(inode);
30854 +#endif
30855 +       result = ctail_insert_unprepped_cluster(clust, inode);
30856 +       if (result)
30857 +               return result;
30858 +
30859 +       inode_add_bytes(inode, inode_cluster_size(inode));
30860 +
30861 +       assert("edward-743", cryptcompress_inode_ok(inode));
30862 +       assert("edward-744", znode_is_write_locked(clust->hint->lh.node));
30863 +
30864 +       clust->dstat = UNPR_DISK_CLUSTER;
30865 +       return 0;
30866 +}
30867 +
30868 +/* . Grab page cluster for read, write, setattr, etc. operations;
30869 + * . Truncate its complete pages, if needed;
30870 + */
30871 +int prepare_page_cluster(struct inode * inode, struct cluster_handle * clust,
30872 +                        rw_op rw)
30873 +{
30874 +       assert("edward-177", inode != NULL);
30875 +       assert("edward-741", cryptcompress_inode_ok(inode));
30876 +       assert("edward-740", clust->pages != NULL);
30877 +
30878 +       set_cluster_nrpages(clust, inode);
30879 +       reset_cluster_pgset(clust, cluster_nrpages(inode));
30880 +       return grab_page_cluster(inode, clust, rw);
30881 +}
30882 +
30883 +/* Truncate complete page cluster of index @index.
30884 + * This is called by ->kill_hook() method of item
30885 + * plugin when deleting a disk cluster of such index.
30886 + */
30887 +void truncate_complete_page_cluster(struct inode *inode, cloff_t index,
30888 +                                   int even_cows)
30889 +{
30890 +       int found;
30891 +       int nr_pages;
30892 +       jnode *node;
30893 +       struct page *pages[MAX_CLUSTER_NRPAGES];
30894 +
30895 +       node = jlookup(current_tree, get_inode_oid(inode),
30896 +                      clust_to_pg(index, inode));
30897 +       nr_pages = size_in_pages(lbytes(index, inode));
30898 +       assert("edward-1483", nr_pages != 0);
30899 +       if (!node)
30900 +               goto truncate;
30901 +       found = find_get_pages(inode->i_mapping,
30902 +                              clust_to_pg(index, inode),
30903 +                              cluster_nrpages(inode), pages);
30904 +       if (!found) {
30905 +               assert("edward-1484", jnode_truncate_ok(inode, index));
30906 +               return;
30907 +       }
30908 +       lock_cluster(node);
30909 +
30910 +       if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS)
30911 +           && index == 0)
30912 +               /* converting to unix_file is in progress */
30913 +               JF_CLR(node, JNODE_CLUSTER_PAGE);
30914 +       if (JF_ISSET(node, JNODE_DIRTY)) {
30915 +               /*
30916 +                * @nr_pages were checked in, but not yet checked out -
30917 +                * we need to release them. (also there can be pages
30918 +                * attached to page cache by read(), etc. - don't take
30919 +                * them into account).
30920 +                */
30921 +               assert("edward-1198", found >= nr_pages);
30922 +
30923 +               /* free disk space grabbed for disk cluster converting */
30924 +               cluster_reserved2grabbed(estimate_update_cluster(inode));
30925 +               grabbed2free(get_current_context(),
30926 +                            get_current_super_private(),
30927 +                            estimate_update_cluster(inode));
30928 +               __put_page_cluster(0, nr_pages, pages, inode);
30929 +
30930 +               /* This will clear dirty bit, uncapture and unlock jnode */
30931 +               unlock_cluster_uncapture(node);
30932 +       } else
30933 +               unlock_cluster(node);
30934 +       jput(node);                         /* jlookup */
30935 +       put_found_pages(pages, found); /* find_get_pages */
30936 + truncate:
30937 +       if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) &&
30938 +           index == 0)
30939 +               return;
30940 +       truncate_page_cluster_range(inode, pages, index, 0,
30941 +                                   cluster_nrpages(inode),
30942 +                                   even_cows);
30943 +       assert("edward-1201",
30944 +              ergo(!reiser4_inode_get_flag(inode,
30945 +                                           REISER4_FILE_CONV_IN_PROGRESS),
30946 +                   jnode_truncate_ok(inode, index)));
30947 +       return;
30948 +}
30949 +
30950 +/*
30951 + * Set cluster handle @clust of a logical cluster before
30952 + * modifications which are supposed to be committed.
30953 + *
30954 + * . grab cluster pages;
30955 + * . reserve disk space;
30956 + * . maybe read pages from disk and set the disk cluster dirty;
30957 + * . maybe write hole and check in (partially zeroed) logical cluster;
30958 + * . create 'unprepped' disk cluster for new or fake logical one.
30959 + */
30960 +static int prepare_logical_cluster(struct inode *inode,
30961 +                                  loff_t file_off, /* write position
30962 +                                                      in the file */
30963 +                                  loff_t to_file, /* bytes of users data
30964 +                                                     to write to the file */
30965 +                                  struct cluster_handle * clust,
30966 +                                  logical_cluster_op op)
30967 +{
30968 +       int result = 0;
30969 +       struct reiser4_slide * win = clust->win;
30970 +
30971 +       reset_cluster_params(clust);
30972 +       cluster_set_tfm_act(&clust->tc, TFMA_READ);
30973 +#if REISER4_DEBUG
30974 +       clust->ctx = get_current_context();
30975 +#endif
30976 +       assert("edward-1190", op != LC_INVAL);
30977 +
30978 +       clust->op = op;
30979 +
30980 +       result = prepare_page_cluster(inode, clust, WRITE_OP);
30981 +       if (result)
30982 +               return result;
30983 +       assert("edward-1447",
30984 +              ergo(clust->nr_pages != 0, jprivate(clust->pages[0])));
30985 +       assert("edward-1448",
30986 +              ergo(clust->nr_pages != 0,
30987 +                   jnode_is_cluster_page(jprivate(clust->pages[0]))));
30988 +
30989 +       result = reserve4cluster(inode, clust);
30990 +       if (result)
30991 +               goto err1;
30992 +       result = read_some_cluster_pages(inode, clust);
30993 +       if (result) {
30994 +               free_reserved4cluster(inode,
30995 +                                     clust,
30996 +                                     estimate_update_cluster(inode) +
30997 +                                     estimate_insert_cluster(inode));
30998 +               goto err1;
30999 +       }
31000 +       assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER);
31001 +
31002 +       result = cryptcompress_make_unprepped_cluster(clust, inode);
31003 +       if (result)
31004 +               goto err2;
31005 +       if (win && win->stat == HOLE_WINDOW) {
31006 +               result = write_hole(inode, clust, file_off, to_file);
31007 +               if (result)
31008 +                       goto err2;
31009 +       }
31010 +       return 0;
31011 + err2:
31012 +       free_reserved4cluster(inode, clust,
31013 +                             estimate_update_cluster(inode));
31014 + err1:
31015 +       put_page_cluster(clust, inode, WRITE_OP);
31016 +       assert("edward-1125", result == -ENOSPC);
31017 +       return result;
31018 +}
31019 +
31020 +/* set window by two offsets */
31021 +static void set_window(struct cluster_handle * clust,
31022 +                      struct reiser4_slide * win, struct inode *inode,
31023 +                      loff_t o1, loff_t o2)
31024 +{
31025 +       assert("edward-295", clust != NULL);
31026 +       assert("edward-296", inode != NULL);
31027 +       assert("edward-1071", win != NULL);
31028 +       assert("edward-297", o1 <= o2);
31029 +
31030 +       clust->index = off_to_clust(o1, inode);
31031 +
31032 +       win->off = off_to_cloff(o1, inode);
31033 +       win->count = min((loff_t)(inode_cluster_size(inode) - win->off),
31034 +                        o2 - o1);
31035 +       win->delta = 0;
31036 +
31037 +       clust->win = win;
31038 +}
31039 +
31040 +static int set_cluster_by_window(struct inode *inode,
31041 +                                struct cluster_handle * clust,
31042 +                                struct reiser4_slide * win, size_t length,
31043 +                                loff_t file_off)
31044 +{
31045 +       int result;
31046 +
31047 +       assert("edward-197", clust != NULL);
31048 +       assert("edward-1072", win != NULL);
31049 +       assert("edward-198", inode != NULL);
31050 +
31051 +       result = alloc_cluster_pgset(clust, cluster_nrpages(inode));
31052 +       if (result)
31053 +               return result;
31054 +
31055 +       if (file_off > i_size_read(inode)) {
31056 +               /* Uhmm, hole in cryptcompress file... */
31057 +               loff_t hole_size;
31058 +               hole_size = file_off - inode->i_size;
31059 +
31060 +               set_window(clust, win, inode, inode->i_size, file_off);
31061 +               win->stat = HOLE_WINDOW;
31062 +               if (win->off + hole_size < inode_cluster_size(inode))
31063 +                       /* there is also user's data to append to the hole */
31064 +                       win->delta = min(inode_cluster_size(inode) -
31065 +                                        (win->off + win->count), length);
31066 +               return 0;
31067 +       }
31068 +       set_window(clust, win, inode, file_off, file_off + length);
31069 +       win->stat = DATA_WINDOW;
31070 +       return 0;
31071 +}
31072 +
31073 +int set_cluster_by_page(struct cluster_handle * clust, struct page * page,
31074 +                       int count)
31075 +{
31076 +       int result = 0;
31077 +       int (*setting_actor)(struct cluster_handle * clust, int count);
31078 +
31079 +       assert("edward-1358", clust != NULL);
31080 +       assert("edward-1359", page != NULL);
31081 +       assert("edward-1360", page->mapping != NULL);
31082 +       assert("edward-1361", page->mapping->host != NULL);
31083 +
31084 +       setting_actor =
31085 +               (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset);
31086 +       result = setting_actor(clust, count);
31087 +       clust->index = pg_to_clust(page->index, page->mapping->host);
31088 +       return result;
31089 +}
31090 +
31091 +/* reset all the params that not get updated */
31092 +void reset_cluster_params(struct cluster_handle * clust)
31093 +{
31094 +       assert("edward-197", clust != NULL);
31095 +
31096 +       clust->dstat = INVAL_DISK_CLUSTER;
31097 +       clust->tc.uptodate = 0;
31098 +       clust->tc.len = 0;
31099 +}
31100 +
31101 +/* the heart of write_cryptcompress */
31102 +static loff_t do_write_cryptcompress(struct file *file, struct inode *inode,
31103 +                                    const char __user *buf, size_t to_write,
31104 +                                    loff_t pos, struct psched_context *cont)
31105 +{
31106 +       int i;
31107 +       hint_t *hint;
31108 +       int result = 0;
31109 +       size_t count;
31110 +       struct reiser4_slide win;
31111 +       struct cluster_handle clust;
31112 +       struct cryptcompress_info * info;
31113 +
31114 +       assert("edward-154", buf != NULL);
31115 +       assert("edward-161", reiser4_schedulable());
31116 +       assert("edward-748", cryptcompress_inode_ok(inode));
31117 +       assert("edward-159", current_blocksize == PAGE_CACHE_SIZE);
31118 +       assert("edward-1274", get_current_context()->grabbed_blocks == 0);
31119 +
31120 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31121 +       if (hint == NULL)
31122 +               return RETERR(-ENOMEM);
31123 +
31124 +       result = load_file_hint(file, hint);
31125 +       if (result) {
31126 +               kfree(hint);
31127 +               return result;
31128 +       }
31129 +       count = to_write;
31130 +
31131 +       reiser4_slide_init(&win);
31132 +       cluster_init_read(&clust, &win);
31133 +       clust.hint = hint;
31134 +       info = cryptcompress_inode_data(inode);
31135 +
31136 +       mutex_lock(&info->checkin_mutex);
31137 +
31138 +       result = set_cluster_by_window(inode, &clust, &win, to_write, pos);
31139 +       if (result)
31140 +               goto out;
31141 +
31142 +       if (next_window_stat(&win) == HOLE_WINDOW) {
31143 +               /* write hole in this iteration
31144 +                  separated from the loop below */
31145 +               result = write_pschedule_hook(file, inode,
31146 +                                             pos,
31147 +                                             &clust,
31148 +                                             cont);
31149 +               if (result)
31150 +                       goto out;
31151 +               result = prepare_logical_cluster(inode, pos, count, &clust,
31152 +                                                LC_APPOV);
31153 +               if (result)
31154 +                       goto out;
31155 +       }
31156 +       do {
31157 +               const char __user * src;
31158 +               unsigned page_off, to_page;
31159 +
31160 +               assert("edward-750", reiser4_schedulable());
31161 +
31162 +               result = write_pschedule_hook(file, inode,
31163 +                                             pos + to_write - count,
31164 +                                             &clust,
31165 +                                             cont);
31166 +               if (result)
31167 +                       goto out;
31168 +               if (cont->state == PSCHED_ASSIGNED_NEW)
31169 +                       /* done_lh was called in write_pschedule_hook */
31170 +                       goto out_no_longterm_lock;
31171 +
31172 +               result = prepare_logical_cluster(inode, pos, count, &clust,
31173 +                                                LC_APPOV);
31174 +               if (result)
31175 +                       goto out;
31176 +
31177 +               assert("edward-751", cryptcompress_inode_ok(inode));
31178 +               assert("edward-204", win.stat == DATA_WINDOW);
31179 +               assert("edward-1288", hint_is_valid(clust.hint));
31180 +               assert("edward-752",
31181 +                      znode_is_write_locked(hint->ext_coord.coord.node));
31182 +               put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK);
31183 +
31184 +               /* set write position in page */
31185 +               page_off = off_to_pgoff(win.off);
31186 +
31187 +               /* copy user's data to cluster pages */
31188 +               for (i = off_to_pg(win.off), src = buf;
31189 +                    i < size_in_pages(win.off + win.count);
31190 +                    i++, src += to_page) {
31191 +                       to_page = __mbp(win.off + win.count, i) - page_off;
31192 +                       assert("edward-1039",
31193 +                              page_off + to_page <= PAGE_CACHE_SIZE);
31194 +                       assert("edward-287", clust.pages[i] != NULL);
31195 +
31196 +                       fault_in_pages_readable(src, to_page);
31197 +
31198 +                       lock_page(clust.pages[i]);
31199 +                       result =
31200 +                           __copy_from_user((char *)kmap(clust.pages[i]) +
31201 +                                            page_off, src, to_page);
31202 +                       kunmap(clust.pages[i]);
31203 +                       if (unlikely(result)) {
31204 +                               unlock_page(clust.pages[i]);
31205 +                               result = -EFAULT;
31206 +                               goto err2;
31207 +                       }
31208 +                       SetPageUptodate(clust.pages[i]);
31209 +                       reiser4_set_page_dirty_internal(clust.pages[i]);
31210 +                       flush_dcache_page(clust.pages[i]);
31211 +                       mark_page_accessed(clust.pages[i]);
31212 +                       unlock_page(clust.pages[i]);
31213 +                       page_off = 0;
31214 +               }
31215 +               assert("edward-753", cryptcompress_inode_ok(inode));
31216 +
31217 +               result = checkin_logical_cluster(&clust, inode);
31218 +               if (result)
31219 +                       goto err2;
31220 +
31221 +               buf   += win.count;
31222 +               count -= win.count;
31223 +
31224 +               result = balance_dirty_page_cluster(&clust, inode, 0, count,
31225 +                                                   win_count_to_nrpages(&win));
31226 +               if (result)
31227 +                       goto err1;
31228 +               assert("edward-755", hint->lh.owner == NULL);
31229 +               reset_cluster_params(&clust);
31230 +               continue;
31231 +       err2:
31232 +               put_page_cluster(&clust, inode, WRITE_OP);
31233 +       err1:
31234 +               if (clust.reserved)
31235 +                       free_reserved4cluster(inode,
31236 +                                             &clust,
31237 +                                             estimate_update_cluster(inode));
31238 +               break;
31239 +       } while (count);
31240 + out:
31241 +       done_lh(&hint->lh);
31242 +       save_file_hint(file, hint);
31243 + out_no_longterm_lock:
31244 +       mutex_unlock(&info->checkin_mutex);
31245 +       kfree(hint);
31246 +       put_cluster_handle(&clust);
31247 +       assert("edward-195",
31248 +              ergo((to_write == count),
31249 +                   (result < 0 || cont->state == PSCHED_ASSIGNED_NEW)));
31250 +       return (to_write - count) ? (to_write - count) : result;
31251 +}
31252 +
31253 +/**
31254 + * plugin->write()
31255 + * @file: file to write to
31256 + * @buf: address of user-space buffer
31257 + * @read_amount: number of bytes to write
31258 + * @off: position in file to write to
31259 + */
31260 +ssize_t write_cryptcompress(struct file *file, const char __user *buf,
31261 +                           size_t count, loff_t *off,
31262 +                           struct psched_context *cont)
31263 +{
31264 +       ssize_t result;
31265 +       struct inode *inode;
31266 +       reiser4_context *ctx;
31267 +       loff_t pos = *off;
31268 +       struct cryptcompress_info *info;
31269 +
31270 +       assert("edward-1449", cont->state == PSCHED_INVAL_STATE);
31271 +
31272 +       inode = file->f_dentry->d_inode;
31273 +       assert("edward-196", cryptcompress_inode_ok(inode));
31274 +
31275 +       info = cryptcompress_inode_data(inode);
31276 +       ctx = get_current_context();
31277 +
31278 +       result = generic_write_checks(file, &pos, &count, 0);
31279 +       if (unlikely(result != 0)) {
31280 +               context_set_commit_async(ctx);
31281 +               return result;
31282 +       }
31283 +       if (unlikely(count == 0))
31284 +               return 0;
31285 +       result = file_remove_suid(file);
31286 +       if (unlikely(result != 0)) {
31287 +               context_set_commit_async(ctx);
31288 +               return result;
31289 +       }
31290 +       /* remove_suid might create a transaction */
31291 +       reiser4_txn_restart(ctx);
31292 +
31293 +       result = do_write_cryptcompress(file, inode, buf, count, pos, cont);
31294 +
31295 +       if (unlikely(result < 0)) {
31296 +               context_set_commit_async(ctx);
31297 +               return result;
31298 +       }
31299 +       /* update position in a file */
31300 +       *off = pos + result;
31301 +       return result;
31302 +}
31303 +
31304 +/* plugin->readpages */
31305 +int readpages_cryptcompress(struct file *file, struct address_space *mapping,
31306 +                           struct list_head *pages, unsigned nr_pages)
31307 +{
31308 +       reiser4_context * ctx;
31309 +       int ret;
31310 +
31311 +       ctx = reiser4_init_context(mapping->host->i_sb);
31312 +       if (IS_ERR(ctx)) {
31313 +               ret = PTR_ERR(ctx);
31314 +               goto err;
31315 +       }
31316 +       /* cryptcompress file can be built of ctail items only */
31317 +       ret = readpages_ctail(file, mapping, pages);
31318 +       reiser4_txn_restart(ctx);
31319 +       reiser4_exit_context(ctx);
31320 +       if (ret) {
31321 +err:
31322 +               put_pages_list(pages);
31323 +       }
31324 +       return ret;
31325 +}
31326 +
31327 +static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode)
31328 +{
31329 +       /* reserve one block to update stat data item */
31330 +       assert("edward-1193",
31331 +              inode_file_plugin(inode)->estimate.update ==
31332 +              estimate_update_common);
31333 +       return estimate_update_common(inode);
31334 +}
31335 +
31336 +/**
31337 + * plugin->read
31338 + * @file: file to read from
31339 + * @buf: address of user-space buffer
31340 + * @read_amount: number of bytes to read
31341 + * @off: position in file to read from
31342 + */
31343 +ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size,
31344 +                          loff_t * off)
31345 +{
31346 +       ssize_t result;
31347 +       struct inode *inode;
31348 +       reiser4_context *ctx;
31349 +       struct cryptcompress_info *info;
31350 +       reiser4_block_nr needed;
31351 +
31352 +       inode = file->f_dentry->d_inode;
31353 +       assert("edward-1194", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
31354 +
31355 +       ctx = reiser4_init_context(inode->i_sb);
31356 +       if (IS_ERR(ctx))
31357 +               return PTR_ERR(ctx);
31358 +
31359 +       info = cryptcompress_inode_data(inode);
31360 +       needed = cryptcompress_estimate_read(inode);
31361 +
31362 +       result = reiser4_grab_space(needed, BA_CAN_COMMIT);
31363 +       if (result != 0) {
31364 +               reiser4_exit_context(ctx);
31365 +               return result;
31366 +       }
31367 +       result = do_sync_read(file, buf, size, off);
31368 +
31369 +       context_set_commit_async(ctx);
31370 +       reiser4_exit_context(ctx);
31371 +
31372 +       return result;
31373 +}
31374 +
31375 +/* Look for a disk cluster and keep lookup result in @found.
31376 + * If @index > 0, then find disk cluster of the index (@index - 1);
31377 + * If @index == 0, then find the rightmost disk cluster.
31378 + * Keep incremented index of the found disk cluster in @found.
31379 + * @found == 0 means that disk cluster was not found (in the last
31380 + * case (@index == 0) it means that file doesn't have disk clusters).
31381 + */
31382 +static int lookup_disk_cluster(struct inode *inode, cloff_t * found,
31383 +                              cloff_t index)
31384 +{
31385 +       int result;
31386 +       reiser4_key key;
31387 +       loff_t offset;
31388 +       hint_t *hint;
31389 +       lock_handle *lh;
31390 +       lookup_bias bias;
31391 +       coord_t *coord;
31392 +       item_plugin *iplug;
31393 +
31394 +       assert("edward-1131", inode != NULL);
31395 +       assert("edward-95", cryptcompress_inode_ok(inode));
31396 +
31397 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31398 +       if (hint == NULL)
31399 +               return RETERR(-ENOMEM);
31400 +       hint_init_zero(hint);
31401 +       lh = &hint->lh;
31402 +
31403 +       bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN);
31404 +       offset =
31405 +           (index ? clust_to_off(index, inode) -
31406 +            1 : get_key_offset(reiser4_max_key()));
31407 +
31408 +       key_by_inode_cryptcompress(inode, offset, &key);
31409 +
31410 +       /* find the last item of this object */
31411 +       result =
31412 +           find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */,
31413 +                             bias, 0);
31414 +       if (cbk_errored(result)) {
31415 +               done_lh(lh);
31416 +               kfree(hint);
31417 +               return result;
31418 +       }
31419 +       if (result == CBK_COORD_NOTFOUND) {
31420 +               /* no real disk clusters */
31421 +               done_lh(lh);
31422 +               kfree(hint);
31423 +               *found = 0;
31424 +               return 0;
31425 +       }
31426 +       /* disk cluster is found */
31427 +       coord = &hint->ext_coord.coord;
31428 +       coord_clear_iplug(coord);
31429 +       result = zload(coord->node);
31430 +       if (unlikely(result)) {
31431 +               done_lh(lh);
31432 +               kfree(hint);
31433 +               return result;
31434 +       }
31435 +       iplug = item_plugin_by_coord(coord);
31436 +       assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID));
31437 +       assert("edward-1202", ctail_ok(coord));
31438 +
31439 +       item_key_by_coord(coord, &key);
31440 +       *found = off_to_clust(get_key_offset(&key), inode) + 1;
31441 +
31442 +       assert("edward-1132", ergo(index, index == *found));
31443 +
31444 +       zrelse(coord->node);
31445 +       done_lh(lh);
31446 +       kfree(hint);
31447 +       return 0;
31448 +}
31449 +
31450 +static int find_fake_appended(struct inode *inode, cloff_t * index)
31451 +{
31452 +       return lookup_disk_cluster(inode, index,
31453 +                                  0 /* find last real one */ );
31454 +}
31455 +
31456 +/* Set left coord when unit is not found after node_lookup()
31457 +   This takes into account that there can be holes in a sequence
31458 +   of disk clusters */
31459 +
31460 +static void adjust_left_coord(coord_t * left_coord)
31461 +{
31462 +       switch (left_coord->between) {
31463 +       case AFTER_UNIT:
31464 +               left_coord->between = AFTER_ITEM;
31465 +       case AFTER_ITEM:
31466 +       case BEFORE_UNIT:
31467 +               break;
31468 +       default:
31469 +               impossible("edward-1204", "bad left coord to cut");
31470 +       }
31471 +       return;
31472 +}
31473 +
31474 +#define CRC_CUT_TREE_MIN_ITERATIONS 64
31475 +
31476 +/* plugin->cut_tree_worker */
31477 +int cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key,
31478 +                                 const reiser4_key * to_key,
31479 +                                 reiser4_key * smallest_removed,
31480 +                                 struct inode *object, int truncate,
31481 +                                 int *progress)
31482 +{
31483 +       lock_handle next_node_lock;
31484 +       coord_t left_coord;
31485 +       int result;
31486 +
31487 +       assert("edward-1158", tap->coord->node != NULL);
31488 +       assert("edward-1159", znode_is_write_locked(tap->coord->node));
31489 +       assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL);
31490 +
31491 +       *progress = 0;
31492 +       init_lh(&next_node_lock);
31493 +
31494 +       while (1) {
31495 +               znode *node;    /* node from which items are cut */
31496 +               node_plugin *nplug;     /* node plugin for @node */
31497 +
31498 +               node = tap->coord->node;
31499 +
31500 +               /* Move next_node_lock to the next node on the left. */
31501 +               result =
31502 +                   reiser4_get_left_neighbor(&next_node_lock, node,
31503 +                                             ZNODE_WRITE_LOCK,
31504 +                                             GN_CAN_USE_UPPER_LEVELS);
31505 +               if (result != 0 && result != -E_NO_NEIGHBOR)
31506 +                       break;
31507 +               /* FIXME-EDWARD: Check can we delete the node as a whole. */
31508 +               result = reiser4_tap_load(tap);
31509 +               if (result)
31510 +                       return result;
31511 +
31512 +               /* Prepare the second (right) point for cut_node() */
31513 +               if (*progress)
31514 +                       coord_init_last_unit(tap->coord, node);
31515 +
31516 +               else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL)
31517 +                       /* set rightmost unit for the items without lookup method */
31518 +                       tap->coord->unit_pos = coord_last_unit_pos(tap->coord);
31519 +
31520 +               nplug = node->nplug;
31521 +
31522 +               assert("edward-1161", nplug);
31523 +               assert("edward-1162", nplug->lookup);
31524 +
31525 +               /* left_coord is leftmost unit cut from @node */
31526 +               result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord);
31527 +
31528 +               if (IS_CBKERR(result))
31529 +                       break;
31530 +
31531 +               if (result == CBK_COORD_NOTFOUND)
31532 +                       adjust_left_coord(&left_coord);
31533 +
31534 +               /* adjust coordinates so that they are set to existing units */
31535 +               if (coord_set_to_right(&left_coord)
31536 +                   || coord_set_to_left(tap->coord)) {
31537 +                       result = 0;
31538 +                       break;
31539 +               }
31540 +
31541 +               if (coord_compare(&left_coord, tap->coord) ==
31542 +                   COORD_CMP_ON_RIGHT) {
31543 +                       /* keys from @from_key to @to_key are not in the tree */
31544 +                       result = 0;
31545 +                       break;
31546 +               }
31547 +
31548 +               /* cut data from one node */
31549 +               *smallest_removed = *reiser4_min_key();
31550 +               result = kill_node_content(&left_coord,
31551 +                                          tap->coord,
31552 +                                          from_key,
31553 +                                          to_key,
31554 +                                          smallest_removed,
31555 +                                          next_node_lock.node,
31556 +                                          object, truncate);
31557 +               reiser4_tap_relse(tap);
31558 +
31559 +               if (result)
31560 +                       break;
31561 +
31562 +               ++(*progress);
31563 +
31564 +               /* Check whether all items with keys >= from_key were removed
31565 +                * from the tree. */
31566 +               if (keyle(smallest_removed, from_key))
31567 +                       /* result = 0; */
31568 +                       break;
31569 +
31570 +               if (next_node_lock.node == NULL)
31571 +                       break;
31572 +
31573 +               result = reiser4_tap_move(tap, &next_node_lock);
31574 +               done_lh(&next_node_lock);
31575 +               if (result)
31576 +                       break;
31577 +
31578 +               /* Break long cut_tree operation (deletion of a large file) if
31579 +                * atom requires commit. */
31580 +               if (*progress > CRC_CUT_TREE_MIN_ITERATIONS
31581 +                   && current_atom_should_commit()) {
31582 +                       result = -E_REPEAT;
31583 +                       break;
31584 +               }
31585 +       }
31586 +       done_lh(&next_node_lock);
31587 +       return result;
31588 +}
31589 +
31590 +/* Append or expand hole in two steps:
31591 + * 1) set zeroes to the rightmost page of the rightmost non-fake
31592 + *    logical cluster;
31593 + * 2) expand hole via fake logical clusters (just increase i_size)
31594 + */
31595 +static int cryptcompress_append_hole(struct inode *inode /* with old size */,
31596 +                                    loff_t new_size)
31597 +{
31598 +       int result = 0;
31599 +       hint_t *hint;
31600 +       lock_handle *lh;
31601 +       loff_t hole_size;
31602 +       int nr_zeroes;
31603 +       struct reiser4_slide win;
31604 +       struct cluster_handle clust;
31605 +
31606 +       assert("edward-1133", inode->i_size < new_size);
31607 +       assert("edward-1134", reiser4_schedulable());
31608 +       assert("edward-1135", cryptcompress_inode_ok(inode));
31609 +       assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE);
31610 +       assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0);
31611 +
31612 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31613 +       if (hint == NULL)
31614 +               return RETERR(-ENOMEM);
31615 +       hint_init_zero(hint);
31616 +       lh = &hint->lh;
31617 +
31618 +       reiser4_slide_init(&win);
31619 +       cluster_init_read(&clust, &win);
31620 +       clust.hint = hint;
31621 +
31622 +       result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
31623 +       if (result)
31624 +               goto out;
31625 +       if (off_to_cloff(inode->i_size, inode) == 0)
31626 +               goto append_fake;
31627 +       hole_size = new_size - inode->i_size;
31628 +       nr_zeroes =
31629 +               inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode);
31630 +       if (hole_size < nr_zeroes)
31631 +               nr_zeroes = hole_size;
31632 +       set_window(&clust, &win, inode, inode->i_size,
31633 +                  inode->i_size + nr_zeroes);
31634 +       win.stat = HOLE_WINDOW;
31635 +
31636 +       assert("edward-1137",
31637 +              clust.index == off_to_clust(inode->i_size, inode));
31638 +
31639 +       result = prepare_logical_cluster(inode, 0, 0, &clust, LC_APPOV);
31640 +
31641 +       assert("edward-1271", !result || result == -ENOSPC);
31642 +       if (result)
31643 +               goto out;
31644 +       assert("edward-1139",
31645 +              clust.dstat == PREP_DISK_CLUSTER ||
31646 +              clust.dstat == UNPR_DISK_CLUSTER);
31647 +
31648 +       assert("edward-1431", hole_size >= nr_zeroes);
31649 +       if (hole_size == nr_zeroes)
31650 +       /* nothing to append anymore */
31651 +               goto out;
31652 + append_fake:
31653 +       INODE_SET_SIZE(inode, new_size);
31654 + out:
31655 +       done_lh(lh);
31656 +       kfree(hint);
31657 +       put_cluster_handle(&clust);
31658 +       return result;
31659 +}
31660 +
31661 +static int update_cryptcompress_size(struct inode *inode, loff_t new_size,
31662 +                                    int update_sd)
31663 +{
31664 +       return (new_size & ((loff_t) (inode_cluster_size(inode)) - 1)
31665 +               ? 0 : reiser4_update_file_size(inode, new_size, update_sd));
31666 +}
31667 +
31668 +/* Prune cryptcompress file in two steps:
31669 + * 1) cut all nominated logical clusters except the leftmost one which
31670 + *    is to be partially truncated. Note, that there can be "holes"
31671 + *    represented by fake logical clusters.
31672 + * 2) set zeroes and capture leftmost partially truncated logical
31673 + *    cluster, if it is not fake; otherwise prune fake logical cluster
31674 + *    (just decrease i_size).
31675 + */
31676 +static int prune_cryptcompress(struct inode *inode, loff_t new_size,
31677 +                              int update_sd, cloff_t aidx)
31678 +{
31679 +       int result = 0;
31680 +       unsigned nr_zeroes;
31681 +       loff_t to_prune;
31682 +       loff_t old_size;
31683 +       cloff_t ridx;
31684 +
31685 +       hint_t *hint;
31686 +       lock_handle *lh;
31687 +       struct reiser4_slide win;
31688 +       struct cluster_handle clust;
31689 +
31690 +       assert("edward-1140", inode->i_size >= new_size);
31691 +       assert("edward-1141", reiser4_schedulable());
31692 +       assert("edward-1142", cryptcompress_inode_ok(inode));
31693 +       assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE);
31694 +
31695 +       old_size = inode->i_size;
31696 +
31697 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31698 +       if (hint == NULL)
31699 +               return RETERR(-ENOMEM);
31700 +       hint_init_zero(hint);
31701 +       lh = &hint->lh;
31702 +
31703 +       reiser4_slide_init(&win);
31704 +       cluster_init_read(&clust, &win);
31705 +       clust.hint = hint;
31706 +
31707 +       /* calculate index of the rightmost logical cluster
31708 +          that will be completely truncated */
31709 +       ridx = size_in_lc(new_size, inode);
31710 +
31711 +       /* truncate all disk clusters starting from @ridx */
31712 +       assert("edward-1174", ridx <= aidx);
31713 +       old_size = inode->i_size;
31714 +       if (ridx != aidx) {
31715 +               struct cryptcompress_info * info;
31716 +               info = cryptcompress_inode_data(inode);
31717 +               result = cut_file_items(inode,
31718 +                                       clust_to_off(ridx, inode),
31719 +                                       update_sd,
31720 +                                       clust_to_off(aidx, inode),
31721 +                                       update_cryptcompress_size);
31722 +               info->trunc_index = ULONG_MAX;
31723 +               if (result)
31724 +                       goto out;
31725 +       }
31726 +       /*
31727 +        * there can be pages of fake logical clusters, truncate them
31728 +        */
31729 +       truncate_inode_pages(inode->i_mapping, clust_to_off(ridx, inode));
31730 +       assert("edward-1524",
31731 +              pages_truncate_ok(inode, clust_to_pg(ridx, inode)));
31732 +       /*
31733 +        * now perform partial truncate of last logical cluster
31734 +        */
31735 +       if (!off_to_cloff(new_size, inode)) {
31736 +               /* no partial truncate is needed */
31737 +               assert("edward-1145", inode->i_size == new_size);
31738 +               goto truncate_fake;
31739 +       }
31740 +       assert("edward-1146", new_size < inode->i_size);
31741 +
31742 +       to_prune = inode->i_size - new_size;
31743 +
31744 +       /* check if the last logical cluster is fake */
31745 +       result = lookup_disk_cluster(inode, &aidx, ridx);
31746 +       if (result)
31747 +               goto out;
31748 +       if (!aidx)
31749 +               /* yup, this is fake one */
31750 +               goto truncate_fake;
31751 +
31752 +       assert("edward-1148", aidx == ridx);
31753 +
31754 +       /* do partial truncate of the last page cluster,
31755 +          and try to capture this one */
31756 +       result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
31757 +       if (result)
31758 +               goto out;
31759 +       nr_zeroes = (off_to_pgoff(new_size) ?
31760 +                    PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0);
31761 +       set_window(&clust, &win, inode, new_size, new_size + nr_zeroes);
31762 +       win.stat = HOLE_WINDOW;
31763 +
31764 +       assert("edward-1149", clust.index == ridx - 1);
31765 +
31766 +       result = prepare_logical_cluster(inode, 0, 0, &clust, LC_TRUNC);
31767 +       if (result)
31768 +               goto out;
31769 +       assert("edward-1151",
31770 +              clust.dstat == PREP_DISK_CLUSTER ||
31771 +              clust.dstat == UNPR_DISK_CLUSTER);
31772 +
31773 +       assert("edward-1191", inode->i_size == new_size);
31774 +       assert("edward-1206", body_truncate_ok(inode, ridx));
31775 + truncate_fake:
31776 +       /* drop all the pages that don't have jnodes (i.e. pages
31777 +          which can not be truncated by cut_file_items() because
31778 +          of holes represented by fake disk clusters) including
31779 +          the pages of partially truncated cluster which was
31780 +          released by prepare_logical_cluster() */
31781 +       INODE_SET_SIZE(inode, new_size);
31782 +       truncate_inode_pages(inode->i_mapping, new_size);
31783 + out:
31784 +       assert("edward-1334", !result || result == -ENOSPC);
31785 +       assert("edward-1497",
31786 +              pages_truncate_ok(inode, size_in_pages(new_size)));
31787 +
31788 +       done_lh(lh);
31789 +       kfree(hint);
31790 +       put_cluster_handle(&clust);
31791 +       return result;
31792 +}
31793 +
31794 +/* Prepare cryptcompress file for truncate:
31795 + * prune or append rightmost fake logical clusters (if any)
31796 + */
31797 +static int start_truncate_fake(struct inode *inode, cloff_t aidx,
31798 +                              loff_t new_size, int update_sd)
31799 +{
31800 +       int result = 0;
31801 +       int bytes;
31802 +
31803 +       if (new_size > inode->i_size) {
31804 +               /* append */
31805 +               if (inode->i_size < clust_to_off(aidx, inode))
31806 +                       /* no fake bytes */
31807 +                       return 0;
31808 +               bytes = new_size - inode->i_size;
31809 +               INODE_SET_SIZE(inode, inode->i_size + bytes);
31810 +       } else {
31811 +               /* prune */
31812 +               if (inode->i_size <= clust_to_off(aidx, inode))
31813 +                       /* no fake bytes */
31814 +                       return 0;
31815 +               bytes = inode->i_size -
31816 +                       max(new_size, clust_to_off(aidx, inode));
31817 +               if (!bytes)
31818 +                       return 0;
31819 +               INODE_SET_SIZE(inode, inode->i_size - bytes);
31820 +               /* In the case of fake prune we need to drop page cluster.
31821 +                  There are only 2 cases for partially truncated page:
31822 +                  1. If is is dirty, therefore it is anonymous
31823 +                  (was dirtied via mmap), and will be captured
31824 +                  later via ->capture().
31825 +                  2. If is clean, therefore it is filled by zeroes.
31826 +                  In both cases we don't need to make it dirty and
31827 +                  capture here.
31828 +                */
31829 +               truncate_inode_pages(inode->i_mapping, inode->i_size);
31830 +       }
31831 +       if (update_sd)
31832 +               result = update_sd_cryptcompress(inode);
31833 +       return result;
31834 +}
31835 +
31836 +/**
31837 + * This is called in setattr_cryptcompress when it is used to truncate,
31838 + * and in delete_object_cryptcompress
31839 + */
31840 +static int cryptcompress_truncate(struct inode *inode, /* old size */
31841 +                                 loff_t new_size,      /* new size */
31842 +                                 int update_sd)
31843 +{
31844 +       int result;
31845 +       cloff_t aidx;
31846 +
31847 +       result = find_fake_appended(inode, &aidx);
31848 +       if (result)
31849 +               return result;
31850 +       assert("edward-1208",
31851 +              ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode)));
31852 +
31853 +       result = start_truncate_fake(inode, aidx, new_size, update_sd);
31854 +       if (result)
31855 +               return result;
31856 +       if (inode->i_size == new_size)
31857 +               /* nothing to truncate anymore */
31858 +               return 0;
31859 +       result = (inode->i_size < new_size ?
31860 +                 cryptcompress_append_hole(inode, new_size) :
31861 +                 prune_cryptcompress(inode, new_size, update_sd, aidx));
31862 +       if (!result && update_sd)
31863 +               result = update_sd_cryptcompress(inode);
31864 +       return result;
31865 +}
31866 +
31867 +/* Capture an anonymous pager cluster. (Page cluser is
31868 + * anonymous if it contains at least one anonymous page
31869 + */
31870 +static int capture_anon_page_cluster(struct cluster_handle * clust,
31871 +                                    struct inode * inode)
31872 +{
31873 +       int result;
31874 +
31875 +       assert("edward-1073", clust != NULL);
31876 +       assert("edward-1074", inode != NULL);
31877 +       assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER);
31878 +
31879 +       result = prepare_logical_cluster(inode, 0, 0, clust, LC_APPOV);
31880 +       if (result)
31881 +               return result;
31882 +       set_cluster_pages_dirty(clust, inode);
31883 +       result = checkin_logical_cluster(clust, inode);
31884 +       put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
31885 +       if (unlikely(result))
31886 +               put_page_cluster(clust, inode, WRITE_OP);
31887 +       return result;
31888 +}
31889 +
31890 +/* Starting from @index find tagged pages of the same page cluster.
31891 + * Clear the tag for each of them. Return number of found pages.
31892 + */
31893 +static int find_anon_page_cluster(struct address_space * mapping,
31894 +                                 pgoff_t * index, struct page ** pages)
31895 +{
31896 +       int i = 0;
31897 +       int found;
31898 +       spin_lock_irq(&mapping->tree_lock);
31899 +       do {
31900 +               /* looking for one page */
31901 +               found = radix_tree_gang_lookup_tag(&mapping->page_tree,
31902 +                                                  (void **)&pages[i],
31903 +                                                  *index, 1,
31904 +                                                  PAGECACHE_TAG_REISER4_MOVED);
31905 +               if (!found)
31906 +                       break;
31907 +               if (!same_page_cluster(pages[0], pages[i]))
31908 +                       break;
31909 +
31910 +               /* found */
31911 +               page_cache_get(pages[i]);
31912 +               *index = pages[i]->index + 1;
31913 +
31914 +               radix_tree_tag_clear(&mapping->page_tree,
31915 +                                    pages[i]->index,
31916 +                                    PAGECACHE_TAG_REISER4_MOVED);
31917 +               if (last_page_in_cluster(pages[i++]))
31918 +                       break;
31919 +       } while (1);
31920 +       spin_unlock_irq(&mapping->tree_lock);
31921 +       return i;
31922 +}
31923 +
31924 +#define MAX_PAGES_TO_CAPTURE  (1024)
31925 +
31926 +/* Capture anonymous page clusters */
31927 +static int capture_anon_pages(struct address_space * mapping, pgoff_t * index,
31928 +                             int to_capture)
31929 +{
31930 +       int count = 0;
31931 +       int found = 0;
31932 +       int result = 0;
31933 +       hint_t *hint;
31934 +       lock_handle *lh;
31935 +       struct inode * inode;
31936 +       struct cluster_handle clust;
31937 +       struct page * pages[MAX_CLUSTER_NRPAGES];
31938 +
31939 +       assert("edward-1127", mapping != NULL);
31940 +       assert("edward-1128", mapping->host != NULL);
31941 +       assert("edward-1440", mapping->host->i_mapping == mapping);
31942 +
31943 +       inode = mapping->host;
31944 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
31945 +       if (hint == NULL)
31946 +               return RETERR(-ENOMEM);
31947 +       hint_init_zero(hint);
31948 +       lh = &hint->lh;
31949 +
31950 +       cluster_init_read(&clust, NULL);
31951 +       clust.hint = hint;
31952 +
31953 +       result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
31954 +       if (result)
31955 +               goto out;
31956 +
31957 +       while (to_capture > 0) {
31958 +               found = find_anon_page_cluster(mapping, index, pages);
31959 +               if (!found) {
31960 +                       *index = (pgoff_t) - 1;
31961 +                       break;
31962 +               }
31963 +               move_cluster_forward(&clust, inode, pages[0]->index);
31964 +               result = capture_anon_page_cluster(&clust, inode);
31965 +
31966 +               put_found_pages(pages, found); /* find_anon_page_cluster */
31967 +               if (result)
31968 +                       break;
31969 +               to_capture -= clust.nr_pages;
31970 +               count += clust.nr_pages;
31971 +       }
31972 +       if (result) {
31973 +               warning("edward-1077",
31974 +                       "Capture failed (inode %llu, result=%i, captured=%d)\n",
31975 +                       (unsigned long long)get_inode_oid(inode), result, count);
31976 +       } else {
31977 +               assert("edward-1078", ergo(found > 0, count > 0));
31978 +               if (to_capture <= 0)
31979 +                       /* there may be left more pages */
31980 +                       __mark_inode_dirty(inode, I_DIRTY_PAGES);
31981 +               result = count;
31982 +       }
31983 +      out:
31984 +       done_lh(lh);
31985 +       kfree(hint);
31986 +       put_cluster_handle(&clust);
31987 +       return result;
31988 +}
31989 +
31990 +/* Returns true if inode's mapping has dirty pages
31991 +   which do not belong to any atom */
31992 +static int cryptcompress_inode_has_anon_pages(struct inode *inode)
31993 +{
31994 +       int result;
31995 +       spin_lock_irq(&inode->i_mapping->tree_lock);
31996 +       result = radix_tree_tagged(&inode->i_mapping->page_tree,
31997 +                                  PAGECACHE_TAG_REISER4_MOVED);
31998 +       spin_unlock_irq(&inode->i_mapping->tree_lock);
31999 +       return result;
32000 +}
32001 +
32002 +/* plugin->writepages */
32003 +int writepages_cryptcompress(struct address_space *mapping,
32004 +                            struct writeback_control *wbc)
32005 +{
32006 +       int result = 0;
32007 +       long to_capture;
32008 +       pgoff_t nrpages;
32009 +       pgoff_t index = 0;
32010 +       struct inode *inode;
32011 +       struct cryptcompress_info *info;
32012 +
32013 +       inode = mapping->host;
32014 +       if (!cryptcompress_inode_has_anon_pages(inode))
32015 +               goto end;
32016 +       info = cryptcompress_inode_data(inode);
32017 +       nrpages = size_in_pages(i_size_read(inode));
32018 +
32019 +       if (wbc->sync_mode != WB_SYNC_ALL)
32020 +               to_capture = min(wbc->nr_to_write, (long)MAX_PAGES_TO_CAPTURE);
32021 +       else
32022 +               to_capture = MAX_PAGES_TO_CAPTURE;
32023 +       do {
32024 +               reiser4_context *ctx;
32025 +
32026 +               ctx = reiser4_init_context(inode->i_sb);
32027 +               if (IS_ERR(ctx)) {
32028 +                       result = PTR_ERR(ctx);
32029 +                       break;
32030 +               }
32031 +               /* avoid recursive calls to ->sync_inodes */
32032 +               ctx->nobalance = 1;
32033 +
32034 +               assert("edward-1079",
32035 +                      lock_stack_isclean(get_current_lock_stack()));
32036 +
32037 +               reiser4_txn_restart_current();
32038 +
32039 +               if (get_current_context()->entd) {
32040 +                       if (mutex_trylock(&info->checkin_mutex) == 0) {
32041 +                               /* the mutex might be occupied by
32042 +                                  entd caller */
32043 +                               result = RETERR(-EBUSY);
32044 +                               reiser4_exit_context(ctx);
32045 +                               break;
32046 +                       }
32047 +               } else
32048 +                       mutex_lock(&info->checkin_mutex);
32049 +
32050 +               result = capture_anon_pages(inode->i_mapping, &index,
32051 +                                           to_capture);
32052 +               mutex_unlock(&info->checkin_mutex);
32053 +
32054 +               if (result < 0) {
32055 +                       reiser4_exit_context(ctx);
32056 +                       break;
32057 +               }
32058 +               wbc->nr_to_write -= result;
32059 +               if (wbc->sync_mode != WB_SYNC_ALL) {
32060 +                       reiser4_exit_context(ctx);
32061 +                       break;
32062 +               }
32063 +               result = txnmgr_force_commit_all(inode->i_sb, 0);
32064 +               reiser4_exit_context(ctx);
32065 +       } while (result >= 0 && index < nrpages);
32066 +
32067 + end:
32068 +       if (is_in_reiser4_context()) {
32069 +               if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
32070 +                       /* there are already pages to flush, flush them out,
32071 +                          do not delay until end of reiser4_sync_inodes */
32072 +                       reiser4_writeout(inode->i_sb, wbc);
32073 +                       get_current_context()->nr_captured = 0;
32074 +               }
32075 +       }
32076 +       return result;
32077 +}
32078 +
32079 +/* plugin->ioctl */
32080 +int ioctl_cryptcompress(struct inode *inode, struct file *filp,
32081 +                       unsigned int cmd, unsigned long arg)
32082 +{
32083 +       return RETERR(-ENOSYS);
32084 +}
32085 +
32086 +/* plugin->mmap */
32087 +int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
32088 +{
32089 +       int result;
32090 +       struct inode *inode;
32091 +       reiser4_context *ctx;
32092 +
32093 +       inode = file->f_dentry->d_inode;
32094 +       ctx = reiser4_init_context(inode->i_sb);
32095 +       if (IS_ERR(ctx))
32096 +               return PTR_ERR(ctx);
32097 +       /*
32098 +        * generic_file_mmap will do update_atime. Grab space for stat data
32099 +        * update.
32100 +        */
32101 +       result = reiser4_grab_space_force
32102 +               (inode_file_plugin(inode)->estimate.update(inode),
32103 +                BA_CAN_COMMIT);
32104 +       if (result) {
32105 +               reiser4_exit_context(ctx);
32106 +               return result;
32107 +       }
32108 +       result = generic_file_mmap(file, vma);
32109 +       reiser4_exit_context(ctx);
32110 +       return result;
32111 +}
32112 +
32113 +/* plugin->delete_object */
32114 +int delete_object_cryptcompress(struct inode *inode)
32115 +{
32116 +       int result;
32117 +       struct cryptcompress_info * info;
32118 +
32119 +       assert("edward-429", inode->i_nlink == 0);
32120 +
32121 +       reiser4_txn_restart_current();
32122 +       info = cryptcompress_inode_data(inode);
32123 +
32124 +       mutex_lock(&info->checkin_mutex);
32125 +       result = cryptcompress_truncate(inode, 0, 0);
32126 +       mutex_unlock(&info->checkin_mutex);
32127 +
32128 +       if (result) {
32129 +               warning("edward-430",
32130 +                       "cannot truncate cryptcompress file  %lli: %i",
32131 +                       (unsigned long long)get_inode_oid(inode),
32132 +                       result);
32133 +       }
32134 +       truncate_inode_pages(inode->i_mapping, 0);
32135 +       assert("edward-1487", pages_truncate_ok(inode, 0));
32136 +       /* and remove stat data */
32137 +       return reiser4_delete_object_common(inode);
32138 +}
32139 +
32140 +/*
32141 + * plugin->setattr
32142 + * This implements actual truncate (see comments in reiser4/page_cache.c)
32143 + */
32144 +int setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
32145 +{
32146 +       int result;
32147 +       struct inode *inode;
32148 +       struct cryptcompress_info * info;
32149 +
32150 +       inode = dentry->d_inode;
32151 +       info = cryptcompress_inode_data(inode);
32152 +
32153 +       if (attr->ia_valid & ATTR_SIZE) {
32154 +               if (i_size_read(inode) != attr->ia_size) {
32155 +                       reiser4_context *ctx;
32156 +                       loff_t old_size;
32157 +
32158 +                       ctx = reiser4_init_context(dentry->d_inode->i_sb);
32159 +                       if (IS_ERR(ctx))
32160 +                               return PTR_ERR(ctx);
32161 +                       result = setattr_pschedule_hook(inode);
32162 +                       if (result) {
32163 +                               context_set_commit_async(ctx);
32164 +                               reiser4_exit_context(ctx);
32165 +                               return result;
32166 +                       }
32167 +                       old_size = i_size_read(inode);
32168 +                       inode_check_scale(inode, old_size, attr->ia_size);
32169 +
32170 +                       mutex_lock(&info->checkin_mutex);
32171 +                       result = cryptcompress_truncate(inode,
32172 +                                                       attr->ia_size,
32173 +                                                       1/* update sd */);
32174 +                       mutex_unlock(&info->checkin_mutex);
32175 +                       if (result) {
32176 +                            warning("edward-1192",
32177 +                                    "truncate_cryptcompress failed: oid %lli, "
32178 +                                    "old size %lld, new size %lld, retval %d",
32179 +                                    (unsigned long long)
32180 +                                    get_inode_oid(inode), old_size,
32181 +                                    attr->ia_size, result);
32182 +                       }
32183 +                       context_set_commit_async(ctx);
32184 +                       reiser4_exit_context(ctx);
32185 +               } else
32186 +                       result = 0;
32187 +       } else
32188 +               result = reiser4_setattr_common(dentry, attr);
32189 +       return result;
32190 +}
32191 +
32192 +/* plugin->release */
32193 +int release_cryptcompress(struct inode *inode, struct file *file)
32194 +{
32195 +       reiser4_context *ctx = reiser4_init_context(inode->i_sb);
32196 +
32197 +       if (IS_ERR(ctx))
32198 +               return PTR_ERR(ctx);
32199 +       reiser4_free_file_fsdata(file);
32200 +       reiser4_exit_context(ctx);
32201 +       return 0;
32202 +}
32203 +
32204 +/* plugin->prepare_write */
32205 +int prepare_write_cryptcompress(struct file *file, struct page *page,
32206 +                               unsigned from, unsigned to)
32207 +{
32208 +       return -EINVAL;
32209 +}
32210 +
32211 +/* plugin->commit_write */
32212 +int commit_write_cryptcompress(struct file *file, struct page *page,
32213 +                              unsigned from, unsigned to)
32214 +{
32215 +       BUG();
32216 +       return 0;
32217 +}
32218 +
32219 +/* plugin->bmap */
32220 +sector_t bmap_cryptcompress(struct address_space *mapping, sector_t lblock)
32221 +{
32222 +       return -EINVAL;
32223 +}
32224 +
32225 +/*
32226 +  Local variables:
32227 +  c-indentation-style: "K&R"
32228 +  mode-name: "LC"
32229 +  c-basic-offset: 8
32230 +  tab-width: 8
32231 +  fill-column: 80
32232 +  scroll-step: 1
32233 +  End:
32234 +*/
32235 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/file/cryptcompress.h linux-2.6.27/fs/reiser4/plugin/file/cryptcompress.h
32236 --- linux-2.6.27.orig/fs/reiser4/plugin/file/cryptcompress.h    1970-01-01 03:00:00.000000000 +0300
32237 +++ linux-2.6.27/fs/reiser4/plugin/file/cryptcompress.h 2008-10-12 18:20:01.000000000 +0400
32238 @@ -0,0 +1,616 @@
32239 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
32240 +/* See http://www.namesys.com/cryptcompress_design.html */
32241 +
32242 +#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ )
32243 +#define __FS_REISER4_CRYPTCOMPRESS_H__
32244 +
32245 +#include "../../page_cache.h"
32246 +#include "../compress/compress.h"
32247 +#include "../crypto/cipher.h"
32248 +
32249 +#include <linux/pagemap.h>
32250 +
32251 +#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT
32252 +#define MAX_CLUSTER_SHIFT 16
32253 +#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT)
32254 +#define DC_CHECKSUM_SIZE 4
32255 +
32256 +#define MIN_LATTICE_FACTOR 1
32257 +#define MAX_LATTICE_FACTOR 32
32258 +
32259 +/* this mask contains all non-standard plugins that might
32260 +   be present in reiser4-specific part of inode managed by
32261 +   cryptcompress file plugin */
32262 +#define cryptcompress_mask                             \
32263 +       ((1 << PSET_FILE) |                             \
32264 +        (1 << PSET_CLUSTER) |                          \
32265 +        (1 << PSET_CIPHER) |                           \
32266 +        (1 << PSET_DIGEST) |                           \
32267 +        (1 << PSET_COMPRESSION) |                      \
32268 +        (1 << PSET_COMPRESSION_MODE))
32269 +
32270 +#if REISER4_DEBUG
32271 +static inline int cluster_shift_ok(int shift)
32272 +{
32273 +       return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT);
32274 +}
32275 +#endif
32276 +
32277 +#if REISER4_DEBUG
32278 +#define INODE_PGCOUNT(inode)                                           \
32279 +({                                                                     \
32280 +       assert("edward-1530", inode_file_plugin(inode) ==               \
32281 +              file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));        \
32282 +       atomic_read(&cryptcompress_inode_data(inode)->pgcount);         \
32283 + })
32284 +#define INODE_PGCOUNT_INC(inode)                                       \
32285 +do {                                                                   \
32286 +       assert("edward-1531", inode_file_plugin(inode) ==               \
32287 +              file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));        \
32288 +       atomic_inc(&cryptcompress_inode_data(inode)->pgcount);          \
32289 +} while (0)
32290 +#define INODE_PGCOUNT_DEC(inode)                                       \
32291 +do {                                                                   \
32292 +       if (inode_file_plugin(inode) ==                                 \
32293 +           file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))            \
32294 +               atomic_dec(&cryptcompress_inode_data(inode)->pgcount);  \
32295 +} while (0)
32296 +#else
32297 +#define INODE_PGCOUNT(inode) (0)
32298 +#define INODE_PGCOUNT_INC(inode)
32299 +#define INODE_PGCOUNT_DEC(inode)
32300 +#endif /* REISER4_DEBUG */
32301 +
32302 +struct tfm_stream {
32303 +       __u8 *data;
32304 +       size_t size;
32305 +};
32306 +
32307 +typedef enum {
32308 +       INPUT_STREAM,
32309 +       OUTPUT_STREAM,
32310 +       LAST_STREAM
32311 +} tfm_stream_id;
32312 +
32313 +typedef struct tfm_stream * tfm_unit[LAST_STREAM];
32314 +
32315 +static inline __u8 *ts_data(struct tfm_stream * stm)
32316 +{
32317 +       assert("edward-928", stm != NULL);
32318 +       return stm->data;
32319 +}
32320 +
32321 +static inline size_t ts_size(struct tfm_stream * stm)
32322 +{
32323 +       assert("edward-929", stm != NULL);
32324 +       return stm->size;
32325 +}
32326 +
32327 +static inline void set_ts_size(struct tfm_stream * stm, size_t size)
32328 +{
32329 +       assert("edward-930", stm != NULL);
32330 +
32331 +       stm->size = size;
32332 +}
32333 +
32334 +static inline int alloc_ts(struct tfm_stream ** stm)
32335 +{
32336 +       assert("edward-931", stm);
32337 +       assert("edward-932", *stm == NULL);
32338 +
32339 +       *stm = kzalloc(sizeof(**stm), reiser4_ctx_gfp_mask_get());
32340 +       if (!*stm)
32341 +               return -ENOMEM;
32342 +       return 0;
32343 +}
32344 +
32345 +static inline void free_ts(struct tfm_stream * stm)
32346 +{
32347 +       assert("edward-933", !ts_data(stm));
32348 +       assert("edward-934", !ts_size(stm));
32349 +
32350 +       kfree(stm);
32351 +}
32352 +
32353 +static inline int alloc_ts_data(struct tfm_stream * stm, size_t size)
32354 +{
32355 +       assert("edward-935", !ts_data(stm));
32356 +       assert("edward-936", !ts_size(stm));
32357 +       assert("edward-937", size != 0);
32358 +
32359 +       stm->data = reiser4_vmalloc(size);
32360 +       if (!stm->data)
32361 +               return -ENOMEM;
32362 +       set_ts_size(stm, size);
32363 +       return 0;
32364 +}
32365 +
32366 +static inline void free_ts_data(struct tfm_stream * stm)
32367 +{
32368 +       assert("edward-938", equi(ts_data(stm), ts_size(stm)));
32369 +
32370 +       if (ts_data(stm))
32371 +               vfree(ts_data(stm));
32372 +       memset(stm, 0, sizeof *stm);
32373 +}
32374 +
32375 +/* Write modes for item conversion in flush convert phase */
32376 +typedef enum {
32377 +       CRC_APPEND_ITEM = 1,
32378 +       CRC_OVERWRITE_ITEM = 2,
32379 +       CRC_CUT_ITEM = 3
32380 +} cryptcompress_write_mode_t;
32381 +
32382 +typedef enum {
32383 +       LC_INVAL  = 0,   /* invalid value */
32384 +       LC_APPOV = 1,    /* append and/or overwrite */
32385 +       LC_TRUNC = 2     /* truncate */
32386 +} logical_cluster_op;
32387 +
32388 +/* Transform cluster.
32389 + * Intermediate state between page cluster and disk cluster
32390 + * Is used for data transform (compression/encryption)
32391 + */
32392 +struct tfm_cluster {
32393 +       coa_set coa;      /* compression algorithms info */
32394 +       tfm_unit tun;     /* plain and transformed streams */
32395 +       tfm_action act;
32396 +       int uptodate;
32397 +       int lsize;        /* number of bytes in logical cluster */
32398 +       int len;          /* length of the transform stream */
32399 +};
32400 +
32401 +static inline coa_t get_coa(struct tfm_cluster * tc, reiser4_compression_id id,
32402 +                           tfm_action act)
32403 +{
32404 +       return tc->coa[id][act];
32405 +}
32406 +
32407 +static inline void set_coa(struct tfm_cluster * tc, reiser4_compression_id id,
32408 +                          tfm_action act, coa_t coa)
32409 +{
32410 +       tc->coa[id][act] = coa;
32411 +}
32412 +
32413 +static inline int alloc_coa(struct tfm_cluster * tc, compression_plugin * cplug)
32414 +{
32415 +       coa_t coa;
32416 +
32417 +       coa = cplug->alloc(tc->act);
32418 +       if (IS_ERR(coa))
32419 +               return PTR_ERR(coa);
32420 +       set_coa(tc, cplug->h.id, tc->act, coa);
32421 +       return 0;
32422 +}
32423 +
32424 +static inline int
32425 +grab_coa(struct tfm_cluster * tc, compression_plugin * cplug)
32426 +{
32427 +       return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ?
32428 +               alloc_coa(tc, cplug) : 0);
32429 +}
32430 +
32431 +static inline void free_coa_set(struct tfm_cluster * tc)
32432 +{
32433 +       tfm_action j;
32434 +       reiser4_compression_id i;
32435 +       compression_plugin *cplug;
32436 +
32437 +       assert("edward-810", tc != NULL);
32438 +
32439 +       for (j = 0; j < TFMA_LAST; j++)
32440 +               for (i = 0; i < LAST_COMPRESSION_ID; i++) {
32441 +                       if (!get_coa(tc, i, j))
32442 +                               continue;
32443 +                       cplug = compression_plugin_by_id(i);
32444 +                       assert("edward-812", cplug->free != NULL);
32445 +                       cplug->free(get_coa(tc, i, j), j);
32446 +                       set_coa(tc, i, j, 0);
32447 +               }
32448 +       return;
32449 +}
32450 +
32451 +static inline struct tfm_stream * get_tfm_stream(struct tfm_cluster * tc,
32452 +                                                tfm_stream_id id)
32453 +{
32454 +       return tc->tun[id];
32455 +}
32456 +
32457 +static inline void set_tfm_stream(struct tfm_cluster * tc,
32458 +                                 tfm_stream_id id, struct tfm_stream * ts)
32459 +{
32460 +       tc->tun[id] = ts;
32461 +}
32462 +
32463 +static inline __u8 *tfm_stream_data(struct tfm_cluster * tc, tfm_stream_id id)
32464 +{
32465 +       return ts_data(get_tfm_stream(tc, id));
32466 +}
32467 +
32468 +static inline void set_tfm_stream_data(struct tfm_cluster * tc,
32469 +                                      tfm_stream_id id, __u8 * data)
32470 +{
32471 +       get_tfm_stream(tc, id)->data = data;
32472 +}
32473 +
32474 +static inline size_t tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id)
32475 +{
32476 +       return ts_size(get_tfm_stream(tc, id));
32477 +}
32478 +
32479 +static inline void
32480 +set_tfm_stream_size(struct tfm_cluster * tc, tfm_stream_id id, size_t size)
32481 +{
32482 +       get_tfm_stream(tc, id)->size = size;
32483 +}
32484 +
32485 +static inline int
32486 +alloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
32487 +{
32488 +       assert("edward-939", tc != NULL);
32489 +       assert("edward-940", !get_tfm_stream(tc, id));
32490 +
32491 +       tc->tun[id] = kzalloc(sizeof(struct tfm_stream),
32492 +                             reiser4_ctx_gfp_mask_get());
32493 +       if (!tc->tun[id])
32494 +               return -ENOMEM;
32495 +       return alloc_ts_data(get_tfm_stream(tc, id), size);
32496 +}
32497 +
32498 +static inline int
32499 +realloc_tfm_stream(struct tfm_cluster * tc, size_t size, tfm_stream_id id)
32500 +{
32501 +       assert("edward-941", tfm_stream_size(tc, id) < size);
32502 +       free_ts_data(get_tfm_stream(tc, id));
32503 +       return alloc_ts_data(get_tfm_stream(tc, id), size);
32504 +}
32505 +
32506 +static inline void free_tfm_stream(struct tfm_cluster * tc, tfm_stream_id id)
32507 +{
32508 +       free_ts_data(get_tfm_stream(tc, id));
32509 +       free_ts(get_tfm_stream(tc, id));
32510 +       set_tfm_stream(tc, id, 0);
32511 +}
32512 +
32513 +static inline unsigned coa_overrun(compression_plugin * cplug, int ilen)
32514 +{
32515 +       return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0);
32516 +}
32517 +
32518 +static inline void free_tfm_unit(struct tfm_cluster * tc)
32519 +{
32520 +       tfm_stream_id id;
32521 +       for (id = 0; id < LAST_STREAM; id++) {
32522 +               if (!get_tfm_stream(tc, id))
32523 +                       continue;
32524 +               free_tfm_stream(tc, id);
32525 +       }
32526 +}
32527 +
32528 +static inline void put_tfm_cluster(struct tfm_cluster * tc)
32529 +{
32530 +       assert("edward-942", tc != NULL);
32531 +       free_coa_set(tc);
32532 +       free_tfm_unit(tc);
32533 +}
32534 +
32535 +static inline int tfm_cluster_is_uptodate(struct tfm_cluster * tc)
32536 +{
32537 +       assert("edward-943", tc != NULL);
32538 +       assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1);
32539 +       return (tc->uptodate == 1);
32540 +}
32541 +
32542 +static inline void tfm_cluster_set_uptodate(struct tfm_cluster * tc)
32543 +{
32544 +       assert("edward-945", tc != NULL);
32545 +       assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1);
32546 +       tc->uptodate = 1;
32547 +       return;
32548 +}
32549 +
32550 +static inline void tfm_cluster_clr_uptodate(struct tfm_cluster * tc)
32551 +{
32552 +       assert("edward-947", tc != NULL);
32553 +       assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1);
32554 +       tc->uptodate = 0;
32555 +       return;
32556 +}
32557 +
32558 +static inline int tfm_stream_is_set(struct tfm_cluster * tc, tfm_stream_id id)
32559 +{
32560 +       return (get_tfm_stream(tc, id) &&
32561 +               tfm_stream_data(tc, id) && tfm_stream_size(tc, id));
32562 +}
32563 +
32564 +static inline int tfm_cluster_is_set(struct tfm_cluster * tc)
32565 +{
32566 +       int i;
32567 +       for (i = 0; i < LAST_STREAM; i++)
32568 +               if (!tfm_stream_is_set(tc, i))
32569 +                       return 0;
32570 +       return 1;
32571 +}
32572 +
32573 +static inline void alternate_streams(struct tfm_cluster * tc)
32574 +{
32575 +       struct tfm_stream *tmp = get_tfm_stream(tc, INPUT_STREAM);
32576 +
32577 +       set_tfm_stream(tc, INPUT_STREAM, get_tfm_stream(tc, OUTPUT_STREAM));
32578 +       set_tfm_stream(tc, OUTPUT_STREAM, tmp);
32579 +}
32580 +
32581 +/* Set of states to indicate a kind of data
32582 + * that will be written to the window */
32583 +typedef enum {
32584 +       DATA_WINDOW,            /* user's data */
32585 +       HOLE_WINDOW             /* zeroes (such kind of data can be written
32586 +                                * if we start to write from offset > i_size) */
32587 +} window_stat;
32588 +
32589 +/* Window (of logical cluster size) discretely sliding along a file.
32590 + * Is used to locate hole region in a logical cluster to be properly
32591 + * represented on disk.
32592 + * We split a write to cryptcompress file into writes to its logical
32593 + * clusters. Before writing to a logical cluster we set a window, i.e.
32594 + * calculate values of the following fields:
32595 + */
32596 +struct reiser4_slide {
32597 +       unsigned off;           /* offset to write from */
32598 +       unsigned count;         /* number of bytes to write */
32599 +       unsigned delta;         /* number of bytes to append to the hole */
32600 +       window_stat stat;       /* what kind of data will be written starting
32601 +                                  from @off */
32602 +};
32603 +
32604 +/* Possible states of a disk cluster */
32605 +typedef enum {
32606 +       INVAL_DISK_CLUSTER,     /* unknown state */
32607 +       PREP_DISK_CLUSTER,      /* disk cluster got converted by flush
32608 +                                * at least 1 time */
32609 +       UNPR_DISK_CLUSTER,      /* disk cluster just created and should be
32610 +                                * converted by flush */
32611 +       FAKE_DISK_CLUSTER,      /* disk cluster doesn't exist neither in memory
32612 +                                * nor on disk */
32613 +       TRNC_DISK_CLUSTER       /* disk cluster is partially truncated */
32614 +} disk_cluster_stat;
32615 +
32616 +/* The following structure represents various stages of the same logical
32617 + * cluster of index @index:
32618 + * . fixed slide
32619 + * . page cluster         (stage in primary cache)
32620 + * . transform cluster    (transition stage)
32621 + * . disk cluster         (stage in secondary cache)
32622 + * This structure is used in transition and synchronizing operations, e.g.
32623 + * transform cluster is a transition state when synchronizing page cluster
32624 + * and disk cluster.
32625 + * FIXME: Encapsulate page cluster, disk cluster.
32626 + */
32627 +struct cluster_handle {
32628 +       cloff_t index;           /* offset in a file (unit is a cluster size) */
32629 +       int index_valid;         /* for validating the index above, if needed */
32630 +       struct file *file;       /* host file */
32631 +
32632 +       /* logical cluster */
32633 +       struct reiser4_slide *win; /* sliding window to locate holes */
32634 +       logical_cluster_op op;   /* logical cluster operation (truncate or
32635 +                                   append/overwrite) */
32636 +       /* transform cluster */
32637 +       struct tfm_cluster tc;   /* contains all needed info to synchronize
32638 +                                   page cluster and disk cluster) */
32639 +        /* page cluster */
32640 +       int nr_pages;            /* number of pages of current checkin action */
32641 +       int old_nrpages;         /* number of pages of last checkin action */
32642 +       struct page **pages;     /* attached pages */
32643 +       jnode * node;            /* jnode for capture */
32644 +
32645 +       /* disk cluster */
32646 +       hint_t *hint;            /* current position in the tree */
32647 +       disk_cluster_stat dstat; /* state of the current disk cluster */
32648 +       int reserved;            /* is space for disk cluster reserved */
32649 +#if REISER4_DEBUG
32650 +       reiser4_context *ctx;
32651 +       int reserved_prepped;
32652 +       int reserved_unprepped;
32653 +#endif
32654 +
32655 +};
32656 +
32657 +static inline __u8 * tfm_input_data (struct cluster_handle * clust)
32658 +{
32659 +       return tfm_stream_data(&clust->tc, INPUT_STREAM);
32660 +}
32661 +
32662 +static inline __u8 * tfm_output_data (struct cluster_handle * clust)
32663 +{
32664 +       return tfm_stream_data(&clust->tc, OUTPUT_STREAM);
32665 +}
32666 +
32667 +static inline int reset_cluster_pgset(struct cluster_handle * clust,
32668 +                                     int nrpages)
32669 +{
32670 +       assert("edward-1057", clust->pages != NULL);
32671 +       memset(clust->pages, 0, sizeof(*clust->pages) * nrpages);
32672 +       return 0;
32673 +}
32674 +
32675 +static inline int alloc_cluster_pgset(struct cluster_handle * clust,
32676 +                                     int nrpages)
32677 +{
32678 +       assert("edward-949", clust != NULL);
32679 +       assert("edward-1362", clust->pages == NULL);
32680 +       assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES);
32681 +
32682 +       clust->pages = kzalloc(sizeof(*clust->pages) * nrpages,
32683 +                              reiser4_ctx_gfp_mask_get());
32684 +       if (!clust->pages)
32685 +               return RETERR(-ENOMEM);
32686 +       return 0;
32687 +}
32688 +
32689 +static inline void move_cluster_pgset(struct cluster_handle *clust,
32690 +                                     struct page ***pages, int * nr_pages)
32691 +{
32692 +       assert("edward-1545", clust != NULL && clust->pages != NULL);
32693 +       assert("edward-1546", pages != NULL && *pages == NULL);
32694 +       *pages = clust->pages;
32695 +       *nr_pages = clust->nr_pages;
32696 +       clust->pages = NULL;
32697 +}
32698 +
32699 +static inline void free_cluster_pgset(struct cluster_handle * clust)
32700 +{
32701 +       assert("edward-951", clust->pages != NULL);
32702 +       kfree(clust->pages);
32703 +       clust->pages = NULL;
32704 +}
32705 +
32706 +static inline void put_cluster_handle(struct cluster_handle * clust)
32707 +{
32708 +       assert("edward-435", clust != NULL);
32709 +
32710 +       put_tfm_cluster(&clust->tc);
32711 +       if (clust->pages)
32712 +               free_cluster_pgset(clust);
32713 +       memset(clust, 0, sizeof *clust);
32714 +}
32715 +
32716 +static inline void inc_keyload_count(struct reiser4_crypto_info * data)
32717 +{
32718 +       assert("edward-1410", data != NULL);
32719 +       data->keyload_count++;
32720 +}
32721 +
32722 +static inline void dec_keyload_count(struct reiser4_crypto_info * data)
32723 +{
32724 +       assert("edward-1411", data != NULL);
32725 +       assert("edward-1412", data->keyload_count > 0);
32726 +       data->keyload_count--;
32727 +}
32728 +
32729 +static inline int capture_cluster_jnode(jnode * node)
32730 +{
32731 +       return reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
32732 +}
32733 +
32734 +/* cryptcompress specific part of reiser4_inode */
32735 +struct cryptcompress_info {
32736 +       struct mutex checkin_mutex;  /* This is to serialize
32737 +                                     * checkin_logical_cluster operations */
32738 +       cloff_t trunc_index;         /* Index of the leftmost truncated disk
32739 +                                     * cluster (to resolve races with read) */
32740 +       struct reiser4_crypto_info *crypt;
32741 +       /*
32742 +        * the following 2 fields are controlled by compression mode plugin
32743 +        */
32744 +       int compress_toggle;          /* Current status of compressibility */
32745 +       int lattice_factor;           /* Factor of dynamic lattice. FIXME: Have
32746 +                                      * a compression_toggle to keep the factor
32747 +                                      */
32748 +#if REISER4_DEBUG
32749 +       atomic_t pgcount;             /* number of grabbed pages */
32750 +#endif
32751 +};
32752 +
32753 +static inline void set_compression_toggle (struct cryptcompress_info * info, int val)
32754 +{
32755 +       info->compress_toggle = val;
32756 +}
32757 +
32758 +static inline int get_compression_toggle (struct cryptcompress_info * info)
32759 +{
32760 +       return info->compress_toggle;
32761 +}
32762 +
32763 +static inline int compression_is_on(struct cryptcompress_info * info)
32764 +{
32765 +       return get_compression_toggle(info) == 1;
32766 +}
32767 +
32768 +static inline void turn_on_compression(struct cryptcompress_info * info)
32769 +{
32770 +       set_compression_toggle(info, 1);
32771 +}
32772 +
32773 +static inline void turn_off_compression(struct cryptcompress_info * info)
32774 +{
32775 +       set_compression_toggle(info, 0);
32776 +}
32777 +
32778 +static inline void set_lattice_factor(struct cryptcompress_info * info, int val)
32779 +{
32780 +       info->lattice_factor = val;
32781 +}
32782 +
32783 +static inline int get_lattice_factor(struct cryptcompress_info * info)
32784 +{
32785 +       return info->lattice_factor;
32786 +}
32787 +
32788 +struct cryptcompress_info *cryptcompress_inode_data(const struct inode *);
32789 +int equal_to_rdk(znode *, const reiser4_key *);
32790 +int goto_right_neighbor(coord_t *, lock_handle *);
32791 +int cryptcompress_inode_ok(struct inode *inode);
32792 +int coord_is_unprepped_ctail(const coord_t * coord);
32793 +extern int do_readpage_ctail(struct inode *, struct cluster_handle *,
32794 +                            struct page * page, znode_lock_mode mode);
32795 +extern int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
32796 +                                         struct inode * inode);
32797 +extern int readpages_cryptcompress(struct file*, struct address_space*,
32798 +                                  struct list_head*, unsigned);
32799 +int bind_cryptcompress(struct inode *child, struct inode *parent);
32800 +void destroy_inode_cryptcompress(struct inode * inode);
32801 +int grab_page_cluster(struct inode *inode, struct cluster_handle * clust,
32802 +                     rw_op rw);
32803 +int write_pschedule_hook(struct file *file, struct inode * inode,
32804 +                        loff_t pos, struct cluster_handle * clust,
32805 +                        struct psched_context * cont);
32806 +int setattr_pschedule_hook(struct inode * inode);
32807 +struct reiser4_crypto_info * inode_crypto_info(struct inode * inode);
32808 +void inherit_crypto_info_common(struct inode * parent, struct inode * object,
32809 +                               int (*can_inherit)(struct inode * child,
32810 +                                                  struct inode * parent));
32811 +void reiser4_attach_crypto_info(struct inode * inode,
32812 +                               struct reiser4_crypto_info * info);
32813 +void change_crypto_info(struct inode * inode, struct reiser4_crypto_info * new);
32814 +struct reiser4_crypto_info * reiser4_alloc_crypto_info (struct inode * inode);
32815 +
32816 +static inline struct crypto_blkcipher * info_get_cipher(struct reiser4_crypto_info * info)
32817 +{
32818 +       return info->cipher;
32819 +}
32820 +
32821 +static inline void info_set_cipher(struct reiser4_crypto_info * info,
32822 +                                  struct crypto_blkcipher * tfm)
32823 +{
32824 +       info->cipher = tfm;
32825 +}
32826 +
32827 +static inline struct crypto_hash * info_get_digest(struct reiser4_crypto_info * info)
32828 +{
32829 +       return info->digest;
32830 +}
32831 +
32832 +static inline void info_set_digest(struct reiser4_crypto_info * info,
32833 +                                  struct crypto_hash * tfm)
32834 +{
32835 +       info->digest = tfm;
32836 +}
32837 +
32838 +static inline void put_cluster_page(struct page * page)
32839 +{
32840 +       page_cache_release(page);
32841 +}
32842 +
32843 +#endif /* __FS_REISER4_CRYPTCOMPRESS_H__ */
32844 +
32845 +/* Make Linus happy.
32846 +   Local variables:
32847 +   c-indentation-style: "K&R"
32848 +   mode-name: "LC"
32849 +   c-basic-offset: 8
32850 +   tab-width: 8
32851 +   fill-column: 120
32852 +   scroll-step: 1
32853 +   End:
32854 +*/
32855 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/file/file.c linux-2.6.27/fs/reiser4/plugin/file/file.c
32856 --- linux-2.6.27.orig/fs/reiser4/plugin/file/file.c     1970-01-01 03:00:00.000000000 +0300
32857 +++ linux-2.6.27/fs/reiser4/plugin/file/file.c  2008-10-12 18:20:01.000000000 +0400
32858 @@ -0,0 +1,2728 @@
32859 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
32860 + * reiser4/README */
32861 +
32862 +/*
32863 + * this file contains implementations of inode/file/address_space/file plugin
32864 + * operations specific for "unix file plugin" (plugin id is
32865 + * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only
32866 + * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have
32867 + * no items but stat data)
32868 + */
32869 +
32870 +#include "../../inode.h"
32871 +#include "../../super.h"
32872 +#include "../../tree_walk.h"
32873 +#include "../../carry.h"
32874 +#include "../../page_cache.h"
32875 +#include "../../ioctl.h"
32876 +#include "../object.h"
32877 +#include "../cluster.h"
32878 +#include "../../safe_link.h"
32879 +
32880 +#include <linux/writeback.h>
32881 +#include <linux/pagevec.h>
32882 +#include <linux/syscalls.h>
32883 +
32884 +
32885 +static int unpack(struct file *file, struct inode *inode, int forever);
32886 +static void drop_access(struct unix_file_info *);
32887 +static int hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
32888 +                        znode_lock_mode lock_mode);
32889 +
32890 +/* Get exclusive access and make sure that file is not partially
32891 + * converted (It may happen that another process is doing tail
32892 + * conversion. If so, wait until it completes)
32893 + */
32894 +static inline void get_exclusive_access_careful(struct unix_file_info * uf_info,
32895 +                                               struct inode *inode)
32896 +{
32897 +        do {
32898 +               get_exclusive_access(uf_info);
32899 +               if (!reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))
32900 +                       break;
32901 +               drop_exclusive_access(uf_info);
32902 +               schedule();
32903 +       } while (1);
32904 +}
32905 +
32906 +/* get unix file plugin specific portion of inode */
32907 +struct unix_file_info *unix_file_inode_data(const struct inode *inode)
32908 +{
32909 +       return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info;
32910 +}
32911 +
32912 +/**
32913 + * equal_to_rdk - compare key and znode's right delimiting key
32914 + * @node: node whose right delimiting key to compare with @key
32915 + * @key: key to compare with @node's right delimiting key
32916 + *
32917 + * Returns true if @key is equal to right delimiting key of @node.
32918 + */
32919 +int equal_to_rdk(znode *node, const reiser4_key *key)
32920 +{
32921 +       int result;
32922 +
32923 +       read_lock_dk(znode_get_tree(node));
32924 +       result = keyeq(key, znode_get_rd_key(node));
32925 +       read_unlock_dk(znode_get_tree(node));
32926 +       return result;
32927 +}
32928 +
32929 +#if REISER4_DEBUG
32930 +
32931 +/**
32932 + * equal_to_ldk - compare key and znode's left delimiting key
32933 + * @node: node whose left delimiting key to compare with @key
32934 + * @key: key to compare with @node's left delimiting key
32935 + *
32936 + * Returns true if @key is equal to left delimiting key of @node.
32937 + */
32938 +int equal_to_ldk(znode *node, const reiser4_key *key)
32939 +{
32940 +       int result;
32941 +
32942 +       read_lock_dk(znode_get_tree(node));
32943 +       result = keyeq(key, znode_get_ld_key(node));
32944 +       read_unlock_dk(znode_get_tree(node));
32945 +       return result;
32946 +}
32947 +
32948 +/**
32949 + * check_coord - check whether coord corresponds to key
32950 + * @coord: coord to check
32951 + * @key: key @coord has to correspond to
32952 + *
32953 + * Returns true if @coord is set as if it was set as result of lookup with @key
32954 + * in coord->node.
32955 + */
32956 +static int check_coord(const coord_t *coord, const reiser4_key *key)
32957 +{
32958 +       coord_t twin;
32959 +
32960 +       node_plugin_by_node(coord->node)->lookup(coord->node, key,
32961 +                                                FIND_MAX_NOT_MORE_THAN, &twin);
32962 +       return coords_equal(coord, &twin);
32963 +}
32964 +
32965 +#endif /* REISER4_DEBUG */
32966 +
32967 +/**
32968 + * init_uf_coord - initialize extended coord
32969 + * @uf_coord:
32970 + * @lh:
32971 + *
32972 + *
32973 + */
32974 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh)
32975 +{
32976 +       coord_init_zero(&uf_coord->coord);
32977 +       coord_clear_iplug(&uf_coord->coord);
32978 +       uf_coord->lh = lh;
32979 +       init_lh(lh);
32980 +       memset(&uf_coord->extension, 0, sizeof(uf_coord->extension));
32981 +       uf_coord->valid = 0;
32982 +}
32983 +
32984 +static void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
32985 +{
32986 +       assert("vs-1333", uf_coord->valid == 0);
32987 +
32988 +       if (coord_is_between_items(&uf_coord->coord))
32989 +               return;
32990 +
32991 +       assert("vs-1348",
32992 +              item_plugin_by_coord(&uf_coord->coord)->s.file.
32993 +              init_coord_extension);
32994 +
32995 +       item_body_by_coord(&uf_coord->coord);
32996 +       item_plugin_by_coord(&uf_coord->coord)->s.file.
32997 +           init_coord_extension(uf_coord, offset);
32998 +}
32999 +
33000 +/**
33001 + * goto_right_neighbor - lock right neighbor, drop current node lock
33002 + * @coord:
33003 + * @lh:
33004 + *
33005 + * Obtain lock on right neighbor and drop lock on current node.
33006 + */
33007 +int goto_right_neighbor(coord_t *coord, lock_handle *lh)
33008 +{
33009 +       int result;
33010 +       lock_handle lh_right;
33011 +
33012 +       assert("vs-1100", znode_is_locked(coord->node));
33013 +
33014 +       init_lh(&lh_right);
33015 +       result = reiser4_get_right_neighbor(&lh_right, coord->node,
33016 +                                           znode_is_wlocked(coord->node) ?
33017 +                                           ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
33018 +                                           GN_CAN_USE_UPPER_LEVELS);
33019 +       if (result) {
33020 +               done_lh(&lh_right);
33021 +               return result;
33022 +       }
33023 +
33024 +       /*
33025 +        * we hold two longterm locks on neighboring nodes. Unlock left of
33026 +        * them
33027 +        */
33028 +       done_lh(lh);
33029 +
33030 +       coord_init_first_unit_nocheck(coord, lh_right.node);
33031 +       move_lh(lh, &lh_right);
33032 +
33033 +       return 0;
33034 +
33035 +}
33036 +
33037 +/**
33038 + * set_file_state
33039 + * @uf_info:
33040 + * @cbk_result:
33041 + * @level:
33042 + *
33043 + * This is to be used by find_file_item and in find_file_state to
33044 + * determine real state of file
33045 + */
33046 +static void set_file_state(struct unix_file_info *uf_info, int cbk_result,
33047 +                          tree_level level)
33048 +{
33049 +       if (cbk_errored(cbk_result))
33050 +               /* error happened in find_file_item */
33051 +               return;
33052 +
33053 +       assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL);
33054 +
33055 +       if (uf_info->container == UF_CONTAINER_UNKNOWN) {
33056 +               if (cbk_result == CBK_COORD_NOTFOUND)
33057 +                       uf_info->container = UF_CONTAINER_EMPTY;
33058 +               else if (level == LEAF_LEVEL)
33059 +                       uf_info->container = UF_CONTAINER_TAILS;
33060 +               else
33061 +                       uf_info->container = UF_CONTAINER_EXTENTS;
33062 +       } else {
33063 +               /*
33064 +                * file state is known, check whether it is set correctly if
33065 +                * file is not being tail converted
33066 +                */
33067 +               if (!reiser4_inode_get_flag(unix_file_info_to_inode(uf_info),
33068 +                                           REISER4_PART_IN_CONV)) {
33069 +                       assert("vs-1162",
33070 +                              ergo(level == LEAF_LEVEL &&
33071 +                                   cbk_result == CBK_COORD_FOUND,
33072 +                                   uf_info->container == UF_CONTAINER_TAILS));
33073 +                       assert("vs-1165",
33074 +                              ergo(level == TWIG_LEVEL &&
33075 +                                   cbk_result == CBK_COORD_FOUND,
33076 +                                   uf_info->container == UF_CONTAINER_EXTENTS));
33077 +               }
33078 +       }
33079 +}
33080 +
33081 +int find_file_item_nohint(coord_t *coord, lock_handle *lh,
33082 +                         const reiser4_key *key, znode_lock_mode lock_mode,
33083 +                         struct inode *inode)
33084 +{
33085 +       return reiser4_object_lookup(inode, key, coord, lh, lock_mode,
33086 +                                    FIND_MAX_NOT_MORE_THAN,
33087 +                                    TWIG_LEVEL, LEAF_LEVEL,
33088 +                                    (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE :
33089 +                                    (CBK_UNIQUE | CBK_FOR_INSERT),
33090 +                                    NULL /* ra_info */ );
33091 +}
33092 +
33093 +/**
33094 + * find_file_item - look for file item in the tree
33095 + * @hint: provides coordinate, lock handle, seal
33096 + * @key: key for search
33097 + * @mode: mode of lock to put on returned node
33098 + * @ra_info:
33099 + * @inode:
33100 + *
33101 + * This finds position in the tree corresponding to @key. It first tries to use
33102 + * @hint's seal if it is set.
33103 + */
33104 +int find_file_item(hint_t *hint, const reiser4_key *key,
33105 +                  znode_lock_mode lock_mode,
33106 +                  struct inode *inode)
33107 +{
33108 +       int result;
33109 +       coord_t *coord;
33110 +       lock_handle *lh;
33111 +
33112 +       assert("nikita-3030", reiser4_schedulable());
33113 +       assert("vs-1707", hint != NULL);
33114 +       assert("vs-47", inode != NULL);
33115 +
33116 +       coord = &hint->ext_coord.coord;
33117 +       lh = hint->ext_coord.lh;
33118 +       init_lh(lh);
33119 +
33120 +       result = hint_validate(hint, key, 1 /* check key */, lock_mode);
33121 +       if (!result) {
33122 +               if (coord->between == AFTER_UNIT &&
33123 +                   equal_to_rdk(coord->node, key)) {
33124 +                       result = goto_right_neighbor(coord, lh);
33125 +                       if (result == -E_NO_NEIGHBOR)
33126 +                               return RETERR(-EIO);
33127 +                       if (result)
33128 +                               return result;
33129 +                       assert("vs-1152", equal_to_ldk(coord->node, key));
33130 +                       /*
33131 +                        * we moved to different node. Invalidate coord
33132 +                        * extension, zload is necessary to init it again
33133 +                        */
33134 +                       hint->ext_coord.valid = 0;
33135 +               }
33136 +
33137 +               set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND,
33138 +                              znode_get_level(coord->node));
33139 +
33140 +               return CBK_COORD_FOUND;
33141 +       }
33142 +
33143 +       coord_init_zero(coord);
33144 +       result = find_file_item_nohint(coord, lh, key, lock_mode, inode);
33145 +       set_file_state(unix_file_inode_data(inode), result,
33146 +                      znode_get_level(coord->node));
33147 +
33148 +       /* FIXME: we might already have coord extension initialized */
33149 +       hint->ext_coord.valid = 0;
33150 +       return result;
33151 +}
33152 +
33153 +/* plugin->u.file.write_flowom = NULL
33154 +   plugin->u.file.read_flow = NULL */
33155 +
33156 +void hint_init_zero(hint_t * hint)
33157 +{
33158 +       memset(hint, 0, sizeof(*hint));
33159 +       init_lh(&hint->lh);
33160 +       hint->ext_coord.lh = &hint->lh;
33161 +}
33162 +
33163 +static int find_file_state(struct inode *inode, struct unix_file_info *uf_info)
33164 +{
33165 +       int result;
33166 +       reiser4_key key;
33167 +       coord_t coord;
33168 +       lock_handle lh;
33169 +
33170 +       assert("vs-1628", ea_obtained(uf_info));
33171 +
33172 +       if (uf_info->container == UF_CONTAINER_UNKNOWN) {
33173 +               key_by_inode_and_offset_common(inode, 0, &key);
33174 +               init_lh(&lh);
33175 +               result = find_file_item_nohint(&coord, &lh, &key,
33176 +                                              ZNODE_READ_LOCK, inode);
33177 +               set_file_state(uf_info, result, znode_get_level(coord.node));
33178 +               done_lh(&lh);
33179 +               if (!cbk_errored(result))
33180 +                       result = 0;
33181 +       } else
33182 +               result = 0;
33183 +       assert("vs-1074",
33184 +              ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
33185 +       reiser4_txn_restart_current();
33186 +       return result;
33187 +}
33188 +
33189 +/**
33190 + * Estimate and reserve space needed to truncate page
33191 + * which gets partially truncated: one block for page
33192 + * itself, stat-data update (estimate_one_insert_into_item)
33193 + * and one item insertion (estimate_one_insert_into_item)
33194 + * which may happen if page corresponds to hole extent and
33195 + * unallocated one will have to be created
33196 + */
33197 +static int reserve_partial_page(reiser4_tree * tree)
33198 +{
33199 +       grab_space_enable();
33200 +       return reiser4_grab_reserved(reiser4_get_current_sb(),
33201 +                                    1 +
33202 +                                    2 * estimate_one_insert_into_item(tree),
33203 +                                    BA_CAN_COMMIT);
33204 +}
33205 +
33206 +/* estimate and reserve space needed to cut one item and update one stat data */
33207 +static int reserve_cut_iteration(reiser4_tree * tree)
33208 +{
33209 +       __u64 estimate = estimate_one_item_removal(tree)
33210 +           + estimate_one_insert_into_item(tree);
33211 +
33212 +       assert("nikita-3172", lock_stack_isclean(get_current_lock_stack()));
33213 +
33214 +       grab_space_enable();
33215 +       /* We need to double our estimate now that we can delete more than one
33216 +          node. */
33217 +       return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2,
33218 +                                    BA_CAN_COMMIT);
33219 +}
33220 +
33221 +int reiser4_update_file_size(struct inode *inode, loff_t new_size,
33222 +                            int update_sd)
33223 +{
33224 +       int result = 0;
33225 +
33226 +       INODE_SET_SIZE(inode, new_size);
33227 +       if (update_sd) {
33228 +               inode->i_ctime = inode->i_mtime = CURRENT_TIME;
33229 +               result = reiser4_update_sd(inode);
33230 +       }
33231 +       return result;
33232 +}
33233 +
33234 +/**
33235 + * Cut file items one by one starting from the last one until
33236 + * new file size (inode->i_size) is reached. Reserve space
33237 + * and update file stat data on every single cut from the tree
33238 + */
33239 +int cut_file_items(struct inode *inode, loff_t new_size,
33240 +                  int update_sd, loff_t cur_size,
33241 +                  int (*update_actor) (struct inode *, loff_t, int))
33242 +{
33243 +       reiser4_key from_key, to_key;
33244 +       reiser4_key smallest_removed;
33245 +       file_plugin *fplug = inode_file_plugin(inode);
33246 +       int result;
33247 +       int progress = 0;
33248 +
33249 +       assert("vs-1248",
33250 +              fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) ||
33251 +              fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
33252 +
33253 +       fplug->key_by_inode(inode, new_size, &from_key);
33254 +       to_key = from_key;
33255 +       set_key_offset(&to_key, cur_size - 1 /*get_key_offset(reiser4_max_key()) */ );
33256 +       /* this loop normally runs just once */
33257 +       while (1) {
33258 +               result = reserve_cut_iteration(reiser4_tree_by_inode(inode));
33259 +               if (result)
33260 +                       break;
33261 +
33262 +               result = reiser4_cut_tree_object(current_tree, &from_key, &to_key,
33263 +                                                &smallest_removed, inode, 1,
33264 +                                                &progress);
33265 +               if (result == -E_REPEAT) {
33266 +                       /**
33267 +                        * -E_REPEAT is a signal to interrupt a long
33268 +                        * file truncation process
33269 +                        */
33270 +                       if (progress) {
33271 +                               result = update_actor(inode,
33272 +                                             get_key_offset(&smallest_removed),
33273 +                                             update_sd);
33274 +                               if (result)
33275 +                                       break;
33276 +                       }
33277 +                       /* the below does up(sbinfo->delete_mutex).
33278 +                        * Do not get folled */
33279 +                       reiser4_release_reserved(inode->i_sb);
33280 +                       /**
33281 +                        * reiser4_cut_tree_object() was interrupted probably
33282 +                        * because current atom requires commit, we have to
33283 +                        * release transaction handle to allow atom commit.
33284 +                        */
33285 +                       reiser4_txn_restart_current();
33286 +                       continue;
33287 +               }
33288 +               if (result
33289 +                   && !(result == CBK_COORD_NOTFOUND && new_size == 0
33290 +                        && inode->i_size == 0))
33291 +                       break;
33292 +
33293 +               set_key_offset(&smallest_removed, new_size);
33294 +               /* Final sd update after the file gets its correct size */
33295 +               result = update_actor(inode, get_key_offset(&smallest_removed),
33296 +                                     update_sd);
33297 +               break;
33298 +       }
33299 +
33300 +       /* the below does up(sbinfo->delete_mutex). Do not get folled */
33301 +       reiser4_release_reserved(inode->i_sb);
33302 +
33303 +       return result;
33304 +}
33305 +
33306 +int find_or_create_extent(struct page *page);
33307 +
33308 +/* part of truncate_file_body: it is called when truncate is used to make file
33309 +   shorter */
33310 +static int shorten_file(struct inode *inode, loff_t new_size)
33311 +{
33312 +       int result;
33313 +       struct page *page;
33314 +       int padd_from;
33315 +       unsigned long index;
33316 +       struct unix_file_info *uf_info;
33317 +
33318 +       /*
33319 +        * all items of ordinary reiser4 file are grouped together. That is why
33320 +        * we can use reiser4_cut_tree. Plan B files (for instance) can not be
33321 +        * truncated that simply
33322 +        */
33323 +       result = cut_file_items(inode, new_size, 1 /*update_sd */ ,
33324 +                               get_key_offset(reiser4_max_key()),
33325 +                               reiser4_update_file_size);
33326 +       if (result)
33327 +               return result;
33328 +
33329 +       uf_info = unix_file_inode_data(inode);
33330 +       assert("vs-1105", new_size == inode->i_size);
33331 +       if (new_size == 0) {
33332 +               uf_info->container = UF_CONTAINER_EMPTY;
33333 +               return 0;
33334 +       }
33335 +
33336 +       result = find_file_state(inode, uf_info);
33337 +       if (result)
33338 +               return result;
33339 +       if (uf_info->container == UF_CONTAINER_TAILS)
33340 +               /*
33341 +                * No need to worry about zeroing last page after new file
33342 +                * end
33343 +                */
33344 +               return 0;
33345 +
33346 +       padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1);
33347 +       if (!padd_from)
33348 +               /* file is truncated to page boundary */
33349 +               return 0;
33350 +
33351 +       result = reserve_partial_page(reiser4_tree_by_inode(inode));
33352 +       if (result) {
33353 +               reiser4_release_reserved(inode->i_sb);
33354 +               return result;
33355 +       }
33356 +
33357 +       /* last page is partially truncated - zero its content */
33358 +       index = (inode->i_size >> PAGE_CACHE_SHIFT);
33359 +       page = read_mapping_page(inode->i_mapping, index, NULL);
33360 +       if (IS_ERR(page)) {
33361 +               /*
33362 +                * the below does up(sbinfo->delete_mutex). Do not get
33363 +                * confused
33364 +                */
33365 +               reiser4_release_reserved(inode->i_sb);
33366 +               if (likely(PTR_ERR(page) == -EINVAL)) {
33367 +                       /* looks like file is built of tail items */
33368 +                       return 0;
33369 +               }
33370 +               return PTR_ERR(page);
33371 +       }
33372 +       wait_on_page_locked(page);
33373 +       if (!PageUptodate(page)) {
33374 +               page_cache_release(page);
33375 +               /*
33376 +                * the below does up(sbinfo->delete_mutex). Do not get
33377 +                * confused
33378 +                */
33379 +               reiser4_release_reserved(inode->i_sb);
33380 +               return RETERR(-EIO);
33381 +       }
33382 +
33383 +       /*
33384 +        * if page correspons to hole extent unit - unallocated one will be
33385 +        * created here. This is not necessary
33386 +        */
33387 +       result = find_or_create_extent(page);
33388 +
33389 +       /*
33390 +        * FIXME: cut_file_items has already updated inode. Probably it would
33391 +        * be better to update it here when file is really truncated
33392 +        */
33393 +       if (result) {
33394 +               page_cache_release(page);
33395 +               /*
33396 +                * the below does up(sbinfo->delete_mutex). Do not get
33397 +                * confused
33398 +                */
33399 +               reiser4_release_reserved(inode->i_sb);
33400 +               return result;
33401 +       }
33402 +
33403 +       lock_page(page);
33404 +       assert("vs-1066", PageLocked(page));
33405 +       zero_user_segment(page, padd_from, PAGE_CACHE_SIZE);
33406 +       unlock_page(page);
33407 +       page_cache_release(page);
33408 +       /* the below does up(sbinfo->delete_mutex). Do not get confused */
33409 +       reiser4_release_reserved(inode->i_sb);
33410 +       return 0;
33411 +}
33412 +
33413 +/**
33414 + * should_have_notail
33415 + * @uf_info:
33416 + * @new_size:
33417 + *
33418 + * Calls formatting plugin to see whether file of size @new_size has to be
33419 + * stored in unformatted nodes or in tail items. 0 is returned for later case.
33420 + */
33421 +static int should_have_notail(const struct unix_file_info *uf_info, loff_t new_size)
33422 +{
33423 +       if (!uf_info->tplug)
33424 +               return 1;
33425 +       return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info),
33426 +                                         new_size);
33427 +
33428 +}
33429 +
33430 +/**
33431 + * truncate_file_body - change length of file
33432 + * @inode: inode of file
33433 + * @new_size: new file length
33434 + *
33435 + * Adjusts items file @inode is built of to match @new_size. It may either cut
33436 + * items or add them to represent a hole at the end of file. The caller has to
33437 + * obtain exclusive access to the file.
33438 + */
33439 +static int truncate_file_body(struct inode *inode, struct iattr *attr)
33440 +{
33441 +       int result;
33442 +       loff_t new_size = attr->ia_size;
33443 +
33444 +       if (inode->i_size < new_size) {
33445 +               /* expanding truncate */
33446 +               struct unix_file_info *uf_info = unix_file_inode_data(inode);
33447 +
33448 +               result = find_file_state(inode, uf_info);
33449 +               if (result)
33450 +                       return result;
33451 +
33452 +               if (should_have_notail(uf_info, new_size)) {
33453 +                       /*
33454 +                        * file of size @new_size has to be built of
33455 +                        * extents. If it is built of tails - convert to
33456 +                        * extents
33457 +                        */
33458 +                       if (uf_info->container ==  UF_CONTAINER_TAILS) {
33459 +                               /*
33460 +                                * if file is being convered by another process
33461 +                                * - wait until it completes
33462 +                                */
33463 +                               while (1) {
33464 +                                       if (reiser4_inode_get_flag(inode,
33465 +                                                                  REISER4_PART_IN_CONV)) {
33466 +                                               drop_exclusive_access(uf_info);
33467 +                                               schedule();
33468 +                                               get_exclusive_access(uf_info);
33469 +                                               continue;
33470 +                                       }
33471 +                                       break;
33472 +                               }
33473 +
33474 +                               if (uf_info->container ==  UF_CONTAINER_TAILS) {
33475 +                                       result = tail2extent(uf_info);
33476 +                                       if (result)
33477 +                                               return result;
33478 +                               }
33479 +                       }
33480 +                       result = reiser4_write_extent(NULL, inode, NULL,
33481 +                                                     0, &new_size);
33482 +                       if (result)
33483 +                               return result;
33484 +                       uf_info->container = UF_CONTAINER_EXTENTS;
33485 +               } else {
33486 +                       if (uf_info->container ==  UF_CONTAINER_EXTENTS) {
33487 +                               result = reiser4_write_extent(NULL, inode, NULL,
33488 +                                                             0, &new_size);
33489 +                               if (result)
33490 +                                       return result;
33491 +                       } else {
33492 +                               result = reiser4_write_tail(NULL, inode, NULL,
33493 +                                                           0, &new_size);
33494 +                               if (result)
33495 +                                       return result;
33496 +                               uf_info->container = UF_CONTAINER_TAILS;
33497 +                       }
33498 +               }
33499 +               BUG_ON(result > 0);
33500 +               result = reiser4_update_file_size(inode, new_size, 1);
33501 +               BUG_ON(result != 0);
33502 +       } else
33503 +               result = shorten_file(inode, new_size);
33504 +       return result;
33505 +}
33506 +
33507 +/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */
33508 +
33509 +/**
33510 + * load_file_hint - copy hint from struct file to local variable
33511 + * @file: file to get hint from
33512 + * @hint: structure to fill
33513 + *
33514 + * Reiser4 specific portion of struct file may contain information (hint)
33515 + * stored on exiting from previous read or write. That information includes
33516 + * seal of znode and coord within that znode where previous read or write
33517 + * stopped. This function copies that information to @hint if it was stored or
33518 + * initializes @hint by 0s otherwise.
33519 + */
33520 +int load_file_hint(struct file *file, hint_t *hint)
33521 +{
33522 +       reiser4_file_fsdata *fsdata;
33523 +
33524 +       if (file) {
33525 +               fsdata = reiser4_get_file_fsdata(file);
33526 +               if (IS_ERR(fsdata))
33527 +                       return PTR_ERR(fsdata);
33528 +
33529 +               spin_lock_inode(file->f_dentry->d_inode);
33530 +               if (reiser4_seal_is_set(&fsdata->reg.hint.seal)) {
33531 +                       *hint = fsdata->reg.hint;
33532 +                       init_lh(&hint->lh);
33533 +                       hint->ext_coord.lh = &hint->lh;
33534 +                       spin_unlock_inode(file->f_dentry->d_inode);
33535 +                       /*
33536 +                        * force re-validation of the coord on the first
33537 +                        * iteration of the read/write loop.
33538 +                        */
33539 +                       hint->ext_coord.valid = 0;
33540 +                       assert("nikita-19892", coords_equal(&hint->seal.coord1,
33541 +                                                           &hint->ext_coord.
33542 +                                                           coord));
33543 +                       return 0;
33544 +               }
33545 +               memset(&fsdata->reg.hint, 0, sizeof(hint_t));
33546 +               spin_unlock_inode(file->f_dentry->d_inode);
33547 +       }
33548 +       hint_init_zero(hint);
33549 +       return 0;
33550 +}
33551 +
33552 +/**
33553 + * save_file_hint - copy hint to reiser4 private struct file's part
33554 + * @file: file to save hint in
33555 + * @hint: hint to save
33556 + *
33557 + * This copies @hint to reiser4 private part of struct file. It can help
33558 + * speedup future accesses to the file.
33559 + */
33560 +void save_file_hint(struct file *file, const hint_t *hint)
33561 +{
33562 +       reiser4_file_fsdata *fsdata;
33563 +
33564 +       assert("edward-1337", hint != NULL);
33565 +
33566 +       if (!file || !reiser4_seal_is_set(&hint->seal))
33567 +               return;
33568 +       fsdata = reiser4_get_file_fsdata(file);
33569 +       assert("vs-965", !IS_ERR(fsdata));
33570 +       assert("nikita-19891",
33571 +              coords_equal(&hint->seal.coord1, &hint->ext_coord.coord));
33572 +       assert("vs-30", hint->lh.owner == NULL);
33573 +       spin_lock_inode(file->f_dentry->d_inode);
33574 +       fsdata->reg.hint = *hint;
33575 +       spin_unlock_inode(file->f_dentry->d_inode);
33576 +       return;
33577 +}
33578 +
33579 +void reiser4_unset_hint(hint_t * hint)
33580 +{
33581 +       assert("vs-1315", hint);
33582 +       hint->ext_coord.valid = 0;
33583 +       reiser4_seal_done(&hint->seal);
33584 +       done_lh(&hint->lh);
33585 +}
33586 +
33587 +/* coord must be set properly. So, that reiser4_set_hint
33588 +   has nothing to do */
33589 +void reiser4_set_hint(hint_t * hint, const reiser4_key * key,
33590 +                     znode_lock_mode mode)
33591 +{
33592 +       ON_DEBUG(coord_t * coord = &hint->ext_coord.coord);
33593 +       assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key)));
33594 +
33595 +       reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, key);
33596 +       hint->offset = get_key_offset(key);
33597 +       hint->mode = mode;
33598 +       done_lh(&hint->lh);
33599 +}
33600 +
33601 +int hint_is_set(const hint_t * hint)
33602 +{
33603 +       return reiser4_seal_is_set(&hint->seal);
33604 +}
33605 +
33606 +#if REISER4_DEBUG
33607 +static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
33608 +{
33609 +       return (get_key_locality(k1) == get_key_locality(k2) &&
33610 +               get_key_type(k1) == get_key_type(k2) &&
33611 +               get_key_band(k1) == get_key_band(k2) &&
33612 +               get_key_ordering(k1) == get_key_ordering(k2) &&
33613 +               get_key_objectid(k1) == get_key_objectid(k2));
33614 +}
33615 +#endif
33616 +
33617 +static int
33618 +hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
33619 +             znode_lock_mode lock_mode)
33620 +{
33621 +       if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
33622 +               /* hint either not set or set by different operation */
33623 +               return RETERR(-E_REPEAT);
33624 +
33625 +       assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key));
33626 +
33627 +       if (check_key && get_key_offset(key) != hint->offset)
33628 +               /* hint is set for different key */
33629 +               return RETERR(-E_REPEAT);
33630 +
33631 +       assert("vs-31", hint->ext_coord.lh == &hint->lh);
33632 +       return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, key,
33633 +                                    hint->ext_coord.lh, lock_mode,
33634 +                                    ZNODE_LOCK_LOPRI);
33635 +}
33636 +
33637 +/**
33638 + * Look for place at twig level for extent corresponding to page,
33639 + * call extent's writepage method to create unallocated extent if
33640 + * it does not exist yet, initialize jnode, capture page
33641 + */
33642 +int find_or_create_extent(struct page *page)
33643 +{
33644 +       int result;
33645 +       struct inode *inode;
33646 +       int plugged_hole;
33647 +
33648 +       jnode *node;
33649 +
33650 +       assert("vs-1065", page->mapping && page->mapping->host);
33651 +       inode = page->mapping->host;
33652 +
33653 +       lock_page(page);
33654 +       node = jnode_of_page(page);
33655 +       if (IS_ERR(node)) {
33656 +               unlock_page(page);
33657 +               return PTR_ERR(node);
33658 +       }
33659 +       JF_SET(node, JNODE_WRITE_PREPARED);
33660 +       unlock_page(page);
33661 +
33662 +       if (node->blocknr == 0) {
33663 +               plugged_hole = 0;
33664 +               result = reiser4_update_extent(inode, node, page_offset(page),
33665 +                                              &plugged_hole);
33666 +               if (result) {
33667 +                       JF_CLR(node, JNODE_WRITE_PREPARED);
33668 +                       jput(node);
33669 +                       warning("edward-1549",
33670 +                               "reiser4_update_extent failed: %d", result);
33671 +                       return result;
33672 +               }
33673 +               if (plugged_hole)
33674 +                       reiser4_update_sd(inode);
33675 +       } else {
33676 +               spin_lock_jnode(node);
33677 +               result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
33678 +               BUG_ON(result != 0);
33679 +               jnode_make_dirty_locked(node);
33680 +               spin_unlock_jnode(node);
33681 +       }
33682 +
33683 +       BUG_ON(node->atom == NULL);
33684 +       JF_CLR(node, JNODE_WRITE_PREPARED);
33685 +       jput(node);
33686 +
33687 +       if (get_current_context()->entd) {
33688 +               entd_context *ent = get_entd_context(node->tree->super);
33689 +
33690 +               if (ent->cur_request->page == page)
33691 +                       ent->cur_request->node = node;
33692 +       }
33693 +       return 0;
33694 +}
33695 +
33696 +/**
33697 + * has_anonymous_pages - check whether inode has pages dirtied via mmap
33698 + * @inode: inode to check
33699 + *
33700 + * Returns true if inode's mapping has dirty pages which do not belong to any
33701 + * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page
33702 + * tree or were eflushed and can be found via jnodes tagged
33703 + * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes.
33704 + */
33705 +static int has_anonymous_pages(struct inode *inode)
33706 +{
33707 +       int result;
33708 +
33709 +       spin_lock_irq(&inode->i_mapping->tree_lock);
33710 +       result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED);
33711 +       spin_unlock_irq(&inode->i_mapping->tree_lock);
33712 +       return result;
33713 +}
33714 +
33715 +/**
33716 + * capture_page_and_create_extent -
33717 + * @page: page to be captured
33718 + *
33719 + * Grabs space for extent creation and stat data update and calls function to
33720 + * do actual work.
33721 + */
33722 +static int capture_page_and_create_extent(struct page *page)
33723 +{
33724 +       int result;
33725 +       struct inode *inode;
33726 +
33727 +       assert("vs-1084", page->mapping && page->mapping->host);
33728 +       inode = page->mapping->host;
33729 +       assert("vs-1139",
33730 +              unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS);
33731 +       /* page belongs to file */
33732 +       assert("vs-1393",
33733 +              inode->i_size > page_offset(page));
33734 +
33735 +       /* page capture may require extent creation (if it does not exist yet)
33736 +          and stat data's update (number of blocks changes on extent
33737 +          creation) */
33738 +       grab_space_enable();
33739 +       result = reiser4_grab_space(2 * estimate_one_insert_into_item
33740 +                                   (reiser4_tree_by_inode(inode)),
33741 +                                   BA_CAN_COMMIT);
33742 +       if (likely(!result))
33743 +               result = find_or_create_extent(page);
33744 +
33745 +       if (result != 0)
33746 +               SetPageError(page);
33747 +       return result;
33748 +}
33749 +
33750 +/* this is implementation of method commit_write of struct
33751 +   address_space_operations for unix file plugin */
33752 +int
33753 +commit_write_unix_file(struct file *file, struct page *page,
33754 +                      unsigned from, unsigned to)
33755 +{
33756 +       reiser4_context *ctx;
33757 +       struct inode *inode;
33758 +       int result;
33759 +
33760 +       assert("umka-3101", file != NULL);
33761 +       assert("umka-3102", page != NULL);
33762 +       assert("umka-3093", PageLocked(page));
33763 +
33764 +       SetPageUptodate(page);
33765 +
33766 +       inode = page->mapping->host;
33767 +       ctx = reiser4_init_context(page->mapping->host->i_sb);
33768 +       if (IS_ERR(ctx))
33769 +               return PTR_ERR(ctx);
33770 +       page_cache_get(page);
33771 +       unlock_page(page);
33772 +       result = capture_page_and_create_extent(page);
33773 +       lock_page(page);
33774 +       page_cache_release(page);
33775 +
33776 +       /* don't commit transaction under inode semaphore */
33777 +       context_set_commit_async(ctx);
33778 +       reiser4_exit_context(ctx);
33779 +       return result;
33780 +}
33781 +
33782 +/*
33783 + * Support for "anonymous" pages and jnodes.
33784 + *
33785 + * When file is write-accessed through mmap pages can be dirtied from the user
33786 + * level. In this case kernel is not notified until one of following happens:
33787 + *
33788 + *     (1) msync()
33789 + *
33790 + *     (2) truncate() (either explicit or through unlink)
33791 + *
33792 + *     (3) VM scanner starts reclaiming mapped pages, dirtying them before
33793 + *     starting write-back.
33794 + *
33795 + * As a result of (3) ->writepage may be called on a dirty page without
33796 + * jnode. Such page is called "anonymous" in reiser4. Certain work-loads
33797 + * (iozone) generate huge number of anonymous pages. Emergency flush handles
33798 + * this situation by creating jnode for anonymous page, starting IO on the
33799 + * page, and marking jnode with JNODE_KEEPME bit so that it's not thrown out of
33800 + * memory. Such jnode is also called anonymous.
33801 + *
33802 + * reiser4_sync_sb() method tries to insert anonymous pages and jnodes into
33803 + * tree. This is done by capture_anonymous_*() functions below.
33804 + */
33805 +
33806 +/**
33807 + * capture_anonymous_page - involve page into transaction
33808 + * @pg: page to deal with
33809 + *
33810 + * Takes care that @page has corresponding metadata in the tree, creates jnode
33811 + * for @page and captures it. On success 1 is returned.
33812 + */
33813 +static int capture_anonymous_page(struct page *page)
33814 +{
33815 +       int result;
33816 +
33817 +       if (PageWriteback(page))
33818 +               /* FIXME: do nothing? */
33819 +               return 0;
33820 +
33821 +       result = capture_page_and_create_extent(page);
33822 +       if (result == 0) {
33823 +               result = 1;
33824 +       } else
33825 +               warning("nikita-3329",
33826 +                               "Cannot capture anon page: %i", result);
33827 +
33828 +       return result;
33829 +}
33830 +
33831 +/**
33832 + * capture_anonymous_pages - find and capture pages dirtied via mmap
33833 + * @mapping: address space where to look for pages
33834 + * @index: start index
33835 + * @to_capture: maximum number of pages to capture
33836 + *
33837 + * Looks for pages tagged REISER4_MOVED starting from the *@index-th page,
33838 + * captures (involves into atom) them, returns number of captured pages,
33839 + * updates @index to next page after the last captured one.
33840 + */
33841 +static int
33842 +capture_anonymous_pages(struct address_space *mapping, pgoff_t *index,
33843 +                       unsigned int to_capture)
33844 +{
33845 +       int result;
33846 +       struct pagevec pvec;
33847 +       unsigned int i, count;
33848 +       int nr;
33849 +
33850 +       pagevec_init(&pvec, 0);
33851 +       count = min(pagevec_space(&pvec), to_capture);
33852 +       nr = 0;
33853 +
33854 +       /* find pages tagged MOVED */
33855 +       spin_lock_irq(&mapping->tree_lock);
33856 +       pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree,
33857 +                                            (void **)pvec.pages, *index, count,
33858 +                                            PAGECACHE_TAG_REISER4_MOVED);
33859 +       if (pagevec_count(&pvec) == 0) {
33860 +               /*
33861 +                * there are no pages tagged MOVED in mapping->page_tree
33862 +                * starting from *index
33863 +                */
33864 +               spin_unlock_irq(&mapping->tree_lock);
33865 +               *index = (pgoff_t)-1;
33866 +               return 0;
33867 +       }
33868 +
33869 +       /* clear MOVED tag for all found pages */
33870 +       for (i = 0; i < pagevec_count(&pvec); i++) {
33871 +               page_cache_get(pvec.pages[i]);
33872 +               radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index,
33873 +                                    PAGECACHE_TAG_REISER4_MOVED);
33874 +       }
33875 +       spin_unlock_irq(&mapping->tree_lock);
33876 +
33877 +
33878 +       *index = pvec.pages[i - 1]->index + 1;
33879 +
33880 +       for (i = 0; i < pagevec_count(&pvec); i++) {
33881 +               /*
33882 +                * tag PAGECACHE_TAG_REISER4_MOVED will be cleared by
33883 +                * reiser4_set_page_dirty_internal which is called when jnode is
33884 +                * captured
33885 +                */
33886 +               result = capture_anonymous_page(pvec.pages[i]);
33887 +               if (result == 1)
33888 +                       nr++;
33889 +               else {
33890 +                       if (result < 0) {
33891 +                               warning("vs-1454",
33892 +                                       "failed to capture page: "
33893 +                                       "result=%d, captured=%d)\n",
33894 +                                       result, i);
33895 +
33896 +                               /*
33897 +                                * set MOVED tag to all pages which left not
33898 +                                * captured
33899 +                                */
33900 +                               spin_lock_irq(&mapping->tree_lock);
33901 +                               for (; i < pagevec_count(&pvec); i ++) {
33902 +                                       radix_tree_tag_set(&mapping->page_tree,
33903 +                                                          pvec.pages[i]->index,
33904 +                                                          PAGECACHE_TAG_REISER4_MOVED);
33905 +                               }
33906 +                               spin_unlock_irq(&mapping->tree_lock);
33907 +
33908 +                               pagevec_release(&pvec);
33909 +                               return result;
33910 +                       } else {
33911 +                               /*
33912 +                                * result == 0. capture_anonymous_page returns
33913 +                                * 0 for Writeback-ed page. Set MOVED tag on
33914 +                                * that page
33915 +                                */
33916 +                               spin_lock_irq(&mapping->tree_lock);
33917 +                               radix_tree_tag_set(&mapping->page_tree,
33918 +                                                  pvec.pages[i]->index,
33919 +                                                  PAGECACHE_TAG_REISER4_MOVED);
33920 +                               spin_unlock_irq(&mapping->tree_lock);
33921 +                               if (i == 0)
33922 +                                       *index = pvec.pages[0]->index;
33923 +                               else
33924 +                                       *index = pvec.pages[i - 1]->index + 1;
33925 +                       }
33926 +               }
33927 +       }
33928 +       pagevec_release(&pvec);
33929 +       return nr;
33930 +}
33931 +
33932 +/**
33933 + * capture_anonymous_jnodes - find and capture anonymous jnodes
33934 + * @mapping: address space where to look for jnodes
33935 + * @from: start index
33936 + * @to: end index
33937 + * @to_capture: maximum number of jnodes to capture
33938 + *
33939 + * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in
33940 + * the range of indexes @from-@to and captures them, returns number of captured
33941 + * jnodes, updates @from to next jnode after the last captured one.
33942 + */
33943 +static int
33944 +capture_anonymous_jnodes(struct address_space *mapping,
33945 +                        pgoff_t *from, pgoff_t to, int to_capture)
33946 +{
33947 +       *from = to;
33948 +       return 0;
33949 +}
33950 +
33951 +/*
33952 + * Commit atom of the jnode of a page.
33953 + */
33954 +static int sync_page(struct page *page)
33955 +{
33956 +       int result;
33957 +       do {
33958 +               jnode *node;
33959 +               txn_atom *atom;
33960 +
33961 +               lock_page(page);
33962 +               node = jprivate(page);
33963 +               if (node != NULL) {
33964 +                       spin_lock_jnode(node);
33965 +                       atom = jnode_get_atom(node);
33966 +                       spin_unlock_jnode(node);
33967 +               } else
33968 +                       atom = NULL;
33969 +               unlock_page(page);
33970 +               result = reiser4_sync_atom(atom);
33971 +       } while (result == -E_REPEAT);
33972 +       /*
33973 +        * ZAM-FIXME-HANS: document the logic of this loop, is it just to
33974 +        * handle the case where more pages get added to the atom while we are
33975 +        * syncing it?
33976 +        */
33977 +       assert("nikita-3485", ergo(result == 0,
33978 +                                  get_current_context()->trans->atom == NULL));
33979 +       return result;
33980 +}
33981 +
33982 +/*
33983 + * Commit atoms of pages on @pages list.
33984 + * call sync_page for each page from mapping's page tree
33985 + */
33986 +static int sync_page_list(struct inode *inode)
33987 +{
33988 +       int result;
33989 +       struct address_space *mapping;
33990 +       unsigned long from;     /* start index for radix_tree_gang_lookup */
33991 +       unsigned int found;     /* return value for radix_tree_gang_lookup */
33992 +
33993 +       mapping = inode->i_mapping;
33994 +       from = 0;
33995 +       result = 0;
33996 +       spin_lock_irq(&mapping->tree_lock);
33997 +       while (result == 0) {
33998 +               struct page *page;
33999 +
34000 +               found =
34001 +                   radix_tree_gang_lookup(&mapping->page_tree, (void **)&page,
34002 +                                          from, 1);
34003 +               assert("edward-1550", found < 2);
34004 +               if (found == 0)
34005 +                       break;
34006 +               /**
34007 +                * page may not leave radix tree because it is protected from
34008 +                * truncating by inode->i_mutex locked by sys_fsync
34009 +                */
34010 +               page_cache_get(page);
34011 +               spin_unlock_irq(&mapping->tree_lock);
34012 +
34013 +               from = page->index + 1;
34014 +
34015 +               result = sync_page(page);
34016 +
34017 +               page_cache_release(page);
34018 +               spin_lock_irq(&mapping->tree_lock);
34019 +       }
34020 +
34021 +       spin_unlock_irq(&mapping->tree_lock);
34022 +       return result;
34023 +}
34024 +
34025 +static int commit_file_atoms(struct inode *inode)
34026 +{
34027 +       int result;
34028 +       struct unix_file_info *uf_info;
34029 +
34030 +       uf_info = unix_file_inode_data(inode);
34031 +
34032 +       get_exclusive_access(uf_info);
34033 +       /*
34034 +        * find what items file is made from
34035 +        */
34036 +       result = find_file_state(inode, uf_info);
34037 +       drop_exclusive_access(uf_info);
34038 +       if (result != 0)
34039 +               return result;
34040 +
34041 +       /*
34042 +        * file state cannot change because we are under ->i_mutex
34043 +        */
34044 +       switch (uf_info->container) {
34045 +       case UF_CONTAINER_EXTENTS:
34046 +               /* find_file_state might open join an atom */
34047 +               reiser4_txn_restart_current();
34048 +               result =
34049 +                   /*
34050 +                    * when we are called by
34051 +                    * filemap_fdatawrite->
34052 +                    *    do_writepages()->
34053 +                    *       reiser4_writepages()
34054 +                    *
34055 +                    * inode->i_mapping->dirty_pages are spices into
34056 +                    * ->io_pages, leaving ->dirty_pages dirty.
34057 +                    *
34058 +                    * When we are called from
34059 +                    * reiser4_fsync()->sync_unix_file(), we have to
34060 +                    * commit atoms of all pages on the ->dirty_list.
34061 +                    *
34062 +                    * So for simplicity we just commit ->io_pages and
34063 +                    * ->dirty_pages.
34064 +                    */
34065 +                   sync_page_list(inode);
34066 +               break;
34067 +       case UF_CONTAINER_TAILS:
34068 +               /*
34069 +                * NOTE-NIKITA probably we can be smarter for tails. For now
34070 +                * just commit all existing atoms.
34071 +                */
34072 +               result = txnmgr_force_commit_all(inode->i_sb, 0);
34073 +               break;
34074 +       case UF_CONTAINER_EMPTY:
34075 +               result = 0;
34076 +               break;
34077 +       case UF_CONTAINER_UNKNOWN:
34078 +       default:
34079 +               result = -EIO;
34080 +               break;
34081 +       }
34082 +
34083 +       /*
34084 +        * commit current transaction: there can be captured nodes from
34085 +        * find_file_state() and finish_conversion().
34086 +        */
34087 +       reiser4_txn_restart_current();
34088 +       return result;
34089 +}
34090 +
34091 +/**
34092 + * writepages_unix_file - writepages of struct address_space_operations
34093 + * @mapping:
34094 + * @wbc:
34095 + *
34096 + * This captures anonymous pages and anonymous jnodes. Anonymous pages are
34097 + * pages which are dirtied via mmapping. Anonymous jnodes are ones which were
34098 + * created by reiser4_writepage.
34099 + */
34100 +int writepages_unix_file(struct address_space *mapping,
34101 +                    struct writeback_control *wbc)
34102 +{
34103 +       int result;
34104 +       struct unix_file_info *uf_info;
34105 +       pgoff_t pindex, jindex, nr_pages;
34106 +       long to_capture;
34107 +       struct inode *inode;
34108 +
34109 +       inode = mapping->host;
34110 +       if (!has_anonymous_pages(inode)) {
34111 +               result = 0;
34112 +               goto end;
34113 +       }
34114 +       jindex = pindex = wbc->range_start >> PAGE_CACHE_SHIFT;
34115 +       result = 0;
34116 +       nr_pages = size_in_pages(i_size_read(inode));
34117 +
34118 +       uf_info = unix_file_inode_data(inode);
34119 +
34120 +       do {
34121 +               reiser4_context *ctx;
34122 +
34123 +               if (wbc->sync_mode != WB_SYNC_ALL)
34124 +                       to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST);
34125 +               else
34126 +                       to_capture = CAPTURE_APAGE_BURST;
34127 +
34128 +               ctx = reiser4_init_context(inode->i_sb);
34129 +               if (IS_ERR(ctx)) {
34130 +                       result = PTR_ERR(ctx);
34131 +                       break;
34132 +               }
34133 +               /* avoid recursive calls to ->sync_inodes */
34134 +               ctx->nobalance = 1;
34135 +               assert("zam-760", lock_stack_isclean(get_current_lock_stack()));
34136 +               assert("edward-1551", LOCK_CNT_NIL(inode_sem_w));
34137 +               assert("edward-1552", LOCK_CNT_NIL(inode_sem_r));
34138 +
34139 +               reiser4_txn_restart_current();
34140 +
34141 +               /* we have to get nonexclusive access to the file */
34142 +               if (get_current_context()->entd) {
34143 +                       /*
34144 +                        * use nonblocking version of nonexclusive_access to
34145 +                        * avoid deadlock which might look like the following:
34146 +                        * process P1 holds NEA on file F1 and called entd to
34147 +                        * reclaim some memory. Entd works for P1 and is going
34148 +                        * to capture pages of file F2. To do that entd has to
34149 +                        * get NEA to F2. F2 is held by process P2 which also
34150 +                        * called entd. But entd is serving P1 at the moment
34151 +                        * and P2 has to wait. Process P3 trying to get EA to
34152 +                        * file F2. Existence of pending EA request to file F2
34153 +                        * makes impossible for entd to get NEA to file
34154 +                        * F2. Neither of these process can continue. Using
34155 +                        * nonblocking version of gettign NEA is supposed to
34156 +                        * avoid this deadlock.
34157 +                        */
34158 +                       if (try_to_get_nonexclusive_access(uf_info) == 0) {
34159 +                               result = RETERR(-EBUSY);
34160 +                               reiser4_exit_context(ctx);
34161 +                               break;
34162 +                       }
34163 +               } else
34164 +                       get_nonexclusive_access(uf_info);
34165 +
34166 +               while (to_capture > 0) {
34167 +                       pgoff_t start;
34168 +
34169 +                       assert("vs-1727", jindex <= pindex);
34170 +                       if (pindex == jindex) {
34171 +                               start = pindex;
34172 +                               result =
34173 +                                   capture_anonymous_pages(inode->i_mapping,
34174 +                                                           &pindex,
34175 +                                                           to_capture);
34176 +                               if (result <= 0)
34177 +                                       break;
34178 +                               to_capture -= result;
34179 +                               wbc->nr_to_write -= result;
34180 +                               if (start + result == pindex) {
34181 +                                       jindex = pindex;
34182 +                                       continue;
34183 +                               }
34184 +                               if (to_capture <= 0)
34185 +                                       break;
34186 +                       }
34187 +                       /* deal with anonymous jnodes between jindex and pindex */
34188 +                       result =
34189 +                           capture_anonymous_jnodes(inode->i_mapping, &jindex,
34190 +                                                    pindex, to_capture);
34191 +                       if (result < 0)
34192 +                               break;
34193 +                       to_capture -= result;
34194 +                       get_current_context()->nr_captured += result;
34195 +
34196 +                       if (jindex == (pgoff_t) - 1) {
34197 +                               assert("vs-1728", pindex == (pgoff_t) - 1);
34198 +                               break;
34199 +                       }
34200 +               }
34201 +               if (to_capture <= 0)
34202 +                       /* there may be left more pages */
34203 +                       __mark_inode_dirty(inode, I_DIRTY_PAGES);
34204 +
34205 +               drop_nonexclusive_access(uf_info);
34206 +               if (result < 0) {
34207 +                       /* error happened */
34208 +                       reiser4_exit_context(ctx);
34209 +                       return result;
34210 +               }
34211 +               if (wbc->sync_mode != WB_SYNC_ALL) {
34212 +                       reiser4_exit_context(ctx);
34213 +                       return 0;
34214 +               }
34215 +               result = commit_file_atoms(inode);
34216 +               reiser4_exit_context(ctx);
34217 +               if (pindex >= nr_pages && jindex == pindex)
34218 +                       break;
34219 +       } while (1);
34220 +
34221 +      end:
34222 +       if (is_in_reiser4_context()) {
34223 +               if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
34224 +                       /*
34225 +                        * there are already pages to flush, flush them out, do
34226 +                        * not delay until end of reiser4_sync_inodes
34227 +                        */
34228 +                       reiser4_writeout(inode->i_sb, wbc);
34229 +                       get_current_context()->nr_captured = 0;
34230 +               }
34231 +       }
34232 +       return result;
34233 +}
34234 +
34235 +/**
34236 + * readpage_unix_file_nolock - readpage of struct address_space_operations
34237 + * @file:
34238 + * @page:
34239 + *
34240 + * Compose a key and search for item containing information about @page
34241 + * data. If item is found - its readpage method is called.
34242 + */
34243 +int readpage_unix_file(struct file *file, struct page *page)
34244 +{
34245 +       reiser4_context *ctx;
34246 +       int result;
34247 +       struct inode *inode;
34248 +       reiser4_key key;
34249 +       item_plugin *iplug;
34250 +       hint_t *hint;
34251 +       lock_handle *lh;
34252 +       coord_t *coord;
34253 +
34254 +       assert("vs-1062", PageLocked(page));
34255 +       assert("vs-976", !PageUptodate(page));
34256 +       assert("vs-1061", page->mapping && page->mapping->host);
34257 +
34258 +       if (page->mapping->host->i_size <= page_offset(page)) {
34259 +               /* page is out of file */
34260 +               zero_user(page, 0, PAGE_CACHE_SIZE);
34261 +               SetPageUptodate(page);
34262 +               unlock_page(page);
34263 +               return 0;
34264 +       }
34265 +
34266 +       inode = page->mapping->host;
34267 +       ctx = reiser4_init_context(inode->i_sb);
34268 +       if (IS_ERR(ctx)) {
34269 +               unlock_page(page);
34270 +               return PTR_ERR(ctx);
34271 +       }
34272 +
34273 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34274 +       if (hint == NULL) {
34275 +               unlock_page(page);
34276 +               reiser4_exit_context(ctx);
34277 +               return RETERR(-ENOMEM);
34278 +       }
34279 +
34280 +       result = load_file_hint(file, hint);
34281 +       if (result) {
34282 +               kfree(hint);
34283 +               unlock_page(page);
34284 +               reiser4_exit_context(ctx);
34285 +               return result;
34286 +       }
34287 +       lh = &hint->lh;
34288 +
34289 +       /* get key of first byte of the page */
34290 +       key_by_inode_and_offset_common(inode, page_offset(page), &key);
34291 +
34292 +       /* look for file metadata corresponding to first byte of page */
34293 +       page_cache_get(page);
34294 +       unlock_page(page);
34295 +       result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode);
34296 +       lock_page(page);
34297 +       page_cache_release(page);
34298 +
34299 +       if (page->mapping == NULL) {
34300 +               /*
34301 +                * readpage allows truncate to run concurrently. Page was
34302 +                * truncated while it was not locked
34303 +                */
34304 +               done_lh(lh);
34305 +               kfree(hint);
34306 +               unlock_page(page);
34307 +               reiser4_txn_restart(ctx);
34308 +               reiser4_exit_context(ctx);
34309 +               return -EINVAL;
34310 +       }
34311 +
34312 +       if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) {
34313 +               if (result == CBK_COORD_FOUND &&
34314 +                   hint->ext_coord.coord.between != AT_UNIT)
34315 +                       /* file is truncated */
34316 +                       result = -EINVAL;
34317 +               done_lh(lh);
34318 +               kfree(hint);
34319 +               unlock_page(page);
34320 +               reiser4_txn_restart(ctx);
34321 +               reiser4_exit_context(ctx);
34322 +               return result;
34323 +       }
34324 +
34325 +       /*
34326 +        * item corresponding to page is found. It can not be removed because
34327 +        * znode lock is held
34328 +        */
34329 +       if (PageUptodate(page)) {
34330 +               done_lh(lh);
34331 +               kfree(hint);
34332 +               unlock_page(page);
34333 +               reiser4_txn_restart(ctx);
34334 +               reiser4_exit_context(ctx);
34335 +               return 0;
34336 +       }
34337 +
34338 +       coord = &hint->ext_coord.coord;
34339 +       result = zload(coord->node);
34340 +       if (result) {
34341 +               done_lh(lh);
34342 +               kfree(hint);
34343 +               unlock_page(page);
34344 +               reiser4_txn_restart(ctx);
34345 +               reiser4_exit_context(ctx);
34346 +               return result;
34347 +       }
34348 +
34349 +       validate_extended_coord(&hint->ext_coord, page_offset(page));
34350 +
34351 +       if (!coord_is_existing_unit(coord)) {
34352 +               /* this indicates corruption */
34353 +               warning("vs-280",
34354 +                       "Looking for page %lu of file %llu (size %lli). "
34355 +                       "No file items found (%d). File is corrupted?\n",
34356 +                       page->index, (unsigned long long)get_inode_oid(inode),
34357 +                       inode->i_size, result);
34358 +               zrelse(coord->node);
34359 +               done_lh(lh);
34360 +               kfree(hint);
34361 +               unlock_page(page);
34362 +               reiser4_txn_restart(ctx);
34363 +               reiser4_exit_context(ctx);
34364 +               return RETERR(-EIO);
34365 +       }
34366 +
34367 +       /*
34368 +        * get plugin of found item or use plugin if extent if there are no
34369 +        * one
34370 +        */
34371 +       iplug = item_plugin_by_coord(coord);
34372 +       if (iplug->s.file.readpage)
34373 +               result = iplug->s.file.readpage(coord, page);
34374 +       else
34375 +               result = RETERR(-EINVAL);
34376 +
34377 +       if (!result) {
34378 +               set_key_offset(&key,
34379 +                              (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT);
34380 +               /* FIXME should call reiser4_set_hint() */
34381 +               reiser4_unset_hint(hint);
34382 +       } else {
34383 +               unlock_page(page);
34384 +               reiser4_unset_hint(hint);
34385 +       }
34386 +       assert("vs-979",
34387 +              ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
34388 +       assert("vs-9791", ergo(result != 0, !PageLocked(page)));
34389 +
34390 +       zrelse(coord->node);
34391 +       done_lh(lh);
34392 +
34393 +       save_file_hint(file, hint);
34394 +       kfree(hint);
34395 +
34396 +       /*
34397 +        * FIXME: explain why it is needed. HINT: page allocation in write can
34398 +        * not be done when atom is not NULL because reiser4_writepage can not
34399 +        * kick entd and have to eflush
34400 +        */
34401 +       reiser4_txn_restart(ctx);
34402 +       reiser4_exit_context(ctx);
34403 +       return result;
34404 +}
34405 +
34406 +struct uf_readpages_context {
34407 +       lock_handle lh;
34408 +       coord_t coord;
34409 +};
34410 +
34411 +/* A callback function for readpages_unix_file/read_cache_pages.
34412 + * If the file is build of tails, then return error (-ENOENT).
34413 + *
34414 + * @data -- a pointer to reiser4_readpages_context object,
34415 + *            to save the twig lock and the coord between
34416 + *            read_cache_page iterations.
34417 + * @page -- page to start read.
34418 + */
34419 +static int uf_readpages_filler(void * data, struct page * page)
34420 +{
34421 +       struct uf_readpages_context *rc = data;
34422 +       jnode * node;
34423 +       int ret = 0;
34424 +       reiser4_extent *ext;
34425 +       __u64 ext_index;
34426 +       int cbk_done = 0;
34427 +       struct address_space * mapping = page->mapping;
34428 +
34429 +       if (PageUptodate(page)) {
34430 +               unlock_page(page);
34431 +               return 0;
34432 +       }
34433 +       page_cache_get(page);
34434 +
34435 +       if (rc->lh.node == 0) {
34436 +               /* no twig lock  - have to do tree search. */
34437 +               reiser4_key key;
34438 +       repeat:
34439 +               unlock_page(page);
34440 +               key_by_inode_and_offset_common(
34441 +                       mapping->host, page_offset(page), &key);
34442 +               ret = coord_by_key(
34443 +                       &get_super_private(mapping->host->i_sb)->tree,
34444 +                       &key, &rc->coord, &rc->lh,
34445 +                       ZNODE_READ_LOCK, FIND_EXACT,
34446 +                       TWIG_LEVEL, TWIG_LEVEL, CBK_UNIQUE, NULL);
34447 +               if (unlikely(ret))
34448 +                       goto exit;
34449 +               lock_page(page);
34450 +               if (PageUptodate(page))
34451 +                       goto unlock;
34452 +               cbk_done = 1;
34453 +       }
34454 +       ret = zload(rc->coord.node);
34455 +       if (unlikely(ret))
34456 +               goto unlock;
34457 +       if (!coord_is_existing_item(&rc->coord) ||
34458 +           !item_is_extent(&rc->coord)) {
34459 +               zrelse(rc->coord.node);
34460 +               ret = RETERR(-EIO);
34461 +               goto unlock;
34462 +       }
34463 +       ext = extent_by_coord(&rc->coord);
34464 +       ext_index = extent_unit_index(&rc->coord);
34465 +       if (page->index < ext_index ||
34466 +           page->index >= ext_index + extent_get_width(ext)) {
34467 +               /* the page index doesn't belong to the extent unit
34468 +                  which the coord points to - release the lock and
34469 +                  repeat with tree search. */
34470 +               zrelse(rc->coord.node);
34471 +               done_lh(&rc->lh);
34472 +               /* we can be here after a CBK call only in case of
34473 +                  corruption of the tree or the tree lookup algorithm bug. */
34474 +               if (unlikely(cbk_done)) {
34475 +                       ret = RETERR(-EIO);
34476 +                       goto unlock;
34477 +               }
34478 +               goto repeat;
34479 +       }
34480 +       node = jnode_of_page(page);
34481 +       if (unlikely(IS_ERR(node))) {
34482 +               zrelse(rc->coord.node);
34483 +               ret = PTR_ERR(node);
34484 +               goto unlock;
34485 +       }
34486 +       ret = reiser4_do_readpage_extent(ext, page->index - ext_index, page);
34487 +       jput(node);
34488 +       zrelse(rc->coord.node);
34489 +       if (likely(!ret))
34490 +               goto exit;
34491 + unlock:
34492 +       unlock_page(page);
34493 + exit:
34494 +       page_cache_release(page);
34495 +       return ret;
34496 +}
34497 +
34498 +/**
34499 + * readpages_unix_file - called by the readahead code, starts reading for each
34500 + * page of given list of pages
34501 + */
34502 +int readpages_unix_file(
34503 +       struct file *file, struct address_space *mapping,
34504 +       struct list_head *pages, unsigned nr_pages)
34505 +{
34506 +       reiser4_context *ctx;
34507 +       struct uf_readpages_context rc;
34508 +       int ret;
34509 +
34510 +       ctx = reiser4_init_context(mapping->host->i_sb);
34511 +       if (IS_ERR(ctx)) {
34512 +               put_pages_list(pages);
34513 +               return PTR_ERR(ctx);
34514 +       }
34515 +       init_lh(&rc.lh);
34516 +       ret = read_cache_pages(mapping, pages,  uf_readpages_filler, &rc);
34517 +       done_lh(&rc.lh);
34518 +       context_set_commit_async(ctx);
34519 +       /* close the transaction to protect further page allocation from deadlocks */
34520 +       reiser4_txn_restart(ctx);
34521 +       reiser4_exit_context(ctx);
34522 +       return ret;
34523 +}
34524 +
34525 +static reiser4_block_nr unix_file_estimate_read(struct inode *inode,
34526 +                                               loff_t count UNUSED_ARG)
34527 +{
34528 +       /* We should reserve one block, because of updating of the stat data
34529 +          item */
34530 +       assert("vs-1249",
34531 +              inode_file_plugin(inode)->estimate.update ==
34532 +              estimate_update_common);
34533 +       return estimate_update_common(inode);
34534 +}
34535 +
34536 +/* this is called with nonexclusive access obtained, file's container can not change */
34537 +static ssize_t read_file(hint_t *hint, struct file *file,      /* file to read from to */
34538 +                        char __user *buf,      /* address of user-space buffer */
34539 +                        size_t count,  /* number of bytes to read */
34540 +                        loff_t *off)
34541 +{
34542 +       int result;
34543 +       struct inode *inode;
34544 +       flow_t flow;
34545 +       int (*read_f) (struct file *, flow_t *, hint_t *);
34546 +       coord_t *coord;
34547 +       znode *loaded;
34548 +
34549 +       inode = file->f_dentry->d_inode;
34550 +
34551 +       /* build flow */
34552 +       assert("vs-1250",
34553 +              inode_file_plugin(inode)->flow_by_inode ==
34554 +              flow_by_inode_unix_file);
34555 +       result =
34556 +           flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count,
34557 +                                   *off, READ_OP, &flow);
34558 +       if (unlikely(result))
34559 +               return result;
34560 +
34561 +       /* get seal and coord sealed with it from reiser4 private data
34562 +          of struct file.  The coord will tell us where our last read
34563 +          of this file finished, and the seal will help to determine
34564 +          if that location is still valid.
34565 +        */
34566 +       coord = &hint->ext_coord.coord;
34567 +       while (flow.length && result == 0) {
34568 +               result =
34569 +                       find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode);
34570 +               if (cbk_errored(result))
34571 +                       /* error happened */
34572 +                       break;
34573 +
34574 +               if (coord->between != AT_UNIT) {
34575 +                       /* there were no items corresponding to given offset */
34576 +                       done_lh(hint->ext_coord.lh);
34577 +                       break;
34578 +               }
34579 +
34580 +               loaded = coord->node;
34581 +               result = zload(loaded);
34582 +               if (unlikely(result)) {
34583 +                       done_lh(hint->ext_coord.lh);
34584 +                       break;
34585 +               }
34586 +
34587 +               if (hint->ext_coord.valid == 0)
34588 +                       validate_extended_coord(&hint->ext_coord,
34589 +                                               get_key_offset(&flow.key));
34590 +
34591 +               assert("vs-4", hint->ext_coord.valid == 1);
34592 +               assert("vs-33", hint->ext_coord.lh == &hint->lh);
34593 +               /* call item's read method */
34594 +               read_f = item_plugin_by_coord(coord)->s.file.read;
34595 +               result = read_f(file, &flow, hint);
34596 +               zrelse(loaded);
34597 +               done_lh(hint->ext_coord.lh);
34598 +       }
34599 +
34600 +       return (count - flow.length) ? (count - flow.length) : result;
34601 +}
34602 +
34603 +static ssize_t read_unix_file_container_tails(struct file*, char __user*, size_t, loff_t*);
34604 +
34605 +/**
34606 + * read_unix_file - read of struct file_operations
34607 + * @file: file to read from
34608 + * @buf: address of user-space buffer
34609 + * @read_amount: number of bytes to read
34610 + * @off: position in file to read from
34611 + *
34612 + * This is implementation of vfs's read method of struct file_operations for
34613 + * unix file plugin.
34614 + */
34615 +ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount,
34616 +                      loff_t *off)
34617 +{
34618 +       reiser4_context *ctx;
34619 +       ssize_t result;
34620 +       struct inode *inode;
34621 +       struct unix_file_info *uf_info;
34622 +
34623 +       if (unlikely(read_amount == 0))
34624 +               return 0;
34625 +
34626 +       assert("umka-072", file != NULL);
34627 +       assert("umka-074", off != NULL);
34628 +       inode = file->f_dentry->d_inode;
34629 +       assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
34630 +
34631 +       ctx = reiser4_init_context(inode->i_sb);
34632 +       if (IS_ERR(ctx))
34633 +               return PTR_ERR(ctx);
34634 +       uf_info = unix_file_inode_data(inode);
34635 +       if (uf_info->container == UF_CONTAINER_UNKNOWN) {
34636 +               get_exclusive_access(uf_info);
34637 +               result = find_file_state(inode, uf_info);
34638 +               if (unlikely(result != 0))
34639 +                       goto out;
34640 +       } else
34641 +               get_nonexclusive_access(uf_info);
34642 +       result = reiser4_grab_space_force(unix_file_estimate_read(inode, read_amount),
34643 +                                         BA_CAN_COMMIT);
34644 +       if (unlikely(result != 0))
34645 +               goto out;
34646 +       if (uf_info->container == UF_CONTAINER_EXTENTS){
34647 +               result = do_sync_read(file, buf, read_amount, off);
34648 +       } else if (uf_info->container == UF_CONTAINER_TAILS ||
34649 +                  reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV) ||
34650 +                  reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
34651 +               result = read_unix_file_container_tails(file, buf, read_amount, off);
34652 +       } else {
34653 +               assert("zam-1085", uf_info->container == UF_CONTAINER_EMPTY);
34654 +               result = 0;
34655 +       }
34656 +out:
34657 +       drop_access(uf_info);
34658 +       context_set_commit_async(ctx);
34659 +       reiser4_exit_context(ctx);
34660 +       return result;
34661 +}
34662 +
34663 +static ssize_t read_unix_file_container_tails(
34664 +       struct file *file, char __user *buf, size_t read_amount, loff_t *off)
34665 +{
34666 +       int result;
34667 +       struct inode *inode;
34668 +       hint_t *hint;
34669 +       struct unix_file_info *uf_info;
34670 +       size_t count, read, left;
34671 +       loff_t size;
34672 +
34673 +       assert("umka-072", file != NULL);
34674 +       assert("umka-074", off != NULL);
34675 +       inode = file->f_dentry->d_inode;
34676 +       assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
34677 +
34678 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34679 +       if (hint == NULL)
34680 +               return RETERR(-ENOMEM);
34681 +
34682 +       result = load_file_hint(file, hint);
34683 +       if (result) {
34684 +               kfree(hint);
34685 +               return result;
34686 +       }
34687 +
34688 +       left = read_amount;
34689 +       count = 0;
34690 +       uf_info = unix_file_inode_data(inode);
34691 +       while (left > 0) {
34692 +               reiser4_txn_restart_current();
34693 +               size = i_size_read(inode);
34694 +               if (*off >= size)
34695 +                       /* position to read from is past the end of file */
34696 +                       break;
34697 +               if (*off + left > size)
34698 +                       left = size - *off;
34699 +               /* faultin user page */
34700 +               result = fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left);
34701 +               if (result)
34702 +                       return RETERR(-EFAULT);
34703 +
34704 +               read = read_file(hint, file, buf,
34705 +                                left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left,
34706 +                                off);
34707 +               if (read < 0) {
34708 +                       result = read;
34709 +                       break;
34710 +               }
34711 +               left -= read;
34712 +               buf += read;
34713 +
34714 +               /* update position in a file */
34715 +               *off += read;
34716 +               /* total number of read bytes */
34717 +               count += read;
34718 +       }
34719 +       done_lh(&hint->lh);
34720 +       save_file_hint(file, hint);
34721 +       kfree(hint);
34722 +       if (count)
34723 +               file_accessed(file);
34724 +       /* return number of read bytes or error code if nothing is read */
34725 +       return count ? count : result;
34726 +}
34727 +
34728 +/* This function takes care about @file's pages. First of all it checks if
34729 +   filesystems readonly and if so gets out. Otherwise, it throws out all
34730 +   pages of file if it was mapped for read and going to be mapped for write
34731 +   and consists of tails. This is done in order to not manage few copies
34732 +   of the data (first in page cache and second one in tails them selves)
34733 +   for the case of mapping files consisting tails.
34734 +
34735 +   Here also tail2extent conversion is performed if it is allowed and file
34736 +   is going to be written or mapped for write. This functions may be called
34737 +   from write_unix_file() or mmap_unix_file(). */
34738 +static int check_pages_unix_file(struct file *file, struct inode *inode)
34739 +{
34740 +       reiser4_invalidate_pages(inode->i_mapping, 0,
34741 +                                (inode->i_size + PAGE_CACHE_SIZE -
34742 +                                 1) >> PAGE_CACHE_SHIFT, 0);
34743 +       return unpack(file, inode, 0 /* not forever */ );
34744 +}
34745 +
34746 +/**
34747 + * mmap_unix_file - mmap of struct file_operations
34748 + * @file: file to mmap
34749 + * @vma:
34750 + *
34751 + * This is implementation of vfs's mmap method of struct file_operations for
34752 + * unix file plugin. It converts file to extent if necessary. Sets
34753 + * reiser4_inode's flag - REISER4_HAS_MMAP.
34754 + */
34755 +int mmap_unix_file(struct file *file, struct vm_area_struct *vma)
34756 +{
34757 +       reiser4_context *ctx;
34758 +       int result;
34759 +       struct inode *inode;
34760 +       struct unix_file_info *uf_info;
34761 +       reiser4_block_nr needed;
34762 +
34763 +       inode = file->f_dentry->d_inode;
34764 +       ctx = reiser4_init_context(inode->i_sb);
34765 +       if (IS_ERR(ctx))
34766 +               return PTR_ERR(ctx);
34767 +
34768 +       uf_info = unix_file_inode_data(inode);
34769 +
34770 +       get_exclusive_access_careful(uf_info, inode);
34771 +
34772 +       if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) {
34773 +               /*
34774 +                * we need file built of extent items. If it is still built of
34775 +                * tail items we have to convert it. Find what items the file
34776 +                * is built of
34777 +                */
34778 +               result = find_file_state(inode, uf_info);
34779 +               if (result != 0) {
34780 +                       drop_exclusive_access(uf_info);
34781 +                       reiser4_exit_context(ctx);
34782 +                       return result;
34783 +               }
34784 +
34785 +               assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS ||
34786 +                                  uf_info->container == UF_CONTAINER_EXTENTS ||
34787 +                                  uf_info->container == UF_CONTAINER_EMPTY));
34788 +               if (uf_info->container == UF_CONTAINER_TAILS) {
34789 +                       /*
34790 +                        * invalidate all pages and convert file from tails to
34791 +                        * extents
34792 +                        */
34793 +                       result = check_pages_unix_file(file, inode);
34794 +                       if (result) {
34795 +                               drop_exclusive_access(uf_info);
34796 +                               reiser4_exit_context(ctx);
34797 +                               return result;
34798 +                       }
34799 +               }
34800 +       }
34801 +
34802 +       /*
34803 +        * generic_file_mmap will do update_atime. Grab space for stat data
34804 +        * update.
34805 +        */
34806 +       needed = inode_file_plugin(inode)->estimate.update(inode);
34807 +       result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
34808 +       if (result) {
34809 +               drop_exclusive_access(uf_info);
34810 +               reiser4_exit_context(ctx);
34811 +               return result;
34812 +       }
34813 +
34814 +       result = generic_file_mmap(file, vma);
34815 +       if (result == 0) {
34816 +               /* mark file as having mapping. */
34817 +               reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
34818 +       }
34819 +
34820 +       drop_exclusive_access(uf_info);
34821 +       reiser4_exit_context(ctx);
34822 +       return result;
34823 +}
34824 +
34825 +/**
34826 + * find_first_item
34827 + * @inode:
34828 + *
34829 + * Finds file item which is responsible for first byte in the file.
34830 + */
34831 +static int find_first_item(struct inode *inode)
34832 +{
34833 +       coord_t coord;
34834 +       lock_handle lh;
34835 +       reiser4_key key;
34836 +       int result;
34837 +
34838 +       coord_init_zero(&coord);
34839 +       init_lh(&lh);
34840 +       inode_file_plugin(inode)->key_by_inode(inode, 0, &key);
34841 +       result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK,
34842 +                                      inode);
34843 +       if (result == CBK_COORD_FOUND) {
34844 +               if (coord.between == AT_UNIT) {
34845 +                       result = zload(coord.node);
34846 +                       if (result == 0) {
34847 +                               result = item_id_by_coord(&coord);
34848 +                               zrelse(coord.node);
34849 +                               if (result != EXTENT_POINTER_ID &&
34850 +                                   result != FORMATTING_ID)
34851 +                                       result = RETERR(-EIO);
34852 +                       }
34853 +               } else
34854 +                       result = RETERR(-EIO);
34855 +       }
34856 +       done_lh(&lh);
34857 +       return result;
34858 +}
34859 +
34860 +/**
34861 + * open_unix_file
34862 + * @inode:
34863 + * @file:
34864 + *
34865 + * If filesystem is not readonly - complete uncompleted tail conversion if
34866 + * there was one
34867 + */
34868 +int open_unix_file(struct inode *inode, struct file *file)
34869 +{
34870 +       int result;
34871 +       reiser4_context *ctx;
34872 +       struct unix_file_info *uf_info;
34873 +
34874 +       if (IS_RDONLY(inode))
34875 +               return 0;
34876 +
34877 +       if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED))
34878 +               return 0;
34879 +
34880 +       ctx = reiser4_init_context(inode->i_sb);
34881 +       if (IS_ERR(ctx))
34882 +               return PTR_ERR(ctx);
34883 +
34884 +       uf_info = unix_file_inode_data(inode);
34885 +
34886 +       get_exclusive_access_careful(uf_info, inode);
34887 +
34888 +       if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
34889 +               /*
34890 +                * other process completed the conversion
34891 +                */
34892 +               drop_exclusive_access(uf_info);
34893 +               reiser4_exit_context(ctx);
34894 +               return 0;
34895 +       }
34896 +
34897 +       /*
34898 +        * file left in semi converted state after unclean shutdown or another
34899 +        * thread is doing conversion and dropped exclusive access which doing
34900 +        * balance dirty pages. Complete the conversion
34901 +        */
34902 +       result = find_first_item(inode);
34903 +       if (result == EXTENT_POINTER_ID)
34904 +               /*
34905 +                * first item is extent, therefore there was incomplete
34906 +                * tail2extent conversion. Complete it
34907 +                */
34908 +               result = tail2extent(unix_file_inode_data(inode));
34909 +       else if (result == FORMATTING_ID)
34910 +               /*
34911 +                * first item is formatting item, therefore there was
34912 +                * incomplete extent2tail conversion. Complete it
34913 +                */
34914 +               result = extent2tail(file, unix_file_inode_data(inode));
34915 +       else
34916 +               result = -EIO;
34917 +
34918 +       assert("vs-1712",
34919 +              ergo(result == 0,
34920 +                   (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED) &&
34921 +                    !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))));
34922 +       drop_exclusive_access(uf_info);
34923 +       reiser4_exit_context(ctx);
34924 +       return result;
34925 +}
34926 +
34927 +#define NEITHER_OBTAINED 0
34928 +#define EA_OBTAINED 1
34929 +#define NEA_OBTAINED 2
34930 +
34931 +static void drop_access(struct unix_file_info *uf_info)
34932 +{
34933 +       if (uf_info->exclusive_use)
34934 +               drop_exclusive_access(uf_info);
34935 +       else
34936 +               drop_nonexclusive_access(uf_info);
34937 +}
34938 +
34939 +#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \
34940 +                             __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__)
34941 +
34942 +/**
34943 + * write_unix_file - private ->write() method of unix_file plugin.
34944 + *
34945 + * @file: file to write to
34946 + * @buf: address of user-space buffer
34947 + * @count: number of bytes to write
34948 + * @pos: position in file to write to
34949 + * @cont: unused argument, as we don't perform plugin conversion when being
34950 + * managed by unix_file plugin.
34951 + */
34952 +ssize_t write_unix_file(struct file *file, const char __user *buf,
34953 +                       size_t count, loff_t *pos, struct psched_context *cont)
34954 +{
34955 +       int result;
34956 +       reiser4_context *ctx;
34957 +       struct inode *inode;
34958 +       struct unix_file_info *uf_info;
34959 +       ssize_t written;
34960 +       int try_free_space;
34961 +       int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY;
34962 +       size_t left;
34963 +       ssize_t (*write_op)(struct file *, struct inode *,
34964 +                           const char __user *, size_t,
34965 +                           loff_t *pos);
34966 +       int ea;
34967 +       loff_t new_size;
34968 +
34969 +       ctx = get_current_context();
34970 +       inode = file->f_dentry->d_inode;
34971 +
34972 +       assert("vs-947", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
34973 +       assert("vs-9471", (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)));
34974 +
34975 +       /* check amount of bytes to write and writing position */
34976 +       result = generic_write_checks(file, pos, &count, 0);
34977 +       if (result) {
34978 +               context_set_commit_async(ctx);
34979 +               return result;
34980 +       }
34981 +
34982 +       result = file_remove_suid(file);
34983 +       if (result) {
34984 +               context_set_commit_async(ctx);
34985 +               return result;
34986 +       }
34987 +       /* remove_suid might create a transaction */
34988 +       reiser4_txn_restart(ctx);
34989 +
34990 +       uf_info = unix_file_inode_data(inode);
34991 +
34992 +       current->backing_dev_info = inode->i_mapping->backing_dev_info;
34993 +       written = 0;
34994 +       try_free_space = 0;
34995 +       left = count;
34996 +       ea = NEITHER_OBTAINED;
34997 +
34998 +       new_size = i_size_read(inode);
34999 +       if (*pos + count > new_size)
35000 +               new_size = *pos + count;
35001 +
35002 +       while (left) {
35003 +               if (left < to_write)
35004 +                       to_write = left;
35005 +
35006 +               if (uf_info->container == UF_CONTAINER_EMPTY) {
35007 +                       get_exclusive_access(uf_info);
35008 +                       ea = EA_OBTAINED;
35009 +                       if (uf_info->container != UF_CONTAINER_EMPTY) {
35010 +                               /* file is made not empty by another process */
35011 +                               drop_exclusive_access(uf_info);
35012 +                               ea = NEITHER_OBTAINED;
35013 +                               continue;
35014 +                       }
35015 +               } else if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35016 +                       /*
35017 +                        * get exclusive access directly just to not have to
35018 +                        * re-obtain it if file will appear empty
35019 +                        */
35020 +                       get_exclusive_access(uf_info);
35021 +                       ea = EA_OBTAINED;
35022 +                       result = find_file_state(inode, uf_info);
35023 +                       if (result) {
35024 +                               drop_exclusive_access(uf_info);
35025 +                               ea = NEITHER_OBTAINED;
35026 +                               break;
35027 +                       }
35028 +               } else {
35029 +                       get_nonexclusive_access(uf_info);
35030 +                       ea = NEA_OBTAINED;
35031 +               }
35032 +
35033 +               /* either EA or NEA is obtained. Choose item write method */
35034 +               if (uf_info->container == UF_CONTAINER_EXTENTS) {
35035 +                       /* file is built of extent items */
35036 +                       write_op = reiser4_write_extent;
35037 +               } else if (uf_info->container == UF_CONTAINER_EMPTY) {
35038 +                       /* file is empty */
35039 +                       if (should_have_notail(uf_info, new_size))
35040 +                               write_op = reiser4_write_extent;
35041 +                       else
35042 +                               write_op = reiser4_write_tail;
35043 +               } else {
35044 +                       /* file is built of tail items */
35045 +                       if (should_have_notail(uf_info, new_size)) {
35046 +                               if (ea == NEA_OBTAINED) {
35047 +                                       drop_nonexclusive_access(uf_info);
35048 +                                       get_exclusive_access(uf_info);
35049 +                                       ea = EA_OBTAINED;
35050 +                               }
35051 +                               if (uf_info->container == UF_CONTAINER_TAILS) {
35052 +                                       /*
35053 +                                        * if file is being convered by another
35054 +                                        * process - wait until it completes
35055 +                                        */
35056 +                                       while (1) {
35057 +                                               if (reiser4_inode_get_flag(inode,
35058 +                                                                          REISER4_PART_IN_CONV)) {
35059 +                                                       drop_exclusive_access(uf_info);
35060 +                                                       schedule();
35061 +                                                       get_exclusive_access(uf_info);
35062 +                                                       continue;
35063 +                                               }
35064 +                                               break;
35065 +                                       }
35066 +                                       if (uf_info->container ==  UF_CONTAINER_TAILS) {
35067 +                                               result = tail2extent(uf_info);
35068 +                                               if (result) {
35069 +                                                       drop_exclusive_access(uf_info);
35070 +                                                       context_set_commit_async(ctx);
35071 +                                                       break;
35072 +                                               }
35073 +                                       }
35074 +                               }
35075 +                               drop_exclusive_access(uf_info);
35076 +                               ea = NEITHER_OBTAINED;
35077 +                               continue;
35078 +                       }
35079 +                       write_op = reiser4_write_tail;
35080 +               }
35081 +
35082 +               written = write_op(file, inode, buf, to_write, pos);
35083 +               if (written == -ENOSPC && try_free_space) {
35084 +                       drop_access(uf_info);
35085 +                       txnmgr_force_commit_all(inode->i_sb, 0);
35086 +                       try_free_space = 0;
35087 +                       continue;
35088 +               }
35089 +               if (written < 0) {
35090 +                       drop_access(uf_info);
35091 +                       result = written;
35092 +                       break;
35093 +               }
35094 +               /* something is written. */
35095 +               if (uf_info->container == UF_CONTAINER_EMPTY) {
35096 +                       assert("edward-1553", ea == EA_OBTAINED);
35097 +                       uf_info->container =
35098 +                               (write_op == reiser4_write_extent) ?
35099 +                               UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS;
35100 +               } else {
35101 +                       assert("edward-1554", ergo(uf_info->container == UF_CONTAINER_EXTENTS,
35102 +                                       write_op == reiser4_write_extent));
35103 +                       assert("edward-1555", ergo(uf_info->container == UF_CONTAINER_TAILS,
35104 +                                       write_op == reiser4_write_tail));
35105 +               }
35106 +               if (*pos + written > inode->i_size)
35107 +                       INODE_SET_FIELD(inode, i_size, *pos + written);
35108 +               file_update_time(file);
35109 +               result = reiser4_update_sd(inode);
35110 +               if (result) {
35111 +                       current->backing_dev_info = NULL;
35112 +                       drop_access(uf_info);
35113 +                       context_set_commit_async(ctx);
35114 +                       break;
35115 +               }
35116 +               drop_access(uf_info);
35117 +               ea = NEITHER_OBTAINED;
35118 +               reiser4_txn_restart(ctx);
35119 +               current->journal_info = NULL;
35120 +               /*
35121 +                * tell VM how many pages were dirtied. Maybe number of pages
35122 +                * which were dirty already should not be counted
35123 +                */
35124 +               balance_dirty_pages_ratelimited_nr(inode->i_mapping,
35125 +                                                  (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE);
35126 +               current->journal_info = ctx;
35127 +
35128 +               left -= written;
35129 +               buf += written;
35130 +               *pos += written;
35131 +       }
35132 +       if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
35133 +               reiser4_txn_restart_current();
35134 +               grab_space_enable();
35135 +               result = reiser4_sync_file_common(file, file->f_dentry,
35136 +                                                 0 /* data and stat data */);
35137 +               if (result)
35138 +                       warning("reiser4-7", "failed to sync file %llu",
35139 +                               (unsigned long long)get_inode_oid(inode));
35140 +       }
35141 +
35142 +       current->backing_dev_info = NULL;
35143 +
35144 +       /*
35145 +        * return number of written bytes or error code if nothing is
35146 +        * written. Note, that it does not work correctly in case when
35147 +        * sync_unix_file returns error
35148 +        */
35149 +       return (count - left) ? (count - left) : result;
35150 +}
35151 +
35152 +/**
35153 + * release_unix_file - release of struct file_operations
35154 + * @inode: inode of released file
35155 + * @file: file to release
35156 + *
35157 + * Implementation of release method of struct file_operations for unix file
35158 + * plugin. If last reference to indode is released - convert all extent items
35159 + * into tail items if necessary. Frees reiser4 specific file data.
35160 + */
35161 +int release_unix_file(struct inode *inode, struct file *file)
35162 +{
35163 +       reiser4_context *ctx;
35164 +       struct unix_file_info *uf_info;
35165 +       int result;
35166 +       int in_reiser4;
35167 +
35168 +       in_reiser4 = is_in_reiser4_context();
35169 +
35170 +       ctx = reiser4_init_context(inode->i_sb);
35171 +       if (IS_ERR(ctx))
35172 +               return PTR_ERR(ctx);
35173 +
35174 +       result = 0;
35175 +       if (in_reiser4 == 0) {
35176 +               uf_info = unix_file_inode_data(inode);
35177 +
35178 +               get_exclusive_access_careful(uf_info, inode);
35179 +               if (atomic_read(&file->f_dentry->d_count) == 1 &&
35180 +                   uf_info->container == UF_CONTAINER_EXTENTS &&
35181 +                   !should_have_notail(uf_info, inode->i_size) &&
35182 +                   !rofs_inode(inode)) {
35183 +                       result = extent2tail(file, uf_info);
35184 +                       if (result != 0) {
35185 +                               context_set_commit_async(ctx);
35186 +                               warning("nikita-3233",
35187 +                                       "Failed (%d) to convert in %s (%llu)",
35188 +                                       result, __FUNCTION__,
35189 +                                       (unsigned long long)
35190 +                                       get_inode_oid(inode));
35191 +                       }
35192 +               }
35193 +               drop_exclusive_access(uf_info);
35194 +       } else {
35195 +               /*
35196 +                  we are within reiser4 context already. How latter is
35197 +                  possible? Simple:
35198 +
35199 +                  (gdb) bt
35200 +                  #0  get_exclusive_access ()
35201 +                  #2  0xc01e56d3 in release_unix_file ()
35202 +                  #3  0xc01c3643 in reiser4_release ()
35203 +                  #4  0xc014cae0 in __fput ()
35204 +                  #5  0xc013ffc3 in remove_vm_struct ()
35205 +                  #6  0xc0141786 in exit_mmap ()
35206 +                  #7  0xc0118480 in mmput ()
35207 +                  #8  0xc0133205 in oom_kill ()
35208 +                  #9  0xc01332d1 in out_of_memory ()
35209 +                  #10 0xc013bc1d in try_to_free_pages ()
35210 +                  #11 0xc013427b in __alloc_pages ()
35211 +                  #12 0xc013f058 in do_anonymous_page ()
35212 +                  #13 0xc013f19d in do_no_page ()
35213 +                  #14 0xc013f60e in handle_mm_fault ()
35214 +                  #15 0xc01131e5 in do_page_fault ()
35215 +                  #16 0xc0104935 in error_code ()
35216 +                  #17 0xc025c0c6 in __copy_to_user_ll ()
35217 +                  #18 0xc01d496f in reiser4_read_tail ()
35218 +                  #19 0xc01e4def in read_unix_file ()
35219 +                  #20 0xc01c3504 in reiser4_read ()
35220 +                  #21 0xc014bd4f in vfs_read ()
35221 +                  #22 0xc014bf66 in sys_read ()
35222 +                */
35223 +               warning("vs-44", "out of memory?");
35224 +       }
35225 +
35226 +       reiser4_free_file_fsdata(file);
35227 +
35228 +       reiser4_exit_context(ctx);
35229 +       return result;
35230 +}
35231 +
35232 +static void set_file_notail(struct inode *inode)
35233 +{
35234 +       reiser4_inode *state;
35235 +       formatting_plugin *tplug;
35236 +
35237 +       state = reiser4_inode_data(inode);
35238 +       tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID);
35239 +       force_plugin_pset(inode, PSET_FORMATTING, (reiser4_plugin *)tplug);
35240 +}
35241 +
35242 +/* if file is built of tails - convert it to extents */
35243 +static int unpack(struct file *filp, struct inode *inode, int forever)
35244 +{
35245 +       int result = 0;
35246 +       struct unix_file_info *uf_info;
35247 +
35248 +       uf_info = unix_file_inode_data(inode);
35249 +       assert("vs-1628", ea_obtained(uf_info));
35250 +
35251 +       result = find_file_state(inode, uf_info);
35252 +       if (result)
35253 +               return result;
35254 +       assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN);
35255 +
35256 +       if (uf_info->container == UF_CONTAINER_TAILS) {
35257 +               /*
35258 +                * if file is being convered by another process - wait until it
35259 +                * completes
35260 +                */
35261 +               while (1) {
35262 +                       if (reiser4_inode_get_flag(inode,
35263 +                                                  REISER4_PART_IN_CONV)) {
35264 +                               drop_exclusive_access(uf_info);
35265 +                               schedule();
35266 +                               get_exclusive_access(uf_info);
35267 +                               continue;
35268 +                       }
35269 +                       break;
35270 +               }
35271 +               if (uf_info->container == UF_CONTAINER_TAILS) {
35272 +                       result = tail2extent(uf_info);
35273 +                       if (result)
35274 +                               return result;
35275 +               }
35276 +       }
35277 +       if (forever) {
35278 +               /* safe new formatting plugin in stat data */
35279 +               __u64 tograb;
35280 +
35281 +               set_file_notail(inode);
35282 +
35283 +               grab_space_enable();
35284 +               tograb = inode_file_plugin(inode)->estimate.update(inode);
35285 +               result = reiser4_grab_space(tograb, BA_CAN_COMMIT);
35286 +               result = reiser4_update_sd(inode);
35287 +       }
35288 +
35289 +       return result;
35290 +}
35291 +
35292 +/* implentation of vfs' ioctl method of struct file_operations for unix file
35293 +   plugin
35294 +*/
35295 +int
35296 +ioctl_unix_file(struct inode *inode, struct file *filp,
35297 +               unsigned int cmd, unsigned long arg UNUSED_ARG)
35298 +{
35299 +       reiser4_context *ctx;
35300 +       int result;
35301 +
35302 +       ctx = reiser4_init_context(inode->i_sb);
35303 +       if (IS_ERR(ctx))
35304 +               return PTR_ERR(ctx);
35305 +
35306 +       switch (cmd) {
35307 +       case REISER4_IOC_UNPACK:
35308 +               get_exclusive_access(unix_file_inode_data(inode));
35309 +               result = unpack(filp, inode, 1 /* forever */ );
35310 +               drop_exclusive_access(unix_file_inode_data(inode));
35311 +               break;
35312 +
35313 +       default:
35314 +               result = RETERR(-ENOSYS);
35315 +               break;
35316 +       }
35317 +       reiser4_exit_context(ctx);
35318 +       return result;
35319 +}
35320 +
35321 +/* implentation of vfs' bmap method of struct address_space_operations for unix
35322 +   file plugin
35323 +*/
35324 +sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock)
35325 +{
35326 +       reiser4_context *ctx;
35327 +       sector_t result;
35328 +       reiser4_key key;
35329 +       coord_t coord;
35330 +       lock_handle lh;
35331 +       struct inode *inode;
35332 +       item_plugin *iplug;
35333 +       sector_t block;
35334 +
35335 +       inode = mapping->host;
35336 +
35337 +       ctx = reiser4_init_context(inode->i_sb);
35338 +       if (IS_ERR(ctx))
35339 +               return PTR_ERR(ctx);
35340 +       key_by_inode_and_offset_common(inode,
35341 +                                      (loff_t) lblock * current_blocksize,
35342 +                                      &key);
35343 +
35344 +       init_lh(&lh);
35345 +       result =
35346 +           find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
35347 +       if (cbk_errored(result)) {
35348 +               done_lh(&lh);
35349 +               reiser4_exit_context(ctx);
35350 +               return result;
35351 +       }
35352 +
35353 +       result = zload(coord.node);
35354 +       if (result) {
35355 +               done_lh(&lh);
35356 +               reiser4_exit_context(ctx);
35357 +               return result;
35358 +       }
35359 +
35360 +       iplug = item_plugin_by_coord(&coord);
35361 +       if (iplug->s.file.get_block) {
35362 +               result = iplug->s.file.get_block(&coord, lblock, &block);
35363 +               if (result == 0)
35364 +                       result = block;
35365 +       } else
35366 +               result = RETERR(-EINVAL);
35367 +
35368 +       zrelse(coord.node);
35369 +       done_lh(&lh);
35370 +       reiser4_exit_context(ctx);
35371 +       return result;
35372 +}
35373 +
35374 +/**
35375 + * flow_by_inode_unix_file - initizlize structure flow
35376 + * @inode: inode of file for which read or write is abou
35377 + * @buf: buffer to perform read to or write from
35378 + * @user: flag showing whether @buf is user space or kernel space
35379 + * @size: size of buffer @buf
35380 + * @off: start offset fro read or write
35381 + * @op: READ or WRITE
35382 + * @flow:
35383 + *
35384 + * Initializes fields of @flow: key, size of data, i/o mode (read or write).
35385 + */
35386 +int flow_by_inode_unix_file(struct inode *inode,
35387 +                           const char __user *buf, int user,
35388 +                           loff_t size, loff_t off,
35389 +                           rw_op op, flow_t *flow)
35390 +{
35391 +       assert("nikita-1100", inode != NULL);
35392 +
35393 +       flow->length = size;
35394 +       memcpy(&flow->data, &buf, sizeof(buf));
35395 +       flow->user = user;
35396 +       flow->op = op;
35397 +       assert("nikita-1931", inode_file_plugin(inode) != NULL);
35398 +       assert("nikita-1932",
35399 +              inode_file_plugin(inode)->key_by_inode ==
35400 +              key_by_inode_and_offset_common);
35401 +       /* calculate key of write position and insert it into flow->key */
35402 +       return key_by_inode_and_offset_common(inode, off, &flow->key);
35403 +}
35404 +
35405 +/* plugin->u.file.set_plug_in_sd = NULL
35406 +   plugin->u.file.set_plug_in_inode = NULL
35407 +   plugin->u.file.create_blank_sd = NULL */
35408 +/* plugin->u.file.delete */
35409 +/*
35410 +   plugin->u.file.add_link = reiser4_add_link_common
35411 +   plugin->u.file.rem_link = NULL */
35412 +
35413 +/* plugin->u.file.owns_item
35414 +   this is common_file_owns_item with assertion */
35415 +/* Audited by: green(2002.06.15) */
35416 +int
35417 +owns_item_unix_file(const struct inode *inode /* object to check against */ ,
35418 +                   const coord_t * coord /* coord to check */ )
35419 +{
35420 +       int result;
35421 +
35422 +       result = owns_item_common(inode, coord);
35423 +       if (!result)
35424 +               return 0;
35425 +       if (!plugin_of_group(item_plugin_by_coord(coord),
35426 +                            UNIX_FILE_METADATA_ITEM_TYPE))
35427 +               return 0;
35428 +       assert("vs-547",
35429 +              item_id_by_coord(coord) == EXTENT_POINTER_ID ||
35430 +              item_id_by_coord(coord) == FORMATTING_ID);
35431 +       return 1;
35432 +}
35433 +
35434 +static int setattr_truncate(struct inode *inode, struct iattr *attr)
35435 +{
35436 +       int result;
35437 +       int s_result;
35438 +       loff_t old_size;
35439 +       reiser4_tree *tree;
35440 +
35441 +       inode_check_scale(inode, inode->i_size, attr->ia_size);
35442 +
35443 +       old_size = inode->i_size;
35444 +       tree = reiser4_tree_by_inode(inode);
35445 +
35446 +       result = safe_link_grab(tree, BA_CAN_COMMIT);
35447 +       if (result == 0)
35448 +               result = safe_link_add(inode, SAFE_TRUNCATE);
35449 +       if (result == 0)
35450 +               result = truncate_file_body(inode, attr);
35451 +       if (result)
35452 +               warning("vs-1588", "truncate_file failed: oid %lli, "
35453 +                       "old size %lld, new size %lld, retval %d",
35454 +                       (unsigned long long)get_inode_oid(inode),
35455 +                       old_size, attr->ia_size, result);
35456 +
35457 +       s_result = safe_link_grab(tree, BA_CAN_COMMIT);
35458 +       if (s_result == 0)
35459 +               s_result =
35460 +                   safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE);
35461 +       if (s_result != 0) {
35462 +               warning("nikita-3417", "Cannot kill safelink %lli: %i",
35463 +                       (unsigned long long)get_inode_oid(inode), s_result);
35464 +       }
35465 +       safe_link_release(tree);
35466 +       return result;
35467 +}
35468 +
35469 +/* plugin->u.file.setattr method */
35470 +/* This calls inode_setattr and if truncate is in effect it also takes
35471 +   exclusive inode access to avoid races */
35472 +int setattr_unix_file(struct dentry *dentry,   /* Object to change attributes */
35473 +                     struct iattr *attr /* change description */ )
35474 +{
35475 +       int result;
35476 +
35477 +       if (attr->ia_valid & ATTR_SIZE) {
35478 +               reiser4_context *ctx;
35479 +               struct unix_file_info *uf_info;
35480 +
35481 +               /* truncate does reservation itself and requires exclusive
35482 +                  access obtained */
35483 +               ctx = reiser4_init_context(dentry->d_inode->i_sb);
35484 +               if (IS_ERR(ctx))
35485 +                       return PTR_ERR(ctx);
35486 +
35487 +               uf_info = unix_file_inode_data(dentry->d_inode);
35488 +               get_exclusive_access_careful(uf_info, dentry->d_inode);
35489 +               result = setattr_truncate(dentry->d_inode, attr);
35490 +               drop_exclusive_access(uf_info);
35491 +               context_set_commit_async(ctx);
35492 +               reiser4_exit_context(ctx);
35493 +       } else
35494 +               result = reiser4_setattr_common(dentry, attr);
35495 +
35496 +       return result;
35497 +}
35498 +
35499 +/* plugin->u.file.init_inode_data */
35500 +void
35501 +init_inode_data_unix_file(struct inode *inode,
35502 +                         reiser4_object_create_data * crd, int create)
35503 +{
35504 +       struct unix_file_info *data;
35505 +
35506 +       data = unix_file_inode_data(inode);
35507 +       data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN;
35508 +       init_rwsem(&data->latch);
35509 +       data->tplug = inode_formatting_plugin(inode);
35510 +       data->exclusive_use = 0;
35511 +
35512 +#if REISER4_DEBUG
35513 +       data->ea_owner = NULL;
35514 +       atomic_set(&data->nr_neas, 0);
35515 +#endif
35516 +       init_inode_ordering(inode, crd, create);
35517 +}
35518 +
35519 +/**
35520 + * delete_unix_file - delete_object of file_plugin
35521 + * @inode: inode to be deleted
35522 + *
35523 + * Truncates file to length 0, removes stat data and safe link.
35524 + */
35525 +int delete_object_unix_file(struct inode *inode)
35526 +{
35527 +       struct unix_file_info *uf_info;
35528 +       int result;
35529 +
35530 +       if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
35531 +               return 0;
35532 +
35533 +       /* truncate file bogy first */
35534 +       uf_info = unix_file_inode_data(inode);
35535 +       get_exclusive_access(uf_info);
35536 +       result = shorten_file(inode, 0 /* size */ );
35537 +       drop_exclusive_access(uf_info);
35538 +
35539 +       if (result)
35540 +               warning("edward-1556",
35541 +                       "failed to truncate file (%llu) on removal: %d",
35542 +                       get_inode_oid(inode), result);
35543 +
35544 +       /* remove stat data and safe link */
35545 +       return reiser4_delete_object_common(inode);
35546 +}
35547 +
35548 +int
35549 +prepare_write_unix_file(struct file *file, struct page *page,
35550 +                       unsigned from, unsigned to)
35551 +{
35552 +       reiser4_context *ctx;
35553 +       struct unix_file_info *uf_info;
35554 +       int ret;
35555 +
35556 +       ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
35557 +       if (IS_ERR(ctx))
35558 +               return PTR_ERR(ctx);
35559 +
35560 +       uf_info = unix_file_inode_data(file->f_dentry->d_inode);
35561 +       get_exclusive_access(uf_info);
35562 +       ret = find_file_state(file->f_dentry->d_inode, uf_info);
35563 +       if (ret == 0) {
35564 +               if (uf_info->container == UF_CONTAINER_TAILS)
35565 +                       ret = -EINVAL;
35566 +               else
35567 +                       ret = do_prepare_write(file, page, from, to);
35568 +       }
35569 +       drop_exclusive_access(uf_info);
35570 +
35571 +       /* don't commit transaction under inode semaphore */
35572 +       context_set_commit_async(ctx);
35573 +       reiser4_exit_context(ctx);
35574 +       return ret;
35575 +}
35576 +
35577 +/*
35578 + * Local variables:
35579 + * c-indentation-style: "K&R"
35580 + * mode-name: "LC"
35581 + * c-basic-offset: 8
35582 + * tab-width: 8
35583 + * fill-column: 79
35584 + * scroll-step: 1
35585 + * End:
35586 + */
35587 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/file/file_conversion.c linux-2.6.27/fs/reiser4/plugin/file/file_conversion.c
35588 --- linux-2.6.27.orig/fs/reiser4/plugin/file/file_conversion.c  1970-01-01 03:00:00.000000000 +0300
35589 +++ linux-2.6.27/fs/reiser4/plugin/file/file_conversion.c       2008-10-12 18:20:01.000000000 +0400
35590 @@ -0,0 +1,689 @@
35591 +/* Copyright 2001, 2002, 2003 by Hans Reiser,
35592 +   licensing governed by reiser4/README */
35593 +
35594 +/**
35595 + * This file contains plugin schedule hooks, and plugin conversion methods.
35596 + *
35597 + * Plugin schedule hook makes a decision (at plugin schedule point) about the
35598 + * most reasonable plugins for managing a regular file. Usually such decisions
35599 + * is made by some O(1)-heuristic.
35600 + *
35601 + * By default we assign a unix_file plugin id when writing incompressible file
35602 + * managed by cryptcompress plugin id. Currently used heuristic for estimating
35603 + * compressibility is very simple: if first complete logical cluster (64K by
35604 + * default) of a file is incompressible, then we make a decision, that the whole
35605 + * file is incompressible (*).
35606 + *
35607 + * To enable a conversion we install a special "magic" compression mode plugin
35608 + * (CONVX_COMPRESSION_MODE_ID, see plugin/compress/compress_mode.c for details)
35609 + * at file creation time (**).
35610 + *
35611 + * Note, that we don't perform back conversion (unix_file->cryptcompress)
35612 + * because of compatibility reasons (see http://dev.namesys.com/Version4.X.Y
35613 + * for details).
35614 + *
35615 + * The conversion is accompanied by rebuilding disk structures of a file, so it
35616 + * is important to protect them from being interacted with other plugins which
35617 + * don't expect them to be in such inconsistent state. For this to be protected
35618 + * we serialize readers and writers of a file's conversion set (FCS).
35619 + *
35620 + * We define FCS as a file plugin installed in inode's pset plus file's data
35621 + * and metadata that this file plugin manipulates with (items, etc).
35622 + * Note, that FCS is defined per file.
35623 + * FCS reader is defined as a set of instruction of the following type:
35624 + * {inode_file_plugin(inode)->method()} (I.e. retrieving a file plugin id
35625 + * conjoined with all method's instructions should be atomic).
35626 + * FCS writer is a set of instructions that perform file plugin conversion
35627 + * (convert items, update pset, etc).
35628 + * Example:
35629 + * reiser4_write_careful() supplied to VFS as a ->write() file operation is
35630 + * composed of the following (optional) instructions:
35631 + *             1              2                         3
35632 + * *********************** ####### -------------------------------------------->
35633 + *
35634 + * 1) "****" are instructions performed on behalf of cryptcompress file plugin;
35635 + * 2) "####" is a FCS writer (performing a conversion cryptcompress->unix_file);
35636 + * 3) "----" are instructions performed on behalf of unix_file plugin;
35637 + * Here (1) and (3) are FCS readers.
35638 + *
35639 + * In this example FCS readers and writers are already serialized (by design),
35640 + * however there can be readers and writers executing at the same time in
35641 + * different contexts, so we need a common mechanism of serialization.
35642 + *
35643 + * Currently serialization of FCS readers and writers is performed via acquiring
35644 + * a special per-inode rw-semaphore (conv_sem). And yes, {down, up}_read is for
35645 + * FCS readers, and  {down, up}_write is for FCS writers, see the macros below
35646 + * for passive/active protection.
35647 + *
35648 + * ---
35649 + * (*)  This heuristic can be changed to a better one (benchmarking is needed).
35650 + * (**) Such technique allows to keep enable/disable state on disk.
35651 + */
35652 +
35653 +#include "../../inode.h"
35654 +#include "../cluster.h"
35655 +#include "file.h"
35656 +
35657 +#define conversion_enabled(inode)                                      \
35658 +        (inode_compression_mode_plugin(inode) ==                      \
35659 +         compression_mode_plugin_by_id(CONVX_COMPRESSION_MODE_ID))
35660 +
35661 +/**
35662 + * Located sections (readers and writers of @pset) are not permanently
35663 + * critical: cryptcompress file can be converted only if the conversion
35664 + * is enabled (see the macrio above). Also we don't perform back
35665 + * conversion. The following helper macro is a sanity check to decide
35666 + * if we need the protection (locks are always additional overheads).
35667 + */
35668 +#define should_protect(inode)                                          \
35669 +       (inode_file_plugin(inode) ==                                    \
35670 +        file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID) &&             \
35671 +        conversion_enabled(inode))
35672 +/**
35673 + * To avoid confusion with read/write file operations, we'll speak about
35674 + * "passive" protection for FCS readers and "active" protection for FCS
35675 + * writers. All methods with active or passive protection have suffix
35676 + * "careful".
35677 + */
35678 +/**
35679 + * Macros for passive protection.
35680 + *
35681 + * Construct invariant operation to be supplied to VFS.
35682 + * The macro accepts the following lexemes:
35683 + * @type - type of the value represented by the compound statement;
35684 + * @method - name of an operation to be supplied to VFS (reiser4 file
35685 + * plugin also should contain a method with such name).
35686 + */
35687 +#define PROT_PASSIVE(type, method, args)                               \
35688 +({                                                                     \
35689 +       type _result;                                                   \
35690 +       struct rw_semaphore * guard =                                   \
35691 +               &reiser4_inode_data(inode)->conv_sem;                   \
35692 +                                                                       \
35693 +       if (should_protect(inode)) {                                    \
35694 +               down_read(guard);                                       \
35695 +               if (!should_protect(inode))                             \
35696 +                       up_read(guard);                                 \
35697 +       }                                                               \
35698 +       _result = inode_file_plugin(inode)->method args;                \
35699 +       if (should_protect(inode))                                      \
35700 +               up_read(guard);                                         \
35701 +       _result;                                                        \
35702 +})
35703 +
35704 +#define PROT_PASSIVE_VOID(method, args)                                        \
35705 +({                                                                     \
35706 +       struct rw_semaphore * guard =                                   \
35707 +               &reiser4_inode_data(inode)->conv_sem;                   \
35708 +                                                                       \
35709 +       if (should_protect(inode)) {                                    \
35710 +               down_read(guard);                                       \
35711 +               if (!should_protect(inode))                             \
35712 +                       up_read(guard);                                 \
35713 +       }                                                               \
35714 +       inode_file_plugin(inode)->method args;                          \
35715 +                                                                       \
35716 +       if (should_protect(inode))                                      \
35717 +               up_read(guard);                                         \
35718 +})
35719 +
35720 +/* Pass management to the unix-file plugin with "notail" policy */
35721 +static int __cryptcompress2unixfile(struct file *file, struct inode * inode)
35722 +{
35723 +       int result;
35724 +       reiser4_inode *info;
35725 +       struct unix_file_info * uf;
35726 +       info = reiser4_inode_data(inode);
35727 +
35728 +       result = aset_set_unsafe(&info->pset,
35729 +                           PSET_FILE,
35730 +                           (reiser4_plugin *)
35731 +                           file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
35732 +       if (result)
35733 +               return result;
35734 +       result = aset_set_unsafe(&info->pset,
35735 +                           PSET_FORMATTING,
35736 +                           (reiser4_plugin *)
35737 +                           formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID));
35738 +       if (result)
35739 +               return result;
35740 +       /* get rid of non-standard plugins */
35741 +       info->plugin_mask &= ~cryptcompress_mask;
35742 +       /* get rid of plugin stat-data extension */
35743 +       info->extmask &= ~(1 << PLUGIN_STAT);
35744 +
35745 +       reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
35746 +
35747 +       /* FIXME use init_inode_data_unix_file() instead,
35748 +          but aviod init_inode_ordering() */
35749 +       /* Init unix-file specific part of inode */
35750 +       uf = unix_file_inode_data(inode);
35751 +       uf->container = UF_CONTAINER_UNKNOWN;
35752 +       init_rwsem(&uf->latch);
35753 +       uf->tplug = inode_formatting_plugin(inode);
35754 +       uf->exclusive_use = 0;
35755 +#if REISER4_DEBUG
35756 +       uf->ea_owner = NULL;
35757 +       atomic_set(&uf->nr_neas, 0);
35758 +#endif
35759 +       /**
35760 +        * we was carefull for file_ops, inode_ops and as_ops
35761 +        * to be invariant for plugin conversion, so there is
35762 +        * no need to update ones already installed in the
35763 +        * vfs's residence.
35764 +        */
35765 +       return 0;
35766 +}
35767 +
35768 +#if REISER4_DEBUG
35769 +static int disabled_conversion_inode_ok(struct inode * inode)
35770 +{
35771 +       __u64 extmask = reiser4_inode_data(inode)->extmask;
35772 +       __u16 plugin_mask = reiser4_inode_data(inode)->plugin_mask;
35773 +
35774 +       return ((extmask & (1 << LIGHT_WEIGHT_STAT)) &&
35775 +               (extmask & (1 << UNIX_STAT)) &&
35776 +               (extmask & (1 << LARGE_TIMES_STAT)) &&
35777 +               (extmask & (1 << PLUGIN_STAT)) &&
35778 +               (plugin_mask & (1 << PSET_COMPRESSION_MODE)));
35779 +}
35780 +#endif
35781 +
35782 +/**
35783 + * Disable future attempts to schedule/convert file plugin.
35784 + * This function is called by plugin schedule hooks.
35785 + *
35786 + * To disable conversion we assign any compression mode plugin id
35787 + * different from CONVX_COMPRESSION_MODE_ID.
35788 + */
35789 +static int disable_conversion(struct inode * inode)
35790 +{
35791 +       int result;
35792 +       result =
35793 +              force_plugin_pset(inode,
35794 +                                PSET_COMPRESSION_MODE,
35795 +                                (reiser4_plugin *)compression_mode_plugin_by_id
35796 +                                (LATTD_COMPRESSION_MODE_ID));
35797 +       assert("edward-1500",
35798 +              ergo(!result, disabled_conversion_inode_ok(inode)));
35799 +       return result;
35800 +}
35801 +
35802 +/**
35803 + * Check if we really have achieved plugin scheduling point
35804 + */
35805 +static int check_psched_point(struct inode * inode,
35806 +                             loff_t pos /* position in the
35807 +                                           file to write from */,
35808 +                             struct cluster_handle * clust,
35809 +                             struct psched_context * cont)
35810 +{
35811 +       assert("edward-1505", conversion_enabled(inode));
35812 +       /*
35813 +        * if file size is more then cluster size, then compressible
35814 +        * status must be figured out (i.e. compression was disabled,
35815 +        * or file plugin was converted to unix_file)
35816 +        */
35817 +       assert("edward-1506", inode->i_size <= inode_cluster_size(inode));
35818 +
35819 +       if (pos > inode->i_size)
35820 +               /* first logical cluster will contain a (partial) hole */
35821 +               return disable_conversion(inode);
35822 +       if (pos < inode_cluster_size(inode))
35823 +               /* writing to the first logical cluster */
35824 +               return 0;
35825 +       /*
35826 +        * here we have:
35827 +        * cluster_size <= pos <= i_size <= cluster_size,
35828 +        * and, hence,  pos == i_size == cluster_size
35829 +        */
35830 +       assert("edward-1498",
35831 +              pos == inode->i_size &&
35832 +              pos == inode_cluster_size(inode));
35833 +       assert("edward-1539", cont != NULL);
35834 +       assert("edward-1540", cont->state == PSCHED_INVAL_STATE);
35835 +
35836 +       cont->state = PSCHED_SCHED_POINT;
35837 +       return 0;
35838 +}
35839 +
35840 +static void start_check_compressibility(struct inode * inode,
35841 +                                       struct cluster_handle * clust,
35842 +                                       hint_t * hint)
35843 +{
35844 +       assert("edward-1507", clust->index == 1);
35845 +       assert("edward-1508", !tfm_cluster_is_uptodate(&clust->tc));
35846 +       assert("edward-1509", cluster_get_tfm_act(&clust->tc) == TFMA_READ);
35847 +
35848 +       hint_init_zero(hint);
35849 +       clust->hint = hint;
35850 +       clust->index --;
35851 +       clust->nr_pages = size_in_pages(lbytes(clust->index, inode));
35852 +
35853 +       /* first logical cluster (of index #0) must be complete */
35854 +       assert("edward-1510", lbytes(clust->index, inode) ==
35855 +              inode_cluster_size(inode));
35856 +}
35857 +
35858 +static void finish_check_compressibility(struct inode * inode,
35859 +                                        struct cluster_handle * clust,
35860 +                                        hint_t * hint)
35861 +{
35862 +       reiser4_unset_hint(clust->hint);
35863 +       clust->hint = hint;
35864 +       clust->index ++;
35865 +}
35866 +
35867 +#if REISER4_DEBUG
35868 +static int prepped_dclust_ok(hint_t * hint)
35869 +{
35870 +       reiser4_key key;
35871 +       coord_t * coord = &hint->ext_coord.coord;
35872 +
35873 +       item_key_by_coord(coord, &key);
35874 +       return (item_id_by_coord(coord) == CTAIL_ID &&
35875 +               !coord_is_unprepped_ctail(coord) &&
35876 +               (get_key_offset(&key) + nr_units_ctail(coord) ==
35877 +                dclust_get_extension_dsize(hint)));
35878 +}
35879 +#endif
35880 +
35881 +#define fifty_persent(size) (size >> 1)
35882 +/* evaluation of data compressibility */
35883 +#define data_is_compressible(osize, isize)             \
35884 +       (osize < fifty_persent(isize))
35885 +
35886 +/**
35887 + * A simple O(1)-heuristic for compressibility.
35888 + * This is called not more then one time per file's life.
35889 + * Read first logical cluster (of index #0) and estimate its compressibility.
35890 + * Save estimation result in @cont.
35891 + */
35892 +static int read_check_compressibility(struct inode * inode,
35893 +                                     struct cluster_handle * clust,
35894 +                                     struct psched_context * cont)
35895 +{
35896 +       int i;
35897 +       int result;
35898 +       __u32 dst_len;
35899 +       hint_t tmp_hint;
35900 +       hint_t * cur_hint = clust->hint;
35901 +       assert("edward-1541", cont->state == PSCHED_SCHED_POINT);
35902 +
35903 +       start_check_compressibility(inode, clust, &tmp_hint);
35904 +
35905 +       reset_cluster_pgset(clust, cluster_nrpages(inode));
35906 +       result = grab_page_cluster(inode, clust, READ_OP);
35907 +       if (result)
35908 +               return result;
35909 +       /* Read page cluster here */
35910 +       for (i = 0; i < clust->nr_pages; i++) {
35911 +               struct page *page = clust->pages[i];
35912 +               lock_page(page);
35913 +               result = do_readpage_ctail(inode, clust, page,
35914 +                                          ZNODE_READ_LOCK);
35915 +               unlock_page(page);
35916 +               if (result)
35917 +                       goto error;
35918 +       }
35919 +       tfm_cluster_clr_uptodate(&clust->tc);
35920 +
35921 +       cluster_set_tfm_act(&clust->tc, TFMA_WRITE);
35922 +
35923 +       if (hint_is_valid(&tmp_hint) && !hint_is_unprepped_dclust(&tmp_hint)) {
35924 +               /* lenght of compressed data is known, no need to compress */
35925 +               assert("edward-1511",
35926 +                      znode_is_any_locked(tmp_hint.lh.node));
35927 +               assert("edward-1512",
35928 +                      WITH_DATA(tmp_hint.ext_coord.coord.node,
35929 +                                prepped_dclust_ok(&tmp_hint)));
35930 +               dst_len = dclust_get_extension_dsize(&tmp_hint);
35931 +       }
35932 +       else {
35933 +               struct tfm_cluster * tc = &clust->tc;
35934 +               compression_plugin * cplug = inode_compression_plugin(inode);
35935 +               result = grab_tfm_stream(inode, tc, INPUT_STREAM);
35936 +               if (result)
35937 +                       goto error;
35938 +               for (i = 0; i < clust->nr_pages; i++) {
35939 +                       char *data;
35940 +                       lock_page(clust->pages[i]);
35941 +                       BUG_ON(!PageUptodate(clust->pages[i]));
35942 +                       data = kmap(clust->pages[i]);
35943 +                       memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
35944 +                              data, PAGE_CACHE_SIZE);
35945 +                       kunmap(clust->pages[i]);
35946 +                       unlock_page(clust->pages[i]);
35947 +               }
35948 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
35949 +               if (result)
35950 +                       goto error;
35951 +               result = grab_coa(tc, cplug);
35952 +               if (result)
35953 +                       goto error;
35954 +               tc->len = tc->lsize = lbytes(clust->index, inode);
35955 +               assert("edward-1513", tc->len == inode_cluster_size(inode));
35956 +               dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
35957 +               cplug->compress(get_coa(tc, cplug->h.id, tc->act),
35958 +                               tfm_input_data(clust), tc->len,
35959 +                               tfm_output_data(clust), &dst_len);
35960 +               assert("edward-1514",
35961 +                      dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
35962 +       }
35963 +       finish_check_compressibility(inode, clust, cur_hint);
35964 +       cont->state =
35965 +               (data_is_compressible(dst_len, inode_cluster_size(inode)) ?
35966 +                PSCHED_REMAINS_OLD :
35967 +                PSCHED_ASSIGNED_NEW);
35968 +       return 0;
35969 + error:
35970 +       put_page_cluster(clust, inode, READ_OP);
35971 +       return result;
35972 +}
35973 +
35974 +/* Cut disk cluster of index @idx */
35975 +static int cut_disk_cluster(struct inode * inode, cloff_t idx)
35976 +{
35977 +       reiser4_key from, to;
35978 +       assert("edward-1515", inode_file_plugin(inode) ==
35979 +              file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
35980 +       key_by_inode_cryptcompress(inode, clust_to_off(idx, inode), &from);
35981 +       to = from;
35982 +       set_key_offset(&to,
35983 +                      get_key_offset(&from) + inode_cluster_size(inode) - 1);
35984 +       return reiser4_cut_tree(reiser4_tree_by_inode(inode),
35985 +                               &from, &to, inode, 0);
35986 +}
35987 +
35988 +static int reserve_cryptcompress2unixfile(struct inode *inode)
35989 +{
35990 +       reiser4_block_nr unformatted_nodes;
35991 +       reiser4_tree *tree;
35992 +
35993 +       tree = reiser4_tree_by_inode(inode);
35994 +
35995 +       /* number of unformatted nodes which will be created */
35996 +       unformatted_nodes = cluster_nrpages(inode); /* N */
35997 +
35998 +       /*
35999 +        * space required for one iteration of extent->tail conversion:
36000 +        *
36001 +        *     1. kill ctail items
36002 +        *
36003 +        *     2. insert N unformatted nodes
36004 +        *
36005 +        *     3. insert N (worst-case single-block
36006 +        *     extents) extent units.
36007 +        *
36008 +        *     4. drilling to the leaf level by coord_by_key()
36009 +        *
36010 +        *     5. possible update of stat-data
36011 +        *
36012 +        */
36013 +       grab_space_enable();
36014 +       return reiser4_grab_space
36015 +               (2 * tree->height +
36016 +                unformatted_nodes  +
36017 +                unformatted_nodes * estimate_one_insert_into_item(tree) +
36018 +                1 + estimate_one_insert_item(tree) +
36019 +                inode_file_plugin(inode)->estimate.update(inode),
36020 +                BA_CAN_COMMIT);
36021 +}
36022 +
36023 +/**
36024 + * Convert cryptcompress file plugin to unix_file plugin.
36025 + */
36026 +static int cryptcompress2unixfile(struct file * file, struct inode * inode,
36027 +                                 struct psched_context * cont)
36028 +{
36029 +       int i;
36030 +       int result = 0;
36031 +       struct cryptcompress_info *cr_info;
36032 +       struct unix_file_info *uf_info;
36033 +       assert("edward-1516", cont->pages[0]->index == 0);
36034 +
36035 +       /* release all cryptcompress-specific resources */
36036 +       cr_info = cryptcompress_inode_data(inode);
36037 +       result = reserve_cryptcompress2unixfile(inode);
36038 +       if (result)
36039 +               goto out;
36040 +       /* tell kill_hook to not truncate pages */
36041 +       reiser4_inode_set_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
36042 +       result = cut_disk_cluster(inode, 0);
36043 +       if (result)
36044 +               goto out;
36045 +       /* captured jnode of cluster and assotiated resources (pages,
36046 +          reserved disk space) were released by ->kill_hook() method
36047 +          of the item plugin */
36048 +
36049 +       result = __cryptcompress2unixfile(file, inode);
36050 +       if (result)
36051 +               goto out;
36052 +       /* At this point file is managed by unix file plugin */
36053 +
36054 +       uf_info = unix_file_inode_data(inode);
36055 +
36056 +       assert("edward-1518",
36057 +              ergo(jprivate(cont->pages[0]),
36058 +                   !jnode_is_cluster_page(jprivate(cont->pages[0]))));
36059 +       for(i = 0; i < cont->nr_pages; i++) {
36060 +               assert("edward-1519", cont->pages[i]);
36061 +               assert("edward-1520", PageUptodate(cont->pages[i]));
36062 +
36063 +               result = find_or_create_extent(cont->pages[i]);
36064 +               if (result)
36065 +                       break;
36066 +       }
36067 +       if (unlikely(result))
36068 +               goto out;
36069 +       uf_info->container = UF_CONTAINER_EXTENTS;
36070 +       result = reiser4_update_sd(inode);
36071 + out:
36072 +       all_grabbed2free();
36073 +       return result;
36074 +}
36075 +
36076 +#define convert_file_plugin cryptcompress2unixfile
36077 +
36078 +/**
36079 + * This is called by ->write() method of a cryptcompress file plugin.
36080 + * Make a decision about the most reasonable file plugin id to manage
36081 + * the file.
36082 + */
36083 +int write_pschedule_hook(struct file * file, struct inode * inode,
36084 +                        loff_t pos, struct cluster_handle * clust,
36085 +                        struct psched_context * cont)
36086 +{
36087 +       int result;
36088 +       if (!conversion_enabled(inode))
36089 +               return 0;
36090 +       result = check_psched_point(inode, pos, clust, cont);
36091 +       if (result || cont->state != PSCHED_SCHED_POINT)
36092 +               return result;
36093 +       result = read_check_compressibility(inode, clust, cont);
36094 +       if (result)
36095 +               return result;
36096 +       if (cont->state == PSCHED_REMAINS_OLD) {
36097 +               put_page_cluster(clust, inode, READ_OP);
36098 +               return disable_conversion(inode);
36099 +       }
36100 +       assert("edward-1543", cont->state == PSCHED_ASSIGNED_NEW);
36101 +       /*
36102 +        * page cluster is grabbed and uptodate. It will be
36103 +        * released with a pgset after plugin conversion is
36104 +        * finished, see put_psched_context().
36105 +        */
36106 +       reiser4_unset_hint(clust->hint);
36107 +       move_cluster_pgset(clust, &cont->pages, &cont->nr_pages);
36108 +       return 0;
36109 +}
36110 +
36111 +/**
36112 + * This is called by ->setattr() method of cryptcompress file plugin.
36113 + */
36114 +int setattr_pschedule_hook(struct inode * inode)
36115 +{
36116 +       if (conversion_enabled(inode))
36117 +               return disable_conversion(inode);
36118 +       return 0;
36119 +}
36120 +
36121 +static inline void init_psched_context(struct psched_context * cont)
36122 +{
36123 +       memset(cont, 0, sizeof(*cont));
36124 +}
36125 +
36126 +static inline void done_psched_context(struct psched_context * cont,
36127 +                                      struct inode * inode)
36128 +{
36129 +       if (cont->pages) {
36130 +               __put_page_cluster(0, cont->nr_pages, cont->pages, inode);
36131 +               kfree(cont->pages);
36132 +       }
36133 +}
36134 +/**
36135 + * Here are wrappers with "protection", aka Reiser4 "careful" methods.
36136 + * They are used by vfs (as methods of file_ops, inode_ops or as_ops),
36137 + * which is not aware of plugin conversion performed by Reiser4.
36138 + */
36139 +
36140 +/*
36141 + * Wrappers with active protection for:
36142 + *
36143 + * ->write();
36144 + */
36145 +
36146 +/*
36147 + * ->write() file operation supplied to VFS.
36148 + * Write a file in 3 steps (some of them can be optional).
36149 + */
36150 +ssize_t reiser4_write_careful(struct file *file, const char __user *buf,
36151 +                             size_t count, loff_t *off)
36152 +{
36153 +       int result;
36154 +       reiser4_context *ctx;
36155 +       ssize_t written_old = 0; /* bytes written with initial plugin */
36156 +       ssize_t written_new = 0; /* bytes written with new plugin */
36157 +       struct psched_context cont;
36158 +       struct inode * inode = file->f_dentry->d_inode;
36159 +
36160 +       ctx = reiser4_init_context(inode->i_sb);
36161 +       if (IS_ERR(ctx))
36162 +               return PTR_ERR(ctx);
36163 +       init_psched_context(&cont);
36164 +       mutex_lock(&inode->i_mutex);
36165 +       /**
36166 +        * First step.
36167 +        * Start write with initial file plugin.
36168 +        * Keep a plugin schedule status at @cont (if any).
36169 +        */
36170 +       written_old = inode_file_plugin(inode)->write(file,
36171 +                                                     buf,
36172 +                                                     count,
36173 +                                                     off,
36174 +                                                     &cont);
36175 +       if (cont.state != PSCHED_ASSIGNED_NEW || written_old < 0)
36176 +               goto exit;
36177 +       /**
36178 +        * Second step.
36179 +        * New file plugin has been scheduled.
36180 +        * Perform conversion to the new plugin.
36181 +        */
36182 +       down_read(&reiser4_inode_data(inode)->conv_sem);
36183 +       result = convert_file_plugin(file, inode, &cont);
36184 +       up_read(&reiser4_inode_data(inode)->conv_sem);
36185 +       if (result) {
36186 +               warning("edward-1544",
36187 +                       "Inode %llu: file plugin conversion failed (%d)",
36188 +                       (unsigned long long)get_inode_oid(inode),
36189 +                       result);
36190 +               context_set_commit_async(ctx);
36191 +               goto exit;
36192 +       }
36193 +       reiser4_txn_restart(ctx);
36194 +       /**
36195 +        * Third step:
36196 +        * Finish write with the new file plugin.
36197 +        */
36198 +       assert("edward-1536",
36199 +              inode_file_plugin(inode) ==
36200 +              file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
36201 +
36202 +       written_new = inode_file_plugin(inode)->write(file,
36203 +                                                     buf + written_old,
36204 +                                                     count - written_old,
36205 +                                                     off,
36206 +                                                     NULL);
36207 + exit:
36208 +       mutex_unlock(&inode->i_mutex);
36209 +       done_psched_context(&cont, inode);
36210 +       reiser4_exit_context(ctx);
36211 +
36212 +       return written_old + (written_new < 0 ? 0 : written_new);
36213 +}
36214 +
36215 +/* Wrappers with passive protection for:
36216 + *
36217 + * ->open();
36218 + * ->read();
36219 + * ->ioctl();
36220 + * ->mmap();
36221 + * ->release();
36222 + * ->bmap().
36223 + */
36224 +
36225 +int reiser4_open_careful(struct inode *inode, struct file *file)
36226 +{
36227 +       return PROT_PASSIVE(int, open, (inode, file));
36228 +}
36229 +
36230 +ssize_t reiser4_read_careful(struct file * file, char __user * buf,
36231 +                            size_t size, loff_t * off)
36232 +{
36233 +       struct inode * inode = file->f_dentry->d_inode;
36234 +       return PROT_PASSIVE(ssize_t, read, (file, buf, size, off));
36235 +}
36236 +
36237 +int reiser4_ioctl_careful(struct inode *inode, struct file *filp,
36238 +                         unsigned int cmd, unsigned long arg)
36239 +{
36240 +       return PROT_PASSIVE(int, ioctl, (inode, filp, cmd, arg));
36241 +}
36242 +
36243 +int reiser4_mmap_careful(struct file *file, struct vm_area_struct *vma)
36244 +{
36245 +       struct inode *inode = file->f_dentry->d_inode;
36246 +       return PROT_PASSIVE(int, mmap, (file, vma));
36247 +}
36248 +
36249 +int reiser4_release_careful(struct inode *inode, struct file *file)
36250 +{
36251 +       return PROT_PASSIVE(int, release, (inode, file));
36252 +}
36253 +
36254 +sector_t reiser4_bmap_careful(struct address_space * mapping, sector_t lblock)
36255 +{
36256 +       struct inode *inode = mapping->host;
36257 +       return PROT_PASSIVE(sector_t, bmap, (mapping, lblock));
36258 +}
36259 +
36260 +/*
36261 + * Wrappers without protection for:
36262 + *
36263 + * ->setattr()
36264 + */
36265 +int reiser4_setattr(struct dentry *dentry, struct iattr *attr)
36266 +{
36267 +       return inode_file_plugin(dentry->d_inode)->setattr(dentry, attr);
36268 +}
36269 +
36270 +/*
36271 +  Local variables:
36272 +  c-indentation-style: "K&R"
36273 +  mode-name: "LC"
36274 +  c-basic-offset: 8
36275 +  tab-width: 8
36276 +  fill-column: 80
36277 +  scroll-step: 1
36278 +  End:
36279 +*/
36280 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/file/file.h linux-2.6.27/fs/reiser4/plugin/file/file.h
36281 --- linux-2.6.27.orig/fs/reiser4/plugin/file/file.h     1970-01-01 03:00:00.000000000 +0300
36282 +++ linux-2.6.27/fs/reiser4/plugin/file/file.h  2008-10-12 18:20:01.000000000 +0400
36283 @@ -0,0 +1,331 @@
36284 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
36285 + * reiser4/README */
36286 +
36287 +/* this file contains declarations of methods implementing
36288 +   file plugins (UNIX_FILE_PLUGIN_ID, CRYPTCOMPRESS_FILE_PLUGIN_ID
36289 +   and SYMLINK_FILE_PLUGIN_ID) */
36290 +
36291 +#if !defined( __REISER4_FILE_H__ )
36292 +#define __REISER4_FILE_H__
36293 +
36294 +/* possible states when scheduling a new file plugin */
36295 +typedef enum {
36296 +       PSCHED_INVAL_STATE,    /* invalid state */
36297 +       PSCHED_SCHED_POINT,    /* scheduling point has been achieved */
36298 +       PSCHED_REMAINS_OLD,    /* made a decision to be managed by old plugin */
36299 +       PSCHED_ASSIGNED_NEW    /* new plugin has been scheduled */
36300 +} psched_state;
36301 +
36302 +struct psched_context {
36303 +       int nr_pages;
36304 +       struct page **pages;
36305 +       psched_state state;
36306 +};
36307 +
36308 +/**
36309 + * Declarations of common/careful/generic methods.
36310 + * Suppose ->foo() is a vs method (of f_ops, i_ops, or a_ops);
36311 + * Then common reiser4 method for foo looks like reiser4_foo_common;
36312 + * careful method looks like reiser4_foo_careful;
36313 + * generic method looks like reiser4_foo.
36314 + *
36315 + * Common method is a simple instruction set eligible for more
36316 + * then one plugin id.
36317 + *
36318 + * Generic method looks at the plugin installed in inode's
36319 + * plugin set and calls its appropriate method.
36320 + *
36321 + * Careful method looks like generic method with protected pset
36322 + * (see plugin/file/file_conversion.c for details).
36323 + */
36324 +
36325 +/* inode operations */
36326 +int reiser4_setattr(struct dentry *, struct iattr *);
36327 +
36328 +/* file operations */
36329 +ssize_t reiser4_read_careful(struct file *, char __user *buf,
36330 +                            size_t count, loff_t *off);
36331 +ssize_t reiser4_write_careful(struct file *, const char __user *buf,
36332 +                             size_t count, loff_t * off);
36333 +int reiser4_ioctl_careful(struct inode *inode, struct file *filp,
36334 +                         unsigned int cmd, unsigned long arg);
36335 +int reiser4_mmap_careful(struct file *, struct vm_area_struct *);
36336 +int reiser4_open_careful(struct inode *inode, struct file *file);
36337 +int reiser4_release_careful(struct inode *, struct file *);
36338 +int reiser4_sync_file_common(struct file *, struct dentry *, int datasync);
36339 +
36340 +/* address space operations */
36341 +int reiser4_readpage(struct file *, struct page *);
36342 +int reiser4_readpages(struct file*, struct address_space*, struct list_head*,
36343 +                     unsigned);
36344 +int reiser4_writepages(struct address_space *, struct writeback_control *);
36345 +int reiser4_prepare_write(struct file *, struct page *, unsigned from,
36346 +                         unsigned to);
36347 +int reiser4_commit_write(struct file *, struct page *, unsigned from,
36348 +                        unsigned to);
36349 +sector_t reiser4_bmap_careful(struct address_space *, sector_t lblock);
36350 +
36351 +/*
36352 + * Private methods of unix-file plugin
36353 + * (UNIX_FILE_PLUGIN_ID)
36354 + */
36355 +
36356 +/* private inode operations */
36357 +int setattr_unix_file(struct dentry *, struct iattr *);
36358 +
36359 +/* private file operations */
36360 +
36361 +ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount,
36362 +                      loff_t *off);
36363 +ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount,
36364 +                       loff_t * off, struct psched_context * cont);
36365 +int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd,
36366 +                   unsigned long arg);
36367 +int mmap_unix_file(struct file *, struct vm_area_struct *);
36368 +int open_unix_file(struct inode *, struct file *);
36369 +int release_unix_file(struct inode *, struct file *);
36370 +
36371 +/* private address space operations */
36372 +int readpage_unix_file(struct file *, struct page *);
36373 +int readpages_unix_file(struct file*, struct address_space*, struct list_head*, unsigned);
36374 +int writepages_unix_file(struct address_space *, struct writeback_control *);
36375 +int prepare_write_unix_file(struct file *, struct page *, unsigned from,
36376 +                           unsigned to);
36377 +int commit_write_unix_file(struct file *, struct page *, unsigned from,
36378 +                          unsigned to);
36379 +sector_t bmap_unix_file(struct address_space *, sector_t lblock);
36380 +
36381 +/* other private methods */
36382 +int delete_object_unix_file(struct inode *);
36383 +int flow_by_inode_unix_file(struct inode *, const char __user *buf,
36384 +                           int user, loff_t, loff_t, rw_op, flow_t *);
36385 +int owns_item_unix_file(const struct inode *, const coord_t *);
36386 +void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *,
36387 +                              int create);
36388 +
36389 +/*
36390 + * Private methods of cryptcompress file plugin
36391 + * (CRYPTCOMPRESS_FILE_PLUGIN_ID)
36392 + */
36393 +
36394 +/* private inode operations */
36395 +int setattr_cryptcompress(struct dentry *, struct iattr *);
36396 +
36397 +/* private file operations */
36398 +ssize_t read_cryptcompress(struct file *, char __user *buf,
36399 +                          size_t count, loff_t *off);
36400 +ssize_t write_cryptcompress(struct file *, const char __user *buf,
36401 +                           size_t count, loff_t * off,
36402 +                           struct psched_context *cont);
36403 +int ioctl_cryptcompress(struct inode *, struct file *, unsigned int cmd,
36404 +                       unsigned long arg);
36405 +int mmap_cryptcompress(struct file *, struct vm_area_struct *);
36406 +int open_cryptcompress(struct inode *, struct file *);
36407 +int release_cryptcompress(struct inode *, struct file *);
36408 +
36409 +/* private address space operations */
36410 +int readpage_cryptcompress(struct file *, struct page *);
36411 +int readpages_cryptcompress(struct file*, struct address_space*,
36412 +                           struct list_head*, unsigned);
36413 +int writepages_cryptcompress(struct address_space *,
36414 +                            struct writeback_control *);
36415 +int prepare_write_cryptcompress(struct file *, struct page *, unsigned from,
36416 +                               unsigned to);
36417 +int commit_write_cryptcompress(struct file *, struct page *, unsigned from,
36418 +                              unsigned to);
36419 +sector_t bmap_cryptcompress(struct address_space *, sector_t lblock);
36420 +
36421 +/* other private methods */
36422 +int flow_by_inode_cryptcompress(struct inode *, const char __user *buf,
36423 +                               int user, loff_t, loff_t, rw_op, flow_t *);
36424 +int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *);
36425 +int create_object_cryptcompress(struct inode *, struct inode *,
36426 +                               reiser4_object_create_data *);
36427 +int delete_object_cryptcompress(struct inode *);
36428 +void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *,
36429 +                                  int create);
36430 +int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key,
36431 +                                 const reiser4_key * to_key,
36432 +                                 reiser4_key * smallest_removed,
36433 +                                 struct inode *object, int truncate,
36434 +                                 int *progress);
36435 +void destroy_inode_cryptcompress(struct inode *);
36436 +
36437 +/*
36438 + * Private methods of symlink file plugin
36439 + * (SYMLINK_FILE_PLUGIN_ID)
36440 + */
36441 +int reiser4_create_symlink(struct inode *symlink, struct inode *dir,
36442 +                          reiser4_object_create_data *);
36443 +void destroy_inode_symlink(struct inode *);
36444 +
36445 +/*
36446 + * all the write into unix file is performed by item write method. Write method
36447 + * of unix file plugin only decides which item plugin (extent or tail) and in
36448 + * which mode (one from the enum below) to call
36449 + */
36450 +typedef enum {
36451 +       FIRST_ITEM = 1,
36452 +       APPEND_ITEM = 2,
36453 +       OVERWRITE_ITEM = 3
36454 +} write_mode_t;
36455 +
36456 +/* unix file may be in one the following states */
36457 +typedef enum {
36458 +       UF_CONTAINER_UNKNOWN = 0,
36459 +       UF_CONTAINER_TAILS = 1,
36460 +       UF_CONTAINER_EXTENTS = 2,
36461 +       UF_CONTAINER_EMPTY = 3
36462 +} file_container_t;
36463 +
36464 +struct formatting_plugin;
36465 +struct inode;
36466 +
36467 +/* unix file plugin specific part of reiser4 inode */
36468 +struct unix_file_info {
36469 +       /*
36470 +        * this read-write lock protects file containerization change. Accesses
36471 +        * which do not change file containerization (see file_container_t)
36472 +        * (read, readpage, writepage, write (until tail conversion is
36473 +        * involved)) take read-lock. Accesses which modify file
36474 +        * containerization (truncate, conversion from tail to extent and back)
36475 +        * take write-lock.
36476 +        */
36477 +       struct rw_semaphore latch;
36478 +       /* this enum specifies which items are used to build the file */
36479 +       file_container_t container;
36480 +       /*
36481 +        * plugin which controls when file is to be converted to extents and
36482 +        * back to tail
36483 +        */
36484 +       struct formatting_plugin *tplug;
36485 +       /* if this is set, file is in exclusive use */
36486 +       int exclusive_use;
36487 +#if REISER4_DEBUG
36488 +       /* pointer to task struct of thread owning exclusive access to file */
36489 +       void *ea_owner;
36490 +       atomic_t nr_neas;
36491 +       void *last_reader;
36492 +#endif
36493 +};
36494 +
36495 +struct unix_file_info *unix_file_inode_data(const struct inode *inode);
36496 +void get_exclusive_access(struct unix_file_info *);
36497 +void drop_exclusive_access(struct unix_file_info *);
36498 +void get_nonexclusive_access(struct unix_file_info *);
36499 +void drop_nonexclusive_access(struct unix_file_info *);
36500 +int try_to_get_nonexclusive_access(struct unix_file_info *);
36501 +int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode,
36502 +                  struct inode *);
36503 +int find_file_item_nohint(coord_t *, lock_handle *,
36504 +                         const reiser4_key *, znode_lock_mode,
36505 +                         struct inode *);
36506 +
36507 +int load_file_hint(struct file *, hint_t *);
36508 +void save_file_hint(struct file *, const hint_t *);
36509 +
36510 +#include "../item/extent.h"
36511 +#include "../item/tail.h"
36512 +#include "../item/ctail.h"
36513 +
36514 +struct uf_coord {
36515 +       coord_t coord;
36516 +       lock_handle *lh;
36517 +       int valid;
36518 +       union {
36519 +               struct extent_coord_extension extent;
36520 +               struct tail_coord_extension tail;
36521 +               struct ctail_coord_extension ctail;
36522 +       } extension;
36523 +};
36524 +
36525 +#include "../../forward.h"
36526 +#include "../../seal.h"
36527 +#include "../../lock.h"
36528 +
36529 +/*
36530 + * This structure is used to speed up file operations (reads and writes).  A
36531 + * hint is a suggestion about where a key resolved to last time.  A seal
36532 + * indicates whether a node has been modified since a hint was last recorded.
36533 + * You check the seal, and if the seal is still valid, you can use the hint
36534 + * without traversing the tree again.
36535 + */
36536 +struct hint {
36537 +       seal_t seal; /* a seal over last file item accessed */
36538 +       uf_coord_t ext_coord;
36539 +       loff_t offset;
36540 +       znode_lock_mode mode;
36541 +       lock_handle lh;
36542 +};
36543 +
36544 +static inline int hint_is_valid(hint_t * hint)
36545 +{
36546 +       return hint->ext_coord.valid;
36547 +}
36548 +
36549 +static inline void hint_set_valid(hint_t * hint)
36550 +{
36551 +       hint->ext_coord.valid = 1;
36552 +}
36553 +
36554 +static inline void hint_clr_valid(hint_t * hint)
36555 +{
36556 +       hint->ext_coord.valid = 0;
36557 +}
36558 +
36559 +int load_file_hint(struct file *, hint_t *);
36560 +void save_file_hint(struct file *, const hint_t *);
36561 +void hint_init_zero(hint_t *);
36562 +void reiser4_set_hint(hint_t *, const reiser4_key *, znode_lock_mode);
36563 +int hint_is_set(const hint_t *);
36564 +void reiser4_unset_hint(hint_t *);
36565 +
36566 +int reiser4_update_file_size(struct inode *, loff_t, int update_sd);
36567 +int cut_file_items(struct inode *, loff_t new_size,
36568 +                  int update_sd, loff_t cur_size,
36569 +                  int (*update_actor) (struct inode *, loff_t, int));
36570 +#if REISER4_DEBUG
36571 +
36572 +/* return 1 is exclusive access is obtained, 0 - otherwise */
36573 +static inline int ea_obtained(struct unix_file_info * uf_info)
36574 +{
36575 +       int ret;
36576 +
36577 +       ret = down_read_trylock(&uf_info->latch);
36578 +       if (ret)
36579 +               up_read(&uf_info->latch);
36580 +       return !ret;
36581 +}
36582 +
36583 +#endif
36584 +
36585 +#define WRITE_GRANULARITY 32
36586 +
36587 +int tail2extent(struct unix_file_info *);
36588 +int extent2tail(struct file *, struct unix_file_info *);
36589 +
36590 +int goto_right_neighbor(coord_t *, lock_handle *);
36591 +int find_or_create_extent(struct page *);
36592 +int equal_to_ldk(znode *, const reiser4_key *);
36593 +
36594 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh);
36595 +
36596 +static inline int cbk_errored(int cbk_result)
36597 +{
36598 +       return (cbk_result != CBK_COORD_NOTFOUND
36599 +               && cbk_result != CBK_COORD_FOUND);
36600 +}
36601 +
36602 +/* __REISER4_FILE_H__ */
36603 +#endif
36604 +
36605 +/*
36606 + * Local variables:
36607 + * c-indentation-style: "K&R"
36608 + * mode-name: "LC"
36609 + * c-basic-offset: 8
36610 + * tab-width: 8
36611 + * fill-column: 79
36612 + * scroll-step: 1
36613 + * End:
36614 +*/
36615 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/file/Makefile linux-2.6.27/fs/reiser4/plugin/file/Makefile
36616 --- linux-2.6.27.orig/fs/reiser4/plugin/file/Makefile   1970-01-01 03:00:00.000000000 +0300
36617 +++ linux-2.6.27/fs/reiser4/plugin/file/Makefile        2008-10-12 18:20:01.000000000 +0400
36618 @@ -0,0 +1,7 @@
36619 +obj-$(CONFIG_REISER4_FS) += file_plugins.o
36620 +
36621 +file_plugins-objs :=           \
36622 +       file.o                  \
36623 +       tail_conversion.o       \
36624 +       symlink.o               \
36625 +       cryptcompress.o
36626 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/file/symfile.c linux-2.6.27/fs/reiser4/plugin/file/symfile.c
36627 --- linux-2.6.27.orig/fs/reiser4/plugin/file/symfile.c  1970-01-01 03:00:00.000000000 +0300
36628 +++ linux-2.6.27/fs/reiser4/plugin/file/symfile.c       2008-10-12 18:20:01.000000000 +0400
36629 @@ -0,0 +1,87 @@
36630 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
36631 +
36632 +/* Symfiles are a generalization of Unix symlinks.
36633 +
36634 +   A symfile when read behaves as though you took its contents and
36635 +   substituted them into the reiser4 naming system as the right hand side
36636 +   of an assignment, and then read that which you had assigned to it.
36637 +
36638 +   A key issue for symfiles is how to implement writes through to
36639 +   subfiles.  In general, one must have some method of determining what
36640 +   of that which is written to the symfile is written to what subfile.
36641 +   This can be done by use of custom plugin methods written by users, or
36642 +   by using a few general methods we provide for those willing to endure
36643 +   the insertion of delimiters into what is read.
36644 +
36645 +   Writing to symfiles without delimiters to denote what is written to
36646 +   what subfile is not supported by any plugins we provide in this
36647 +   release.  Our most sophisticated support for writes is that embodied
36648 +   by the invert plugin (see invert.c).
36649 +
36650 +   A read only version of the /etc/passwd file might be
36651 +   constructed as a symfile whose contents are as follows:
36652 +
36653 +   /etc/passwd/userlines/*
36654 +
36655 +   or
36656 +
36657 +   /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root
36658 +
36659 +   or
36660 +
36661 +   /etc/passwd/userlines/(demidov+edward+reiser+root)
36662 +
36663 +   A symfile with contents
36664 +
36665 +   /filenameA+"(some text stored in the uninvertable symfile)+/filenameB
36666 +
36667 +   will return when read
36668 +
36669 +   The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB
36670 +
36671 +   and write of what has been read will not be possible to implement as
36672 +   an identity operation because there are no delimiters denoting the
36673 +   boundaries of what is to be written to what subfile.
36674 +
36675 +   Note that one could make this a read/write symfile if one specified
36676 +   delimiters, and the write method understood those delimiters delimited
36677 +   what was written to subfiles.
36678 +
36679 +   So, specifying the symfile in a manner that allows writes:
36680 +
36681 +   /etc/passwd/userlines/demidov+"(
36682 +   )+/etc/passwd/userlines/edward+"(
36683 +   )+/etc/passwd/userlines/reiser+"(
36684 +   )+/etc/passwd/userlines/root+"(
36685 +   )
36686 +
36687 +   or
36688 +
36689 +   /etc/passwd/userlines/(demidov+"(
36690 +   )+edward+"(
36691 +   )+reiser+"(
36692 +   )+root+"(
36693 +   ))
36694 +
36695 +   and the file demidov might be specified as:
36696 +
36697 +   /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell
36698 +
36699 +   or
36700 +
36701 +   /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell)
36702 +
36703 +   Notice that if the file demidov has a carriage return in it, the
36704 +   parsing fails, but then if you put carriage returns in the wrong place
36705 +   in a normal /etc/passwd file it breaks things also.
36706 +
36707 +   Note that it is forbidden to have no text between two interpolations
36708 +   if one wants to be able to define what parts of a write go to what
36709 +   subfiles referenced in an interpolation.
36710 +
36711 +   If one wants to be able to add new lines by writing to the file, one
36712 +   must either write a custom plugin for /etc/passwd that knows how to
36713 +   name an added line, or one must use an invert, or one must use a more
36714 +   sophisticated symfile syntax that we are not planning to write for
36715 +   version 4.0.
36716 +*/
36717 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/file/symlink.c linux-2.6.27/fs/reiser4/plugin/file/symlink.c
36718 --- linux-2.6.27.orig/fs/reiser4/plugin/file/symlink.c  1970-01-01 03:00:00.000000000 +0300
36719 +++ linux-2.6.27/fs/reiser4/plugin/file/symlink.c       2008-10-12 18:20:01.000000000 +0400
36720 @@ -0,0 +1,95 @@
36721 +/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */
36722 +
36723 +#include "../../inode.h"
36724 +
36725 +#include <linux/types.h>
36726 +#include <linux/fs.h>
36727 +
36728 +/* file plugin methods specific for symlink files
36729 +   (SYMLINK_FILE_PLUGIN_ID) */
36730 +
36731 +/* this is implementation of create_object method of file plugin for
36732 +   SYMLINK_FILE_PLUGIN_ID
36733 + */
36734 +
36735 +/**
36736 + * reiser4_create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID
36737 + * @symlink: inode of symlink object
36738 + * @dir: inode of parent directory
36739 + * @info:  parameters of new object
36740 + *
36741 + * Inserts stat data with symlink extension where into the tree.
36742 + */
36743 +int reiser4_create_symlink(struct inode *symlink,
36744 +                          struct inode *dir UNUSED_ARG,
36745 +                          reiser4_object_create_data *data /* info passed to us
36746 +                                                            * this is filled by
36747 +                                                            * reiser4() syscall
36748 +                                                            * in particular */)
36749 +{
36750 +       int result;
36751 +
36752 +       assert("nikita-680", symlink != NULL);
36753 +       assert("nikita-681", S_ISLNK(symlink->i_mode));
36754 +       assert("nikita-685", reiser4_inode_get_flag(symlink, REISER4_NO_SD));
36755 +       assert("nikita-682", dir != NULL);
36756 +       assert("nikita-684", data != NULL);
36757 +       assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID);
36758 +
36759 +       /*
36760 +        * stat data of symlink has symlink extension in which we store
36761 +        * symlink content, that is, path symlink is pointing to.
36762 +        */
36763 +       reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT);
36764 +
36765 +       assert("vs-838", symlink->i_private == NULL);
36766 +       symlink->i_private = (void *)data->name;
36767 +
36768 +       assert("vs-843", symlink->i_size == 0);
36769 +       INODE_SET_FIELD(symlink, i_size, strlen(data->name));
36770 +
36771 +       /* insert stat data appended with data->name */
36772 +       result = inode_file_plugin(symlink)->write_sd_by_inode(symlink);
36773 +       if (result) {
36774 +               /* FIXME-VS: Make sure that symlink->i_private is not attached
36775 +                  to kmalloced data */
36776 +               INODE_SET_FIELD(symlink, i_size, 0);
36777 +       } else {
36778 +               assert("vs-849", symlink->i_private
36779 +                      && reiser4_inode_get_flag(symlink,
36780 +                                                REISER4_GENERIC_PTR_USED));
36781 +               assert("vs-850",
36782 +                      !memcmp((char *)symlink->i_private, data->name,
36783 +                              (size_t) symlink->i_size + 1));
36784 +       }
36785 +       return result;
36786 +}
36787 +
36788 +/* this is implementation of destroy_inode method of file plugin for
36789 +   SYMLINK_FILE_PLUGIN_ID
36790 + */
36791 +void destroy_inode_symlink(struct inode *inode)
36792 +{
36793 +       assert("edward-799",
36794 +              inode_file_plugin(inode) ==
36795 +              file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
36796 +       assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode));
36797 +       assert("edward-801", reiser4_inode_get_flag(inode,
36798 +                                                   REISER4_GENERIC_PTR_USED));
36799 +       assert("vs-839", S_ISLNK(inode->i_mode));
36800 +
36801 +       kfree(inode->i_private);
36802 +       inode->i_private = NULL;
36803 +       reiser4_inode_clr_flag(inode, REISER4_GENERIC_PTR_USED);
36804 +}
36805 +
36806 +/*
36807 +  Local variables:
36808 +  c-indentation-style: "K&R"
36809 +  mode-name: "LC"
36810 +  c-basic-offset: 8
36811 +  tab-width: 8
36812 +  fill-column: 80
36813 +  scroll-step: 1
36814 +  End:
36815 +*/
36816 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/file/tail_conversion.c linux-2.6.27/fs/reiser4/plugin/file/tail_conversion.c
36817 --- linux-2.6.27.orig/fs/reiser4/plugin/file/tail_conversion.c  1970-01-01 03:00:00.000000000 +0300
36818 +++ linux-2.6.27/fs/reiser4/plugin/file/tail_conversion.c       2008-10-12 18:20:01.000000000 +0400
36819 @@ -0,0 +1,737 @@
36820 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
36821 +
36822 +#include "../../inode.h"
36823 +#include "../../super.h"
36824 +#include "../../page_cache.h"
36825 +#include "../../carry.h"
36826 +#include "../../safe_link.h"
36827 +#include "../../vfs_ops.h"
36828 +
36829 +#include <linux/writeback.h>
36830 +
36831 +/* this file contains:
36832 +   tail2extent and extent2tail */
36833 +
36834 +/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
36835 +void get_exclusive_access(struct unix_file_info * uf_info)
36836 +{
36837 +       assert("nikita-3028", reiser4_schedulable());
36838 +       assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
36839 +       assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
36840 +       /*
36841 +        * "deadlock avoidance": sometimes we commit a transaction under
36842 +        * rw-semaphore on a file. Such commit can deadlock with another
36843 +        * thread that captured some block (hence preventing atom from being
36844 +        * committed) and waits on rw-semaphore.
36845 +        */
36846 +       reiser4_txn_restart_current();
36847 +       LOCK_CNT_INC(inode_sem_w);
36848 +       down_write(&uf_info->latch);
36849 +       uf_info->exclusive_use = 1;
36850 +       assert("vs-1713", uf_info->ea_owner == NULL);
36851 +       assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
36852 +       ON_DEBUG(uf_info->ea_owner = current);
36853 +}
36854 +
36855 +void drop_exclusive_access(struct unix_file_info * uf_info)
36856 +{
36857 +       assert("vs-1714", uf_info->ea_owner == current);
36858 +       assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
36859 +       ON_DEBUG(uf_info->ea_owner = NULL);
36860 +       uf_info->exclusive_use = 0;
36861 +       up_write(&uf_info->latch);
36862 +       assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
36863 +       assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
36864 +       LOCK_CNT_DEC(inode_sem_w);
36865 +       reiser4_txn_restart_current();
36866 +}
36867 +
36868 +/**
36869 + * nea_grabbed - do something when file semaphore is down_read-ed
36870 + * @uf_info:
36871 + *
36872 + * This is called when nonexclisive access is obtained on file. All it does is
36873 + * for debugging purposes.
36874 + */
36875 +static void nea_grabbed(struct unix_file_info *uf_info)
36876 +{
36877 +#if REISER4_DEBUG
36878 +       LOCK_CNT_INC(inode_sem_r);
36879 +       assert("vs-1716", uf_info->ea_owner == NULL);
36880 +       atomic_inc(&uf_info->nr_neas);
36881 +       uf_info->last_reader = current;
36882 +#endif
36883 +}
36884 +
36885 +/**
36886 + * get_nonexclusive_access - get nonexclusive access to a file
36887 + * @uf_info: unix file specific part of inode to obtain access to
36888 + *
36889 + * Nonexclusive access is obtained on a file before read, write, readpage.
36890 + */
36891 +void get_nonexclusive_access(struct unix_file_info *uf_info)
36892 +{
36893 +       assert("nikita-3029", reiser4_schedulable());
36894 +       assert("nikita-3361", get_current_context()->trans->atom == NULL);
36895 +
36896 +       down_read(&uf_info->latch);
36897 +       nea_grabbed(uf_info);
36898 +}
36899 +
36900 +/**
36901 + * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
36902 + * @uf_info: unix file specific part of inode to obtain access to
36903 + *
36904 + * Non-blocking version of nonexclusive access obtaining.
36905 + */
36906 +int try_to_get_nonexclusive_access(struct unix_file_info *uf_info)
36907 +{
36908 +       int result;
36909 +
36910 +       result = down_read_trylock(&uf_info->latch);
36911 +       if (result)
36912 +               nea_grabbed(uf_info);
36913 +       return result;
36914 +}
36915 +
36916 +void drop_nonexclusive_access(struct unix_file_info * uf_info)
36917 +{
36918 +       assert("vs-1718", uf_info->ea_owner == NULL);
36919 +       assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
36920 +       ON_DEBUG(atomic_dec(&uf_info->nr_neas));
36921 +
36922 +       up_read(&uf_info->latch);
36923 +
36924 +       LOCK_CNT_DEC(inode_sem_r);
36925 +       reiser4_txn_restart_current();
36926 +}
36927 +
36928 +/* part of tail2extent. Cut all items covering @count bytes starting from
36929 +   @offset */
36930 +/* Audited by: green(2002.06.15) */
36931 +static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
36932 +{
36933 +       reiser4_key from, to;
36934 +
36935 +       /* AUDIT: How about putting an assertion here, what would check
36936 +          all provided range is covered by tail items only? */
36937 +       /* key of first byte in the range to be cut  */
36938 +       inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
36939 +
36940 +       /* key of last byte in that range */
36941 +       to = from;
36942 +       set_key_offset(&to, (__u64) (offset + count - 1));
36943 +
36944 +       /* cut everything between those keys */
36945 +       return reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, &to,
36946 +                               inode, 0);
36947 +}
36948 +
36949 +static void release_all_pages(struct page **pages, unsigned nr_pages)
36950 +{
36951 +       unsigned i;
36952 +
36953 +       for (i = 0; i < nr_pages; i++) {
36954 +               if (pages[i] == NULL) {
36955 +#if REISER4_DEBUG
36956 +                       unsigned j;
36957 +                       for (j = i + 1; j < nr_pages; j++)
36958 +                               assert("vs-1620", pages[j] == NULL);
36959 +#endif
36960 +                       break;
36961 +               }
36962 +               page_cache_release(pages[i]);
36963 +               pages[i] = NULL;
36964 +       }
36965 +}
36966 +
36967 +/* part of tail2extent. replace tail items with extent one. Content of tail
36968 +   items (@count bytes) being cut are copied already into
36969 +   pages. extent_writepage method is called to create extents corresponding to
36970 +   those pages */
36971 +static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
36972 +{
36973 +       int result;
36974 +       unsigned i;
36975 +       STORE_COUNTERS;
36976 +
36977 +       if (nr_pages == 0)
36978 +               return 0;
36979 +
36980 +       assert("vs-596", pages[0]);
36981 +
36982 +       /* cut copied items */
36983 +       result = cut_formatting_items(inode, page_offset(pages[0]), count);
36984 +       if (result)
36985 +               return result;
36986 +
36987 +       CHECK_COUNTERS;
36988 +
36989 +       /* put into tree replacement for just removed items: extent item, namely */
36990 +       for (i = 0; i < nr_pages; i++) {
36991 +               result = add_to_page_cache_lru(pages[i], inode->i_mapping,
36992 +                                              pages[i]->index,
36993 +                                              mapping_gfp_mask(inode->
36994 +                                                               i_mapping));
36995 +               if (result)
36996 +                       break;
36997 +               unlock_page(pages[i]);
36998 +               result = find_or_create_extent(pages[i]);
36999 +               if (result)
37000 +                       break;
37001 +               SetPageUptodate(pages[i]);
37002 +       }
37003 +       return result;
37004 +}
37005 +
37006 +#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
37007 +                                * items */
37008 +
37009 +static int reserve_tail2extent_iteration(struct inode *inode)
37010 +{
37011 +       reiser4_block_nr unformatted_nodes;
37012 +       reiser4_tree *tree;
37013 +
37014 +       tree = reiser4_tree_by_inode(inode);
37015 +
37016 +       /* number of unformatted nodes which will be created */
37017 +       unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
37018 +
37019 +       /*
37020 +        * space required for one iteration of extent->tail conversion:
37021 +        *
37022 +        *     1. kill N tail items
37023 +        *
37024 +        *     2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
37025 +        *
37026 +        *     3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
37027 +        *     extents) extent units.
37028 +        *
37029 +        *     4. drilling to the leaf level by coord_by_key()
37030 +        *
37031 +        *     5. possible update of stat-data
37032 +        *
37033 +        */
37034 +       grab_space_enable();
37035 +       return reiser4_grab_space
37036 +           (2 * tree->height +
37037 +            TAIL2EXTENT_PAGE_NUM +
37038 +            TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
37039 +            1 + estimate_one_insert_item(tree) +
37040 +            inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
37041 +}
37042 +
37043 +/* clear stat data's flag indicating that conversion is being converted */
37044 +static int complete_conversion(struct inode *inode)
37045 +{
37046 +       int result;
37047 +
37048 +       grab_space_enable();
37049 +       result =
37050 +           reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
37051 +                              BA_CAN_COMMIT);
37052 +       if (result == 0) {
37053 +               reiser4_inode_clr_flag(inode, REISER4_PART_MIXED);
37054 +               result = reiser4_update_sd(inode);
37055 +       }
37056 +       if (result)
37057 +               warning("vs-1696", "Failed to clear converting bit of %llu: %i",
37058 +                       (unsigned long long)get_inode_oid(inode), result);
37059 +       return 0;
37060 +}
37061 +
37062 +/**
37063 + * find_start
37064 + * @inode:
37065 + * @id:
37066 + * @offset:
37067 + *
37068 + * this is used by tail2extent and extent2tail to detect where previous
37069 + * uncompleted conversion stopped
37070 + */
37071 +static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
37072 +{
37073 +       int result;
37074 +       lock_handle lh;
37075 +       coord_t coord;
37076 +       struct unix_file_info *ufo;
37077 +       int found;
37078 +       reiser4_key key;
37079 +
37080 +       ufo = unix_file_inode_data(inode);
37081 +       init_lh(&lh);
37082 +       result = 0;
37083 +       found = 0;
37084 +       inode_file_plugin(inode)->key_by_inode(inode, *offset, &key);
37085 +       do {
37086 +               init_lh(&lh);
37087 +               result = find_file_item_nohint(&coord, &lh, &key,
37088 +                                              ZNODE_READ_LOCK, inode);
37089 +
37090 +               if (result == CBK_COORD_FOUND) {
37091 +                       if (coord.between == AT_UNIT) {
37092 +                               /*coord_clear_iplug(&coord); */
37093 +                               result = zload(coord.node);
37094 +                               if (result == 0) {
37095 +                                       if (item_id_by_coord(&coord) == id)
37096 +                                               found = 1;
37097 +                                       else
37098 +                                               item_plugin_by_coord(&coord)->s.
37099 +                                                   file.append_key(&coord,
37100 +                                                                   &key);
37101 +                                       zrelse(coord.node);
37102 +                               }
37103 +                       } else
37104 +                               result = RETERR(-ENOENT);
37105 +               }
37106 +               done_lh(&lh);
37107 +       } while (result == 0 && !found);
37108 +       *offset = get_key_offset(&key);
37109 +       return result;
37110 +}
37111 +
37112 +/**
37113 + * tail2extent
37114 + * @uf_info:
37115 + *
37116 + *
37117 + */
37118 +int tail2extent(struct unix_file_info *uf_info)
37119 +{
37120 +       int result;
37121 +       reiser4_key key;        /* key of next byte to be moved to page */
37122 +       char *p_data;           /* data of page */
37123 +       unsigned page_off = 0,  /* offset within the page where to copy data */
37124 +           count;              /* number of bytes of item which can be
37125 +                                * copied to page */
37126 +       struct page *pages[TAIL2EXTENT_PAGE_NUM];
37127 +       struct page *page;
37128 +       int done;               /* set to 1 when all file is read */
37129 +       char *item;
37130 +       int i;
37131 +       struct inode *inode;
37132 +       int first_iteration;
37133 +       int bytes;
37134 +       __u64 offset;
37135 +
37136 +       assert("nikita-3362", ea_obtained(uf_info));
37137 +       inode = unix_file_info_to_inode(uf_info);
37138 +       assert("nikita-3412", !IS_RDONLY(inode));
37139 +       assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
37140 +       assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
37141 +
37142 +       offset = 0;
37143 +       first_iteration = 1;
37144 +       result = 0;
37145 +       if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37146 +               /*
37147 +                * file is marked on disk as there was a conversion which did
37148 +                * not complete due to either crash or some error. Find which
37149 +                * offset tail conversion stopped at
37150 +                */
37151 +               result = find_start(inode, FORMATTING_ID, &offset);
37152 +               if (result == -ENOENT) {
37153 +                       /* no tail items found, everything is converted */
37154 +                       uf_info->container = UF_CONTAINER_EXTENTS;
37155 +                       complete_conversion(inode);
37156 +                       return 0;
37157 +               } else if (result != 0)
37158 +                       /* some other error */
37159 +                       return result;
37160 +               first_iteration = 0;
37161 +       }
37162 +
37163 +       reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
37164 +
37165 +       /* get key of first byte of a file */
37166 +       inode_file_plugin(inode)->key_by_inode(inode, offset, &key);
37167 +
37168 +       done = 0;
37169 +       while (done == 0) {
37170 +               memset(pages, 0, sizeof(pages));
37171 +               result = reserve_tail2extent_iteration(inode);
37172 +               if (result != 0) {
37173 +                       reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37174 +                       goto out;
37175 +               }
37176 +               if (first_iteration) {
37177 +                       reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
37178 +                       reiser4_update_sd(inode);
37179 +                       first_iteration = 0;
37180 +               }
37181 +               bytes = 0;
37182 +               for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
37183 +                       assert("vs-598",
37184 +                              (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
37185 +                       page = alloc_page(reiser4_ctx_gfp_mask_get());
37186 +                       if (!page) {
37187 +                               result = RETERR(-ENOMEM);
37188 +                               goto error;
37189 +                       }
37190 +
37191 +                       page->index =
37192 +                           (unsigned long)(get_key_offset(&key) >>
37193 +                                           PAGE_CACHE_SHIFT);
37194 +                       /*
37195 +                        * usually when one is going to longterm lock znode (as
37196 +                        * find_file_item does, for instance) he must not hold
37197 +                        * locked pages. However, there is an exception for
37198 +                        * case tail2extent. Pages appearing here are not
37199 +                        * reachable to everyone else, they are clean, they do
37200 +                        * not have jnodes attached so keeping them locked do
37201 +                        * not risk deadlock appearance
37202 +                        */
37203 +                       assert("vs-983", !PagePrivate(page));
37204 +                       reiser4_invalidate_pages(inode->i_mapping, page->index,
37205 +                                                1, 0);
37206 +
37207 +                       for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
37208 +                               coord_t coord;
37209 +                               lock_handle lh;
37210 +
37211 +                               /* get next item */
37212 +                               /* FIXME: we might want to readahead here */
37213 +                               init_lh(&lh);
37214 +                               result =
37215 +                                   find_file_item_nohint(&coord, &lh, &key,
37216 +                                                         ZNODE_READ_LOCK,
37217 +                                                         inode);
37218 +                               if (result != CBK_COORD_FOUND) {
37219 +                                       /*
37220 +                                        * error happened of not items of file
37221 +                                        * were found
37222 +                                        */
37223 +                                       done_lh(&lh);
37224 +                                       page_cache_release(page);
37225 +                                       goto error;
37226 +                               }
37227 +
37228 +                               if (coord.between == AFTER_UNIT) {
37229 +                                       /*
37230 +                                        * end of file is reached. Padd page
37231 +                                        * with zeros
37232 +                                        */
37233 +                                       done_lh(&lh);
37234 +                                       done = 1;
37235 +                                       p_data = kmap_atomic(page, KM_USER0);
37236 +                                       memset(p_data + page_off, 0,
37237 +                                              PAGE_CACHE_SIZE - page_off);
37238 +                                       kunmap_atomic(p_data, KM_USER0);
37239 +                                       break;
37240 +                               }
37241 +
37242 +                               result = zload(coord.node);
37243 +                               if (result) {
37244 +                                       page_cache_release(page);
37245 +                                       done_lh(&lh);
37246 +                                       goto error;
37247 +                               }
37248 +                               assert("vs-856", coord.between == AT_UNIT);
37249 +                               item = ((char *)item_body_by_coord(&coord)) +
37250 +                                       coord.unit_pos;
37251 +
37252 +                               /* how many bytes to copy */
37253 +                               count =
37254 +                                   item_length_by_coord(&coord) -
37255 +                                   coord.unit_pos;
37256 +                               /* limit length of copy to end of page */
37257 +                               if (count > PAGE_CACHE_SIZE - page_off)
37258 +                                       count = PAGE_CACHE_SIZE - page_off;
37259 +
37260 +                               /*
37261 +                                * copy item (as much as will fit starting from
37262 +                                * the beginning of the item) into the page
37263 +                                */
37264 +                               p_data = kmap_atomic(page, KM_USER0);
37265 +                               memcpy(p_data + page_off, item, count);
37266 +                               kunmap_atomic(p_data, KM_USER0);
37267 +
37268 +                               page_off += count;
37269 +                               bytes += count;
37270 +                               set_key_offset(&key,
37271 +                                              get_key_offset(&key) + count);
37272 +
37273 +                               zrelse(coord.node);
37274 +                               done_lh(&lh);
37275 +                       } /* end of loop which fills one page by content of
37276 +                          * formatting items */
37277 +
37278 +                       if (page_off) {
37279 +                               /* something was copied into page */
37280 +                               pages[i] = page;
37281 +                       } else {
37282 +                               page_cache_release(page);
37283 +                               assert("vs-1648", done == 1);
37284 +                               break;
37285 +                       }
37286 +               } /* end of loop through pages of one conversion iteration */
37287 +
37288 +               if (i > 0) {
37289 +                       result = replace(inode, pages, i, bytes);
37290 +                       release_all_pages(pages, sizeof_array(pages));
37291 +                       if (result)
37292 +                               goto error;
37293 +                       /*
37294 +                        * We have to drop exclusive access to avoid deadlock
37295 +                        * which may happen because called by reiser4_writepages
37296 +                        * capture_unix_file requires to get non-exclusive
37297 +                        * access to a file. It is safe to drop EA in the middle
37298 +                        * of tail2extent conversion because write_unix_file,
37299 +                        * setattr_unix_file(truncate), mmap_unix_file,
37300 +                        * release_unix_file(extent2tail) checks if conversion
37301 +                        * is not in progress (see comments before
37302 +                        * get_exclusive_access_careful().
37303 +                        * Other processes that acquire non-exclusive access
37304 +                        * (read_unix_file, reiser4_writepages, etc) should work
37305 +                        * on partially converted files.
37306 +                        */
37307 +                       drop_exclusive_access(uf_info);
37308 +                       /* throttle the conversion */
37309 +                       reiser4_throttle_write(inode);
37310 +                       get_exclusive_access(uf_info);
37311 +
37312 +                       /*
37313 +                        * nobody is allowed to complete conversion but a
37314 +                        * process which started it
37315 +                        */
37316 +                       assert("", reiser4_inode_get_flag(inode,
37317 +                                                         REISER4_PART_MIXED));
37318 +               }
37319 +       }
37320 +       if (result == 0) {
37321 +               /* file is converted to extent items */
37322 +               reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37323 +               assert("vs-1697", reiser4_inode_get_flag(inode,
37324 +                                                        REISER4_PART_MIXED));
37325 +
37326 +               uf_info->container = UF_CONTAINER_EXTENTS;
37327 +               complete_conversion(inode);
37328 +       } else {
37329 +               /*
37330 +                * conversion is not complete. Inode was already marked as
37331 +                * REISER4_PART_MIXED and stat-data were updated at the first
37332 +                * iteration of the loop above.
37333 +                */
37334 +       error:
37335 +               release_all_pages(pages, sizeof_array(pages));
37336 +               reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37337 +               warning("edward-1548", "Partial conversion of %llu: %i",
37338 +                       (unsigned long long)get_inode_oid(inode), result);
37339 +       }
37340 +
37341 + out:
37342 +       /* this assertion is to make sure get_exclusive_access_careful()
37343 +          won't fall into deadlock loop */
37344 +       assert("edward-1549", !reiser4_inode_get_flag(inode,
37345 +                                                     REISER4_PART_IN_CONV));
37346 +       return result;
37347 +}
37348 +
37349 +static int reserve_extent2tail_iteration(struct inode *inode)
37350 +{
37351 +       reiser4_tree *tree;
37352 +
37353 +       tree = reiser4_tree_by_inode(inode);
37354 +       /*
37355 +        * reserve blocks for (in this order):
37356 +        *
37357 +        *     1. removal of extent item
37358 +        *
37359 +        *     2. insertion of tail by insert_flow()
37360 +        *
37361 +        *     3. drilling to the leaf level by coord_by_key()
37362 +        *
37363 +        *     4. possible update of stat-data
37364 +        */
37365 +       grab_space_enable();
37366 +       return reiser4_grab_space
37367 +           (estimate_one_item_removal(tree) +
37368 +            estimate_insert_flow(tree->height) +
37369 +            1 + estimate_one_insert_item(tree) +
37370 +            inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
37371 +}
37372 +
37373 +/* for every page of file: read page, cut part of extent pointing to this page,
37374 +   put data of page tree by tail item */
37375 +int extent2tail(struct file * file, struct unix_file_info *uf_info)
37376 +{
37377 +       int result;
37378 +       struct inode *inode;
37379 +       struct page *page;
37380 +       unsigned long num_pages, i;
37381 +       unsigned long start_page;
37382 +       reiser4_key from;
37383 +       reiser4_key to;
37384 +       unsigned count;
37385 +       __u64 offset;
37386 +
37387 +       assert("nikita-3362", ea_obtained(uf_info));
37388 +       inode = unix_file_info_to_inode(uf_info);
37389 +       assert("nikita-3412", !IS_RDONLY(inode));
37390 +       assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
37391 +       assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
37392 +
37393 +       offset = 0;
37394 +       if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37395 +               /*
37396 +                * file is marked on disk as there was a conversion which did
37397 +                * not complete due to either crash or some error. Find which
37398 +                * offset tail conversion stopped at
37399 +                */
37400 +               result = find_start(inode, EXTENT_POINTER_ID, &offset);
37401 +               if (result == -ENOENT) {
37402 +                       /* no extent found, everything is converted */
37403 +                       uf_info->container = UF_CONTAINER_TAILS;
37404 +                       complete_conversion(inode);
37405 +                       return 0;
37406 +               } else if (result != 0)
37407 +                       /* some other error */
37408 +                       return result;
37409 +       }
37410 +
37411 +       reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
37412 +
37413 +       /* number of pages in the file */
37414 +       num_pages =
37415 +           (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
37416 +       start_page = offset >> PAGE_CACHE_SHIFT;
37417 +
37418 +       inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
37419 +       to = from;
37420 +
37421 +       result = 0;
37422 +       for (i = 0; i < num_pages; i++) {
37423 +               __u64 start_byte;
37424 +
37425 +               result = reserve_extent2tail_iteration(inode);
37426 +               if (result != 0)
37427 +                       break;
37428 +               if (i == 0 && offset == 0) {
37429 +                       reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
37430 +                       reiser4_update_sd(inode);
37431 +               }
37432 +
37433 +               page = read_mapping_page(inode->i_mapping,
37434 +                                        (unsigned)(i + start_page), NULL);
37435 +               if (IS_ERR(page)) {
37436 +                       result = PTR_ERR(page);
37437 +                       break;
37438 +               }
37439 +
37440 +               wait_on_page_locked(page);
37441 +
37442 +               if (!PageUptodate(page)) {
37443 +                       page_cache_release(page);
37444 +                       result = RETERR(-EIO);
37445 +                       break;
37446 +               }
37447 +
37448 +               /* cut part of file we have read */
37449 +               start_byte = (__u64) ((i + start_page) << PAGE_CACHE_SHIFT);
37450 +               set_key_offset(&from, start_byte);
37451 +               set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
37452 +               /*
37453 +                * reiser4_cut_tree_object() returns -E_REPEAT to allow atom
37454 +                * commits during over-long truncates. But
37455 +                * extent->tail conversion should be performed in one
37456 +                * transaction.
37457 +                */
37458 +               result = reiser4_cut_tree(reiser4_tree_by_inode(inode), &from,
37459 +                                         &to, inode, 0);
37460 +
37461 +               if (result) {
37462 +                       page_cache_release(page);
37463 +                       break;
37464 +               }
37465 +
37466 +               /* put page data into tree via tail_write */
37467 +               count = PAGE_CACHE_SIZE;
37468 +               if ((i == (num_pages - 1)) &&
37469 +                   (inode->i_size & ~PAGE_CACHE_MASK))
37470 +                       /* last page can be incompleted */
37471 +                       count = (inode->i_size & ~PAGE_CACHE_MASK);
37472 +               while (count) {
37473 +                       loff_t pos = start_byte;
37474 +
37475 +                       assert("edward-1537",
37476 +                              file != NULL && file->f_dentry != NULL);
37477 +                       assert("edward-1538",
37478 +                              file->f_dentry->d_inode == inode);
37479 +
37480 +                       result = reiser4_write_tail(file, inode,
37481 +                                                   (char __user *)kmap(page),
37482 +                                                   count, &pos);
37483 +                       reiser4_free_file_fsdata(file);
37484 +                       if (result <= 0) {
37485 +                               warning("", "reiser4_write_tail failed");
37486 +                               page_cache_release(page);
37487 +                               reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37488 +                               return result;
37489 +                       }
37490 +                       count -= result;
37491 +               }
37492 +
37493 +               /* release page */
37494 +               lock_page(page);
37495 +               /* page is already detached from jnode and mapping. */
37496 +               assert("vs-1086", page->mapping == NULL);
37497 +               assert("nikita-2690",
37498 +                      (!PagePrivate(page) && jprivate(page) == 0));
37499 +               /* waiting for writeback completion with page lock held is
37500 +                * perfectly valid. */
37501 +               wait_on_page_writeback(page);
37502 +               reiser4_drop_page(page);
37503 +               /* release reference taken by read_cache_page() above */
37504 +               page_cache_release(page);
37505 +
37506 +               drop_exclusive_access(uf_info);
37507 +               /* throttle the conversion */
37508 +               reiser4_throttle_write(inode);
37509 +               get_exclusive_access(uf_info);
37510 +               /*
37511 +                * nobody is allowed to complete conversion but a process which
37512 +                * started it
37513 +                */
37514 +               assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED));
37515 +       }
37516 +
37517 +       reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
37518 +
37519 +       if (i == num_pages) {
37520 +               /* file is converted to formatted items */
37521 +               assert("vs-1698", reiser4_inode_get_flag(inode,
37522 +                                                        REISER4_PART_MIXED));
37523 +               assert("vs-1260",
37524 +                      inode_has_no_jnodes(reiser4_inode_data(inode)));
37525 +
37526 +               uf_info->container = UF_CONTAINER_TAILS;
37527 +               complete_conversion(inode);
37528 +               return 0;
37529 +       }
37530 +       /*
37531 +        * conversion is not complete. Inode was already marked as
37532 +        * REISER4_PART_MIXED and stat-data were updated at the first
37533 +        * iteration of the loop above.
37534 +        */
37535 +       warning("nikita-2282",
37536 +               "Partial conversion of %llu: %lu of %lu: %i",
37537 +               (unsigned long long)get_inode_oid(inode), i,
37538 +               num_pages, result);
37539 +
37540 +       /* this assertion is to make sure get_exclusive_access_careful()
37541 +          won't fall into deadlock loop */
37542 +       assert("edward-1550", !reiser4_inode_get_flag(inode,
37543 +                                                     REISER4_PART_IN_CONV));
37544 +       return result;
37545 +}
37546 +
37547 +/*
37548 + * Local variables:
37549 + * c-indentation-style: "K&R"
37550 + * mode-name: "LC"
37551 + * c-basic-offset: 8
37552 + * tab-width: 8
37553 + * fill-column: 79
37554 + * scroll-step: 1
37555 + * End:
37556 + */
37557 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/file_ops.c linux-2.6.27/fs/reiser4/plugin/file_ops.c
37558 --- linux-2.6.27.orig/fs/reiser4/plugin/file_ops.c      1970-01-01 03:00:00.000000000 +0300
37559 +++ linux-2.6.27/fs/reiser4/plugin/file_ops.c   2008-10-12 18:20:01.000000000 +0400
37560 @@ -0,0 +1,205 @@
37561 +/* Copyright 2005 by Hans Reiser, licensing governed by
37562 +   reiser4/README */
37563 +
37564 +/* this file contains typical implementations for some of methods of
37565 +   struct file_operations and of struct address_space_operations
37566 +*/
37567 +
37568 +#include "../inode.h"
37569 +#include "object.h"
37570 +
37571 +/* file operations */
37572 +
37573 +/* implementation of vfs's llseek method of struct file_operations for
37574 +   typical directory can be found in readdir_common.c
37575 +*/
37576 +loff_t reiser4_llseek_dir_common(struct file *, loff_t, int origin);
37577 +
37578 +/* implementation of vfs's readdir method of struct file_operations for
37579 +   typical directory can be found in readdir_common.c
37580 +*/
37581 +int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
37582 +
37583 +/**
37584 + * reiser4_release_dir_common - release of struct file_operations
37585 + * @inode: inode of released file
37586 + * @file: file to release
37587 + *
37588 + * Implementation of release method of struct file_operations for typical
37589 + * directory. All it does is freeing of reiser4 specific file data.
37590 +*/
37591 +int reiser4_release_dir_common(struct inode *inode, struct file *file)
37592 +{
37593 +       reiser4_context *ctx;
37594 +
37595 +       ctx = reiser4_init_context(inode->i_sb);
37596 +       if (IS_ERR(ctx))
37597 +               return PTR_ERR(ctx);
37598 +       reiser4_free_file_fsdata(file);
37599 +       reiser4_exit_context(ctx);
37600 +       return 0;
37601 +}
37602 +
37603 +/* this is common implementation of vfs's fsync method of struct
37604 +   file_operations
37605 +*/
37606 +int reiser4_sync_common(struct file *file, struct dentry *dentry, int datasync)
37607 +{
37608 +       reiser4_context *ctx;
37609 +       int result;
37610 +
37611 +       ctx = reiser4_init_context(dentry->d_inode->i_sb);
37612 +       if (IS_ERR(ctx))
37613 +               return PTR_ERR(ctx);
37614 +       result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0);
37615 +
37616 +       context_set_commit_async(ctx);
37617 +       reiser4_exit_context(ctx);
37618 +       return result;
37619 +}
37620 +
37621 +/*
37622 + * common sync method for regular files.
37623 + *
37624 + * We are trying to be smart here. Instead of committing all atoms (original
37625 + * solution), we scan dirty pages of this file and commit all atoms they are
37626 + * part of.
37627 + *
37628 + * Situation is complicated by anonymous pages: i.e., extent-less pages
37629 + * dirtied through mmap. Fortunately sys_fsync() first calls
37630 + * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert
37631 + * all missing extents and capture anonymous pages.
37632 + */
37633 +int reiser4_sync_file_common(struct file *file,
37634 +                            struct dentry *dentry, int datasync)
37635 +{
37636 +       reiser4_context *ctx;
37637 +       txn_atom *atom;
37638 +       reiser4_block_nr reserve;
37639 +
37640 +       ctx = reiser4_init_context(dentry->d_inode->i_sb);
37641 +       if (IS_ERR(ctx))
37642 +               return PTR_ERR(ctx);
37643 +
37644 +       reserve = estimate_update_common(dentry->d_inode);
37645 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
37646 +               reiser4_exit_context(ctx);
37647 +               return RETERR(-ENOSPC);
37648 +       }
37649 +       write_sd_by_inode_common(dentry->d_inode);
37650 +
37651 +       atom = get_current_atom_locked();
37652 +       spin_lock_txnh(ctx->trans);
37653 +       force_commit_atom(ctx->trans);
37654 +       reiser4_exit_context(ctx);
37655 +       return 0;
37656 +}
37657 +
37658 +/* this is common implementation of vfs's sendfile method of struct
37659 +   file_operations
37660 +
37661 +   Reads @count bytes from @file and calls @actor for every page read. This is
37662 +   needed for loop back devices support.
37663 +*/
37664 +#if 0
37665 +ssize_t
37666 +sendfile_common(struct file *file, loff_t *ppos, size_t count,
37667 +               read_actor_t actor, void *target)
37668 +{
37669 +       reiser4_context *ctx;
37670 +       ssize_t result;
37671 +
37672 +       ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
37673 +       if (IS_ERR(ctx))
37674 +               return PTR_ERR(ctx);
37675 +       result = generic_file_sendfile(file, ppos, count, actor, target);
37676 +       reiser4_exit_context(ctx);
37677 +       return result;
37678 +}
37679 +#endif  /*  0  */
37680 +
37681 +/* address space operations */
37682 +
37683 +/* this is common implementation of vfs's prepare_write method of struct
37684 +   address_space_operations
37685 +*/
37686 +int
37687 +prepare_write_common(struct file *file, struct page *page, unsigned from,
37688 +                    unsigned to)
37689 +{
37690 +       reiser4_context *ctx;
37691 +       int result;
37692 +
37693 +       ctx = reiser4_init_context(page->mapping->host->i_sb);
37694 +       result = do_prepare_write(file, page, from, to);
37695 +
37696 +       /* don't commit transaction under inode semaphore */
37697 +       context_set_commit_async(ctx);
37698 +       reiser4_exit_context(ctx);
37699 +
37700 +       return result;
37701 +}
37702 +
37703 +/* this is helper for prepare_write_common and prepare_write_unix_file
37704 + */
37705 +int
37706 +do_prepare_write(struct file *file, struct page *page, unsigned from,
37707 +                unsigned to)
37708 +{
37709 +       int result;
37710 +       file_plugin *fplug;
37711 +       struct inode *inode;
37712 +
37713 +       assert("umka-3099", file != NULL);
37714 +       assert("umka-3100", page != NULL);
37715 +       assert("umka-3095", PageLocked(page));
37716 +
37717 +       if (to - from == PAGE_CACHE_SIZE || PageUptodate(page))
37718 +               return 0;
37719 +
37720 +       inode = page->mapping->host;
37721 +       fplug = inode_file_plugin(inode);
37722 +
37723 +       if (page->mapping->a_ops->readpage == NULL)
37724 +               return RETERR(-EINVAL);
37725 +
37726 +       result = page->mapping->a_ops->readpage(file, page);
37727 +       if (result != 0) {
37728 +               SetPageError(page);
37729 +               ClearPageUptodate(page);
37730 +               /* All reiser4 readpage() implementations should return the
37731 +                * page locked in case of error. */
37732 +               assert("nikita-3472", PageLocked(page));
37733 +       } else {
37734 +               /*
37735 +                * ->readpage() either:
37736 +                *
37737 +                *     1. starts IO against @page. @page is locked for IO in
37738 +                *     this case.
37739 +                *
37740 +                *     2. doesn't start IO. @page is unlocked.
37741 +                *
37742 +                * In either case, page should be locked.
37743 +                */
37744 +               lock_page(page);
37745 +               /*
37746 +                * IO (if any) is completed at this point. Check for IO
37747 +                * errors.
37748 +                */
37749 +               if (!PageUptodate(page))
37750 +                       result = RETERR(-EIO);
37751 +       }
37752 +       assert("umka-3098", PageLocked(page));
37753 +       return result;
37754 +}
37755 +
37756 +/*
37757 + * Local variables:
37758 + * c-indentation-style: "K&R"
37759 + * mode-name: "LC"
37760 + * c-basic-offset: 8
37761 + * tab-width: 8
37762 + * fill-column: 79
37763 + * scroll-step: 1
37764 + * End:
37765 + */
37766 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/file_ops_readdir.c linux-2.6.27/fs/reiser4/plugin/file_ops_readdir.c
37767 --- linux-2.6.27.orig/fs/reiser4/plugin/file_ops_readdir.c      1970-01-01 03:00:00.000000000 +0300
37768 +++ linux-2.6.27/fs/reiser4/plugin/file_ops_readdir.c   2008-10-12 18:20:01.000000000 +0400
37769 @@ -0,0 +1,658 @@
37770 +/* Copyright 2005 by Hans Reiser, licensing governed by
37771 + * reiser4/README */
37772 +
37773 +#include "../inode.h"
37774 +
37775 +/* return true, iff @coord points to the valid directory item that is part of
37776 + * @inode directory. */
37777 +static int is_valid_dir_coord(struct inode *inode, coord_t * coord)
37778 +{
37779 +       return plugin_of_group(item_plugin_by_coord(coord),
37780 +                              DIR_ENTRY_ITEM_TYPE) &&
37781 +              inode_file_plugin(inode)->owns_item(inode, coord);
37782 +}
37783 +
37784 +/* compare two logical positions within the same directory */
37785 +static cmp_t dir_pos_cmp(const struct dir_pos * p1, const struct dir_pos * p2)
37786 +{
37787 +       cmp_t result;
37788 +
37789 +       assert("nikita-2534", p1 != NULL);
37790 +       assert("nikita-2535", p2 != NULL);
37791 +
37792 +       result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
37793 +       if (result == EQUAL_TO) {
37794 +               int diff;
37795 +
37796 +               diff = p1->pos - p2->pos;
37797 +               result =
37798 +                   (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
37799 +       }
37800 +       return result;
37801 +}
37802 +
37803 +/* see comment before reiser4_readdir_common() for overview of why "adjustment" is
37804 + * necessary. */
37805 +static void
37806 +adjust_dir_pos(struct file *dir, struct readdir_pos * readdir_spot,
37807 +              const struct dir_pos * mod_point, int adj)
37808 +{
37809 +       struct dir_pos *pos;
37810 +
37811 +       /*
37812 +        * new directory entry was added (adj == +1) or removed (adj == -1) at
37813 +        * the @mod_point. Directory file descriptor @dir is doing readdir and
37814 +        * is currently positioned at @readdir_spot. Latter has to be updated
37815 +        * to maintain stable readdir.
37816 +        */
37817 +       /* directory is positioned to the beginning. */
37818 +       if (readdir_spot->entry_no == 0)
37819 +               return;
37820 +
37821 +       pos = &readdir_spot->position;
37822 +       switch (dir_pos_cmp(mod_point, pos)) {
37823 +       case LESS_THAN:
37824 +               /* @mod_pos is _before_ @readdir_spot, that is, entry was
37825 +                * added/removed on the left (in key order) of current
37826 +                * position. */
37827 +               /* logical number of directory entry readdir is "looking" at
37828 +                * changes */
37829 +               readdir_spot->entry_no += adj;
37830 +               assert("nikita-2577",
37831 +                      ergo(dir != NULL, reiser4_get_dir_fpos(dir) + adj >= 0));
37832 +               if (de_id_cmp(&pos->dir_entry_key,
37833 +                             &mod_point->dir_entry_key) == EQUAL_TO) {
37834 +                       assert("nikita-2575", mod_point->pos < pos->pos);
37835 +                       /*
37836 +                        * if entry added/removed has the same key as current
37837 +                        * for readdir, update counter of duplicate keys in
37838 +                        * @readdir_spot.
37839 +                        */
37840 +                       pos->pos += adj;
37841 +               }
37842 +               break;
37843 +       case GREATER_THAN:
37844 +               /* directory is modified after @pos: nothing to do. */
37845 +               break;
37846 +       case EQUAL_TO:
37847 +               /* cannot insert an entry readdir is looking at, because it
37848 +                  already exists. */
37849 +               assert("nikita-2576", adj < 0);
37850 +               /* directory entry to which @pos points to is being
37851 +                  removed.
37852 +
37853 +                  NOTE-NIKITA: Right thing to do is to update @pos to point
37854 +                  to the next entry. This is complex (we are under spin-lock
37855 +                  for one thing). Just rewind it to the beginning. Next
37856 +                  readdir will have to scan the beginning of
37857 +                  directory. Proper solution is to use semaphore in
37858 +                  spin lock's stead and use rewind_right() here.
37859 +
37860 +                  NOTE-NIKITA: now, semaphore is used, so...
37861 +                */
37862 +               memset(readdir_spot, 0, sizeof *readdir_spot);
37863 +       }
37864 +}
37865 +
37866 +/* scan all file-descriptors for this directory and adjust their
37867 +   positions respectively. Should be used by implementations of
37868 +   add_entry and rem_entry of dir plugin */
37869 +void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
37870 +                            int offset, int adj)
37871 +{
37872 +       reiser4_file_fsdata *scan;
37873 +       struct dir_pos mod_point;
37874 +
37875 +       assert("nikita-2536", dir != NULL);
37876 +       assert("nikita-2538", de != NULL);
37877 +       assert("nikita-2539", adj != 0);
37878 +
37879 +       build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
37880 +       mod_point.pos = offset;
37881 +
37882 +       spin_lock_inode(dir);
37883 +
37884 +       /*
37885 +        * new entry was added/removed in directory @dir. Scan all file
37886 +        * descriptors for @dir that are currently involved into @readdir and
37887 +        * update them.
37888 +        */
37889 +
37890 +       list_for_each_entry(scan, get_readdir_list(dir), dir.linkage)
37891 +               adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
37892 +
37893 +       spin_unlock_inode(dir);
37894 +}
37895 +
37896 +/*
37897 + * traverse tree to start/continue readdir from the readdir position @pos.
37898 + */
37899 +static int dir_go_to(struct file *dir, struct readdir_pos * pos, tap_t * tap)
37900 +{
37901 +       reiser4_key key;
37902 +       int result;
37903 +       struct inode *inode;
37904 +
37905 +       assert("nikita-2554", pos != NULL);
37906 +
37907 +       inode = dir->f_dentry->d_inode;
37908 +       result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
37909 +       if (result != 0)
37910 +               return result;
37911 +       result = reiser4_object_lookup(inode,
37912 +                                      &key,
37913 +                                      tap->coord,
37914 +                                      tap->lh,
37915 +                                      tap->mode,
37916 +                                      FIND_EXACT,
37917 +                                      LEAF_LEVEL, LEAF_LEVEL,
37918 +                                      0, &tap->ra_info);
37919 +       if (result == CBK_COORD_FOUND)
37920 +               result = rewind_right(tap, (int)pos->position.pos);
37921 +       else {
37922 +               tap->coord->node = NULL;
37923 +               done_lh(tap->lh);
37924 +               result = RETERR(-EIO);
37925 +       }
37926 +       return result;
37927 +}
37928 +
37929 +/*
37930 + * handling of non-unique keys: calculate at what ordinal position within
37931 + * sequence of directory items with identical keys @pos is.
37932 + */
37933 +static int set_pos(struct inode *inode, struct readdir_pos * pos, tap_t * tap)
37934 +{
37935 +       int result;
37936 +       coord_t coord;
37937 +       lock_handle lh;
37938 +       tap_t scan;
37939 +       de_id *did;
37940 +       reiser4_key de_key;
37941 +
37942 +       coord_init_zero(&coord);
37943 +       init_lh(&lh);
37944 +       reiser4_tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
37945 +       reiser4_tap_copy(&scan, tap);
37946 +       reiser4_tap_load(&scan);
37947 +       pos->position.pos = 0;
37948 +
37949 +       did = &pos->position.dir_entry_key;
37950 +
37951 +       if (is_valid_dir_coord(inode, scan.coord)) {
37952 +
37953 +               build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
37954 +
37955 +               while (1) {
37956 +
37957 +                       result = go_prev_unit(&scan);
37958 +                       if (result != 0)
37959 +                               break;
37960 +
37961 +                       if (!is_valid_dir_coord(inode, scan.coord)) {
37962 +                               result = -EINVAL;
37963 +                               break;
37964 +                       }
37965 +
37966 +                       /* get key of directory entry */
37967 +                       unit_key_by_coord(scan.coord, &de_key);
37968 +                       if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
37969 +                               /* duplicate-sequence is over */
37970 +                               break;
37971 +                       }
37972 +                       pos->position.pos++;
37973 +               }
37974 +       } else
37975 +               result = RETERR(-ENOENT);
37976 +       reiser4_tap_relse(&scan);
37977 +       reiser4_tap_done(&scan);
37978 +       return result;
37979 +}
37980 +
37981 +/*
37982 + * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
37983 + */
37984 +static int dir_rewind(struct file *dir, struct readdir_pos * pos, tap_t * tap)
37985 +{
37986 +       __u64 destination;
37987 +       __s64 shift;
37988 +       int result;
37989 +       struct inode *inode;
37990 +       loff_t dirpos;
37991 +
37992 +       assert("nikita-2553", dir != NULL);
37993 +       assert("nikita-2548", pos != NULL);
37994 +       assert("nikita-2551", tap->coord != NULL);
37995 +       assert("nikita-2552", tap->lh != NULL);
37996 +
37997 +       dirpos = reiser4_get_dir_fpos(dir);
37998 +       shift = dirpos - pos->fpos;
37999 +       /* this is logical directory entry within @dir which we are rewinding
38000 +        * to */
38001 +       destination = pos->entry_no + shift;
38002 +
38003 +       inode = dir->f_dentry->d_inode;
38004 +       if (dirpos < 0)
38005 +               return RETERR(-EINVAL);
38006 +       else if (destination == 0ll || dirpos == 0) {
38007 +               /* rewind to the beginning of directory */
38008 +               memset(pos, 0, sizeof *pos);
38009 +               return dir_go_to(dir, pos, tap);
38010 +       } else if (destination >= inode->i_size)
38011 +               return RETERR(-ENOENT);
38012 +
38013 +       if (shift < 0) {
38014 +               /* I am afraid of negative numbers */
38015 +               shift = -shift;
38016 +               /* rewinding to the left */
38017 +               if (shift <= (int)pos->position.pos) {
38018 +                       /* destination is within sequence of entries with
38019 +                          duplicate keys. */
38020 +                       result = dir_go_to(dir, pos, tap);
38021 +               } else {
38022 +                       shift -= pos->position.pos;
38023 +                       while (1) {
38024 +                               /* repetitions: deadlock is possible when
38025 +                                  going to the left. */
38026 +                               result = dir_go_to(dir, pos, tap);
38027 +                               if (result == 0) {
38028 +                                       result = rewind_left(tap, shift);
38029 +                                       if (result == -E_DEADLOCK) {
38030 +                                               reiser4_tap_done(tap);
38031 +                                               continue;
38032 +                                       }
38033 +                               }
38034 +                               break;
38035 +                       }
38036 +               }
38037 +       } else {
38038 +               /* rewinding to the right */
38039 +               result = dir_go_to(dir, pos, tap);
38040 +               if (result == 0)
38041 +                       result = rewind_right(tap, shift);
38042 +       }
38043 +       if (result == 0) {
38044 +               result = set_pos(inode, pos, tap);
38045 +               if (result == 0) {
38046 +                       /* update pos->position.pos */
38047 +                       pos->entry_no = destination;
38048 +                       pos->fpos = dirpos;
38049 +               }
38050 +       }
38051 +       return result;
38052 +}
38053 +
38054 +/*
38055 + * Function that is called by common_readdir() on each directory entry while
38056 + * doing readdir. ->filldir callback may block, so we had to release long term
38057 + * lock while calling it. To avoid repeating tree traversal, seal is used. If
38058 + * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
38059 + *
38060 + * Whether node is unlocked in case of any other error is undefined. It is
38061 + * guaranteed to be still locked if success (0) is returned.
38062 + *
38063 + * When ->filldir() wants no more, feed_entry() returns 1, and node is
38064 + * unlocked.
38065 + */
38066 +static int
38067 +feed_entry(struct file *f, struct readdir_pos * pos, tap_t * tap,
38068 +          filldir_t filldir, void *dirent)
38069 +{
38070 +       item_plugin *iplug;
38071 +       char *name;
38072 +       reiser4_key sd_key;
38073 +       int result;
38074 +       char buf[DE_NAME_BUF_LEN];
38075 +       char name_buf[32];
38076 +       char *local_name;
38077 +       unsigned file_type;
38078 +       seal_t seal;
38079 +       coord_t *coord;
38080 +       reiser4_key entry_key;
38081 +
38082 +       coord = tap->coord;
38083 +       iplug = item_plugin_by_coord(coord);
38084 +
38085 +       /* pointer to name within the node */
38086 +       name = iplug->s.dir.extract_name(coord, buf);
38087 +       assert("nikita-1371", name != NULL);
38088 +
38089 +       /* key of object the entry points to */
38090 +       if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
38091 +               return RETERR(-EIO);
38092 +
38093 +       /* we must release longterm znode lock before calling filldir to avoid
38094 +          deadlock which may happen if filldir causes page fault. So, copy
38095 +          name to intermediate buffer */
38096 +       if (strlen(name) + 1 > sizeof(name_buf)) {
38097 +               local_name = kmalloc(strlen(name) + 1,
38098 +                                    reiser4_ctx_gfp_mask_get());
38099 +               if (local_name == NULL)
38100 +                       return RETERR(-ENOMEM);
38101 +       } else
38102 +               local_name = name_buf;
38103 +
38104 +       strcpy(local_name, name);
38105 +       file_type = iplug->s.dir.extract_file_type(coord);
38106 +
38107 +       unit_key_by_coord(coord, &entry_key);
38108 +       reiser4_seal_init(&seal, coord, &entry_key);
38109 +
38110 +       longterm_unlock_znode(tap->lh);
38111 +
38112 +       /*
38113 +        * send information about directory entry to the ->filldir() filler
38114 +        * supplied to us by caller (VFS).
38115 +        *
38116 +        * ->filldir is entitled to do weird things. For example, ->filldir
38117 +        * supplied by knfsd re-enters file system. Make sure no locks are
38118 +        * held.
38119 +        */
38120 +       assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
38121 +
38122 +       reiser4_txn_restart_current();
38123 +       result = filldir(dirent, name, (int)strlen(name),
38124 +                        /* offset of this entry */
38125 +                        f->f_pos,
38126 +                        /* inode number of object bounden by this entry */
38127 +                        oid_to_uino(get_key_objectid(&sd_key)), file_type);
38128 +       if (local_name != name_buf)
38129 +               kfree(local_name);
38130 +       if (result < 0)
38131 +               /* ->filldir() is satisfied. (no space in buffer, IOW) */
38132 +               result = 1;
38133 +       else
38134 +               result = reiser4_seal_validate(&seal, coord, &entry_key,
38135 +                                              tap->lh, tap->mode,
38136 +                                              ZNODE_LOCK_HIPRI);
38137 +       return result;
38138 +}
38139 +
38140 +static void move_entry(struct readdir_pos * pos, coord_t * coord)
38141 +{
38142 +       reiser4_key de_key;
38143 +       de_id *did;
38144 +
38145 +       /* update @pos */
38146 +       ++pos->entry_no;
38147 +       did = &pos->position.dir_entry_key;
38148 +
38149 +       /* get key of directory entry */
38150 +       unit_key_by_coord(coord, &de_key);
38151 +
38152 +       if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
38153 +               /* we are within sequence of directory entries
38154 +                  with duplicate keys. */
38155 +               ++pos->position.pos;
38156 +       else {
38157 +               pos->position.pos = 0;
38158 +               build_de_id_by_key(&de_key, did);
38159 +       }
38160 +       ++pos->fpos;
38161 +}
38162 +
38163 +/*
38164 + *     STATELESS READDIR
38165 + *
38166 + * readdir support in reiser4 relies on ability to update readdir_pos embedded
38167 + * into reiser4_file_fsdata on each directory modification (name insertion and
38168 + * removal), see reiser4_readdir_common() function below. This obviously doesn't
38169 + * work when reiser4 is accessed over NFS, because NFS doesn't keep any state
38170 + * across client READDIR requests for the same directory.
38171 + *
38172 + * To address this we maintain a "pool" of detached reiser4_file_fsdata
38173 + * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
38174 + * find detached reiser4_file_fsdata corresponding to previous readdir
38175 + * request. In other words, additional state is maintained on the
38176 + * server. (This is somewhat contrary to the design goals of NFS protocol.)
38177 + *
38178 + * To efficiently detect when our ->readdir() method is called by NFS server,
38179 + * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
38180 + * file_is_stateless() function).
38181 + *
38182 + * To find out d_cursor in the pool, we encode client id (cid) in the highest
38183 + * bits of NFS readdir cookie: when first readdir request comes to the given
38184 + * directory from the given client, cookie is set to 0. This situation is
38185 + * detected, global cid_counter is incremented, and stored in highest bits of
38186 + * all direntry offsets returned to the client, including last one. As the
38187 + * only valid readdir cookie is one obtained as direntry->offset, we are
38188 + * guaranteed that next readdir request (continuing current one) will have
38189 + * current cid in the highest bits of starting readdir cookie. All d_cursors
38190 + * are hashed into per-super-block hash table by (oid, cid) key.
38191 + *
38192 + * In addition d_cursors are placed into per-super-block radix tree where they
38193 + * are keyed by oid alone. This is necessary to efficiently remove them during
38194 + * rmdir.
38195 + *
38196 + * At last, currently unused d_cursors are linked into special list. This list
38197 + * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
38198 + *
38199 + */
38200 +
38201 +/*
38202 + * prepare for readdir.
38203 + */
38204 +static int dir_readdir_init(struct file *f, tap_t * tap,
38205 +                           struct readdir_pos ** pos)
38206 +{
38207 +       struct inode *inode;
38208 +       reiser4_file_fsdata *fsdata;
38209 +       int result;
38210 +
38211 +       assert("nikita-1359", f != NULL);
38212 +       inode = f->f_dentry->d_inode;
38213 +       assert("nikita-1360", inode != NULL);
38214 +
38215 +       if (!S_ISDIR(inode->i_mode))
38216 +               return RETERR(-ENOTDIR);
38217 +
38218 +       /* try to find detached readdir state */
38219 +       result = reiser4_attach_fsdata(f, inode);
38220 +       if (result != 0)
38221 +               return result;
38222 +
38223 +       fsdata = reiser4_get_file_fsdata(f);
38224 +       assert("nikita-2571", fsdata != NULL);
38225 +       if (IS_ERR(fsdata))
38226 +               return PTR_ERR(fsdata);
38227 +
38228 +       /* add file descriptor to the readdir list hanging of directory
38229 +        * inode. This list is used to scan "readdirs-in-progress" while
38230 +        * inserting or removing names in the directory. */
38231 +       spin_lock_inode(inode);
38232 +       if (list_empty_careful(&fsdata->dir.linkage))
38233 +               list_add(&fsdata->dir.linkage, get_readdir_list(inode));
38234 +       *pos = &fsdata->dir.readdir;
38235 +       spin_unlock_inode(inode);
38236 +
38237 +       /* move @tap to the current position */
38238 +       return dir_rewind(f, *pos, tap);
38239 +}
38240 +
38241 +/* this is implementation of vfs's llseek method of struct file_operations for
38242 +   typical directory
38243 +   See comment before reiser4_readdir_common() for explanation.
38244 +*/
38245 +loff_t reiser4_llseek_dir_common(struct file * file, loff_t off, int origin)
38246 +{
38247 +       reiser4_context *ctx;
38248 +       loff_t result;
38249 +       struct inode *inode;
38250 +
38251 +       inode = file->f_dentry->d_inode;
38252 +
38253 +       ctx = reiser4_init_context(inode->i_sb);
38254 +       if (IS_ERR(ctx))
38255 +               return PTR_ERR(ctx);
38256 +
38257 +       mutex_lock(&inode->i_mutex);
38258 +
38259 +       /* update ->f_pos */
38260 +       result = default_llseek(file, off, origin);
38261 +       if (result >= 0) {
38262 +               int ff;
38263 +               coord_t coord;
38264 +               lock_handle lh;
38265 +               tap_t tap;
38266 +               struct readdir_pos *pos;
38267 +
38268 +               coord_init_zero(&coord);
38269 +               init_lh(&lh);
38270 +               reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
38271 +
38272 +               ff = dir_readdir_init(file, &tap, &pos);
38273 +               reiser4_detach_fsdata(file);
38274 +               if (ff != 0)
38275 +                       result = (loff_t) ff;
38276 +               reiser4_tap_done(&tap);
38277 +       }
38278 +       reiser4_detach_fsdata(file);
38279 +       mutex_unlock(&inode->i_mutex);
38280 +
38281 +       reiser4_exit_context(ctx);
38282 +       return result;
38283 +}
38284 +
38285 +/* this is common implementation of vfs's readdir method of struct
38286 +   file_operations
38287 +
38288 +   readdir problems:
38289 +
38290 +   readdir(2)/getdents(2) interface is based on implicit assumption that
38291 +   readdir can be restarted from any particular point by supplying file system
38292 +   with off_t-full of data. That is, file system fills ->d_off field in struct
38293 +   dirent and later user passes ->d_off to the seekdir(3), which is, actually,
38294 +   implemented by glibc as lseek(2) on directory.
38295 +
38296 +   Reiser4 cannot restart readdir from 64 bits of data, because two last
38297 +   components of the key of directory entry are unknown, which given 128 bits:
38298 +   locality and type fields in the key of directory entry are always known, to
38299 +   start readdir() from given point objectid and offset fields have to be
38300 +   filled.
38301 +
38302 +   Traditional UNIX API for scanning through directory
38303 +   (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
38304 +   assumption that directory is structured very much like regular file, in
38305 +   particular, it is implied that each name within given directory (directory
38306 +   entry) can be uniquely identified by scalar offset and that such offset is
38307 +   stable across the life-time of the name is identifies.
38308 +
38309 +   This is manifestly not so for reiser4. In reiser4 the only stable unique
38310 +   identifies for the directory entry is its key that doesn't fit into
38311 +   seekdir/telldir API.
38312 +
38313 +   solution:
38314 +
38315 +   Within each file descriptor participating in readdir-ing of directory
38316 +   plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
38317 +   the "current" directory entry that file descriptor looks at. It contains a
38318 +   key of directory entry (plus some additional info to deal with non-unique
38319 +   keys that we wouldn't dwell onto here) and a logical position of this
38320 +   directory entry starting from the beginning of the directory, that is
38321 +   ordinal number of this entry in the readdir order.
38322 +
38323 +   Obviously this logical position is not stable in the face of directory
38324 +   modifications. To work around this, on each addition or removal of directory
38325 +   entry all file descriptors for directory inode are scanned and their
38326 +   readdir_pos are updated accordingly (adjust_dir_pos()).
38327 +*/
38328 +int reiser4_readdir_common(struct file *f /* directory file being read */,
38329 +                          void *dirent /* opaque data passed to us by VFS */,
38330 +                          filldir_t filld /* filler function passed to us
38331 +                                           * by VFS */)
38332 +{
38333 +       reiser4_context *ctx;
38334 +       int result;
38335 +       struct inode *inode;
38336 +       coord_t coord;
38337 +       lock_handle lh;
38338 +       tap_t tap;
38339 +       struct readdir_pos *pos;
38340 +
38341 +       assert("nikita-1359", f != NULL);
38342 +       inode = f->f_dentry->d_inode;
38343 +       assert("nikita-1360", inode != NULL);
38344 +
38345 +       if (!S_ISDIR(inode->i_mode))
38346 +               return RETERR(-ENOTDIR);
38347 +
38348 +       ctx = reiser4_init_context(inode->i_sb);
38349 +       if (IS_ERR(ctx))
38350 +               return PTR_ERR(ctx);
38351 +
38352 +       coord_init_zero(&coord);
38353 +       init_lh(&lh);
38354 +       reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
38355 +
38356 +       reiser4_readdir_readahead_init(inode, &tap);
38357 +
38358 +      repeat:
38359 +       result = dir_readdir_init(f, &tap, &pos);
38360 +       if (result == 0) {
38361 +               result = reiser4_tap_load(&tap);
38362 +               /* scan entries one by one feeding them to @filld */
38363 +               while (result == 0) {
38364 +                       coord_t *coord;
38365 +
38366 +                       coord = tap.coord;
38367 +                       assert("nikita-2572", coord_is_existing_unit(coord));
38368 +                       assert("nikita-3227", is_valid_dir_coord(inode, coord));
38369 +
38370 +                       result = feed_entry(f, pos, &tap, filld, dirent);
38371 +                       if (result > 0) {
38372 +                               break;
38373 +                       } else if (result == 0) {
38374 +                               ++f->f_pos;
38375 +                               result = go_next_unit(&tap);
38376 +                               if (result == -E_NO_NEIGHBOR ||
38377 +                                   result == -ENOENT) {
38378 +                                       result = 0;
38379 +                                       break;
38380 +                               } else if (result == 0) {
38381 +                                       if (is_valid_dir_coord(inode, coord))
38382 +                                               move_entry(pos, coord);
38383 +                                       else
38384 +                                               break;
38385 +                               }
38386 +                       } else if (result == -E_REPEAT) {
38387 +                               /* feed_entry() had to restart. */
38388 +                               ++f->f_pos;
38389 +                               reiser4_tap_relse(&tap);
38390 +                               goto repeat;
38391 +                       } else
38392 +                               warning("vs-1617",
38393 +                                       "reiser4_readdir_common: unexpected error %d",
38394 +                                       result);
38395 +               }
38396 +               reiser4_tap_relse(&tap);
38397 +
38398 +               if (result >= 0)
38399 +                       f->f_version = inode->i_version;
38400 +       } else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
38401 +               result = 0;
38402 +       reiser4_tap_done(&tap);
38403 +       reiser4_detach_fsdata(f);
38404 +
38405 +       /* try to update directory's atime */
38406 +       if (reiser4_grab_space_force(inode_file_plugin(inode)->estimate.update(inode),
38407 +                              BA_CAN_COMMIT) != 0)
38408 +               warning("", "failed to update atime on readdir: %llu",
38409 +                       get_inode_oid(inode));
38410 +       else
38411 +               file_accessed(f);
38412 +
38413 +       context_set_commit_async(ctx);
38414 +       reiser4_exit_context(ctx);
38415 +
38416 +       return (result <= 0) ? result : 0;
38417 +}
38418 +
38419 +/*
38420 + * Local variables:
38421 + * c-indentation-style: "K&R"
38422 + * mode-name: "LC"
38423 + * c-basic-offset: 8
38424 + * tab-width: 8
38425 + * fill-column: 79
38426 + * End:
38427 + */
38428 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/file_plugin_common.c linux-2.6.27/fs/reiser4/plugin/file_plugin_common.c
38429 --- linux-2.6.27.orig/fs/reiser4/plugin/file_plugin_common.c    1970-01-01 03:00:00.000000000 +0300
38430 +++ linux-2.6.27/fs/reiser4/plugin/file_plugin_common.c 2008-10-12 18:20:01.000000000 +0400
38431 @@ -0,0 +1,1009 @@
38432 +/* Copyright 2005 by Hans Reiser, licensing governed by
38433 +   reiser4/README */
38434 +
38435 +/* this file contains typical implementations for most of methods of
38436 +   file plugin
38437 +*/
38438 +
38439 +#include "../inode.h"
38440 +#include "object.h"
38441 +#include "../safe_link.h"
38442 +
38443 +#include <linux/quotaops.h>
38444 +
38445 +static int insert_new_sd(struct inode *inode);
38446 +static int update_sd(struct inode *inode);
38447 +
38448 +/* this is common implementation of write_sd_by_inode method of file plugin
38449 +   either insert stat data or update it
38450 + */
38451 +int write_sd_by_inode_common(struct inode *inode /* object to save */ )
38452 +{
38453 +       int result;
38454 +
38455 +       assert("nikita-730", inode != NULL);
38456 +
38457 +       if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
38458 +               /* object doesn't have stat-data yet */
38459 +               result = insert_new_sd(inode);
38460 +       else
38461 +               result = update_sd(inode);
38462 +       if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM)
38463 +               /* Don't issue warnings about "name is too long" */
38464 +               warning("nikita-2221", "Failed to save sd for %llu: %i",
38465 +                       (unsigned long long)get_inode_oid(inode), result);
38466 +       return result;
38467 +}
38468 +
38469 +/* this is common implementation of key_by_inode method of file plugin
38470 + */
38471 +int
38472 +key_by_inode_and_offset_common(struct inode *inode, loff_t off,
38473 +                              reiser4_key * key)
38474 +{
38475 +       reiser4_key_init(key);
38476 +       set_key_locality(key, reiser4_inode_data(inode)->locality_id);
38477 +       set_key_ordering(key, get_inode_ordering(inode));
38478 +       set_key_objectid(key, get_inode_oid(inode));    /*FIXME: inode->i_ino */
38479 +       set_key_type(key, KEY_BODY_MINOR);
38480 +       set_key_offset(key, (__u64) off);
38481 +       return 0;
38482 +}
38483 +
38484 +/* this is common implementation of set_plug_in_inode method of file plugin
38485 + */
38486 +int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ ,
38487 +                            struct inode *parent /* parent object */ ,
38488 +                            reiser4_object_create_data * data  /* creational
38489 +                                                                * data */ )
38490 +{
38491 +       __u64 mask;
38492 +
38493 +       object->i_mode = data->mode;
38494 +       /* this should be plugin decision */
38495 +       object->i_uid = current->fsuid;
38496 +       object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME;
38497 +
38498 +       /* support for BSD style group-id assignment. See mount's manual page
38499 +          description of bsdgroups ext2 mount options for more details */
38500 +       if (reiser4_is_set(object->i_sb, REISER4_BSD_GID))
38501 +               object->i_gid = parent->i_gid;
38502 +       else if (parent->i_mode & S_ISGID) {
38503 +               /* parent directory has sguid bit */
38504 +               object->i_gid = parent->i_gid;
38505 +               if (S_ISDIR(object->i_mode))
38506 +                       /* sguid is inherited by sub-directories */
38507 +                       object->i_mode |= S_ISGID;
38508 +       } else
38509 +               object->i_gid = current->fsgid;
38510 +
38511 +       /* this object doesn't have stat-data yet */
38512 +       reiser4_inode_set_flag(object, REISER4_NO_SD);
38513 +#if 0
38514 +       /* this is now called after all inode plugins are initialized:
38515 +          do_create_vfs_child after adjust_to_parent */
38516 +       /* setup inode and file-operations for this inode */
38517 +       setup_inode_ops(object, data);
38518 +#endif
38519 +       object->i_nlink = 0;
38520 +       reiser4_seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL);
38521 +       mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT);
38522 +       if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES))
38523 +               mask |= (1 << LARGE_TIMES_STAT);
38524 +
38525 +       reiser4_inode_data(object)->extmask = mask;
38526 +       return 0;
38527 +}
38528 +
38529 +/* this is common implementation of adjust_to_parent method of file plugin for
38530 +   regular files
38531 + */
38532 +int adjust_to_parent_common(struct inode *object /* new object */ ,
38533 +                           struct inode *parent /* parent directory */ ,
38534 +                           struct inode *root /* root directory */ )
38535 +{
38536 +       assert("nikita-2165", object != NULL);
38537 +       if (parent == NULL)
38538 +               parent = root;
38539 +       assert("nikita-2069", parent != NULL);
38540 +
38541 +       /*
38542 +        * inherit missing plugins from parent
38543 +        */
38544 +
38545 +       grab_plugin_pset(object, parent, PSET_FILE);
38546 +       grab_plugin_pset(object, parent, PSET_SD);
38547 +       grab_plugin_pset(object, parent, PSET_FORMATTING);
38548 +       grab_plugin_pset(object, parent, PSET_PERM);
38549 +       return 0;
38550 +}
38551 +
38552 +/* this is common implementation of adjust_to_parent method of file plugin for
38553 +   typical directories
38554 + */
38555 +int adjust_to_parent_common_dir(struct inode *object /* new object */ ,
38556 +                               struct inode *parent /* parent directory */ ,
38557 +                               struct inode *root /* root directory */ )
38558 +{
38559 +       int result = 0;
38560 +       pset_member memb;
38561 +
38562 +       assert("nikita-2166", object != NULL);
38563 +       if (parent == NULL)
38564 +               parent = root;
38565 +       assert("nikita-2167", parent != NULL);
38566 +
38567 +       /*
38568 +        * inherit missing plugins from parent
38569 +        */
38570 +       for (memb = 0; memb < PSET_LAST; ++memb) {
38571 +               result = grab_plugin_pset(object, parent, memb);
38572 +               if (result != 0)
38573 +                       break;
38574 +       }
38575 +       return result;
38576 +}
38577 +
38578 +int adjust_to_parent_cryptcompress(struct inode *object /* new object */ ,
38579 +                                  struct inode *parent /* parent directory */,
38580 +                                  struct inode *root /* root directory */)
38581 +{
38582 +       int result;
38583 +       result = adjust_to_parent_common(object, parent, root);
38584 +       if (result)
38585 +               return result;
38586 +       assert("edward-1416", parent != NULL);
38587 +
38588 +       grab_plugin_pset(object, parent, PSET_CLUSTER);
38589 +       grab_plugin_pset(object, parent, PSET_CIPHER);
38590 +       grab_plugin_pset(object, parent, PSET_DIGEST);
38591 +       grab_plugin_pset(object, parent, PSET_COMPRESSION);
38592 +       grab_plugin_pset(object, parent, PSET_COMPRESSION_MODE);
38593 +
38594 +       return 0;
38595 +}
38596 +
38597 +/* this is common implementation of create_object method of file plugin
38598 + */
38599 +int reiser4_create_object_common(struct inode *object, struct inode *parent,
38600 +                                reiser4_object_create_data * data)
38601 +{
38602 +       reiser4_block_nr reserve;
38603 +       assert("nikita-744", object != NULL);
38604 +       assert("nikita-745", parent != NULL);
38605 +       assert("nikita-747", data != NULL);
38606 +       assert("nikita-748", reiser4_inode_get_flag(object, REISER4_NO_SD));
38607 +
38608 +       reserve = estimate_create_common(object);
38609 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
38610 +               return RETERR(-ENOSPC);
38611 +       return write_sd_by_inode_common(object);
38612 +}
38613 +
38614 +static int common_object_delete_no_reserve(struct inode *inode);
38615 +
38616 +/**
38617 + * reiser4_delete_object_common - delete_object of file_plugin
38618 + * @inode: inode to be deleted
38619 + *
38620 + * This is common implementation of delete_object method of file_plugin. It
38621 + * applies to object its deletion consists of removing two items - stat data
38622 + * and safe-link.
38623 + */
38624 +int reiser4_delete_object_common(struct inode *inode)
38625 +{
38626 +       int result;
38627 +
38628 +       assert("nikita-1477", inode != NULL);
38629 +       /* FIXME: if file body deletion failed (i/o error, for instance),
38630 +          inode->i_size can be != 0 here */
38631 +       assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode));
38632 +       assert("nikita-3421", inode->i_nlink == 0);
38633 +
38634 +       if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
38635 +               reiser4_block_nr reserve;
38636 +
38637 +               /* grab space which is needed to remove 2 items from the tree:
38638 +                  stat data and safe-link */
38639 +               reserve = 2 *
38640 +                 estimate_one_item_removal(reiser4_tree_by_inode(inode));
38641 +               if (reiser4_grab_space_force(reserve,
38642 +                                            BA_RESERVED | BA_CAN_COMMIT))
38643 +                       return RETERR(-ENOSPC);
38644 +               result = common_object_delete_no_reserve(inode);
38645 +       } else
38646 +               result = 0;
38647 +       return result;
38648 +}
38649 +
38650 +/**
38651 + * reiser4_delete_dir_common - delete_object of file_plugin
38652 + * @inode: inode to be deleted
38653 + *
38654 + * This is common implementation of delete_object method of file_plugin for
38655 + * typical directory. It calls done method of dir_plugin to remove "." and
38656 + * removes stat data and safe-link.
38657 + */
38658 +int reiser4_delete_dir_common(struct inode *inode)
38659 +{
38660 +       int result;
38661 +       dir_plugin *dplug;
38662 +
38663 +       assert("", (get_current_context() &&
38664 +                   get_current_context()->trans->atom == NULL));
38665 +
38666 +       dplug = inode_dir_plugin(inode);
38667 +       assert("vs-1101", dplug && dplug->done);
38668 +
38669 +       /* kill cursors which might be attached to inode */
38670 +       reiser4_kill_cursors(inode);
38671 +
38672 +       /* grab space enough for removing two items */
38673 +       if (reiser4_grab_space
38674 +           (2 * estimate_one_item_removal(reiser4_tree_by_inode(inode)),
38675 +            BA_RESERVED | BA_CAN_COMMIT))
38676 +               return RETERR(-ENOSPC);
38677 +
38678 +       result = dplug->done(inode);
38679 +       if (!result)
38680 +               result = common_object_delete_no_reserve(inode);
38681 +       return result;
38682 +}
38683 +
38684 +/* this is common implementation of add_link method of file plugin
38685 + */
38686 +int reiser4_add_link_common(struct inode *object, struct inode *parent)
38687 +{
38688 +       /*
38689 +        * increment ->i_nlink and update ->i_ctime
38690 +        */
38691 +
38692 +       INODE_INC_FIELD(object, i_nlink);
38693 +       object->i_ctime = CURRENT_TIME;
38694 +       return 0;
38695 +}
38696 +
38697 +/* this is common implementation of rem_link method of file plugin
38698 + */
38699 +int reiser4_rem_link_common(struct inode *object, struct inode *parent)
38700 +{
38701 +       assert("nikita-2021", object != NULL);
38702 +       assert("nikita-2163", object->i_nlink > 0);
38703 +
38704 +       /*
38705 +        * decrement ->i_nlink and update ->i_ctime
38706 +        */
38707 +
38708 +       INODE_DEC_FIELD(object, i_nlink);
38709 +       object->i_ctime = CURRENT_TIME;
38710 +       return 0;
38711 +}
38712 +
38713 +/* this is common implementation of rem_link method of file plugin for typical
38714 +   directory
38715 +*/
38716 +int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG)
38717 +{
38718 +       assert("nikita-20211", object != NULL);
38719 +       assert("nikita-21631", object->i_nlink > 0);
38720 +
38721 +       /*
38722 +        * decrement ->i_nlink and update ->i_ctime
38723 +        */
38724 +       INODE_DEC_FIELD(object, i_nlink);
38725 +       if (object->i_nlink == 1)
38726 +               INODE_DEC_FIELD(object, i_nlink);
38727 +       object->i_ctime = CURRENT_TIME;
38728 +       return 0;
38729 +}
38730 +
38731 +/* this is common implementation of owns_item method of file plugin
38732 +   compare objectids of keys in inode and coord */
38733 +int owns_item_common(const struct inode *inode,        /* object to check
38734 +                                                * against */
38735 +                    const coord_t * coord /* coord to check */ )
38736 +{
38737 +       reiser4_key item_key;
38738 +       reiser4_key file_key;
38739 +
38740 +       assert("nikita-760", inode != NULL);
38741 +       assert("nikita-761", coord != NULL);
38742 +
38743 +       return coord_is_existing_item(coord) &&
38744 +           (get_key_objectid(build_sd_key(inode, &file_key)) ==
38745 +            get_key_objectid(item_key_by_coord(coord, &item_key)));
38746 +}
38747 +
38748 +/* this is common implementation of owns_item method of file plugin
38749 +   for typical directory
38750 +*/
38751 +int owns_item_common_dir(const struct inode *inode,    /* object to check against */
38752 +                        const coord_t * coord /* coord of item to check */ )
38753 +{
38754 +       reiser4_key item_key;
38755 +
38756 +       assert("nikita-1335", inode != NULL);
38757 +       assert("nikita-1334", coord != NULL);
38758 +
38759 +       if (plugin_of_group(item_plugin_by_coord(coord), DIR_ENTRY_ITEM_TYPE))
38760 +               return get_key_locality(item_key_by_coord(coord, &item_key)) ==
38761 +                   get_inode_oid(inode);
38762 +       else
38763 +               return owns_item_common(inode, coord);
38764 +}
38765 +
38766 +/* this is common implementation of can_add_link method of file plugin
38767 +   checks whether yet another hard links to this object can be added
38768 +*/
38769 +int can_add_link_common(const struct inode *object /* object to check */ )
38770 +{
38771 +       assert("nikita-732", object != NULL);
38772 +
38773 +       /* inode->i_nlink is unsigned int, so just check for integer
38774 +          overflow */
38775 +       return object->i_nlink + 1 != 0;
38776 +}
38777 +
38778 +/* this is common implementation of can_rem_link method of file plugin for
38779 +   typical directory
38780 +*/
38781 +int can_rem_link_common_dir(const struct inode *inode)
38782 +{
38783 +       /* is_dir_empty() returns 0 is dir is empty */
38784 +       return !is_dir_empty(inode);
38785 +}
38786 +
38787 +/* this is common implementation of detach method of file plugin for typical
38788 +   directory
38789 +*/
38790 +int reiser4_detach_common_dir(struct inode *child, struct inode *parent)
38791 +{
38792 +       dir_plugin *dplug;
38793 +
38794 +       dplug = inode_dir_plugin(child);
38795 +       assert("nikita-2883", dplug != NULL);
38796 +       assert("nikita-2884", dplug->detach != NULL);
38797 +       return dplug->detach(child, parent);
38798 +}
38799 +
38800 +/* this is common implementation of bind method of file plugin for typical
38801 +   directory
38802 +*/
38803 +int reiser4_bind_common_dir(struct inode *child, struct inode *parent)
38804 +{
38805 +       dir_plugin *dplug;
38806 +
38807 +       dplug = inode_dir_plugin(child);
38808 +       assert("nikita-2646", dplug != NULL);
38809 +       return dplug->attach(child, parent);
38810 +}
38811 +
38812 +static int process_truncate(struct inode *, __u64 size);
38813 +
38814 +/* this is common implementation of safelink method of file plugin
38815 + */
38816 +int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value)
38817 +{
38818 +       int result;
38819 +
38820 +       assert("vs-1705", get_current_context()->trans->atom == NULL);
38821 +       if (link == SAFE_UNLINK)
38822 +               /* nothing to do. iput() in the caller (process_safelink) will
38823 +                * finish with file */
38824 +               result = 0;
38825 +       else if (link == SAFE_TRUNCATE)
38826 +               result = process_truncate(object, value);
38827 +       else {
38828 +               warning("nikita-3438", "Unrecognized safe-link type: %i", link);
38829 +               result = RETERR(-EIO);
38830 +       }
38831 +       return result;
38832 +}
38833 +
38834 +/* this is common implementation of estimate.create method of file plugin
38835 +   can be used when object creation involves insertion of one item (usually stat
38836 +   data) into tree
38837 +*/
38838 +reiser4_block_nr estimate_create_common(const struct inode * object)
38839 +{
38840 +       return estimate_one_insert_item(reiser4_tree_by_inode(object));
38841 +}
38842 +
38843 +/* this is common implementation of estimate.create method of file plugin for
38844 +   typical directory
38845 +   can be used when directory creation involves insertion of two items (usually
38846 +   stat data and item containing "." and "..") into tree
38847 +*/
38848 +reiser4_block_nr estimate_create_common_dir(const struct inode * object)
38849 +{
38850 +       return 2 * estimate_one_insert_item(reiser4_tree_by_inode(object));
38851 +}
38852 +
38853 +/* this is common implementation of estimate.update method of file plugin
38854 +   can be used when stat data update does not do more than inserting a unit
38855 +   into a stat data item which is probably true for most cases
38856 +*/
38857 +reiser4_block_nr estimate_update_common(const struct inode * inode)
38858 +{
38859 +       return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
38860 +}
38861 +
38862 +/* this is common implementation of estimate.unlink method of file plugin
38863 + */
38864 +reiser4_block_nr
38865 +estimate_unlink_common(const struct inode * object UNUSED_ARG,
38866 +                      const struct inode * parent UNUSED_ARG)
38867 +{
38868 +       return 0;
38869 +}
38870 +
38871 +/* this is common implementation of estimate.unlink method of file plugin for
38872 +   typical directory
38873 +*/
38874 +reiser4_block_nr
38875 +estimate_unlink_common_dir(const struct inode * object,
38876 +                          const struct inode * parent)
38877 +{
38878 +       dir_plugin *dplug;
38879 +
38880 +       dplug = inode_dir_plugin(object);
38881 +       assert("nikita-2888", dplug != NULL);
38882 +       assert("nikita-2887", dplug->estimate.unlink != NULL);
38883 +       return dplug->estimate.unlink(object, parent);
38884 +}
38885 +
38886 +char *wire_write_common(struct inode *inode, char *start)
38887 +{
38888 +       return build_inode_onwire(inode, start);
38889 +}
38890 +
38891 +char *wire_read_common(char *addr, reiser4_object_on_wire * obj)
38892 +{
38893 +       if (!obj)
38894 +               return locate_obj_key_id_onwire(addr);
38895 +       return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id);
38896 +}
38897 +
38898 +struct dentry *wire_get_common(struct super_block *sb,
38899 +                              reiser4_object_on_wire * obj)
38900 +{
38901 +       struct inode *inode;
38902 +       struct dentry *dentry;
38903 +       reiser4_key key;
38904 +
38905 +       extract_key_from_id(&obj->u.std.key_id, &key);
38906 +       inode = reiser4_iget(sb, &key, 1);
38907 +       if (!IS_ERR(inode)) {
38908 +               reiser4_iget_complete(inode);
38909 +               dentry = d_alloc_anon(inode);
38910 +               if (dentry == NULL) {
38911 +                       iput(inode);
38912 +                       dentry = ERR_PTR(-ENOMEM);
38913 +               } else
38914 +                       dentry->d_op = &get_super_private(sb)->ops.dentry;
38915 +       } else if (PTR_ERR(inode) == -ENOENT)
38916 +               /*
38917 +                * inode wasn't found at the key encoded in the file
38918 +                * handle. Hence, file handle is stale.
38919 +                */
38920 +               dentry = ERR_PTR(RETERR(-ESTALE));
38921 +       else
38922 +               dentry = (void *)inode;
38923 +       return dentry;
38924 +}
38925 +
38926 +int wire_size_common(struct inode *inode)
38927 +{
38928 +       return inode_onwire_size(inode);
38929 +}
38930 +
38931 +void wire_done_common(reiser4_object_on_wire * obj)
38932 +{
38933 +       /* nothing to do */
38934 +}
38935 +
38936 +/* helper function to print errors */
38937 +static void key_warning(const reiser4_key * key /* key to print */ ,
38938 +                       const struct inode *inode,
38939 +                       int code /* error code to print */ )
38940 +{
38941 +       assert("nikita-716", key != NULL);
38942 +
38943 +       if (code != -ENOMEM) {
38944 +               warning("nikita-717", "Error for inode %llu (%i)",
38945 +                       (unsigned long long)get_key_objectid(key), code);
38946 +               reiser4_print_key("for key", key);
38947 +       }
38948 +}
38949 +
38950 +/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */
38951 +#if REISER4_DEBUG
38952 +static void
38953 +check_inode_seal(const struct inode *inode,
38954 +                const coord_t * coord, const reiser4_key * key)
38955 +{
38956 +       reiser4_key unit_key;
38957 +
38958 +       unit_key_by_coord(coord, &unit_key);
38959 +       assert("nikita-2752",
38960 +              WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key)));
38961 +       assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key));
38962 +}
38963 +
38964 +static void check_sd_coord(coord_t * coord, const reiser4_key * key)
38965 +{
38966 +       reiser4_key ukey;
38967 +
38968 +       coord_clear_iplug(coord);
38969 +       if (zload(coord->node))
38970 +               return;
38971 +
38972 +       if (!coord_is_existing_unit(coord) ||
38973 +           !item_plugin_by_coord(coord) ||
38974 +           !keyeq(unit_key_by_coord(coord, &ukey), key) ||
38975 +           (znode_get_level(coord->node) != LEAF_LEVEL) ||
38976 +           !item_is_statdata(coord)) {
38977 +               warning("nikita-1901", "Conspicuous seal");
38978 +               reiser4_print_key("key", key);
38979 +               print_coord("coord", coord, 1);
38980 +               impossible("nikita-2877", "no way");
38981 +       }
38982 +       zrelse(coord->node);
38983 +}
38984 +
38985 +#else
38986 +#define check_inode_seal(inode, coord, key) noop
38987 +#define check_sd_coord(coord, key) noop
38988 +#endif
38989 +
38990 +/* insert new stat-data into tree. Called with inode state
38991 +    locked. Return inode state locked. */
38992 +static int insert_new_sd(struct inode *inode /* inode to create sd for */ )
38993 +{
38994 +       int result;
38995 +       reiser4_key key;
38996 +       coord_t coord;
38997 +       reiser4_item_data data;
38998 +       char *area;
38999 +       reiser4_inode *ref;
39000 +       lock_handle lh;
39001 +       oid_t oid;
39002 +
39003 +       assert("nikita-723", inode != NULL);
39004 +       assert("nikita-3406", reiser4_inode_get_flag(inode, REISER4_NO_SD));
39005 +
39006 +       ref = reiser4_inode_data(inode);
39007 +       spin_lock_inode(inode);
39008 +
39009 +       if (ref->plugin_mask != 0)
39010 +               /* inode has non-standard plugins */
39011 +               inode_set_extension(inode, PLUGIN_STAT);
39012 +       /*
39013 +        * prepare specification of new item to be inserted
39014 +        */
39015 +
39016 +       data.iplug = inode_sd_plugin(inode);
39017 +       data.length = data.iplug->s.sd.save_len(inode);
39018 +       spin_unlock_inode(inode);
39019 +
39020 +       data.data = NULL;
39021 +       data.user = 0;
39022 +/* could be optimized for case where there is only one node format in
39023 + * use in the filesystem, probably there are lots of such
39024 + * places we could optimize for only one node layout.... -Hans */
39025 +       if (data.length > reiser4_tree_by_inode(inode)->nplug->max_item_size()){
39026 +               /* This is silly check, but we don't know actual node where
39027 +                  insertion will go into. */
39028 +               return RETERR(-ENAMETOOLONG);
39029 +       }
39030 +       oid = oid_allocate(inode->i_sb);
39031 +/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be encapsulated into oid_allocate? */
39032 +       if (oid == ABSOLUTE_MAX_OID)
39033 +               return RETERR(-EOVERFLOW);
39034 +
39035 +       set_inode_oid(inode, oid);
39036 +
39037 +       coord_init_zero(&coord);
39038 +       init_lh(&lh);
39039 +
39040 +       result = insert_by_key(reiser4_tree_by_inode(inode),
39041 +                              build_sd_key(inode, &key), &data, &coord, &lh,
39042 +                              /* stat data lives on a leaf level */
39043 +                              LEAF_LEVEL, CBK_UNIQUE);
39044 +
39045 +       /* we don't want to re-check that somebody didn't insert
39046 +          stat-data while we were doing io, because if it did,
39047 +          insert_by_key() returned error. */
39048 +       /* but what _is_ possible is that plugin for inode's stat-data,
39049 +          list of non-standard plugins or their state would change
39050 +          during io, so that stat-data wouldn't fit into sd. To avoid
39051 +          this race we keep inode_state lock. This lock has to be
39052 +          taken each time you access inode in a way that would cause
39053 +          changes in sd size: changing plugins etc.
39054 +        */
39055 +
39056 +       if (result == IBK_INSERT_OK) {
39057 +               coord_clear_iplug(&coord);
39058 +               result = zload(coord.node);
39059 +               if (result == 0) {
39060 +                       /* have we really inserted stat data? */
39061 +                       assert("nikita-725", item_is_statdata(&coord));
39062 +
39063 +                       /* inode was just created. It is inserted into hash
39064 +                          table, but no directory entry was yet inserted into
39065 +                          parent. So, inode is inaccessible through
39066 +                          ->lookup(). All places that directly grab inode
39067 +                          from hash-table (like old knfsd), should check
39068 +                          IMMUTABLE flag that is set by common_create_child.
39069 +                        */
39070 +                       assert("nikita-3240", data.iplug != NULL);
39071 +                       assert("nikita-3241", data.iplug->s.sd.save != NULL);
39072 +                       area = item_body_by_coord(&coord);
39073 +                       result = data.iplug->s.sd.save(inode, &area);
39074 +                       znode_make_dirty(coord.node);
39075 +                       if (result == 0) {
39076 +                               /* object has stat-data now */
39077 +                               reiser4_inode_clr_flag(inode, REISER4_NO_SD);
39078 +                               reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
39079 +                               /* initialise stat-data seal */
39080 +                               reiser4_seal_init(&ref->sd_seal, &coord, &key);
39081 +                               ref->sd_coord = coord;
39082 +                               check_inode_seal(inode, &coord, &key);
39083 +                       } else if (result != -ENOMEM)
39084 +                               /*
39085 +                                * convert any other error code to -EIO to
39086 +                                * avoid confusing user level with unexpected
39087 +                                * errors.
39088 +                                */
39089 +                               result = RETERR(-EIO);
39090 +                       zrelse(coord.node);
39091 +               }
39092 +       }
39093 +       done_lh(&lh);
39094 +
39095 +       if (result != 0)
39096 +               key_warning(&key, inode, result);
39097 +       else
39098 +               oid_count_allocated();
39099 +
39100 +       return result;
39101 +}
39102 +
39103 +/* find sd of inode in a tree, deal with errors */
39104 +int lookup_sd(struct inode *inode /* inode to look sd for */ ,
39105 +             znode_lock_mode lock_mode /* lock mode */ ,
39106 +             coord_t * coord /* resulting coord */ ,
39107 +             lock_handle * lh /* resulting lock handle */ ,
39108 +             const reiser4_key * key /* resulting key */ ,
39109 +             int silent)
39110 +{
39111 +       int result;
39112 +       __u32 flags;
39113 +
39114 +       assert("nikita-1692", inode != NULL);
39115 +       assert("nikita-1693", coord != NULL);
39116 +       assert("nikita-1694", key != NULL);
39117 +
39118 +       /* look for the object's stat data in a tree.
39119 +          This returns in "node" pointer to a locked znode and in "pos"
39120 +          position of an item found in node. Both are only valid if
39121 +          coord_found is returned. */
39122 +       flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
39123 +       flags |= CBK_UNIQUE;
39124 +       /*
39125 +        * traverse tree to find stat data. We cannot use vroot here, because
39126 +        * it only covers _body_ of the file, and stat data don't belong
39127 +        * there.
39128 +        */
39129 +       result = coord_by_key(reiser4_tree_by_inode(inode),
39130 +                             key,
39131 +                             coord,
39132 +                             lh,
39133 +                             lock_mode,
39134 +                             FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL);
39135 +       if (REISER4_DEBUG && result == 0)
39136 +               check_sd_coord(coord, key);
39137 +
39138 +       if (result != 0 && !silent)
39139 +               key_warning(key, inode, result);
39140 +       return result;
39141 +}
39142 +
39143 +static int
39144 +locate_inode_sd(struct inode *inode,
39145 +               reiser4_key * key, coord_t * coord, lock_handle * lh)
39146 +{
39147 +       reiser4_inode *state;
39148 +       seal_t seal;
39149 +       int result;
39150 +
39151 +       assert("nikita-3483", inode != NULL);
39152 +
39153 +       state = reiser4_inode_data(inode);
39154 +       spin_lock_inode(inode);
39155 +       *coord = state->sd_coord;
39156 +       coord_clear_iplug(coord);
39157 +       seal = state->sd_seal;
39158 +       spin_unlock_inode(inode);
39159 +
39160 +       build_sd_key(inode, key);
39161 +       if (reiser4_seal_is_set(&seal)) {
39162 +               /* first, try to use seal */
39163 +               result = reiser4_seal_validate(&seal,
39164 +                                              coord,
39165 +                                              key,
39166 +                                              lh, ZNODE_WRITE_LOCK,
39167 +                                              ZNODE_LOCK_LOPRI);
39168 +               if (result == 0)
39169 +                       check_sd_coord(coord, key);
39170 +       } else
39171 +               result = -E_REPEAT;
39172 +
39173 +       if (result != 0) {
39174 +               coord_init_zero(coord);
39175 +               result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0);
39176 +       }
39177 +       return result;
39178 +}
39179 +
39180 +#if REISER4_DEBUG
39181 +static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
39182 +{
39183 +       return (get_key_locality(k1) == get_key_locality(k2) &&
39184 +               get_key_type(k1) == get_key_type(k2) &&
39185 +               get_key_band(k1) == get_key_band(k2) &&
39186 +               get_key_ordering(k1) == get_key_ordering(k2) &&
39187 +               get_key_objectid(k1) == get_key_objectid(k2));
39188 +}
39189 +
39190 +#include "../tree_walk.h"
39191 +
39192 +/* make some checks before and after stat-data resize operation */
39193 +static int check_sd_resize(struct inode * inode, coord_t * coord,
39194 +                          int length, int progress /* 1 means after resize */)
39195 +{
39196 +       int ret = 0;
39197 +       lock_handle left_lock;
39198 +       coord_t left_coord;
39199 +       reiser4_key left_key;
39200 +       reiser4_key key;
39201 +
39202 +       if (inode_file_plugin(inode) !=
39203 +           file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
39204 +               return 0;
39205 +       if (!length)
39206 +               return 0;
39207 +       if (coord->item_pos != 0)
39208 +               return 0;
39209 +
39210 +       init_lh(&left_lock);
39211 +       ret = reiser4_get_left_neighbor(&left_lock,
39212 +                                       coord->node,
39213 +                                       ZNODE_WRITE_LOCK,
39214 +                                       GN_CAN_USE_UPPER_LEVELS);
39215 +       if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
39216 +           ret == -ENOENT || ret == -EINVAL
39217 +           || ret == -E_DEADLOCK) {
39218 +               ret = 0;
39219 +               goto exit;
39220 +       }
39221 +       ret = zload(left_lock.node);
39222 +       if (ret)
39223 +               goto exit;
39224 +       coord_init_last_unit(&left_coord, left_lock.node);
39225 +       item_key_by_coord(&left_coord, &left_key);
39226 +       item_key_by_coord(coord, &key);
39227 +
39228 +       if (all_but_offset_key_eq(&key, &left_key))
39229 +               /* corruption occured */
39230 +               ret = 1;
39231 +       zrelse(left_lock.node);
39232 + exit:
39233 +       done_lh(&left_lock);
39234 +       return ret;
39235 +}
39236 +#endif
39237 +
39238 +/* update stat-data at @coord */
39239 +static int
39240 +update_sd_at(struct inode *inode, coord_t * coord, reiser4_key * key,
39241 +            lock_handle * lh)
39242 +{
39243 +       int result;
39244 +       reiser4_item_data data;
39245 +       char *area;
39246 +       reiser4_inode *state;
39247 +       znode *loaded;
39248 +
39249 +       state = reiser4_inode_data(inode);
39250 +
39251 +       coord_clear_iplug(coord);
39252 +       result = zload(coord->node);
39253 +       if (result != 0)
39254 +               return result;
39255 +       loaded = coord->node;
39256 +
39257 +       spin_lock_inode(inode);
39258 +       assert("nikita-728", inode_sd_plugin(inode) != NULL);
39259 +       data.iplug = inode_sd_plugin(inode);
39260 +
39261 +       /* if inode has non-standard plugins, add appropriate stat data
39262 +        * extension */
39263 +       if (state->extmask & (1 << PLUGIN_STAT)) {
39264 +               if (state->plugin_mask == 0)
39265 +                       inode_clr_extension(inode, PLUGIN_STAT);
39266 +       } else if (state->plugin_mask != 0)
39267 +               inode_set_extension(inode, PLUGIN_STAT);
39268 +
39269 +       if (state->extmask & (1 << HEIR_STAT)) {
39270 +               if (state->heir_mask == 0)
39271 +                       inode_clr_extension(inode, HEIR_STAT);
39272 +       } else if (state->heir_mask != 0)
39273 +                       inode_set_extension(inode, HEIR_STAT);
39274 +
39275 +       /* data.length is how much space to add to (or remove
39276 +          from if negative) sd */
39277 +       if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
39278 +               /* recalculate stat-data length */
39279 +               data.length =
39280 +                   data.iplug->s.sd.save_len(inode) -
39281 +                   item_length_by_coord(coord);
39282 +               reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
39283 +       } else
39284 +               data.length = 0;
39285 +       spin_unlock_inode(inode);
39286 +
39287 +       /* if on-disk stat data is of different length than required
39288 +          for this inode, resize it */
39289 +
39290 +       if (data.length != 0) {
39291 +               data.data = NULL;
39292 +               data.user = 0;
39293 +
39294 +               assert("edward-1441",
39295 +                      !check_sd_resize(inode, coord,
39296 +                                       data.length, 0/* before resize */));
39297 +
39298 +               /* insertion code requires that insertion point (coord) was
39299 +                * between units. */
39300 +               coord->between = AFTER_UNIT;
39301 +               result = reiser4_resize_item(coord, &data, key, lh,
39302 +                                            COPI_DONT_SHIFT_LEFT);
39303 +               if (result != 0) {
39304 +                       key_warning(key, inode, result);
39305 +                       zrelse(loaded);
39306 +                       return result;
39307 +               }
39308 +               if (loaded != coord->node) {
39309 +                 /* reiser4_resize_item moved coord to another node.
39310 +                    Zload it */
39311 +                       zrelse(loaded);
39312 +                       coord_clear_iplug(coord);
39313 +                       result = zload(coord->node);
39314 +                       if (result != 0)
39315 +                               return result;
39316 +                       loaded = coord->node;
39317 +               }
39318 +               assert("edward-1442",
39319 +                      !check_sd_resize(inode, coord,
39320 +                                       data.length, 1/* after resize */));
39321 +       }
39322 +       area = item_body_by_coord(coord);
39323 +       spin_lock_inode(inode);
39324 +       result = data.iplug->s.sd.save(inode, &area);
39325 +       znode_make_dirty(coord->node);
39326 +
39327 +       /* re-initialise stat-data seal */
39328 +
39329 +       /*
39330 +        * coord.between was possibly skewed from AT_UNIT when stat-data size
39331 +        * was changed and new extensions were pasted into item.
39332 +        */
39333 +       coord->between = AT_UNIT;
39334 +       reiser4_seal_init(&state->sd_seal, coord, key);
39335 +       state->sd_coord = *coord;
39336 +       spin_unlock_inode(inode);
39337 +       check_inode_seal(inode, coord, key);
39338 +       zrelse(loaded);
39339 +       return result;
39340 +}
39341 +
39342 +/* Update existing stat-data in a tree. Called with inode state locked. Return
39343 +   inode state locked. */
39344 +static int update_sd(struct inode *inode /* inode to update sd for */ )
39345 +{
39346 +       int result;
39347 +       reiser4_key key;
39348 +       coord_t coord;
39349 +       lock_handle lh;
39350 +
39351 +       assert("nikita-726", inode != NULL);
39352 +
39353 +       /* no stat-data, nothing to update?! */
39354 +       assert("nikita-3482", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
39355 +
39356 +       init_lh(&lh);
39357 +
39358 +       result = locate_inode_sd(inode, &key, &coord, &lh);
39359 +       if (result == 0)
39360 +               result = update_sd_at(inode, &coord, &key, &lh);
39361 +       done_lh(&lh);
39362 +
39363 +       return result;
39364 +}
39365 +
39366 +/* helper for reiser4_delete_object_common and reiser4_delete_dir_common.
39367 +   Remove object stat data. Space for that must be reserved by caller before
39368 +*/
39369 +static int
39370 +common_object_delete_no_reserve(struct inode *inode /* object to remove */ )
39371 +{
39372 +       int result;
39373 +
39374 +       assert("nikita-1477", inode != NULL);
39375 +
39376 +       if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
39377 +               reiser4_key sd_key;
39378 +
39379 +               DQUOT_FREE_INODE(inode);
39380 +               DQUOT_DROP(inode);
39381 +
39382 +               build_sd_key(inode, &sd_key);
39383 +               result =
39384 +                   reiser4_cut_tree(reiser4_tree_by_inode(inode),
39385 +                                    &sd_key, &sd_key, NULL, 0);
39386 +               if (result == 0) {
39387 +                       reiser4_inode_set_flag(inode, REISER4_NO_SD);
39388 +                       result = oid_release(inode->i_sb, get_inode_oid(inode));
39389 +                       if (result == 0) {
39390 +                               oid_count_released();
39391 +
39392 +                               result = safe_link_del(reiser4_tree_by_inode(inode),
39393 +                                                      get_inode_oid(inode),
39394 +                                                      SAFE_UNLINK);
39395 +                       }
39396 +               }
39397 +       } else
39398 +               result = 0;
39399 +       return result;
39400 +}
39401 +
39402 +/* helper for safelink_common */
39403 +static int process_truncate(struct inode *inode, __u64 size)
39404 +{
39405 +       int result;
39406 +       struct iattr attr;
39407 +       file_plugin *fplug;
39408 +       reiser4_context *ctx;
39409 +       struct dentry dentry;
39410 +
39411 +       assert("vs-21", is_in_reiser4_context());
39412 +       ctx = reiser4_init_context(inode->i_sb);
39413 +       assert("vs-22", !IS_ERR(ctx));
39414 +
39415 +       attr.ia_size = size;
39416 +       attr.ia_valid = ATTR_SIZE | ATTR_CTIME;
39417 +       fplug = inode_file_plugin(inode);
39418 +
39419 +       mutex_lock(&inode->i_mutex);
39420 +       assert("vs-1704", get_current_context()->trans->atom == NULL);
39421 +       dentry.d_inode = inode;
39422 +       result = inode->i_op->setattr(&dentry, &attr);
39423 +       mutex_unlock(&inode->i_mutex);
39424 +
39425 +       context_set_commit_async(ctx);
39426 +       reiser4_exit_context(ctx);
39427 +
39428 +       return result;
39429 +}
39430 +
39431 +/*
39432 +  Local variables:
39433 +  c-indentation-style: "K&R"
39434 +  mode-name: "LC"
39435 +  c-basic-offset: 8
39436 +  tab-width: 8
39437 +  fill-column: 80
39438 +  scroll-step: 1
39439 +  End:
39440 +*/
39441 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/hash.c linux-2.6.27/fs/reiser4/plugin/hash.c
39442 --- linux-2.6.27.orig/fs/reiser4/plugin/hash.c  1970-01-01 03:00:00.000000000 +0300
39443 +++ linux-2.6.27/fs/reiser4/plugin/hash.c       2008-10-12 18:20:01.000000000 +0400
39444 @@ -0,0 +1,353 @@
39445 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
39446 + * reiser4/README */
39447 +
39448 +/* Hash functions */
39449 +
39450 +#include "../debug.h"
39451 +#include "plugin_header.h"
39452 +#include "plugin.h"
39453 +#include "../super.h"
39454 +#include "../inode.h"
39455 +
39456 +#include <linux/types.h>
39457 +
39458 +/* old rupasov (yura) hash */
39459 +static __u64 hash_rupasov(const unsigned char *name /* name to hash */ ,
39460 +                         int len /* @name's length */ )
39461 +{
39462 +       int i;
39463 +       int j;
39464 +       int pow;
39465 +       __u64 a;
39466 +       __u64 c;
39467 +
39468 +       assert("nikita-672", name != NULL);
39469 +       assert("nikita-673", len >= 0);
39470 +
39471 +       for (pow = 1, i = 1; i < len; ++i)
39472 +               pow = pow * 10;
39473 +
39474 +       if (len == 1)
39475 +               a = name[0] - 48;
39476 +       else
39477 +               a = (name[0] - 48) * pow;
39478 +
39479 +       for (i = 1; i < len; ++i) {
39480 +               c = name[i] - 48;
39481 +               for (pow = 1, j = i; j < len - 1; ++j)
39482 +                       pow = pow * 10;
39483 +               a = a + c * pow;
39484 +       }
39485 +       for (; i < 40; ++i) {
39486 +               c = '0' - 48;
39487 +               for (pow = 1, j = i; j < len - 1; ++j)
39488 +                       pow = pow * 10;
39489 +               a = a + c * pow;
39490 +       }
39491 +
39492 +       for (; i < 256; ++i) {
39493 +               c = i;
39494 +               for (pow = 1, j = i; j < len - 1; ++j)
39495 +                       pow = pow * 10;
39496 +               a = a + c * pow;
39497 +       }
39498 +
39499 +       a = a << 7;
39500 +       return a;
39501 +}
39502 +
39503 +/* r5 hash */
39504 +static __u64 hash_r5(const unsigned char *name /* name to hash */ ,
39505 +                    int len UNUSED_ARG /* @name's length */ )
39506 +{
39507 +       __u64 a = 0;
39508 +
39509 +       assert("nikita-674", name != NULL);
39510 +       assert("nikita-675", len >= 0);
39511 +
39512 +       while (*name) {
39513 +               a += *name << 4;
39514 +               a += *name >> 4;
39515 +               a *= 11;
39516 +               name++;
39517 +       }
39518 +       return a;
39519 +}
39520 +
39521 +/* Keyed 32-bit hash function using TEA in a Davis-Meyer function
39522 +     H0 = Key
39523 +     Hi = E Mi(Hi-1) + Hi-1
39524 +
39525 +   (see Applied Cryptography, 2nd edition, p448).
39526 +
39527 +   Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
39528 +
39529 +   Jeremy has agreed to the contents of reiserfs/README. -Hans
39530 +
39531 +   This code was blindly upgraded to __u64 by s/__u32/__u64/g.
39532 +*/
39533 +static __u64 hash_tea(const unsigned char *name /* name to hash */ ,
39534 +                     int len /* @name's length */ )
39535 +{
39536 +       __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u };
39537 +
39538 +       __u64 h0 = k[0], h1 = k[1];
39539 +       __u64 a, b, c, d;
39540 +       __u64 pad;
39541 +       int i;
39542 +
39543 +       assert("nikita-676", name != NULL);
39544 +       assert("nikita-677", len >= 0);
39545 +
39546 +#define DELTA 0x9E3779B9u
39547 +#define FULLROUNDS 10          /* 32 is overkill, 16 is strong crypto */
39548 +#define PARTROUNDS 6           /* 6 gets complete mixing */
39549 +
39550 +/* a, b, c, d - data; h0, h1 - accumulated hash */
39551 +#define TEACORE(rounds)                                                        \
39552 +       do {                                                            \
39553 +               __u64 sum = 0;                                          \
39554 +               int n = rounds;                                         \
39555 +               __u64 b0, b1;                                           \
39556 +                                                                       \
39557 +               b0 = h0;                                                \
39558 +               b1 = h1;                                                \
39559 +                                                                       \
39560 +               do                                                      \
39561 +               {                                                       \
39562 +                       sum += DELTA;                                   \
39563 +                       b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
39564 +                       b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
39565 +               } while(--n);                                           \
39566 +                                                                       \
39567 +               h0 += b0;                                               \
39568 +               h1 += b1;                                               \
39569 +       } while(0)
39570 +
39571 +       pad = (__u64) len | ((__u64) len << 8);
39572 +       pad |= pad << 16;
39573 +
39574 +       while (len >= 16) {
39575 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39576 +                   16 | (__u64) name[3] << 24;
39577 +               b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
39578 +                   16 | (__u64) name[7] << 24;
39579 +               c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
39580 +                   16 | (__u64) name[11] << 24;
39581 +               d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14]
39582 +                   << 16 | (__u64) name[15] << 24;
39583 +
39584 +               TEACORE(PARTROUNDS);
39585 +
39586 +               len -= 16;
39587 +               name += 16;
39588 +       }
39589 +
39590 +       if (len >= 12) {
39591 +               //assert(len < 16);
39592 +               if (len >= 16)
39593 +                       *(int *)0 = 0;
39594 +
39595 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39596 +                   16 | (__u64) name[3] << 24;
39597 +               b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
39598 +                   16 | (__u64) name[7] << 24;
39599 +               c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
39600 +                   16 | (__u64) name[11] << 24;
39601 +
39602 +               d = pad;
39603 +               for (i = 12; i < len; i++) {
39604 +                       d <<= 8;
39605 +                       d |= name[i];
39606 +               }
39607 +       } else if (len >= 8) {
39608 +               //assert(len < 12);
39609 +               if (len >= 12)
39610 +                       *(int *)0 = 0;
39611 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39612 +                   16 | (__u64) name[3] << 24;
39613 +               b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
39614 +                   16 | (__u64) name[7] << 24;
39615 +
39616 +               c = d = pad;
39617 +               for (i = 8; i < len; i++) {
39618 +                       c <<= 8;
39619 +                       c |= name[i];
39620 +               }
39621 +       } else if (len >= 4) {
39622 +               //assert(len < 8);
39623 +               if (len >= 8)
39624 +                       *(int *)0 = 0;
39625 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
39626 +                   16 | (__u64) name[3] << 24;
39627 +
39628 +               b = c = d = pad;
39629 +               for (i = 4; i < len; i++) {
39630 +                       b <<= 8;
39631 +                       b |= name[i];
39632 +               }
39633 +       } else {
39634 +               //assert(len < 4);
39635 +               if (len >= 4)
39636 +                       *(int *)0 = 0;
39637 +               a = b = c = d = pad;
39638 +               for (i = 0; i < len; i++) {
39639 +                       a <<= 8;
39640 +                       a |= name[i];
39641 +               }
39642 +       }
39643 +
39644 +       TEACORE(FULLROUNDS);
39645 +
39646 +/*     return 0;*/
39647 +       return h0 ^ h1;
39648 +
39649 +}
39650 +
39651 +/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash.
39652 +
39653 +   See http://www.isthe.com/chongo/tech/comp/fnv/ for details.
39654 +
39655 +   Excerpts:
39656 +
39657 +     FNV hashes are designed to be fast while maintaining a low collision
39658 +     rate.
39659 +
39660 +     [This version also seems to preserve lexicographical order locally.]
39661 +
39662 +     FNV hash algorithms and source code have been released into the public
39663 +     domain.
39664 +
39665 +*/
39666 +static __u64 hash_fnv1(const unsigned char *name /* name to hash */ ,
39667 +                      int len UNUSED_ARG /* @name's length */ )
39668 +{
39669 +       unsigned long long a = 0xcbf29ce484222325ull;
39670 +       const unsigned long long fnv_64_prime = 0x100000001b3ull;
39671 +
39672 +       assert("nikita-678", name != NULL);
39673 +       assert("nikita-679", len >= 0);
39674 +
39675 +       /* FNV-1 hash each octet in the buffer */
39676 +       for (; *name; ++name) {
39677 +               /* multiply by the 32 bit FNV magic prime mod 2^64 */
39678 +               a *= fnv_64_prime;
39679 +               /* xor the bottom with the current octet */
39680 +               a ^= (unsigned long long)(*name);
39681 +       }
39682 +       /* return our new hash value */
39683 +       return a;
39684 +}
39685 +
39686 +/* degenerate hash function used to simplify testing of non-unique key
39687 +   handling */
39688 +static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ ,
39689 +                     int len UNUSED_ARG /* @name's length */ )
39690 +{
39691 +       return 0xc0c0c0c010101010ull;
39692 +}
39693 +
39694 +static int change_hash(struct inode *inode,
39695 +                      reiser4_plugin * plugin,
39696 +                      pset_member memb)
39697 +{
39698 +       int result;
39699 +
39700 +       assert("nikita-3503", inode != NULL);
39701 +       assert("nikita-3504", plugin != NULL);
39702 +
39703 +       assert("nikita-3505", is_reiser4_inode(inode));
39704 +       assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE);
39705 +
39706 +       if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
39707 +               return RETERR(-EINVAL);
39708 +
39709 +       result = 0;
39710 +       if (inode_hash_plugin(inode) == NULL ||
39711 +           inode_hash_plugin(inode)->h.id != plugin->h.id) {
39712 +               if (is_dir_empty(inode) == 0)
39713 +                       result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
39714 +                                                PSET_HASH, plugin);
39715 +               else
39716 +                       result = RETERR(-ENOTEMPTY);
39717 +
39718 +       }
39719 +       return result;
39720 +}
39721 +
39722 +static reiser4_plugin_ops hash_plugin_ops = {
39723 +       .init = NULL,
39724 +       .load = NULL,
39725 +       .save_len = NULL,
39726 +       .save = NULL,
39727 +       .change = change_hash
39728 +};
39729 +
39730 +/* hash plugins */
39731 +hash_plugin hash_plugins[LAST_HASH_ID] = {
39732 +       [RUPASOV_HASH_ID] = {
39733 +               .h = {
39734 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
39735 +                       .id = RUPASOV_HASH_ID,
39736 +                       .pops = &hash_plugin_ops,
39737 +                       .label = "rupasov",
39738 +                       .desc = "Original Yura's hash",
39739 +                       .linkage = {NULL, NULL}
39740 +               },
39741 +               .hash = hash_rupasov
39742 +       },
39743 +       [R5_HASH_ID] = {
39744 +               .h = {
39745 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
39746 +                       .id = R5_HASH_ID,
39747 +                       .pops = &hash_plugin_ops,
39748 +                       .label = "r5",
39749 +                       .desc = "r5 hash",
39750 +                       .linkage = {NULL, NULL}
39751 +               },
39752 +               .hash = hash_r5
39753 +       },
39754 +       [TEA_HASH_ID] = {
39755 +               .h = {
39756 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
39757 +                       .id = TEA_HASH_ID,
39758 +                       .pops = &hash_plugin_ops,
39759 +                       .label = "tea",
39760 +                       .desc = "tea hash",
39761 +                       .linkage = {NULL, NULL}
39762 +               },
39763 +               .hash = hash_tea
39764 +       },
39765 +       [FNV1_HASH_ID] = {
39766 +               .h = {
39767 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
39768 +                       .id = FNV1_HASH_ID,
39769 +                       .pops = &hash_plugin_ops,
39770 +                       .label = "fnv1",
39771 +                       .desc = "fnv1 hash",
39772 +                       .linkage = {NULL, NULL}
39773 +               },
39774 +               .hash = hash_fnv1
39775 +       },
39776 +       [DEGENERATE_HASH_ID] = {
39777 +               .h = {
39778 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
39779 +                       .id = DEGENERATE_HASH_ID,
39780 +                       .pops = &hash_plugin_ops,
39781 +                       .label = "degenerate hash",
39782 +                       .desc = "Degenerate hash: only for testing",
39783 +                       .linkage = {NULL, NULL}
39784 +               },
39785 +               .hash = hash_deg
39786 +       }
39787 +};
39788 +
39789 +/* Make Linus happy.
39790 +   Local variables:
39791 +   c-indentation-style: "K&R"
39792 +   mode-name: "LC"
39793 +   c-basic-offset: 8
39794 +   tab-width: 8
39795 +   fill-column: 120
39796 +   End:
39797 +*/
39798 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/inode_ops.c linux-2.6.27/fs/reiser4/plugin/inode_ops.c
39799 --- linux-2.6.27.orig/fs/reiser4/plugin/inode_ops.c     1970-01-01 03:00:00.000000000 +0300
39800 +++ linux-2.6.27/fs/reiser4/plugin/inode_ops.c  2008-10-12 18:20:01.000000000 +0400
39801 @@ -0,0 +1,896 @@
39802 +/*
39803 + * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README
39804 + */
39805 +
39806 +/*
39807 + * this file contains typical implementations for most of methods of struct
39808 + * inode_operations
39809 + */
39810 +
39811 +#include "../inode.h"
39812 +#include "../safe_link.h"
39813 +
39814 +#include <linux/quotaops.h>
39815 +#include <linux/namei.h>
39816 +
39817 +static int create_vfs_object(struct inode *parent, struct dentry *dentry,
39818 +                     reiser4_object_create_data *data);
39819 +
39820 +/**
39821 + * reiser4_create_common - create of inode operations
39822 + * @parent: inode of parent directory
39823 + * @dentry: dentry of new object to create
39824 + * @mode: the permissions to use
39825 + * @nameidata:
39826 + *
39827 + * This is common implementation of vfs's create method of struct
39828 + * inode_operations.
39829 + * Creates regular file using file plugin from parent directory plugin set.
39830 + */
39831 +int reiser4_create_common(struct inode *parent, struct dentry *dentry,
39832 +                         int mode, struct nameidata *nameidata)
39833 +{
39834 +       reiser4_object_create_data data;
39835 +       file_plugin *fplug;
39836 +
39837 +       memset(&data, 0, sizeof data);
39838 +       data.mode = S_IFREG | mode;
39839 +       fplug = child_create_plugin(parent) ? : inode_create_plugin(parent);
39840 +       if (!plugin_of_group(fplug, REISER4_REGULAR_FILE)) {
39841 +               warning("vpf-1900", "'%s' is not a regular file plugin.",
39842 +                       fplug->h.label);
39843 +               return RETERR(-EIO);
39844 +       }
39845 +       data.id = fplug->h.id;
39846 +       return create_vfs_object(parent, dentry, &data);
39847 +}
39848 +
39849 +int reiser4_lookup_name(struct inode *dir, struct dentry *, reiser4_key *);
39850 +void check_light_weight(struct inode *inode, struct inode *parent);
39851 +
39852 +/**
39853 + * reiser4_lookup_common - lookup of inode operations
39854 + * @parent: inode of directory to lookup into
39855 + * @dentry: name to look for
39856 + * @nameidata:
39857 + *
39858 + * This is common implementation of vfs's lookup method of struct
39859 + * inode_operations.
39860 + */
39861 +struct dentry *reiser4_lookup_common(struct inode *parent,
39862 +                                    struct dentry *dentry,
39863 +                                    struct nameidata *nameidata)
39864 +{
39865 +       reiser4_context *ctx;
39866 +       int result;
39867 +       struct dentry *new;
39868 +       struct inode *inode;
39869 +       reiser4_dir_entry_desc entry;
39870 +
39871 +       ctx = reiser4_init_context(parent->i_sb);
39872 +       if (IS_ERR(ctx))
39873 +               return (struct dentry *)ctx;
39874 +
39875 +       /* set up operations on dentry. */
39876 +       dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry;
39877 +
39878 +       result = reiser4_lookup_name(parent, dentry, &entry.key);
39879 +       if (result) {
39880 +               context_set_commit_async(ctx);
39881 +               reiser4_exit_context(ctx);
39882 +               if (result == -ENOENT) {
39883 +                       /* object not found */
39884 +                       if (!IS_DEADDIR(parent))
39885 +                               d_add(dentry, NULL);
39886 +                       return NULL;
39887 +               }
39888 +               return ERR_PTR(result);
39889 +       }
39890 +
39891 +       inode = reiser4_iget(parent->i_sb, &entry.key, 0);
39892 +       if (IS_ERR(inode)) {
39893 +               context_set_commit_async(ctx);
39894 +               reiser4_exit_context(ctx);
39895 +               return ERR_PTR(PTR_ERR(inode));
39896 +       }
39897 +
39898 +       /* success */
39899 +       check_light_weight(inode, parent);
39900 +       new = d_splice_alias(inode, dentry);
39901 +       reiser4_iget_complete(inode);
39902 +
39903 +       /* prevent balance_dirty_pages() from being called: we don't want to
39904 +        * do this under directory i_mutex. */
39905 +       context_set_commit_async(ctx);
39906 +       reiser4_exit_context(ctx);
39907 +       return new;
39908 +}
39909 +
39910 +static reiser4_block_nr common_estimate_link(struct inode *parent,
39911 +                                            struct inode *object);
39912 +int reiser4_update_dir(struct inode *);
39913 +
39914 +/**
39915 + * reiser4_link_common - link of inode operations
39916 + * @existing: dentry of object which is to get new name
39917 + * @parent: directory where new name is to be created
39918 + * @newname: new name
39919 + *
39920 + * This is common implementation of vfs's link method of struct
39921 + * inode_operations.
39922 + */
39923 +int reiser4_link_common(struct dentry *existing, struct inode *parent,
39924 +                       struct dentry *newname)
39925 +{
39926 +       reiser4_context *ctx;
39927 +       int result;
39928 +       struct inode *object;
39929 +       dir_plugin *parent_dplug;
39930 +       reiser4_dir_entry_desc entry;
39931 +       reiser4_object_create_data data;
39932 +       reiser4_block_nr reserve;
39933 +
39934 +       ctx = reiser4_init_context(parent->i_sb);
39935 +       if (IS_ERR(ctx))
39936 +               return PTR_ERR(ctx);
39937 +
39938 +       assert("nikita-1431", existing != NULL);
39939 +       assert("nikita-1432", parent != NULL);
39940 +       assert("nikita-1433", newname != NULL);
39941 +
39942 +       object = existing->d_inode;
39943 +       assert("nikita-1434", object != NULL);
39944 +
39945 +       /* check for race with create_object() */
39946 +       if (reiser4_inode_get_flag(object, REISER4_IMMUTABLE)) {
39947 +               context_set_commit_async(ctx);
39948 +               reiser4_exit_context(ctx);
39949 +               return RETERR(-E_REPEAT);
39950 +       }
39951 +
39952 +       parent_dplug = inode_dir_plugin(parent);
39953 +
39954 +       memset(&entry, 0, sizeof entry);
39955 +       entry.obj = object;
39956 +
39957 +       data.mode = object->i_mode;
39958 +       data.id = inode_file_plugin(object)->h.id;
39959 +
39960 +       reserve = common_estimate_link(parent, existing->d_inode);
39961 +       if ((__s64) reserve < 0) {
39962 +               context_set_commit_async(ctx);
39963 +               reiser4_exit_context(ctx);
39964 +               return reserve;
39965 +       }
39966 +
39967 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
39968 +               context_set_commit_async(ctx);
39969 +               reiser4_exit_context(ctx);
39970 +               return RETERR(-ENOSPC);
39971 +       }
39972 +
39973 +       /*
39974 +        * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It
39975 +        * means that link(2) can race against unlink(2) or rename(2), and
39976 +        * inode is dead (->i_nlink == 0) when reiser4_link() is entered.
39977 +        *
39978 +        * For such inode we have to undo special processing done in
39979 +        * reiser4_unlink() viz. creation of safe-link.
39980 +        */
39981 +       if (unlikely(object->i_nlink == 0)) {
39982 +               result = safe_link_del(reiser4_tree_by_inode(object),
39983 +                                      get_inode_oid(object), SAFE_UNLINK);
39984 +               if (result != 0) {
39985 +                       context_set_commit_async(ctx);
39986 +                       reiser4_exit_context(ctx);
39987 +                       return result;
39988 +               }
39989 +       }
39990 +
39991 +       /* increment nlink of @existing and update its stat data */
39992 +       result = reiser4_add_nlink(object, parent, 1);
39993 +       if (result == 0) {
39994 +               /* add entry to the parent */
39995 +               result =
39996 +                   parent_dplug->add_entry(parent, newname, &data, &entry);
39997 +               if (result != 0) {
39998 +                       /* failed to add entry to the parent, decrement nlink
39999 +                          of @existing */
40000 +                       reiser4_del_nlink(object, parent, 1);
40001 +                       /*
40002 +                        * now, if that failed, we have a file with too big
40003 +                        * nlink---space leak, much better than directory
40004 +                        * entry pointing to nowhere
40005 +                        */
40006 +               }
40007 +       }
40008 +       if (result == 0) {
40009 +               atomic_inc(&object->i_count);
40010 +               /*
40011 +                * Upon successful completion, link() shall mark for update
40012 +                * the st_ctime field of the file. Also, the st_ctime and
40013 +                * st_mtime fields of the directory that contains the new
40014 +                * entry shall be marked for update. --SUS
40015 +                */
40016 +               result = reiser4_update_dir(parent);
40017 +       }
40018 +       if (result == 0)
40019 +               d_instantiate(newname, existing->d_inode);
40020 +
40021 +       context_set_commit_async(ctx);
40022 +       reiser4_exit_context(ctx);
40023 +       return result;
40024 +}
40025 +
40026 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim);
40027 +
40028 +/**
40029 + * reiser4_unlink_common - unlink of inode operations
40030 + * @parent: inode of directory to remove name from
40031 + * @victim: name to be removed
40032 + *
40033 + * This is common implementation of vfs's unlink method of struct
40034 + * inode_operations.
40035 + */
40036 +int reiser4_unlink_common(struct inode *parent, struct dentry *victim)
40037 +{
40038 +       reiser4_context *ctx;
40039 +       int result;
40040 +       struct inode *object;
40041 +       file_plugin *fplug;
40042 +
40043 +       ctx = reiser4_init_context(parent->i_sb);
40044 +       if (IS_ERR(ctx))
40045 +               return PTR_ERR(ctx);
40046 +
40047 +       object = victim->d_inode;
40048 +       fplug = inode_file_plugin(object);
40049 +       assert("nikita-2882", fplug->detach != NULL);
40050 +
40051 +       result = unlink_check_and_grab(parent, victim);
40052 +       if (result != 0) {
40053 +               context_set_commit_async(ctx);
40054 +               reiser4_exit_context(ctx);
40055 +               return result;
40056 +       }
40057 +
40058 +       result = fplug->detach(object, parent);
40059 +       if (result == 0) {
40060 +               dir_plugin *parent_dplug;
40061 +               reiser4_dir_entry_desc entry;
40062 +
40063 +               parent_dplug = inode_dir_plugin(parent);
40064 +               memset(&entry, 0, sizeof entry);
40065 +
40066 +               /* first, delete directory entry */
40067 +               result = parent_dplug->rem_entry(parent, victim, &entry);
40068 +               if (result == 0) {
40069 +                       /*
40070 +                        * if name was removed successfully, we _have_ to
40071 +                        * return 0 from this function, because upper level
40072 +                        * caller (vfs_{rmdir,unlink}) expect this.
40073 +                        *
40074 +                        * now that directory entry is removed, update
40075 +                        * stat-data
40076 +                        */
40077 +                       reiser4_del_nlink(object, parent, 1);
40078 +                       /*
40079 +                        * Upon successful completion, unlink() shall mark for
40080 +                        * update the st_ctime and st_mtime fields of the
40081 +                        * parent directory. Also, if the file's link count is
40082 +                        * not 0, the st_ctime field of the file shall be
40083 +                        * marked for update. --SUS
40084 +                        */
40085 +                       reiser4_update_dir(parent);
40086 +                       /* add safe-link for this file */
40087 +                       if (object->i_nlink == 0)
40088 +                               safe_link_add(object, SAFE_UNLINK);
40089 +               }
40090 +       }
40091 +
40092 +       if (unlikely(result != 0)) {
40093 +               if (result != -ENOMEM)
40094 +                       warning("nikita-3398", "Cannot unlink %llu (%i)",
40095 +                               (unsigned long long)get_inode_oid(object),
40096 +                               result);
40097 +               /* if operation failed commit pending inode modifications to
40098 +                * the stat-data */
40099 +               reiser4_update_sd(object);
40100 +               reiser4_update_sd(parent);
40101 +       }
40102 +
40103 +       reiser4_release_reserved(object->i_sb);
40104 +
40105 +       /* @object's i_ctime was updated by ->rem_link() method(). */
40106 +
40107 +       /* @victim can be already removed from the disk by this time. Inode is
40108 +          then marked so that iput() wouldn't try to remove stat data. But
40109 +          inode itself is still there.
40110 +        */
40111 +
40112 +       /*
40113 +        * we cannot release directory semaphore here, because name has
40114 +        * already been deleted, but dentry (@victim) still exists.  Prevent
40115 +        * balance_dirty_pages() from being called on exiting this context: we
40116 +        * don't want to do this under directory i_mutex.
40117 +        */
40118 +       context_set_commit_async(ctx);
40119 +       reiser4_exit_context(ctx);
40120 +       return result;
40121 +}
40122 +
40123 +/**
40124 + * reiser4_symlink_common - symlink of inode operations
40125 + * @parent: inode of parent directory
40126 + * @dentry: dentry of object to be created
40127 + * @linkname: string symlink is to contain
40128 + *
40129 + * This is common implementation of vfs's symlink method of struct
40130 + * inode_operations.
40131 + * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID.
40132 + */
40133 +int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
40134 +                          const char *linkname)
40135 +{
40136 +       reiser4_object_create_data data;
40137 +
40138 +       memset(&data, 0, sizeof data);
40139 +       data.name = linkname;
40140 +       data.id = SYMLINK_FILE_PLUGIN_ID;
40141 +       data.mode = S_IFLNK | S_IRWXUGO;
40142 +       return create_vfs_object(parent, dentry, &data);
40143 +}
40144 +
40145 +/**
40146 + * reiser4_mkdir_common - mkdir of inode operations
40147 + * @parent: inode of parent directory
40148 + * @dentry: dentry of object to be created
40149 + * @mode: the permissions to use
40150 + *
40151 + * This is common implementation of vfs's mkdir method of struct
40152 + * inode_operations.
40153 + * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID.
40154 + */
40155 +int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode)
40156 +{
40157 +       reiser4_object_create_data data;
40158 +
40159 +       memset(&data, 0, sizeof data);
40160 +       data.mode = S_IFDIR | mode;
40161 +       data.id = DIRECTORY_FILE_PLUGIN_ID;
40162 +       return create_vfs_object(parent, dentry, &data);
40163 +}
40164 +
40165 +/**
40166 + * reiser4_mknod_common - mknod of inode operations
40167 + * @parent: inode of parent directory
40168 + * @dentry: dentry of object to be created
40169 + * @mode: the permissions to use and file type
40170 + * @rdev: minor and major of new device file
40171 + *
40172 + * This is common implementation of vfs's mknod method of struct
40173 + * inode_operations.
40174 + * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID.
40175 + */
40176 +int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
40177 +                        int mode, dev_t rdev)
40178 +{
40179 +       reiser4_object_create_data data;
40180 +
40181 +       memset(&data, 0, sizeof data);
40182 +       data.mode = mode;
40183 +       data.rdev = rdev;
40184 +       data.id = SPECIAL_FILE_PLUGIN_ID;
40185 +       return create_vfs_object(parent, dentry, &data);
40186 +}
40187 +
40188 +/*
40189 + * implementation of vfs's rename method of struct inode_operations for typical
40190 + * directory is in inode_ops_rename.c
40191 + */
40192 +
40193 +/**
40194 + * reiser4_follow_link_common - follow_link of inode operations
40195 + * @dentry: dentry of symlink
40196 + * @data:
40197 + *
40198 + * This is common implementation of vfs's followlink method of struct
40199 + * inode_operations.
40200 + * Assumes that inode's i_private points to the content of symbolic link.
40201 + */
40202 +void *reiser4_follow_link_common(struct dentry *dentry, struct nameidata *nd)
40203 +{
40204 +       assert("vs-851", S_ISLNK(dentry->d_inode->i_mode));
40205 +
40206 +       if (!dentry->d_inode->i_private
40207 +           || !reiser4_inode_get_flag(dentry->d_inode,
40208 +                                      REISER4_GENERIC_PTR_USED))
40209 +               return ERR_PTR(RETERR(-EINVAL));
40210 +       nd_set_link(nd, dentry->d_inode->i_private);
40211 +       return NULL;
40212 +}
40213 +
40214 +/**
40215 + * reiser4_permission_common - permission of inode operations
40216 + * @inode: inode to check permissions for
40217 + * @mask: mode bits to check permissions for
40218 + * @nameidata:
40219 + *
40220 + * Uses generic function to check for rwx permissions.
40221 + */
40222 +int reiser4_permission_common(struct inode *inode, int mask)
40223 +{
40224 +       return generic_permission(inode, mask, NULL);
40225 +}
40226 +
40227 +static int setattr_reserve(reiser4_tree *);
40228 +
40229 +/* this is common implementation of vfs's setattr method of struct
40230 +   inode_operations
40231 +*/
40232 +int reiser4_setattr_common(struct dentry *dentry, struct iattr *attr)
40233 +{
40234 +       reiser4_context *ctx;
40235 +       struct inode *inode;
40236 +       int result;
40237 +
40238 +       inode = dentry->d_inode;
40239 +       result = inode_change_ok(inode, attr);
40240 +       if (result)
40241 +               return result;
40242 +
40243 +       ctx = reiser4_init_context(inode->i_sb);
40244 +       if (IS_ERR(ctx))
40245 +               return PTR_ERR(ctx);
40246 +
40247 +       assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE));
40248 +
40249 +       /*
40250 +        * grab disk space and call standard inode_setattr().
40251 +        */
40252 +       result = setattr_reserve(reiser4_tree_by_inode(inode));
40253 +       if (!result) {
40254 +               if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid)
40255 +                   || (attr->ia_valid & ATTR_GID
40256 +                       && attr->ia_gid != inode->i_gid)) {
40257 +                       result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
40258 +                       if (result) {
40259 +                               context_set_commit_async(ctx);
40260 +                               reiser4_exit_context(ctx);
40261 +                               return result;
40262 +                       }
40263 +               }
40264 +               result = inode_setattr(inode, attr);
40265 +               if (!result)
40266 +                       reiser4_update_sd(inode);
40267 +       }
40268 +
40269 +       context_set_commit_async(ctx);
40270 +       reiser4_exit_context(ctx);
40271 +       return result;
40272 +}
40273 +
40274 +/* this is common implementation of vfs's getattr method of struct
40275 +   inode_operations
40276 +*/
40277 +int reiser4_getattr_common(struct vfsmount *mnt UNUSED_ARG,
40278 +                          struct dentry *dentry, struct kstat *stat)
40279 +{
40280 +       struct inode *obj;
40281 +
40282 +       assert("nikita-2298", dentry != NULL);
40283 +       assert("nikita-2299", stat != NULL);
40284 +       assert("nikita-2300", dentry->d_inode != NULL);
40285 +
40286 +       obj = dentry->d_inode;
40287 +
40288 +       stat->dev = obj->i_sb->s_dev;
40289 +       stat->ino = oid_to_uino(get_inode_oid(obj));
40290 +       stat->mode = obj->i_mode;
40291 +       /* don't confuse userland with huge nlink. This is not entirely
40292 +        * correct, because nlink_t is not necessary 16 bit signed. */
40293 +       stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff);
40294 +       stat->uid = obj->i_uid;
40295 +       stat->gid = obj->i_gid;
40296 +       stat->rdev = obj->i_rdev;
40297 +       stat->atime = obj->i_atime;
40298 +       stat->mtime = obj->i_mtime;
40299 +       stat->ctime = obj->i_ctime;
40300 +       stat->size = obj->i_size;
40301 +       stat->blocks =
40302 +           (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS;
40303 +       /* "preferred" blocksize for efficient file system I/O */
40304 +       stat->blksize = get_super_private(obj->i_sb)->optimal_io_size;
40305 +
40306 +       return 0;
40307 +}
40308 +
40309 +/* Estimate the maximum amount of nodes which might be allocated or changed on
40310 +   typical new object creation. Typical creation consists of calling create
40311 +   method of file plugin, adding directory entry to parent and update parent
40312 +   directory's stat data.
40313 +*/
40314 +static reiser4_block_nr estimate_create_vfs_object(struct inode *parent,       /* parent object */
40315 +                                                  struct inode *object
40316 +                                                  /* object */ )
40317 +{
40318 +       assert("vpf-309", parent != NULL);
40319 +       assert("vpf-307", object != NULL);
40320 +
40321 +       return
40322 +           /* object creation estimation */
40323 +           inode_file_plugin(object)->estimate.create(object) +
40324 +           /* stat data of parent directory estimation */
40325 +           inode_file_plugin(parent)->estimate.update(parent) +
40326 +           /* adding entry estimation */
40327 +           inode_dir_plugin(parent)->estimate.add_entry(parent) +
40328 +           /* to undo in the case of failure */
40329 +           inode_dir_plugin(parent)->estimate.rem_entry(parent);
40330 +}
40331 +
40332 +/* Create child in directory.
40333 +
40334 +   . get object's plugin
40335 +   . get fresh inode
40336 +   . initialize inode
40337 +   . add object's stat-data
40338 +   . initialize object's directory
40339 +   . add entry to the parent
40340 +   . instantiate dentry
40341 +
40342 +*/
40343 +static int do_create_vfs_child(reiser4_object_create_data * data,      /* parameters of new
40344 +                                                                          object */
40345 +                              struct inode **retobj)
40346 +{
40347 +       int result;
40348 +
40349 +       struct dentry *dentry;  /* parent object */
40350 +       struct inode *parent;   /* new name */
40351 +
40352 +       dir_plugin *par_dir;    /* directory plugin on the parent */
40353 +       dir_plugin *obj_dir;    /* directory plugin on the new object */
40354 +       file_plugin *obj_plug;  /* object plugin on the new object */
40355 +       struct inode *object;   /* new object */
40356 +       reiser4_block_nr reserve;
40357 +
40358 +       reiser4_dir_entry_desc entry;   /* new directory entry */
40359 +
40360 +       assert("nikita-1420", data != NULL);
40361 +       parent = data->parent;
40362 +       dentry = data->dentry;
40363 +
40364 +       assert("nikita-1418", parent != NULL);
40365 +       assert("nikita-1419", dentry != NULL);
40366 +
40367 +       /* check, that name is acceptable for parent */
40368 +       par_dir = inode_dir_plugin(parent);
40369 +       if (par_dir->is_name_acceptable &&
40370 +           !par_dir->is_name_acceptable(parent,
40371 +                                        dentry->d_name.name,
40372 +                                        (int)dentry->d_name.len))
40373 +               return RETERR(-ENAMETOOLONG);
40374 +
40375 +       result = 0;
40376 +       obj_plug = file_plugin_by_id((int)data->id);
40377 +       if (obj_plug == NULL) {
40378 +               warning("nikita-430", "Cannot find plugin %i", data->id);
40379 +               return RETERR(-ENOENT);
40380 +       }
40381 +       object = new_inode(parent->i_sb);
40382 +       if (object == NULL)
40383 +               return RETERR(-ENOMEM);
40384 +       /* we'll update i_nlink below */
40385 +       object->i_nlink = 0;
40386 +       /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0,
40387 +        * to simplify error handling: if some error occurs before i_ino is
40388 +        * initialized with oid, i_ino should already be set to some
40389 +        * distinguished value. */
40390 +       object->i_ino = 0;
40391 +
40392 +       /* So that on error iput will be called. */
40393 +       *retobj = object;
40394 +
40395 +       if (DQUOT_ALLOC_INODE(object)) {
40396 +               DQUOT_DROP(object);
40397 +               object->i_flags |= S_NOQUOTA;
40398 +               return RETERR(-EDQUOT);
40399 +       }
40400 +
40401 +       memset(&entry, 0, sizeof entry);
40402 +       entry.obj = object;
40403 +
40404 +       set_plugin(&reiser4_inode_data(object)->pset, PSET_FILE,
40405 +                  file_plugin_to_plugin(obj_plug));
40406 +       result = obj_plug->set_plug_in_inode(object, parent, data);
40407 +       if (result) {
40408 +               warning("nikita-431", "Cannot install plugin %i on %llx",
40409 +                       data->id, (unsigned long long)get_inode_oid(object));
40410 +               DQUOT_FREE_INODE(object);
40411 +               object->i_flags |= S_NOQUOTA;
40412 +               return result;
40413 +       }
40414 +
40415 +       /* reget plugin after installation */
40416 +       obj_plug = inode_file_plugin(object);
40417 +
40418 +       if (obj_plug->create_object == NULL) {
40419 +               DQUOT_FREE_INODE(object);
40420 +               object->i_flags |= S_NOQUOTA;
40421 +               return RETERR(-EPERM);
40422 +       }
40423 +
40424 +       /* if any of hash, tail, sd or permission plugins for newly created
40425 +          object are not set yet set them here inheriting them from parent
40426 +          directory
40427 +        */
40428 +       assert("nikita-2070", obj_plug->adjust_to_parent != NULL);
40429 +       result = obj_plug->adjust_to_parent(object,
40430 +                                           parent,
40431 +                                           object->i_sb->s_root->d_inode);
40432 +       if (result == 0)
40433 +               result = finish_pset(object);
40434 +       if (result != 0) {
40435 +               warning("nikita-432", "Cannot inherit from %llx to %llx",
40436 +                       (unsigned long long)get_inode_oid(parent),
40437 +                       (unsigned long long)get_inode_oid(object));
40438 +               DQUOT_FREE_INODE(object);
40439 +               object->i_flags |= S_NOQUOTA;
40440 +               return result;
40441 +       }
40442 +
40443 +       /* setup inode and file-operations for this inode */
40444 +       setup_inode_ops(object, data);
40445 +
40446 +       /* call file plugin's method to initialize plugin specific part of
40447 +        * inode */
40448 +       if (obj_plug->init_inode_data)
40449 +               obj_plug->init_inode_data(object, data, 1 /*create */ );
40450 +
40451 +       /* obtain directory plugin (if any) for new object. */
40452 +       obj_dir = inode_dir_plugin(object);
40453 +       if (obj_dir != NULL && obj_dir->init == NULL) {
40454 +               DQUOT_FREE_INODE(object);
40455 +               object->i_flags |= S_NOQUOTA;
40456 +               return RETERR(-EPERM);
40457 +       }
40458 +
40459 +       reiser4_inode_data(object)->locality_id = get_inode_oid(parent);
40460 +
40461 +       reserve = estimate_create_vfs_object(parent, object);
40462 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
40463 +               DQUOT_FREE_INODE(object);
40464 +               object->i_flags |= S_NOQUOTA;
40465 +               return RETERR(-ENOSPC);
40466 +       }
40467 +
40468 +       /* mark inode `immutable'. We disable changes to the file being
40469 +          created until valid directory entry for it is inserted. Otherwise,
40470 +          if file were expanded and insertion of directory entry fails, we
40471 +          have to remove file, but we only alloted enough space in
40472 +          transaction to remove _empty_ file. 3.x code used to remove stat
40473 +          data in different transaction thus possibly leaking disk space on
40474 +          crash. This all only matters if it's possible to access file
40475 +          without name, for example, by inode number
40476 +        */
40477 +       reiser4_inode_set_flag(object, REISER4_IMMUTABLE);
40478 +
40479 +       /* create empty object, this includes allocation of new objectid. For
40480 +          directories this implies creation of dot and dotdot  */
40481 +       assert("nikita-2265", reiser4_inode_get_flag(object, REISER4_NO_SD));
40482 +
40483 +       /* mark inode as `loaded'. From this point onward
40484 +          reiser4_delete_inode() will try to remove its stat-data. */
40485 +       reiser4_inode_set_flag(object, REISER4_LOADED);
40486 +
40487 +       result = obj_plug->create_object(object, parent, data);
40488 +       if (result != 0) {
40489 +               reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
40490 +               if (result != -ENAMETOOLONG && result != -ENOMEM)
40491 +                       warning("nikita-2219",
40492 +                               "Failed to create sd for %llu",
40493 +                               (unsigned long long)get_inode_oid(object));
40494 +               DQUOT_FREE_INODE(object);
40495 +               object->i_flags |= S_NOQUOTA;
40496 +               return result;
40497 +       }
40498 +
40499 +       if (obj_dir != NULL)
40500 +               result = obj_dir->init(object, parent, data);
40501 +       if (result == 0) {
40502 +               assert("nikita-434", !reiser4_inode_get_flag(object,
40503 +                                                            REISER4_NO_SD));
40504 +               /* insert inode into VFS hash table */
40505 +               insert_inode_hash(object);
40506 +               /* create entry */
40507 +               result = par_dir->add_entry(parent, dentry, data, &entry);
40508 +               if (result == 0) {
40509 +                       result = reiser4_add_nlink(object, parent, 0);
40510 +                       /* If O_CREAT is set and the file did not previously
40511 +                          exist, upon successful completion, open() shall
40512 +                          mark for update the st_atime, st_ctime, and
40513 +                          st_mtime fields of the file and the st_ctime and
40514 +                          st_mtime fields of the parent directory. --SUS
40515 +                        */
40516 +                       /* @object times are already updated by
40517 +                          reiser4_add_nlink() */
40518 +                       if (result == 0)
40519 +                               reiser4_update_dir(parent);
40520 +                       if (result != 0)
40521 +                               /* cleanup failure to add nlink */
40522 +                               par_dir->rem_entry(parent, dentry, &entry);
40523 +               }
40524 +               if (result != 0)
40525 +                       /* cleanup failure to add entry */
40526 +                       obj_plug->detach(object, parent);
40527 +       } else if (result != -ENOMEM)
40528 +               warning("nikita-2219", "Failed to initialize dir for %llu: %i",
40529 +                       (unsigned long long)get_inode_oid(object), result);
40530 +
40531 +       /*
40532 +        * update stat-data, committing all pending modifications to the inode
40533 +        * fields.
40534 +        */
40535 +       reiser4_update_sd(object);
40536 +       if (result != 0) {
40537 +               DQUOT_FREE_INODE(object);
40538 +               object->i_flags |= S_NOQUOTA;
40539 +               /* if everything was ok (result == 0), parent stat-data is
40540 +                * already updated above (update_parent_dir()) */
40541 +               reiser4_update_sd(parent);
40542 +               /* failure to create entry, remove object */
40543 +               obj_plug->delete_object(object);
40544 +       }
40545 +
40546 +       /* file has name now, clear immutable flag */
40547 +       reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
40548 +
40549 +       /* on error, iput() will call ->delete_inode(). We should keep track
40550 +          of the existence of stat-data for this inode and avoid attempt to
40551 +          remove it in reiser4_delete_inode(). This is accomplished through
40552 +          REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags
40553 +        */
40554 +       return result;
40555 +}
40556 +
40557 +/* this is helper for common implementations of reiser4_mkdir, reiser4_create,
40558 +   reiser4_mknod and reiser4_symlink
40559 +*/
40560 +static int
40561 +create_vfs_object(struct inode *parent,
40562 +                 struct dentry *dentry, reiser4_object_create_data * data)
40563 +{
40564 +       reiser4_context *ctx;
40565 +       int result;
40566 +       struct inode *child;
40567 +
40568 +       ctx = reiser4_init_context(parent->i_sb);
40569 +       if (IS_ERR(ctx))
40570 +               return PTR_ERR(ctx);
40571 +       context_set_commit_async(ctx);
40572 +
40573 +       data->parent = parent;
40574 +       data->dentry = dentry;
40575 +       child = NULL;
40576 +       result = do_create_vfs_child(data, &child);
40577 +       if (unlikely(result != 0)) {
40578 +               if (child != NULL) {
40579 +                       reiser4_make_bad_inode(child);
40580 +                       iput(child);
40581 +               }
40582 +       } else
40583 +               d_instantiate(dentry, child);
40584 +
40585 +       reiser4_exit_context(ctx);
40586 +       return result;
40587 +}
40588 +
40589 +/* helper for link_common. Estimate disk space necessary to add a link
40590 +   from @parent to @object
40591 +*/
40592 +static reiser4_block_nr common_estimate_link(struct inode *parent,     /* parent directory */
40593 +                                            struct inode *object
40594 +                                            /* object to which new link is being cerated */
40595 +                                            )
40596 +{
40597 +       reiser4_block_nr res = 0;
40598 +       file_plugin *fplug;
40599 +       dir_plugin *dplug;
40600 +
40601 +       assert("vpf-317", object != NULL);
40602 +       assert("vpf-318", parent != NULL);
40603 +
40604 +       fplug = inode_file_plugin(object);
40605 +       dplug = inode_dir_plugin(parent);
40606 +       /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice instead of multiplying by 2? */
40607 +       /* reiser4_add_nlink(object) */
40608 +       res += fplug->estimate.update(object);
40609 +       /* add_entry(parent) */
40610 +       res += dplug->estimate.add_entry(parent);
40611 +       /* reiser4_del_nlink(object) */
40612 +       res += fplug->estimate.update(object);
40613 +       /* update_dir(parent) */
40614 +       res += inode_file_plugin(parent)->estimate.update(parent);
40615 +       /* safe-link */
40616 +       res += estimate_one_item_removal(reiser4_tree_by_inode(object));
40617 +
40618 +       return res;
40619 +}
40620 +
40621 +/* Estimate disk space necessary to remove a link between @parent and
40622 +   @object.
40623 +*/
40624 +static reiser4_block_nr estimate_unlink(struct inode *parent,  /* parent directory */
40625 +                                       struct inode *object
40626 +                                       /* object to which new link is being cerated */
40627 +                                       )
40628 +{
40629 +       reiser4_block_nr res = 0;
40630 +       file_plugin *fplug;
40631 +       dir_plugin *dplug;
40632 +
40633 +       assert("vpf-317", object != NULL);
40634 +       assert("vpf-318", parent != NULL);
40635 +
40636 +       fplug = inode_file_plugin(object);
40637 +       dplug = inode_dir_plugin(parent);
40638 +
40639 +       /* rem_entry(parent) */
40640 +       res += dplug->estimate.rem_entry(parent);
40641 +       /* reiser4_del_nlink(object) */
40642 +       res += fplug->estimate.update(object);
40643 +       /* update_dir(parent) */
40644 +       res += inode_file_plugin(parent)->estimate.update(parent);
40645 +       /* fplug->unlink */
40646 +       res += fplug->estimate.unlink(object, parent);
40647 +       /* safe-link */
40648 +       res += estimate_one_insert_item(reiser4_tree_by_inode(object));
40649 +
40650 +       return res;
40651 +}
40652 +
40653 +/* helper for reiser4_unlink_common. Estimate and grab space for unlink. */
40654 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim)
40655 +{
40656 +       file_plugin *fplug;
40657 +       struct inode *child;
40658 +       int result;
40659 +
40660 +       result = 0;
40661 +       child = victim->d_inode;
40662 +       fplug = inode_file_plugin(child);
40663 +
40664 +       /* check for race with create_object() */
40665 +       if (reiser4_inode_get_flag(child, REISER4_IMMUTABLE))
40666 +               return RETERR(-E_REPEAT);
40667 +       /* object being deleted should have stat data */
40668 +       assert("vs-949", !reiser4_inode_get_flag(child, REISER4_NO_SD));
40669 +
40670 +       /* ask object plugin */
40671 +       if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child))
40672 +               return RETERR(-ENOTEMPTY);
40673 +
40674 +       result = (int)estimate_unlink(parent, child);
40675 +       if (result < 0)
40676 +               return result;
40677 +
40678 +       return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT);
40679 +}
40680 +
40681 +/* helper for reiser4_setattr_common */
40682 +static int setattr_reserve(reiser4_tree * tree)
40683 +{
40684 +       assert("vs-1096", is_grab_enabled(get_current_context()));
40685 +       return reiser4_grab_space(estimate_one_insert_into_item(tree),
40686 +                                 BA_CAN_COMMIT);
40687 +}
40688 +
40689 +/* helper function. Standards require that for many file-system operations
40690 +   on success ctime and mtime of parent directory is to be updated. */
40691 +int reiser4_update_dir(struct inode *dir)
40692 +{
40693 +       assert("nikita-2525", dir != NULL);
40694 +
40695 +       dir->i_ctime = dir->i_mtime = CURRENT_TIME;
40696 +       return reiser4_update_sd(dir);
40697 +}
40698 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/inode_ops_rename.c linux-2.6.27/fs/reiser4/plugin/inode_ops_rename.c
40699 --- linux-2.6.27.orig/fs/reiser4/plugin/inode_ops_rename.c      1970-01-01 03:00:00.000000000 +0300
40700 +++ linux-2.6.27/fs/reiser4/plugin/inode_ops_rename.c   2008-10-12 18:20:01.000000000 +0400
40701 @@ -0,0 +1,912 @@
40702 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
40703 + * reiser4/README */
40704 +
40705 +#include "../inode.h"
40706 +#include "../safe_link.h"
40707 +
40708 +static const char *possible_leak = "Possible disk space leak.";
40709 +
40710 +/* re-bind existing name at @from_coord in @from_dir to point to @to_inode.
40711 +
40712 +   Helper function called from hashed_rename() */
40713 +static int replace_name(struct inode *to_inode,        /* inode where @from_coord is
40714 +                                                * to be re-targeted at */
40715 +                       struct inode *from_dir, /* directory where @from_coord
40716 +                                                * lives */
40717 +                       struct inode *from_inode,       /* inode @from_coord
40718 +                                                        * originally point to */
40719 +                       coord_t * from_coord,   /* where directory entry is in
40720 +                                                * the tree */
40721 +                       lock_handle * from_lh /* lock handle on @from_coord */ )
40722 +{
40723 +       item_plugin *from_item;
40724 +       int result;
40725 +       znode *node;
40726 +
40727 +       coord_clear_iplug(from_coord);
40728 +       node = from_coord->node;
40729 +       result = zload(node);
40730 +       if (result != 0)
40731 +               return result;
40732 +       from_item = item_plugin_by_coord(from_coord);
40733 +       if (plugin_of_group(item_plugin_by_coord(from_coord),
40734 +                           DIR_ENTRY_ITEM_TYPE))
40735 +       {
40736 +               reiser4_key to_key;
40737 +
40738 +               build_sd_key(to_inode, &to_key);
40739 +
40740 +               /* everything is found and prepared to change directory entry
40741 +                  at @from_coord to point to @to_inode.
40742 +
40743 +                  @to_inode is just about to get new name, so bump its link
40744 +                  counter.
40745 +
40746 +                */
40747 +               result = reiser4_add_nlink(to_inode, from_dir, 0);
40748 +               if (result != 0) {
40749 +                       /* Don't issue warning: this may be plain -EMLINK */
40750 +                       zrelse(node);
40751 +                       return result;
40752 +               }
40753 +
40754 +               result =
40755 +                   from_item->s.dir.update_key(from_coord, &to_key, from_lh);
40756 +               if (result != 0) {
40757 +                       reiser4_del_nlink(to_inode, from_dir, 0);
40758 +                       zrelse(node);
40759 +                       return result;
40760 +               }
40761 +
40762 +               /* @from_inode just lost its name, he-he.
40763 +
40764 +                  If @from_inode was directory, it contained dotdot pointing
40765 +                  to @from_dir. @from_dir i_nlink will be decreased when
40766 +                  iput() will be called on @from_inode.
40767 +
40768 +                  If file-system is not ADG (hard-links are
40769 +                  supported on directories), iput(from_inode) will not remove
40770 +                  @from_inode, and thus above is incorrect, but hard-links on
40771 +                  directories are problematic in many other respects.
40772 +                */
40773 +               result = reiser4_del_nlink(from_inode, from_dir, 0);
40774 +               if (result != 0) {
40775 +                       warning("nikita-2330",
40776 +                               "Cannot remove link from source: %i. %s",
40777 +                               result, possible_leak);
40778 +               }
40779 +               /* Has to return success, because entry is already
40780 +                * modified. */
40781 +               result = 0;
40782 +
40783 +               /* NOTE-NIKITA consider calling plugin method in stead of
40784 +                  accessing inode fields directly. */
40785 +               from_dir->i_mtime = CURRENT_TIME;
40786 +       } else {
40787 +               warning("nikita-2326", "Unexpected item type");
40788 +               result = RETERR(-EIO);
40789 +       }
40790 +       zrelse(node);
40791 +       return result;
40792 +}
40793 +
40794 +/* add new entry pointing to @inode into @dir at @coord, locked by @lh
40795 +
40796 +   Helper function used by hashed_rename(). */
40797 +static int add_name(struct inode *inode,       /* inode where @coord is to be
40798 +                                                * re-targeted at */
40799 +                   struct inode *dir,  /* directory where @coord lives */
40800 +                   struct dentry *name,        /* new name */
40801 +                   coord_t * coord,    /* where directory entry is in the tree */
40802 +                   lock_handle * lh,   /* lock handle on @coord */
40803 +                   int is_dir /* true, if @inode is directory */ )
40804 +{
40805 +       int result;
40806 +       reiser4_dir_entry_desc entry;
40807 +
40808 +       assert("nikita-2333", lh->node == coord->node);
40809 +       assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode));
40810 +
40811 +       memset(&entry, 0, sizeof entry);
40812 +       entry.obj = inode;
40813 +       /* build key of directory entry description */
40814 +       inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key);
40815 +
40816 +       /* ext2 does this in different order: first inserts new entry,
40817 +          then increases directory nlink. We don't want do this,
40818 +          because reiser4_add_nlink() calls ->add_link() plugin
40819 +          method that can fail for whatever reason, leaving as with
40820 +          cleanup problems.
40821 +        */
40822 +       /* @inode is getting new name */
40823 +       reiser4_add_nlink(inode, dir, 0);
40824 +       /* create @new_name in @new_dir pointing to
40825 +          @old_inode */
40826 +       result = WITH_COORD(coord,
40827 +                           inode_dir_item_plugin(dir)->s.dir.add_entry(dir,
40828 +                                                                       coord,
40829 +                                                                       lh,
40830 +                                                                       name,
40831 +                                                                       &entry));
40832 +       if (result != 0) {
40833 +               int result2;
40834 +               result2 = reiser4_del_nlink(inode, dir, 0);
40835 +               if (result2 != 0) {
40836 +                       warning("nikita-2327",
40837 +                               "Cannot drop link on %lli %i. %s",
40838 +                               (unsigned long long)get_inode_oid(inode),
40839 +                               result2, possible_leak);
40840 +               }
40841 +       } else
40842 +               INODE_INC_FIELD(dir, i_size);
40843 +       return result;
40844 +}
40845 +
40846 +static reiser4_block_nr estimate_rename(struct inode *old_dir, /* directory where @old is located */
40847 +                                       struct dentry *old_name,        /* old name */
40848 +                                       struct inode *new_dir,  /* directory where @new is located */
40849 +                                       struct dentry *new_name /* new name */ )
40850 +{
40851 +       reiser4_block_nr res1, res2;
40852 +       dir_plugin *p_parent_old, *p_parent_new;
40853 +       file_plugin *p_child_old, *p_child_new;
40854 +
40855 +       assert("vpf-311", old_dir != NULL);
40856 +       assert("vpf-312", new_dir != NULL);
40857 +       assert("vpf-313", old_name != NULL);
40858 +       assert("vpf-314", new_name != NULL);
40859 +
40860 +       p_parent_old = inode_dir_plugin(old_dir);
40861 +       p_parent_new = inode_dir_plugin(new_dir);
40862 +       p_child_old = inode_file_plugin(old_name->d_inode);
40863 +       if (new_name->d_inode)
40864 +               p_child_new = inode_file_plugin(new_name->d_inode);
40865 +       else
40866 +               p_child_new = NULL;
40867 +
40868 +       /* find_entry - can insert one leaf. */
40869 +       res1 = res2 = 1;
40870 +
40871 +       /* replace_name */
40872 +       {
40873 +               /* reiser4_add_nlink(p_child_old) and reiser4_del_nlink(p_child_old) */
40874 +               res1 += 2 * p_child_old->estimate.update(old_name->d_inode);
40875 +               /* update key */
40876 +               res1 += 1;
40877 +               /* reiser4_del_nlink(p_child_new) */
40878 +               if (p_child_new)
40879 +                       res1 += p_child_new->estimate.update(new_name->d_inode);
40880 +       }
40881 +
40882 +       /* else add_name */
40883 +       {
40884 +               /* reiser4_add_nlink(p_parent_new) and reiser4_del_nlink(p_parent_new) */
40885 +               res2 +=
40886 +                   2 * inode_file_plugin(new_dir)->estimate.update(new_dir);
40887 +               /* reiser4_add_nlink(p_parent_old) */
40888 +               res2 += p_child_old->estimate.update(old_name->d_inode);
40889 +               /* add_entry(p_parent_new) */
40890 +               res2 += p_parent_new->estimate.add_entry(new_dir);
40891 +               /* reiser4_del_nlink(p_parent_old) */
40892 +               res2 += p_child_old->estimate.update(old_name->d_inode);
40893 +       }
40894 +
40895 +       res1 = res1 < res2 ? res2 : res1;
40896 +
40897 +       /* reiser4_write_sd(p_parent_new) */
40898 +       res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
40899 +
40900 +       /* reiser4_write_sd(p_child_new) */
40901 +       if (p_child_new)
40902 +               res1 += p_child_new->estimate.update(new_name->d_inode);
40903 +
40904 +       /* hashed_rem_entry(p_parent_old) */
40905 +       res1 += p_parent_old->estimate.rem_entry(old_dir);
40906 +
40907 +       /* reiser4_del_nlink(p_child_old) */
40908 +       res1 += p_child_old->estimate.update(old_name->d_inode);
40909 +
40910 +       /* replace_name */
40911 +       {
40912 +               /* reiser4_add_nlink(p_parent_dir_new) */
40913 +               res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
40914 +               /* update_key */
40915 +               res1 += 1;
40916 +               /* reiser4_del_nlink(p_parent_new) */
40917 +               res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
40918 +               /* reiser4_del_nlink(p_parent_old) */
40919 +               res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
40920 +       }
40921 +
40922 +       /* reiser4_write_sd(p_parent_old) */
40923 +       res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
40924 +
40925 +       /* reiser4_write_sd(p_child_old) */
40926 +       res1 += p_child_old->estimate.update(old_name->d_inode);
40927 +
40928 +       return res1;
40929 +}
40930 +
40931 +static int hashed_rename_estimate_and_grab(struct inode *old_dir,      /* directory where @old is located */
40932 +                                          struct dentry *old_name,     /* old name */
40933 +                                          struct inode *new_dir,       /* directory where @new is located */
40934 +                                          struct dentry *new_name
40935 +                                          /* new name */ )
40936 +{
40937 +       reiser4_block_nr reserve;
40938 +
40939 +       reserve = estimate_rename(old_dir, old_name, new_dir, new_name);
40940 +
40941 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
40942 +               return RETERR(-ENOSPC);
40943 +
40944 +       return 0;
40945 +}
40946 +
40947 +/* check whether @old_inode and @new_inode can be moved within file system
40948 + * tree. This singles out attempts to rename pseudo-files, for example. */
40949 +static int can_rename(struct inode *old_dir, struct inode *old_inode,
40950 +                     struct inode *new_dir, struct inode *new_inode)
40951 +{
40952 +       file_plugin *fplug;
40953 +       dir_plugin *dplug;
40954 +
40955 +       assert("nikita-3370", old_inode != NULL);
40956 +
40957 +       dplug = inode_dir_plugin(new_dir);
40958 +       fplug = inode_file_plugin(old_inode);
40959 +
40960 +       if (dplug == NULL)
40961 +               return RETERR(-ENOTDIR);
40962 +       else if (new_dir->i_op->create == NULL)
40963 +               return RETERR(-EPERM);
40964 +       else if (!fplug->can_add_link(old_inode))
40965 +               return RETERR(-EMLINK);
40966 +       else if (new_inode != NULL) {
40967 +               fplug = inode_file_plugin(new_inode);
40968 +               if (fplug->can_rem_link != NULL &&
40969 +                   !fplug->can_rem_link(new_inode))
40970 +                       return RETERR(-EBUSY);
40971 +       }
40972 +       return 0;
40973 +}
40974 +
40975 +int reiser4_find_entry(struct inode *, struct dentry *, lock_handle *,
40976 +              znode_lock_mode, reiser4_dir_entry_desc *);
40977 +int reiser4_update_dir(struct inode *);
40978 +
40979 +/* this is common implementation of vfs's rename method of struct
40980 +   inode_operations
40981 +   See comments in the body.
40982 +
40983 +   It is arguable that this function can be made generic so, that it
40984 +   will be applicable to any kind of directory plugin that deals with
40985 +   directories composed out of directory entries. The only obstacle
40986 +   here is that we don't have any data-type to represent directory
40987 +   entry. This should be re-considered when more than one different
40988 +   directory plugin will be implemented.
40989 +*/
40990 +int reiser4_rename_common(struct inode *old_dir /* directory where @old
40991 +                                                * is located */ ,
40992 +                         struct dentry *old_name /* old name */ ,
40993 +                         struct inode *new_dir /* directory where @new
40994 +                                                * is located */ ,
40995 +                         struct dentry *new_name /* new name */ )
40996 +{
40997 +       /* From `The Open Group Base Specifications Issue 6'
40998 +
40999 +          If either the old or new argument names a symbolic link, rename()
41000 +          shall operate on the symbolic link itself, and shall not resolve
41001 +          the last component of the argument. If the old argument and the new
41002 +          argument resolve to the same existing file, rename() shall return
41003 +          successfully and perform no other action.
41004 +
41005 +          [this is done by VFS: vfs_rename()]
41006 +
41007 +          If the old argument points to the pathname of a file that is not a
41008 +          directory, the new argument shall not point to the pathname of a
41009 +          directory.
41010 +
41011 +          [checked by VFS: vfs_rename->may_delete()]
41012 +
41013 +          If the link named by the new argument exists, it shall
41014 +          be removed and old renamed to new. In this case, a link named new
41015 +          shall remain visible to other processes throughout the renaming
41016 +          operation and refer either to the file referred to by new or old
41017 +          before the operation began.
41018 +
41019 +          [we should assure this]
41020 +
41021 +          Write access permission is required for
41022 +          both the directory containing old and the directory containing new.
41023 +
41024 +          [checked by VFS: vfs_rename->may_delete(), may_create()]
41025 +
41026 +          If the old argument points to the pathname of a directory, the new
41027 +          argument shall not point to the pathname of a file that is not a
41028 +          directory.
41029 +
41030 +          [checked by VFS: vfs_rename->may_delete()]
41031 +
41032 +          If the directory named by the new argument exists, it
41033 +          shall be removed and old renamed to new. In this case, a link named
41034 +          new shall exist throughout the renaming operation and shall refer
41035 +          either to the directory referred to by new or old before the
41036 +          operation began.
41037 +
41038 +          [we should assure this]
41039 +
41040 +          If new names an existing directory, it shall be
41041 +          required to be an empty directory.
41042 +
41043 +          [we should check this]
41044 +
41045 +          If the old argument points to a pathname of a symbolic link, the
41046 +          symbolic link shall be renamed. If the new argument points to a
41047 +          pathname of a symbolic link, the symbolic link shall be removed.
41048 +
41049 +          The new pathname shall not contain a path prefix that names
41050 +          old. Write access permission is required for the directory
41051 +          containing old and the directory containing new. If the old
41052 +          argument points to the pathname of a directory, write access
41053 +          permission may be required for the directory named by old, and, if
41054 +          it exists, the directory named by new.
41055 +
41056 +          [checked by VFS: vfs_rename(), vfs_rename_dir()]
41057 +
41058 +          If the link named by the new argument exists and the file's link
41059 +          count becomes 0 when it is removed and no process has the file
41060 +          open, the space occupied by the file shall be freed and the file
41061 +          shall no longer be accessible. If one or more processes have the
41062 +          file open when the last link is removed, the link shall be removed
41063 +          before rename() returns, but the removal of the file contents shall
41064 +          be postponed until all references to the file are closed.
41065 +
41066 +          [iput() handles this, but we can do this manually, a la
41067 +          reiser4_unlink()]
41068 +
41069 +          Upon successful completion, rename() shall mark for update the
41070 +          st_ctime and st_mtime fields of the parent directory of each file.
41071 +
41072 +          [N/A]
41073 +
41074 +        */
41075 +       reiser4_context *ctx;
41076 +       int result;
41077 +       int is_dir;             /* is @old_name directory */
41078 +
41079 +       struct inode *old_inode;
41080 +       struct inode *new_inode;
41081 +       coord_t *new_coord;
41082 +
41083 +       struct reiser4_dentry_fsdata *new_fsdata;
41084 +       dir_plugin *dplug;
41085 +       file_plugin *fplug;
41086 +
41087 +       reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry;
41088 +       lock_handle *new_lh, *dotdot_lh;
41089 +       struct dentry *dotdot_name;
41090 +       struct reiser4_dentry_fsdata *dataonstack;
41091 +
41092 +       ctx = reiser4_init_context(old_dir->i_sb);
41093 +       if (IS_ERR(ctx))
41094 +               return PTR_ERR(ctx);
41095 +
41096 +       old_entry = kzalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
41097 +                           sizeof(*dotdot_name) + sizeof(*dataonstack),
41098 +                           reiser4_ctx_gfp_mask_get());
41099 +       if (!old_entry) {
41100 +               context_set_commit_async(ctx);
41101 +               reiser4_exit_context(ctx);
41102 +               return RETERR(-ENOMEM);
41103 +       }
41104 +
41105 +       new_entry = old_entry + 1;
41106 +       dotdot_entry = old_entry + 2;
41107 +       new_lh = (lock_handle *)(old_entry + 3);
41108 +       dotdot_lh = new_lh + 1;
41109 +       dotdot_name = (struct dentry *)(new_lh + 2);
41110 +       dataonstack = (struct reiser4_dentry_fsdata *)(dotdot_name + 1);
41111 +
41112 +       assert("nikita-2318", old_dir != NULL);
41113 +       assert("nikita-2319", new_dir != NULL);
41114 +       assert("nikita-2320", old_name != NULL);
41115 +       assert("nikita-2321", new_name != NULL);
41116 +
41117 +       old_inode = old_name->d_inode;
41118 +       new_inode = new_name->d_inode;
41119 +
41120 +       dplug = inode_dir_plugin(old_dir);
41121 +       fplug = NULL;
41122 +
41123 +       new_fsdata = reiser4_get_dentry_fsdata(new_name);
41124 +       if (IS_ERR(new_fsdata)) {
41125 +               kfree(old_entry);
41126 +               context_set_commit_async(ctx);
41127 +               reiser4_exit_context(ctx);
41128 +               return PTR_ERR(new_fsdata);
41129 +       }
41130 +
41131 +       new_coord = &new_fsdata->dec.entry_coord;
41132 +       coord_clear_iplug(new_coord);
41133 +
41134 +       is_dir = S_ISDIR(old_inode->i_mode);
41135 +
41136 +       assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
41137 +
41138 +       /* if target is existing directory and it's not empty---return error.
41139 +
41140 +          This check is done specifically, because is_dir_empty() requires
41141 +          tree traversal and have to be done before locks are taken.
41142 +        */
41143 +       if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) {
41144 +               kfree(old_entry);
41145 +               context_set_commit_async(ctx);
41146 +               reiser4_exit_context(ctx);
41147 +               return RETERR(-ENOTEMPTY);
41148 +       }
41149 +
41150 +       result = can_rename(old_dir, old_inode, new_dir, new_inode);
41151 +       if (result != 0) {
41152 +               kfree(old_entry);
41153 +               context_set_commit_async(ctx);
41154 +               reiser4_exit_context(ctx);
41155 +               return result;
41156 +       }
41157 +
41158 +       result = hashed_rename_estimate_and_grab(old_dir, old_name,
41159 +                                                new_dir, new_name);
41160 +       if (result != 0) {
41161 +               kfree(old_entry);
41162 +               context_set_commit_async(ctx);
41163 +               reiser4_exit_context(ctx);
41164 +               return result;
41165 +       }
41166 +
41167 +       init_lh(new_lh);
41168 +
41169 +       /* find entry for @new_name */
41170 +       result = reiser4_find_entry(new_dir, new_name, new_lh, ZNODE_WRITE_LOCK,
41171 +                                   new_entry);
41172 +
41173 +       if (IS_CBKERR(result)) {
41174 +               done_lh(new_lh);
41175 +               kfree(old_entry);
41176 +               context_set_commit_async(ctx);
41177 +               reiser4_exit_context(ctx);
41178 +               return result;
41179 +       }
41180 +
41181 +       reiser4_seal_done(&new_fsdata->dec.entry_seal);
41182 +
41183 +       /* add or replace name for @old_inode as @new_name */
41184 +       if (new_inode != NULL) {
41185 +               /* target (@new_name) exists. */
41186 +               /* Not clear what to do with objects that are
41187 +                  both directories and files at the same time. */
41188 +               if (result == CBK_COORD_FOUND) {
41189 +                       result = replace_name(old_inode,
41190 +                                             new_dir,
41191 +                                             new_inode, new_coord, new_lh);
41192 +                       if (result == 0)
41193 +                               fplug = inode_file_plugin(new_inode);
41194 +               } else if (result == CBK_COORD_NOTFOUND) {
41195 +                       /* VFS told us that @new_name is bound to existing
41196 +                          inode, but we failed to find directory entry. */
41197 +                       warning("nikita-2324", "Target not found");
41198 +                       result = RETERR(-ENOENT);
41199 +               }
41200 +       } else {
41201 +               /* target (@new_name) doesn't exists. */
41202 +               if (result == CBK_COORD_NOTFOUND)
41203 +                       result = add_name(old_inode,
41204 +                                         new_dir,
41205 +                                         new_name, new_coord, new_lh, is_dir);
41206 +               else if (result == CBK_COORD_FOUND) {
41207 +                       /* VFS told us that @new_name is "negative" dentry,
41208 +                          but we found directory entry. */
41209 +                       warning("nikita-2331", "Target found unexpectedly");
41210 +                       result = RETERR(-EIO);
41211 +               }
41212 +       }
41213 +
41214 +       assert("nikita-3462", ergo(result == 0,
41215 +                                  old_inode->i_nlink >= 2 + !!is_dir));
41216 +
41217 +       /* We are done with all modifications to the @new_dir, release lock on
41218 +          node. */
41219 +       done_lh(new_lh);
41220 +
41221 +       if (fplug != NULL) {
41222 +               /* detach @new_inode from name-space */
41223 +               result = fplug->detach(new_inode, new_dir);
41224 +               if (result != 0)
41225 +                       warning("nikita-2330", "Cannot detach %lli: %i. %s",
41226 +                               (unsigned long long)get_inode_oid(new_inode),
41227 +                               result, possible_leak);
41228 +       }
41229 +
41230 +       if (new_inode != NULL)
41231 +               reiser4_update_sd(new_inode);
41232 +
41233 +       if (result == 0) {
41234 +               old_entry->obj = old_inode;
41235 +
41236 +               dplug->build_entry_key(old_dir,
41237 +                                      &old_name->d_name, &old_entry->key);
41238 +
41239 +               /* At this stage new name was introduced for
41240 +                  @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
41241 +                  counters were updated.
41242 +
41243 +                  We want to remove @old_name now. If @old_inode wasn't
41244 +                  directory this is simple.
41245 +                */
41246 +               result = dplug->rem_entry(old_dir, old_name, old_entry);
41247 +               if (result != 0 && result != -ENOMEM) {
41248 +                       warning("nikita-2335",
41249 +                               "Cannot remove old name: %i", result);
41250 +               } else {
41251 +                       result = reiser4_del_nlink(old_inode, old_dir, 0);
41252 +                       if (result != 0 && result != -ENOMEM) {
41253 +                               warning("nikita-2337",
41254 +                                       "Cannot drop link on old: %i", result);
41255 +                       }
41256 +               }
41257 +
41258 +               if (result == 0 && is_dir) {
41259 +                       /* @old_inode is directory. We also have to update
41260 +                          dotdot entry. */
41261 +                       coord_t *dotdot_coord;
41262 +
41263 +                       memset(dataonstack, 0, sizeof dataonstack);
41264 +                       memset(dotdot_entry, 0, sizeof dotdot_entry);
41265 +                       dotdot_entry->obj = old_dir;
41266 +                       memset(dotdot_name, 0, sizeof dotdot_name);
41267 +                       dotdot_name->d_name.name = "..";
41268 +                       dotdot_name->d_name.len = 2;
41269 +                       /*
41270 +                        * allocate ->d_fsdata on the stack to avoid using
41271 +                        * reiser4_get_dentry_fsdata(). Locking is not needed,
41272 +                        * because dentry is private to the current thread.
41273 +                        */
41274 +                       dotdot_name->d_fsdata = dataonstack;
41275 +                       init_lh(dotdot_lh);
41276 +
41277 +                       dotdot_coord = &dataonstack->dec.entry_coord;
41278 +                       coord_clear_iplug(dotdot_coord);
41279 +
41280 +                       result = reiser4_find_entry(old_inode, dotdot_name,
41281 +                                                   dotdot_lh, ZNODE_WRITE_LOCK,
41282 +                                                   dotdot_entry);
41283 +                       if (result == 0) {
41284 +                               /* replace_name() decreases i_nlink on
41285 +                                * @old_dir */
41286 +                               result = replace_name(new_dir,
41287 +                                                     old_inode,
41288 +                                                     old_dir,
41289 +                                                     dotdot_coord, dotdot_lh);
41290 +                       } else
41291 +                               result = RETERR(-EIO);
41292 +                       done_lh(dotdot_lh);
41293 +               }
41294 +       }
41295 +       reiser4_update_dir(new_dir);
41296 +       reiser4_update_dir(old_dir);
41297 +       reiser4_update_sd(old_inode);
41298 +       if (result == 0) {
41299 +               file_plugin *fplug;
41300 +
41301 +               if (new_inode != NULL) {
41302 +                       /* add safe-link for target file (in case we removed
41303 +                        * last reference to the poor fellow */
41304 +                       fplug = inode_file_plugin(new_inode);
41305 +                       if (new_inode->i_nlink == 0)
41306 +                               result = safe_link_add(new_inode, SAFE_UNLINK);
41307 +               }
41308 +       }
41309 +       kfree(old_entry);
41310 +       context_set_commit_async(ctx);
41311 +       reiser4_exit_context(ctx);
41312 +       return result;
41313 +}
41314 +
41315 +#if 0
41316 +int reiser4_rename_common(struct inode *old_dir /* directory where @old
41317 +                                                * is located */ ,
41318 +                         struct dentry *old_name /* old name */ ,
41319 +                         struct inode *new_dir /* directory where @new
41320 +                                                * is located */ ,
41321 +                         struct dentry *new_name /* new name */ )
41322 +{
41323 +       /* From `The Open Group Base Specifications Issue 6'
41324 +
41325 +          If either the old or new argument names a symbolic link, rename()
41326 +          shall operate on the symbolic link itself, and shall not resolve
41327 +          the last component of the argument. If the old argument and the new
41328 +          argument resolve to the same existing file, rename() shall return
41329 +          successfully and perform no other action.
41330 +
41331 +          [this is done by VFS: vfs_rename()]
41332 +
41333 +          If the old argument points to the pathname of a file that is not a
41334 +          directory, the new argument shall not point to the pathname of a
41335 +          directory.
41336 +
41337 +          [checked by VFS: vfs_rename->may_delete()]
41338 +
41339 +          If the link named by the new argument exists, it shall
41340 +          be removed and old renamed to new. In this case, a link named new
41341 +          shall remain visible to other processes throughout the renaming
41342 +          operation and refer either to the file referred to by new or old
41343 +          before the operation began.
41344 +
41345 +          [we should assure this]
41346 +
41347 +          Write access permission is required for
41348 +          both the directory containing old and the directory containing new.
41349 +
41350 +          [checked by VFS: vfs_rename->may_delete(), may_create()]
41351 +
41352 +          If the old argument points to the pathname of a directory, the new
41353 +          argument shall not point to the pathname of a file that is not a
41354 +          directory.
41355 +
41356 +          [checked by VFS: vfs_rename->may_delete()]
41357 +
41358 +          If the directory named by the new argument exists, it
41359 +          shall be removed and old renamed to new. In this case, a link named
41360 +          new shall exist throughout the renaming operation and shall refer
41361 +          either to the directory referred to by new or old before the
41362 +          operation began.
41363 +
41364 +          [we should assure this]
41365 +
41366 +          If new names an existing directory, it shall be
41367 +          required to be an empty directory.
41368 +
41369 +          [we should check this]
41370 +
41371 +          If the old argument points to a pathname of a symbolic link, the
41372 +          symbolic link shall be renamed. If the new argument points to a
41373 +          pathname of a symbolic link, the symbolic link shall be removed.
41374 +
41375 +          The new pathname shall not contain a path prefix that names
41376 +          old. Write access permission is required for the directory
41377 +          containing old and the directory containing new. If the old
41378 +          argument points to the pathname of a directory, write access
41379 +          permission may be required for the directory named by old, and, if
41380 +          it exists, the directory named by new.
41381 +
41382 +          [checked by VFS: vfs_rename(), vfs_rename_dir()]
41383 +
41384 +          If the link named by the new argument exists and the file's link
41385 +          count becomes 0 when it is removed and no process has the file
41386 +          open, the space occupied by the file shall be freed and the file
41387 +          shall no longer be accessible. If one or more processes have the
41388 +          file open when the last link is removed, the link shall be removed
41389 +          before rename() returns, but the removal of the file contents shall
41390 +          be postponed until all references to the file are closed.
41391 +
41392 +          [iput() handles this, but we can do this manually, a la
41393 +          reiser4_unlink()]
41394 +
41395 +          Upon successful completion, rename() shall mark for update the
41396 +          st_ctime and st_mtime fields of the parent directory of each file.
41397 +
41398 +          [N/A]
41399 +
41400 +        */
41401 +       reiser4_context *ctx;
41402 +       int result;
41403 +       int is_dir;             /* is @old_name directory */
41404 +       struct inode *old_inode;
41405 +       struct inode *new_inode;
41406 +       reiser4_dir_entry_desc old_entry;
41407 +       reiser4_dir_entry_desc new_entry;
41408 +       coord_t *new_coord;
41409 +       struct reiser4_dentry_fsdata *new_fsdata;
41410 +       lock_handle new_lh;
41411 +       dir_plugin *dplug;
41412 +       file_plugin *fplug;
41413 +
41414 +       ctx = reiser4_init_context(old_dir->i_sb);
41415 +       if (IS_ERR(ctx))
41416 +               return PTR_ERR(ctx);
41417 +
41418 +       assert("nikita-2318", old_dir != NULL);
41419 +       assert("nikita-2319", new_dir != NULL);
41420 +       assert("nikita-2320", old_name != NULL);
41421 +       assert("nikita-2321", new_name != NULL);
41422 +
41423 +       old_inode = old_name->d_inode;
41424 +       new_inode = new_name->d_inode;
41425 +
41426 +       dplug = inode_dir_plugin(old_dir);
41427 +       fplug = NULL;
41428 +
41429 +       new_fsdata = reiser4_get_dentry_fsdata(new_name);
41430 +       if (IS_ERR(new_fsdata)) {
41431 +               result = PTR_ERR(new_fsdata);
41432 +               goto exit;
41433 +       }
41434 +
41435 +       new_coord = &new_fsdata->dec.entry_coord;
41436 +       coord_clear_iplug(new_coord);
41437 +
41438 +       is_dir = S_ISDIR(old_inode->i_mode);
41439 +
41440 +       assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
41441 +
41442 +       /* if target is existing directory and it's not empty---return error.
41443 +
41444 +          This check is done specifically, because is_dir_empty() requires
41445 +          tree traversal and have to be done before locks are taken.
41446 +        */
41447 +       if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0)
41448 +               return RETERR(-ENOTEMPTY);
41449 +
41450 +       result = can_rename(old_dir, old_inode, new_dir, new_inode);
41451 +       if (result != 0)
41452 +               goto exit;
41453 +
41454 +       result = hashed_rename_estimate_and_grab(old_dir, old_name,
41455 +                                                new_dir, new_name);
41456 +       if (result != 0)
41457 +               goto exit;
41458 +
41459 +       init_lh(&new_lh);
41460 +
41461 +       /* find entry for @new_name */
41462 +       result = reiser4_find_entry(new_dir, new_name, &new_lh,
41463 +                                   ZNODE_WRITE_LOCK, &new_entry);
41464 +
41465 +       if (IS_CBKERR(result)) {
41466 +               done_lh(&new_lh);
41467 +               goto exit;
41468 +       }
41469 +
41470 +       reiser4_seal_done(&new_fsdata->dec.entry_seal);
41471 +
41472 +       /* add or replace name for @old_inode as @new_name */
41473 +       if (new_inode != NULL) {
41474 +               /* target (@new_name) exists. */
41475 +               /* Not clear what to do with objects that are
41476 +                  both directories and files at the same time. */
41477 +               if (result == CBK_COORD_FOUND) {
41478 +                       result = replace_name(old_inode,
41479 +                                             new_dir,
41480 +                                             new_inode, new_coord, &new_lh);
41481 +                       if (result == 0)
41482 +                               fplug = inode_file_plugin(new_inode);
41483 +               } else if (result == CBK_COORD_NOTFOUND) {
41484 +                       /* VFS told us that @new_name is bound to existing
41485 +                          inode, but we failed to find directory entry. */
41486 +                       warning("nikita-2324", "Target not found");
41487 +                       result = RETERR(-ENOENT);
41488 +               }
41489 +       } else {
41490 +               /* target (@new_name) doesn't exists. */
41491 +               if (result == CBK_COORD_NOTFOUND)
41492 +                       result = add_name(old_inode,
41493 +                                         new_dir,
41494 +                                         new_name, new_coord, &new_lh, is_dir);
41495 +               else if (result == CBK_COORD_FOUND) {
41496 +                       /* VFS told us that @new_name is "negative" dentry,
41497 +                          but we found directory entry. */
41498 +                       warning("nikita-2331", "Target found unexpectedly");
41499 +                       result = RETERR(-EIO);
41500 +               }
41501 +       }
41502 +
41503 +       assert("nikita-3462", ergo(result == 0,
41504 +                                  old_inode->i_nlink >= 2 + !!is_dir));
41505 +
41506 +       /* We are done with all modifications to the @new_dir, release lock on
41507 +          node. */
41508 +       done_lh(&new_lh);
41509 +
41510 +       if (fplug != NULL) {
41511 +               /* detach @new_inode from name-space */
41512 +               result = fplug->detach(new_inode, new_dir);
41513 +               if (result != 0)
41514 +                       warning("nikita-2330", "Cannot detach %lli: %i. %s",
41515 +                               (unsigned long long)get_inode_oid(new_inode),
41516 +                               result, possible_leak);
41517 +       }
41518 +
41519 +       if (new_inode != NULL)
41520 +               reiser4_update_sd(new_inode);
41521 +
41522 +       if (result == 0) {
41523 +               memset(&old_entry, 0, sizeof old_entry);
41524 +               old_entry.obj = old_inode;
41525 +
41526 +               dplug->build_entry_key(old_dir,
41527 +                                      &old_name->d_name, &old_entry.key);
41528 +
41529 +               /* At this stage new name was introduced for
41530 +                  @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
41531 +                  counters were updated.
41532 +
41533 +                  We want to remove @old_name now. If @old_inode wasn't
41534 +                  directory this is simple.
41535 +                */
41536 +               result = dplug->rem_entry(old_dir, old_name, &old_entry);
41537 +               /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */
41538 +               if (result != 0 && result != -ENOMEM) {
41539 +                       warning("nikita-2335",
41540 +                               "Cannot remove old name: %i", result);
41541 +               } else {
41542 +                       result = reiser4_del_nlink(old_inode, old_dir, 0);
41543 +                       if (result != 0 && result != -ENOMEM) {
41544 +                               warning("nikita-2337",
41545 +                                       "Cannot drop link on old: %i", result);
41546 +                       }
41547 +               }
41548 +
41549 +               if (result == 0 && is_dir) {
41550 +                       /* @old_inode is directory. We also have to update
41551 +                          dotdot entry. */
41552 +                       coord_t *dotdot_coord;
41553 +                       lock_handle dotdot_lh;
41554 +                       struct dentry dotdot_name;
41555 +                       reiser4_dir_entry_desc dotdot_entry;
41556 +                       struct reiser4_dentry_fsdata dataonstack;
41557 +                       struct reiser4_dentry_fsdata *fsdata;
41558 +
41559 +                       memset(&dataonstack, 0, sizeof dataonstack);
41560 +                       memset(&dotdot_entry, 0, sizeof dotdot_entry);
41561 +                       dotdot_entry.obj = old_dir;
41562 +                       memset(&dotdot_name, 0, sizeof dotdot_name);
41563 +                       dotdot_name.d_name.name = "..";
41564 +                       dotdot_name.d_name.len = 2;
41565 +                       /*
41566 +                        * allocate ->d_fsdata on the stack to avoid using
41567 +                        * reiser4_get_dentry_fsdata(). Locking is not needed,
41568 +                        * because dentry is private to the current thread.
41569 +                        */
41570 +                       dotdot_name.d_fsdata = &dataonstack;
41571 +                       init_lh(&dotdot_lh);
41572 +
41573 +                       fsdata = &dataonstack;
41574 +                       dotdot_coord = &fsdata->dec.entry_coord;
41575 +                       coord_clear_iplug(dotdot_coord);
41576 +
41577 +                       result = reiser4_find_entry(old_inode,
41578 +                                                   &dotdot_name,
41579 +                                                   &dotdot_lh,
41580 +                                                   ZNODE_WRITE_LOCK,
41581 +                                                   &dotdot_entry);
41582 +                       if (result == 0) {
41583 +                               /* replace_name() decreases i_nlink on
41584 +                                * @old_dir */
41585 +                               result = replace_name(new_dir,
41586 +                                                     old_inode,
41587 +                                                     old_dir,
41588 +                                                     dotdot_coord, &dotdot_lh);
41589 +                       } else
41590 +                               result = RETERR(-EIO);
41591 +                       done_lh(&dotdot_lh);
41592 +               }
41593 +       }
41594 +       reiser4_update_dir(new_dir);
41595 +       reiser4_update_dir(old_dir);
41596 +       reiser4_update_sd(old_inode);
41597 +       if (result == 0) {
41598 +               file_plugin *fplug;
41599 +
41600 +               if (new_inode != NULL) {
41601 +                       /* add safe-link for target file (in case we removed
41602 +                        * last reference to the poor fellow */
41603 +                       fplug = inode_file_plugin(new_inode);
41604 +                       if (new_inode->i_nlink == 0)
41605 +                               result = safe_link_add(new_inode, SAFE_UNLINK);
41606 +               }
41607 +       }
41608 +      exit:
41609 +       context_set_commit_async(ctx);
41610 +       reiser4_exit_context(ctx);
41611 +       return result;
41612 +}
41613 +#endif
41614 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/acl.h linux-2.6.27/fs/reiser4/plugin/item/acl.h
41615 --- linux-2.6.27.orig/fs/reiser4/plugin/item/acl.h      1970-01-01 03:00:00.000000000 +0300
41616 +++ linux-2.6.27/fs/reiser4/plugin/item/acl.h   2008-10-12 18:20:01.000000000 +0400
41617 @@ -0,0 +1,66 @@
41618 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
41619 +
41620 +/* Directory entry. */
41621 +
41622 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
41623 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
41624 +
41625 +#include "../../forward.h"
41626 +#include "../../dformat.h"
41627 +#include "../../kassign.h"
41628 +#include "../../key.h"
41629 +
41630 +#include <linux/fs.h>
41631 +#include <linux/dcache.h>      /* for struct dentry */
41632 +
41633 +typedef struct directory_entry_format {
41634 +       /* key of object stat-data. It's not necessary to store whole
41635 +          key here, because it's always key of stat-data, so minor
41636 +          packing locality and offset can be omitted here. But this
41637 +          relies on particular key allocation scheme for stat-data, so,
41638 +          for extensibility sake, whole key can be stored here.
41639 +
41640 +          We store key as array of bytes, because we don't want 8-byte
41641 +          alignment of dir entries.
41642 +        */
41643 +       obj_key_id id;
41644 +       /* file name. Null terminated string. */
41645 +       d8 name[0];
41646 +} directory_entry_format;
41647 +
41648 +void print_de(const char *prefix, coord_t * coord);
41649 +int extract_key_de(const coord_t * coord, reiser4_key * key);
41650 +int update_key_de(const coord_t * coord, const reiser4_key * key,
41651 +                 lock_handle * lh);
41652 +char *extract_name_de(const coord_t * coord, char *buf);
41653 +unsigned extract_file_type_de(const coord_t * coord);
41654 +int add_entry_de(struct inode *dir, coord_t * coord,
41655 +                lock_handle * lh, const struct dentry *name,
41656 +                reiser4_dir_entry_desc * entry);
41657 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
41658 +                lock_handle * lh, reiser4_dir_entry_desc * entry);
41659 +int max_name_len_de(const struct inode *dir);
41660 +
41661 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
41662 +
41663 +char *extract_dent_name(const coord_t * coord,
41664 +                       directory_entry_format * dent, char *buf);
41665 +
41666 +#if REISER4_LARGE_KEY
41667 +#define DE_NAME_BUF_LEN (24)
41668 +#else
41669 +#define DE_NAME_BUF_LEN (16)
41670 +#endif
41671 +
41672 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
41673 +#endif
41674 +
41675 +/* Make Linus happy.
41676 +   Local variables:
41677 +   c-indentation-style: "K&R"
41678 +   mode-name: "LC"
41679 +   c-basic-offset: 8
41680 +   tab-width: 8
41681 +   fill-column: 120
41682 +   End:
41683 +*/
41684 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/blackbox.c linux-2.6.27/fs/reiser4/plugin/item/blackbox.c
41685 --- linux-2.6.27.orig/fs/reiser4/plugin/item/blackbox.c 1970-01-01 03:00:00.000000000 +0300
41686 +++ linux-2.6.27/fs/reiser4/plugin/item/blackbox.c      2008-10-12 18:20:01.000000000 +0400
41687 @@ -0,0 +1,142 @@
41688 +/* Copyright 2003 by Hans Reiser, licensing governed by
41689 + * reiser4/README */
41690 +
41691 +/* Black box item implementation */
41692 +
41693 +#include "../../forward.h"
41694 +#include "../../debug.h"
41695 +#include "../../dformat.h"
41696 +#include "../../kassign.h"
41697 +#include "../../coord.h"
41698 +#include "../../tree.h"
41699 +#include "../../lock.h"
41700 +
41701 +#include "blackbox.h"
41702 +#include "item.h"
41703 +#include "../plugin.h"
41704 +
41705 +int
41706 +store_black_box(reiser4_tree * tree,
41707 +               const reiser4_key * key, void *data, int length)
41708 +{
41709 +       int result;
41710 +       reiser4_item_data idata;
41711 +       coord_t coord;
41712 +       lock_handle lh;
41713 +
41714 +       memset(&idata, 0, sizeof idata);
41715 +
41716 +       idata.data = data;
41717 +       idata.user = 0;
41718 +       idata.length = length;
41719 +       idata.iplug = item_plugin_by_id(BLACK_BOX_ID);
41720 +
41721 +       init_lh(&lh);
41722 +       result = insert_by_key(tree, key,
41723 +                              &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
41724 +
41725 +       assert("nikita-3413",
41726 +              ergo(result == 0,
41727 +                   WITH_COORD(&coord,
41728 +                              item_length_by_coord(&coord) == length)));
41729 +
41730 +       done_lh(&lh);
41731 +       return result;
41732 +}
41733 +
41734 +int
41735 +load_black_box(reiser4_tree * tree,
41736 +              reiser4_key * key, void *data, int length, int exact)
41737 +{
41738 +       int result;
41739 +       coord_t coord;
41740 +       lock_handle lh;
41741 +
41742 +       init_lh(&lh);
41743 +       result = coord_by_key(tree, key,
41744 +                             &coord, &lh, ZNODE_READ_LOCK,
41745 +                             exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
41746 +                             LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
41747 +
41748 +       if (result == 0) {
41749 +               int ilen;
41750 +
41751 +               result = zload(coord.node);
41752 +               if (result == 0) {
41753 +                       ilen = item_length_by_coord(&coord);
41754 +                       if (ilen <= length) {
41755 +                               memcpy(data, item_body_by_coord(&coord), ilen);
41756 +                               unit_key_by_coord(&coord, key);
41757 +                       } else if (exact) {
41758 +                               /*
41759 +                                * item is larger than buffer provided by the
41760 +                                * user. Only issue a warning if @exact is
41761 +                                * set. If @exact is false, we are iterating
41762 +                                * over all safe-links and here we are reaching
41763 +                                * the end of the iteration.
41764 +                                */
41765 +                               warning("nikita-3415",
41766 +                                       "Wrong black box length: %i > %i",
41767 +                                       ilen, length);
41768 +                               result = RETERR(-EIO);
41769 +                       }
41770 +                       zrelse(coord.node);
41771 +               }
41772 +       }
41773 +
41774 +       done_lh(&lh);
41775 +       return result;
41776 +
41777 +}
41778 +
41779 +int
41780 +update_black_box(reiser4_tree * tree,
41781 +                const reiser4_key * key, void *data, int length)
41782 +{
41783 +       int result;
41784 +       coord_t coord;
41785 +       lock_handle lh;
41786 +
41787 +       init_lh(&lh);
41788 +       result = coord_by_key(tree, key,
41789 +                             &coord, &lh, ZNODE_READ_LOCK,
41790 +                             FIND_EXACT,
41791 +                             LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
41792 +       if (result == 0) {
41793 +               int ilen;
41794 +
41795 +               result = zload(coord.node);
41796 +               if (result == 0) {
41797 +                       ilen = item_length_by_coord(&coord);
41798 +                       if (length <= ilen) {
41799 +                               memcpy(item_body_by_coord(&coord), data,
41800 +                                      length);
41801 +                       } else {
41802 +                               warning("nikita-3437",
41803 +                                       "Wrong black box length: %i < %i",
41804 +                                       ilen, length);
41805 +                               result = RETERR(-EIO);
41806 +                       }
41807 +                       zrelse(coord.node);
41808 +               }
41809 +       }
41810 +
41811 +       done_lh(&lh);
41812 +       return result;
41813 +
41814 +}
41815 +
41816 +int kill_black_box(reiser4_tree * tree, const reiser4_key * key)
41817 +{
41818 +       return reiser4_cut_tree(tree, key, key, NULL, 1);
41819 +}
41820 +
41821 +/* Make Linus happy.
41822 +   Local variables:
41823 +   c-indentation-style: "K&R"
41824 +   mode-name: "LC"
41825 +   c-basic-offset: 8
41826 +   tab-width: 8
41827 +   fill-column: 120
41828 +   End:
41829 +*/
41830 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/blackbox.h linux-2.6.27/fs/reiser4/plugin/item/blackbox.h
41831 --- linux-2.6.27.orig/fs/reiser4/plugin/item/blackbox.h 1970-01-01 03:00:00.000000000 +0300
41832 +++ linux-2.6.27/fs/reiser4/plugin/item/blackbox.h      2008-10-12 18:20:01.000000000 +0400
41833 @@ -0,0 +1,33 @@
41834 +/* Copyright 2003 by Hans Reiser, licensing governed by
41835 + * reiser4/README */
41836 +
41837 +/* "Black box" entry to fixed-width contain user supplied data */
41838 +
41839 +#if !defined( __FS_REISER4_BLACK_BOX_H__ )
41840 +#define __FS_REISER4_BLACK_BOX_H__
41841 +
41842 +#include "../../forward.h"
41843 +#include "../../dformat.h"
41844 +#include "../../kassign.h"
41845 +#include "../../key.h"
41846 +
41847 +extern int store_black_box(reiser4_tree * tree,
41848 +                          const reiser4_key * key, void *data, int length);
41849 +extern int load_black_box(reiser4_tree * tree,
41850 +                         reiser4_key * key, void *data, int length, int exact);
41851 +extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key);
41852 +extern int update_black_box(reiser4_tree * tree,
41853 +                           const reiser4_key * key, void *data, int length);
41854 +
41855 +/* __FS_REISER4_BLACK_BOX_H__ */
41856 +#endif
41857 +
41858 +/* Make Linus happy.
41859 +   Local variables:
41860 +   c-indentation-style: "K&R"
41861 +   mode-name: "LC"
41862 +   c-basic-offset: 8
41863 +   tab-width: 8
41864 +   fill-column: 120
41865 +   End:
41866 +*/
41867 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/cde.c linux-2.6.27/fs/reiser4/plugin/item/cde.c
41868 --- linux-2.6.27.orig/fs/reiser4/plugin/item/cde.c      1970-01-01 03:00:00.000000000 +0300
41869 +++ linux-2.6.27/fs/reiser4/plugin/item/cde.c   2008-10-12 18:20:01.000000000 +0400
41870 @@ -0,0 +1,1008 @@
41871 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
41872 +
41873 +/* Directory entry implementation */
41874 +
41875 +/* DESCRIPTION:
41876 +
41877 +   This is "compound" directory item plugin implementation. This directory
41878 +   item type is compound (as opposed to the "simple directory item" in
41879 +   fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory
41880 +   entries.
41881 +
41882 +   The reason behind this decision is disk space efficiency: all directory
41883 +   entries inside the same directory have identical fragment in their
41884 +   keys. This, of course, depends on key assignment policy. In our default key
41885 +   assignment policy, all directory entries have the same locality which is
41886 +   equal to the object id of their directory.
41887 +
41888 +   Composing directory item out of several directory entries for the same
41889 +   directory allows us to store said key fragment only once. That is, this is
41890 +   some ad hoc form of key compression (stem compression) that is implemented
41891 +   here, because general key compression is not supposed to be implemented in
41892 +   v4.0.
41893 +
41894 +   Another decision that was made regarding all directory item plugins, is
41895 +   that they will store entry keys unaligned. This is for that sake of disk
41896 +   space efficiency again.
41897 +
41898 +   In should be noted, that storing keys unaligned increases CPU consumption,
41899 +   at least on some architectures.
41900 +
41901 +   Internal on-disk structure of the compound directory item is the following:
41902 +
41903 +        HEADER          cde_item_format.        Here number of entries is stored.
41904 +        ENTRY_HEADER_0  cde_unit_header.        Here part of entry key and
41905 +        ENTRY_HEADER_1                          offset of entry body are stored.
41906 +        ENTRY_HEADER_2                         (basically two last parts of key)
41907 +        ...
41908 +        ENTRY_HEADER_N
41909 +        ENTRY_BODY_0    directory_entry_format. Here part of stat data key and
41910 +        ENTRY_BODY_1                            NUL-terminated name are stored.
41911 +        ENTRY_BODY_2                           (part of statadta key in the
41912 +                                                sence that since all SDs have
41913 +                                                zero offset, this offset is not
41914 +                                                stored on disk).
41915 +        ...
41916 +        ENTRY_BODY_N
41917 +
41918 +   When it comes to the balancing, each directory entry in compound directory
41919 +   item is unit, that is, something that can be cut from one item and pasted
41920 +   into another item of the same type. Handling of unit cut and paste is major
41921 +   reason for the complexity of code below.
41922 +
41923 +*/
41924 +
41925 +#include "../../forward.h"
41926 +#include "../../debug.h"
41927 +#include "../../dformat.h"
41928 +#include "../../kassign.h"
41929 +#include "../../key.h"
41930 +#include "../../coord.h"
41931 +#include "sde.h"
41932 +#include "cde.h"
41933 +#include "item.h"
41934 +#include "../node/node.h"
41935 +#include "../plugin.h"
41936 +#include "../../znode.h"
41937 +#include "../../carry.h"
41938 +#include "../../tree.h"
41939 +#include "../../inode.h"
41940 +
41941 +#include <linux/fs.h>          /* for struct inode */
41942 +#include <linux/dcache.h>      /* for struct dentry */
41943 +#include <linux/quotaops.h>
41944 +
41945 +#if 0
41946 +#define CHECKME(coord)                                         \
41947 +({                                                             \
41948 +       const char *message;                                    \
41949 +       coord_t dup;                                            \
41950 +                                                               \
41951 +       coord_dup_nocheck(&dup, (coord));                       \
41952 +       dup.unit_pos = 0;                                       \
41953 +       assert("nikita-2871", cde_check(&dup, &message) == 0);  \
41954 +})
41955 +#else
41956 +#define CHECKME(coord) noop
41957 +#endif
41958 +
41959 +/* return body of compound directory item at @coord */
41960 +static inline cde_item_format *formatted_at(const coord_t * coord)
41961 +{
41962 +       assert("nikita-1282", coord != NULL);
41963 +       return item_body_by_coord(coord);
41964 +}
41965 +
41966 +/* return entry header at @coord */
41967 +static inline cde_unit_header *header_at(const coord_t *
41968 +                                        coord /* coord of item */ ,
41969 +                                        int idx /* index of unit */ )
41970 +{
41971 +       assert("nikita-1283", coord != NULL);
41972 +       return &formatted_at(coord)->entry[idx];
41973 +}
41974 +
41975 +/* return number of units in compound directory item at @coord */
41976 +static int units(const coord_t * coord /* coord of item */ )
41977 +{
41978 +       return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries));
41979 +}
41980 +
41981 +/* return offset of the body of @idx-th entry in @coord */
41982 +static unsigned int offset_of(const coord_t * coord /* coord of item */ ,
41983 +                             int idx /* index of unit */ )
41984 +{
41985 +       if (idx < units(coord))
41986 +               return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset));
41987 +       else if (idx == units(coord))
41988 +               return item_length_by_coord(coord);
41989 +       else
41990 +               impossible("nikita-1308", "Wrong idx");
41991 +       return 0;
41992 +}
41993 +
41994 +/* set offset of the body of @idx-th entry in @coord */
41995 +static void set_offset(const coord_t * coord /* coord of item */ ,
41996 +                      int idx /* index of unit */ ,
41997 +                      unsigned int offset /* new offset */ )
41998 +{
41999 +       put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset);
42000 +}
42001 +
42002 +static void adj_offset(const coord_t * coord /* coord of item */ ,
42003 +                      int idx /* index of unit */ ,
42004 +                      int delta /* offset change */ )
42005 +{
42006 +       d16 *doffset;
42007 +       __u16 offset;
42008 +
42009 +       doffset = &header_at(coord, idx)->offset;
42010 +       offset = le16_to_cpu(get_unaligned(doffset));
42011 +       offset += delta;
42012 +       put_unaligned(cpu_to_le16((__u16) offset), doffset);
42013 +}
42014 +
42015 +/* return pointer to @offset-th byte from the beginning of @coord */
42016 +static char *address(const coord_t * coord /* coord of item */ ,
42017 +                    int offset)
42018 +{
42019 +       return ((char *)item_body_by_coord(coord)) + offset;
42020 +}
42021 +
42022 +/* return pointer to the body of @idx-th entry in @coord */
42023 +static directory_entry_format *entry_at(const coord_t * coord  /* coord of
42024 +                                                                * item */ ,
42025 +                                       int idx /* index of unit */ )
42026 +{
42027 +       return (directory_entry_format *) address(coord,
42028 +                                                 (int)offset_of(coord, idx));
42029 +}
42030 +
42031 +/* return number of unit referenced by @coord */
42032 +static int idx_of(const coord_t * coord /* coord of item */ )
42033 +{
42034 +       assert("nikita-1285", coord != NULL);
42035 +       return coord->unit_pos;
42036 +}
42037 +
42038 +/* find position where entry with @entry_key would be inserted into @coord */
42039 +static int find(const coord_t * coord /* coord of item */ ,
42040 +               const reiser4_key * entry_key /* key to look for */ ,
42041 +               cmp_t * last /* result of last comparison */ )
42042 +{
42043 +       int entries;
42044 +
42045 +       int left;
42046 +       int right;
42047 +
42048 +       cde_unit_header *header;
42049 +
42050 +       assert("nikita-1295", coord != NULL);
42051 +       assert("nikita-1296", entry_key != NULL);
42052 +       assert("nikita-1297", last != NULL);
42053 +
42054 +       entries = units(coord);
42055 +       left = 0;
42056 +       right = entries - 1;
42057 +       while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
42058 +               int median;
42059 +
42060 +               median = (left + right) >> 1;
42061 +
42062 +               header = header_at(coord, median);
42063 +               *last = de_id_key_cmp(&header->hash, entry_key);
42064 +               switch (*last) {
42065 +               case LESS_THAN:
42066 +                       left = median;
42067 +                       break;
42068 +               case GREATER_THAN:
42069 +                       right = median;
42070 +                       break;
42071 +               case EQUAL_TO:{
42072 +                               do {
42073 +                                       median--;
42074 +                                       header--;
42075 +                               } while (median >= 0 &&
42076 +                                        de_id_key_cmp(&header->hash,
42077 +                                                      entry_key) == EQUAL_TO);
42078 +                               return median + 1;
42079 +                       }
42080 +               }
42081 +       }
42082 +       header = header_at(coord, left);
42083 +       for (; left < entries; ++left, ++header) {
42084 +               prefetch(header + 1);
42085 +               *last = de_id_key_cmp(&header->hash, entry_key);
42086 +               if (*last != LESS_THAN)
42087 +                       break;
42088 +       }
42089 +       if (left < entries)
42090 +               return left;
42091 +       else
42092 +               return RETERR(-ENOENT);
42093 +
42094 +}
42095 +
42096 +/* expand @coord as to accommodate for insertion of @no new entries starting
42097 +   from @pos, with total bodies size @size. */
42098 +static int expand_item(const coord_t * coord /* coord of item */ ,
42099 +                      int pos /* unit position */ , int no     /* number of new
42100 +                                                                * units*/ ,
42101 +                      int size /* total size of new units' data */ ,
42102 +                      unsigned int data_size   /* free space already reserved
42103 +                                                * in the item for insertion */ )
42104 +{
42105 +       int entries;
42106 +       cde_unit_header *header;
42107 +       char *dent;
42108 +       int i;
42109 +
42110 +       assert("nikita-1310", coord != NULL);
42111 +       assert("nikita-1311", pos >= 0);
42112 +       assert("nikita-1312", no > 0);
42113 +       assert("nikita-1313", data_size >= no * sizeof(directory_entry_format));
42114 +       assert("nikita-1343",
42115 +              item_length_by_coord(coord) >=
42116 +              (int)(size + data_size + no * sizeof *header));
42117 +
42118 +       entries = units(coord);
42119 +
42120 +       if (pos == entries)
42121 +               dent = address(coord, size);
42122 +       else
42123 +               dent = (char *)entry_at(coord, pos);
42124 +       /* place where new header will be in */
42125 +       header = header_at(coord, pos);
42126 +       /* free space for new entry headers */
42127 +       memmove(header + no, header,
42128 +               (unsigned)(address(coord, size) - (char *)header));
42129 +       /* if adding to the end initialise first new header */
42130 +       if (pos == entries) {
42131 +               set_offset(coord, pos, (unsigned)size);
42132 +       }
42133 +
42134 +       /* adjust entry pointer and size */
42135 +       dent = dent + no * sizeof *header;
42136 +       size += no * sizeof *header;
42137 +       /* free space for new entries */
42138 +       memmove(dent + data_size, dent,
42139 +               (unsigned)(address(coord, size) - dent));
42140 +
42141 +       /* increase counter */
42142 +       entries += no;
42143 +       put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries);
42144 +
42145 +       /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header )
42146 +          bytes.  */
42147 +       for (i = 0; i <= pos; ++i)
42148 +               adj_offset(coord, i, no * sizeof *header);
42149 +       /* [ pos + no ... +\infty ) entries were shifted by ( no *
42150 +          sizeof *header + data_size ) bytes */
42151 +       for (i = pos + no; i < entries; ++i)
42152 +               adj_offset(coord, i, no * sizeof *header + data_size);
42153 +       return 0;
42154 +}
42155 +
42156 +/* insert new @entry into item */
42157 +static int expand(const coord_t * coord /* coord of item */ ,
42158 +                 struct cde_entry * entry /* entry to insert */ ,
42159 +                 int len /* length of @entry data */ ,
42160 +                 int *pos /* position to insert */ ,
42161 +                 reiser4_dir_entry_desc * dir_entry    /* parameters for new
42162 +                                                        * entry */ )
42163 +{
42164 +       cmp_t cmp_res;
42165 +       int datasize;
42166 +
42167 +       *pos = find(coord, &dir_entry->key, &cmp_res);
42168 +       if (*pos < 0)
42169 +               *pos = units(coord);
42170 +
42171 +       datasize = sizeof(directory_entry_format);
42172 +       if (is_longname(entry->name->name, entry->name->len))
42173 +               datasize += entry->name->len + 1;
42174 +
42175 +       expand_item(coord, *pos, 1, item_length_by_coord(coord) - len,
42176 +                   datasize);
42177 +       return 0;
42178 +}
42179 +
42180 +/* paste body of @entry into item */
42181 +static int paste_entry(const coord_t * coord /* coord of item */ ,
42182 +                      struct cde_entry * entry /* new entry */ ,
42183 +                      int pos /* position to insert */ ,
42184 +                      reiser4_dir_entry_desc * dir_entry       /* parameters for
42185 +                                                                * new entry */ )
42186 +{
42187 +       cde_unit_header *header;
42188 +       directory_entry_format *dent;
42189 +       const char *name;
42190 +       int len;
42191 +
42192 +       header = header_at(coord, pos);
42193 +       dent = entry_at(coord, pos);
42194 +
42195 +       build_de_id_by_key(&dir_entry->key, &header->hash);
42196 +       build_inode_key_id(entry->obj, &dent->id);
42197 +       /* AUDIT unsafe strcpy() operation! It should be replaced with
42198 +          much less CPU hungry
42199 +          memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len );
42200 +
42201 +          Also a more major thing is that there should be a way to figure out
42202 +          amount of space in dent -> name and be able to check that we are
42203 +          not going to overwrite more than we supposed to */
42204 +       name = entry->name->name;
42205 +       len = entry->name->len;
42206 +       if (is_longname(name, len)) {
42207 +               strcpy((unsigned char *)dent->name, name);
42208 +               put_unaligned(0, &dent->name[len]);
42209 +       }
42210 +       return 0;
42211 +}
42212 +
42213 +/* estimate how much space is necessary in item to insert/paste set of entries
42214 +   described in @data. */
42215 +int estimate_cde(const coord_t * coord /* coord of item */ ,
42216 +                const reiser4_item_data * data /* parameters for new item */ )
42217 +{
42218 +       struct cde_entry_data *e;
42219 +       int result;
42220 +       int i;
42221 +
42222 +       e = (struct cde_entry_data *) data->data;
42223 +
42224 +       assert("nikita-1288", e != NULL);
42225 +       assert("nikita-1289", e->num_of_entries >= 0);
42226 +
42227 +       if (coord == NULL)
42228 +               /* insert */
42229 +               result = sizeof(cde_item_format);
42230 +       else
42231 +               /* paste */
42232 +               result = 0;
42233 +
42234 +       result += e->num_of_entries *
42235 +           (sizeof(cde_unit_header) + sizeof(directory_entry_format));
42236 +       for (i = 0; i < e->num_of_entries; ++i) {
42237 +               const char *name;
42238 +               int len;
42239 +
42240 +               name = e->entry[i].name->name;
42241 +               len = e->entry[i].name->len;
42242 +               assert("nikita-2054", strlen(name) == len);
42243 +               if (is_longname(name, len))
42244 +                       result += len + 1;
42245 +       }
42246 +       ((reiser4_item_data *) data)->length = result;
42247 +       return result;
42248 +}
42249 +
42250 +/* ->nr_units() method for this item plugin. */
42251 +pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ )
42252 +{
42253 +       return units(coord);
42254 +}
42255 +
42256 +/* ->unit_key() method for this item plugin. */
42257 +reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ ,
42258 +                         reiser4_key * key /* resulting key */ )
42259 +{
42260 +       assert("nikita-1452", coord != NULL);
42261 +       assert("nikita-1345", idx_of(coord) < units(coord));
42262 +       assert("nikita-1346", key != NULL);
42263 +
42264 +       item_key_by_coord(coord, key);
42265 +       extract_key_from_de_id(extract_dir_id_from_key(key),
42266 +                              &header_at(coord, idx_of(coord))->hash, key);
42267 +       return key;
42268 +}
42269 +
42270 +/* mergeable_cde(): implementation of ->mergeable() item method.
42271 +
42272 +   Two directory items are mergeable iff they are from the same
42273 +   directory. That simple.
42274 +
42275 +*/
42276 +int mergeable_cde(const coord_t * p1 /* coord of first item */ ,
42277 +                 const coord_t * p2 /* coord of second item */ )
42278 +{
42279 +       reiser4_key k1;
42280 +       reiser4_key k2;
42281 +
42282 +       assert("nikita-1339", p1 != NULL);
42283 +       assert("nikita-1340", p2 != NULL);
42284 +
42285 +       return
42286 +           (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) &&
42287 +           (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) ==
42288 +            extract_dir_id_from_key(item_key_by_coord(p2, &k2)));
42289 +
42290 +}
42291 +
42292 +/* ->max_key_inside() method for this item plugin. */
42293 +reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ ,
42294 +                               reiser4_key * result /* resulting key */ )
42295 +{
42296 +       assert("nikita-1342", coord != NULL);
42297 +
42298 +       item_key_by_coord(coord, result);
42299 +       set_key_ordering(result, get_key_ordering(reiser4_max_key()));
42300 +       set_key_fulloid(result, get_key_fulloid(reiser4_max_key()));
42301 +       set_key_offset(result, get_key_offset(reiser4_max_key()));
42302 +       return result;
42303 +}
42304 +
42305 +/* @data contains data which are to be put into tree */
42306 +int can_contain_key_cde(const coord_t * coord /* coord of item */ ,
42307 +                       const reiser4_key * key /* key to check */ ,
42308 +                       const reiser4_item_data * data  /* parameters of new
42309 +                                                        * item/unit being
42310 +                                                        * created */ )
42311 +{
42312 +       reiser4_key item_key;
42313 +
42314 +       /* FIXME-VS: do not rely on anything but iplug field of @data. Only
42315 +          data->iplug is initialized */
42316 +       assert("vs-457", data && data->iplug);
42317 +/*     assert( "vs-553", data -> user == 0 );*/
42318 +       item_key_by_coord(coord, &item_key);
42319 +
42320 +       return (item_plugin_by_coord(coord) == data->iplug) &&
42321 +           (extract_dir_id_from_key(&item_key) ==
42322 +            extract_dir_id_from_key(key));
42323 +}
42324 +
42325 +#if REISER4_DEBUG
42326 +/* cde_check ->check() method for compressed directory items
42327 +
42328 +   used for debugging, every item should have here the most complete
42329 +   possible check of the consistency of the item that the inventor can
42330 +   construct
42331 +*/
42332 +int reiser4_check_cde(const coord_t * coord /* coord of item to check */,
42333 +                     const char **error /* where to store error message */)
42334 +{
42335 +       int i;
42336 +       int result;
42337 +       char *item_start;
42338 +       char *item_end;
42339 +       reiser4_key key;
42340 +
42341 +       coord_t c;
42342 +
42343 +       assert("nikita-1357", coord != NULL);
42344 +       assert("nikita-1358", error != NULL);
42345 +
42346 +       if (!ergo(coord->item_pos != 0,
42347 +                 is_dot_key(item_key_by_coord(coord, &key)))) {
42348 +               *error = "CDE doesn't start with dot";
42349 +               return -1;
42350 +       }
42351 +       item_start = item_body_by_coord(coord);
42352 +       item_end = item_start + item_length_by_coord(coord);
42353 +
42354 +       coord_dup(&c, coord);
42355 +       result = 0;
42356 +       for (i = 0; i < units(coord); ++i) {
42357 +               directory_entry_format *entry;
42358 +
42359 +               if ((char *)(header_at(coord, i) + 1) >
42360 +                   item_end - units(coord) * sizeof *entry) {
42361 +                       *error = "CDE header is out of bounds";
42362 +                       result = -1;
42363 +                       break;
42364 +               }
42365 +               entry = entry_at(coord, i);
42366 +               if ((char *)entry < item_start + sizeof(cde_item_format)) {
42367 +                       *error = "CDE header is too low";
42368 +                       result = -1;
42369 +                       break;
42370 +               }
42371 +               if ((char *)(entry + 1) > item_end) {
42372 +                       *error = "CDE header is too high";
42373 +                       result = -1;
42374 +                       break;
42375 +               }
42376 +       }
42377 +
42378 +       return result;
42379 +}
42380 +#endif
42381 +
42382 +/* ->init() method for this item plugin. */
42383 +int init_cde(coord_t * coord /* coord of item */ ,
42384 +            coord_t * from UNUSED_ARG, reiser4_item_data * data        /* structure used for insertion */
42385 +            UNUSED_ARG)
42386 +{
42387 +       put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries);
42388 +       return 0;
42389 +}
42390 +
42391 +/* ->lookup() method for this item plugin. */
42392 +lookup_result lookup_cde(const reiser4_key * key /* key to search for */ ,
42393 +                        lookup_bias bias /* search bias */ ,
42394 +                        coord_t * coord /* coord of item to lookup in */ )
42395 +{
42396 +       cmp_t last_comp;
42397 +       int pos;
42398 +
42399 +       reiser4_key utmost_key;
42400 +
42401 +       assert("nikita-1293", coord != NULL);
42402 +       assert("nikita-1294", key != NULL);
42403 +
42404 +       CHECKME(coord);
42405 +
42406 +       if (keygt(item_key_by_coord(coord, &utmost_key), key)) {
42407 +               coord->unit_pos = 0;
42408 +               coord->between = BEFORE_UNIT;
42409 +               return CBK_COORD_NOTFOUND;
42410 +       }
42411 +       pos = find(coord, key, &last_comp);
42412 +       if (pos >= 0) {
42413 +               coord->unit_pos = (int)pos;
42414 +               switch (last_comp) {
42415 +               case EQUAL_TO:
42416 +                       coord->between = AT_UNIT;
42417 +                       return CBK_COORD_FOUND;
42418 +               case GREATER_THAN:
42419 +                       coord->between = BEFORE_UNIT;
42420 +                       return RETERR(-ENOENT);
42421 +               case LESS_THAN:
42422 +               default:
42423 +                       impossible("nikita-1298", "Broken find");
42424 +                       return RETERR(-EIO);
42425 +               }
42426 +       } else {
42427 +               coord->unit_pos = units(coord) - 1;
42428 +               coord->between = AFTER_UNIT;
42429 +               return (bias ==
42430 +                       FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND :
42431 +                   CBK_COORD_NOTFOUND;
42432 +       }
42433 +}
42434 +
42435 +/* ->paste() method for this item plugin. */
42436 +int paste_cde(coord_t * coord /* coord of item */ ,
42437 +             reiser4_item_data * data  /* parameters of new unit being
42438 +                                        * inserted */ ,
42439 +             carry_plugin_info * info UNUSED_ARG /* todo carry queue */ )
42440 +{
42441 +       struct cde_entry_data *e;
42442 +       int result;
42443 +       int i;
42444 +
42445 +       CHECKME(coord);
42446 +       e = (struct cde_entry_data *) data->data;
42447 +
42448 +       result = 0;
42449 +       for (i = 0; i < e->num_of_entries; ++i) {
42450 +               int pos;
42451 +               int phantom_size;
42452 +
42453 +               phantom_size = data->length;
42454 +               if (units(coord) == 0)
42455 +                       phantom_size -= sizeof(cde_item_format);
42456 +
42457 +               result =
42458 +                   expand(coord, e->entry + i, phantom_size, &pos, data->arg);
42459 +               if (result != 0)
42460 +                       break;
42461 +               result = paste_entry(coord, e->entry + i, pos, data->arg);
42462 +               if (result != 0)
42463 +                       break;
42464 +       }
42465 +       CHECKME(coord);
42466 +       return result;
42467 +}
42468 +
42469 +/* amount of space occupied by all entries starting from @idx both headers and
42470 +   bodies. */
42471 +static unsigned int part_size(const coord_t * coord /* coord of item */ ,
42472 +                             int idx /* index of unit */ )
42473 +{
42474 +       assert("nikita-1299", coord != NULL);
42475 +       assert("nikita-1300", idx < (int)units(coord));
42476 +
42477 +       return sizeof(cde_item_format) +
42478 +           (idx + 1) * sizeof(cde_unit_header) + offset_of(coord,
42479 +                                                           idx + 1) -
42480 +           offset_of(coord, 0);
42481 +}
42482 +
42483 +/* how many but not more than @want units of @source can be merged with
42484 +   item in @target node. If pend == append - we try to append last item
42485 +   of @target by first units of @source. If pend == prepend - we try to
42486 +   "prepend" first item in @target by last units of @source. @target
42487 +   node has @free_space bytes of free space. Total size of those units
42488 +   are returned via @size */
42489 +int can_shift_cde(unsigned free_space /* free space in item */ ,
42490 +                 coord_t * coord /* coord of source item */ ,
42491 +                 znode * target /* target node */ ,
42492 +                 shift_direction pend /* shift direction */ ,
42493 +                 unsigned *size /* resulting number of shifted bytes */ ,
42494 +                 unsigned want /* maximal number of bytes to shift */ )
42495 +{
42496 +       int shift;
42497 +
42498 +       CHECKME(coord);
42499 +       if (want == 0) {
42500 +               *size = 0;
42501 +               return 0;
42502 +       }
42503 +
42504 +       /* pend == SHIFT_LEFT <==> shifting to the left */
42505 +       if (pend == SHIFT_LEFT) {
42506 +               for (shift = min((int)want - 1, units(coord)); shift >= 0;
42507 +                    --shift) {
42508 +                       *size = part_size(coord, shift);
42509 +                       if (target != NULL)
42510 +                               *size -= sizeof(cde_item_format);
42511 +                       if (*size <= free_space)
42512 +                               break;
42513 +               }
42514 +               shift = shift + 1;
42515 +       } else {
42516 +               int total_size;
42517 +
42518 +               assert("nikita-1301", pend == SHIFT_RIGHT);
42519 +
42520 +               total_size = item_length_by_coord(coord);
42521 +               for (shift = units(coord) - want - 1; shift < units(coord) - 1;
42522 +                    ++shift) {
42523 +                       *size = total_size - part_size(coord, shift);
42524 +                       if (target == NULL)
42525 +                               *size += sizeof(cde_item_format);
42526 +                       if (*size <= free_space)
42527 +                               break;
42528 +               }
42529 +               shift = units(coord) - shift - 1;
42530 +       }
42531 +       if (shift == 0)
42532 +               *size = 0;
42533 +       CHECKME(coord);
42534 +       return shift;
42535 +}
42536 +
42537 +/* ->copy_units() method for this item plugin. */
42538 +void copy_units_cde(coord_t * target /* coord of target item */ ,
42539 +                   coord_t * source /* coord of source item */ ,
42540 +                   unsigned from /* starting unit */ ,
42541 +                   unsigned count /* how many units to copy */ ,
42542 +                   shift_direction where_is_free_space /* shift direction */ ,
42543 +                   unsigned free_space /* free space in item */ )
42544 +{
42545 +       char *header_from;
42546 +       char *header_to;
42547 +
42548 +       char *entry_from;
42549 +       char *entry_to;
42550 +
42551 +       int pos_in_target;
42552 +       int data_size;
42553 +       int data_delta;
42554 +       int i;
42555 +
42556 +       assert("nikita-1303", target != NULL);
42557 +       assert("nikita-1304", source != NULL);
42558 +       assert("nikita-1305", (int)from < units(source));
42559 +       assert("nikita-1307", (int)(from + count) <= units(source));
42560 +
42561 +       if (where_is_free_space == SHIFT_LEFT) {
42562 +               assert("nikita-1453", from == 0);
42563 +               pos_in_target = units(target);
42564 +       } else {
42565 +               assert("nikita-1309", (int)(from + count) == units(source));
42566 +               pos_in_target = 0;
42567 +               memmove(item_body_by_coord(target),
42568 +                       (char *)item_body_by_coord(target) + free_space,
42569 +                       item_length_by_coord(target) - free_space);
42570 +       }
42571 +
42572 +       CHECKME(target);
42573 +       CHECKME(source);
42574 +
42575 +       /* expand @target */
42576 +       data_size =
42577 +           offset_of(source, (int)(from + count)) - offset_of(source,
42578 +                                                              (int)from);
42579 +
42580 +       if (units(target) == 0)
42581 +               free_space -= sizeof(cde_item_format);
42582 +
42583 +       expand_item(target, pos_in_target, (int)count,
42584 +                   (int)(item_length_by_coord(target) - free_space),
42585 +                   (unsigned)data_size);
42586 +
42587 +       /* copy first @count units of @source into @target */
42588 +       data_delta =
42589 +           offset_of(target, pos_in_target) - offset_of(source, (int)from);
42590 +
42591 +       /* copy entries */
42592 +       entry_from = (char *)entry_at(source, (int)from);
42593 +       entry_to = (char *)entry_at(source, (int)(from + count));
42594 +       memmove(entry_at(target, pos_in_target), entry_from,
42595 +               (unsigned)(entry_to - entry_from));
42596 +
42597 +       /* copy headers */
42598 +       header_from = (char *)header_at(source, (int)from);
42599 +       header_to = (char *)header_at(source, (int)(from + count));
42600 +       memmove(header_at(target, pos_in_target), header_from,
42601 +               (unsigned)(header_to - header_from));
42602 +
42603 +       /* update offsets */
42604 +       for (i = pos_in_target; i < (int)(pos_in_target + count); ++i)
42605 +               adj_offset(target, i, data_delta);
42606 +       CHECKME(target);
42607 +       CHECKME(source);
42608 +}
42609 +
42610 +/* ->cut_units() method for this item plugin. */
42611 +int cut_units_cde(coord_t * coord /* coord of item */ ,
42612 +                 pos_in_node_t from /* start unit pos */ ,
42613 +                 pos_in_node_t to /* stop unit pos */ ,
42614 +                 struct carry_cut_data *cdata UNUSED_ARG,
42615 +                 reiser4_key * smallest_removed, reiser4_key * new_first)
42616 +{
42617 +       char *header_from;
42618 +       char *header_to;
42619 +
42620 +       char *entry_from;
42621 +       char *entry_to;
42622 +
42623 +       int size;
42624 +       int entry_delta;
42625 +       int header_delta;
42626 +       int i;
42627 +
42628 +       unsigned count;
42629 +
42630 +       CHECKME(coord);
42631 +
42632 +       count = to - from + 1;
42633 +
42634 +       assert("nikita-1454", coord != NULL);
42635 +       assert("nikita-1455", (int)(from + count) <= units(coord));
42636 +
42637 +       if (smallest_removed)
42638 +               unit_key_by_coord(coord, smallest_removed);
42639 +
42640 +       if (new_first) {
42641 +               coord_t next;
42642 +
42643 +               /* not everything is cut from item head */
42644 +               assert("vs-1527", from == 0);
42645 +               assert("vs-1528", to < units(coord) - 1);
42646 +
42647 +               coord_dup(&next, coord);
42648 +               next.unit_pos++;
42649 +               unit_key_by_coord(&next, new_first);
42650 +       }
42651 +
42652 +       size = item_length_by_coord(coord);
42653 +       if (count == (unsigned)units(coord)) {
42654 +               return size;
42655 +       }
42656 +
42657 +       header_from = (char *)header_at(coord, (int)from);
42658 +       header_to = (char *)header_at(coord, (int)(from + count));
42659 +
42660 +       entry_from = (char *)entry_at(coord, (int)from);
42661 +       entry_to = (char *)entry_at(coord, (int)(from + count));
42662 +
42663 +       /* move headers */
42664 +       memmove(header_from, header_to,
42665 +               (unsigned)(address(coord, size) - header_to));
42666 +
42667 +       header_delta = header_to - header_from;
42668 +
42669 +       entry_from -= header_delta;
42670 +       entry_to -= header_delta;
42671 +       size -= header_delta;
42672 +
42673 +       /* copy entries */
42674 +       memmove(entry_from, entry_to,
42675 +               (unsigned)(address(coord, size) - entry_to));
42676 +
42677 +       entry_delta = entry_to - entry_from;
42678 +       size -= entry_delta;
42679 +
42680 +       /* update offsets */
42681 +
42682 +       for (i = 0; i < (int)from; ++i)
42683 +               adj_offset(coord, i, -header_delta);
42684 +
42685 +       for (i = from; i < units(coord) - (int)count; ++i)
42686 +               adj_offset(coord, i, -header_delta - entry_delta);
42687 +
42688 +       put_unaligned(cpu_to_le16((__u16) units(coord) - count),
42689 +                     &formatted_at(coord)->num_of_entries);
42690 +
42691 +       if (from == 0) {
42692 +               /* entries from head was removed - move remaining to right */
42693 +               memmove((char *)item_body_by_coord(coord) +
42694 +                       header_delta + entry_delta, item_body_by_coord(coord),
42695 +                       (unsigned)size);
42696 +               if (REISER4_DEBUG)
42697 +                       memset(item_body_by_coord(coord), 0,
42698 +                              (unsigned)header_delta + entry_delta);
42699 +       } else {
42700 +               /* freed space is already at the end of item */
42701 +               if (REISER4_DEBUG)
42702 +                       memset((char *)item_body_by_coord(coord) + size, 0,
42703 +                              (unsigned)header_delta + entry_delta);
42704 +       }
42705 +
42706 +       return header_delta + entry_delta;
42707 +}
42708 +
42709 +int kill_units_cde(coord_t * coord /* coord of item */ ,
42710 +                  pos_in_node_t from /* start unit pos */ ,
42711 +                  pos_in_node_t to /* stop unit pos */ ,
42712 +                  struct carry_kill_data *kdata UNUSED_ARG,
42713 +                  reiser4_key * smallest_removed, reiser4_key * new_first)
42714 +{
42715 +       return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first);
42716 +}
42717 +
42718 +/* ->s.dir.extract_key() method for this item plugin. */
42719 +int extract_key_cde(const coord_t * coord /* coord of item */ ,
42720 +                   reiser4_key * key /* resulting key */ )
42721 +{
42722 +       directory_entry_format *dent;
42723 +
42724 +       assert("nikita-1155", coord != NULL);
42725 +       assert("nikita-1156", key != NULL);
42726 +
42727 +       dent = entry_at(coord, idx_of(coord));
42728 +       return extract_key_from_id(&dent->id, key);
42729 +}
42730 +
42731 +int
42732 +update_key_cde(const coord_t * coord, const reiser4_key * key,
42733 +              lock_handle * lh UNUSED_ARG)
42734 +{
42735 +       directory_entry_format *dent;
42736 +       obj_key_id obj_id;
42737 +       int result;
42738 +
42739 +       assert("nikita-2344", coord != NULL);
42740 +       assert("nikita-2345", key != NULL);
42741 +
42742 +       dent = entry_at(coord, idx_of(coord));
42743 +       result = build_obj_key_id(key, &obj_id);
42744 +       if (result == 0) {
42745 +               dent->id = obj_id;
42746 +               znode_make_dirty(coord->node);
42747 +       }
42748 +       return 0;
42749 +}
42750 +
42751 +/* ->s.dir.extract_name() method for this item plugin. */
42752 +char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf)
42753 +{
42754 +       directory_entry_format *dent;
42755 +
42756 +       assert("nikita-1157", coord != NULL);
42757 +
42758 +       dent = entry_at(coord, idx_of(coord));
42759 +       return extract_dent_name(coord, dent, buf);
42760 +}
42761 +
42762 +static int cde_bytes(int pasting, const reiser4_item_data * data)
42763 +{
42764 +       int result;
42765 +
42766 +       result = data->length;
42767 +       if (!pasting)
42768 +               result -= sizeof(cde_item_format);
42769 +       return result;
42770 +}
42771 +
42772 +/* ->s.dir.add_entry() method for this item plugin */
42773 +int add_entry_cde(struct inode *dir /* directory object */ ,
42774 +                 coord_t * coord /* coord of item */ ,
42775 +                 lock_handle * lh /* lock handle for insertion */ ,
42776 +                 const struct dentry *name /* name to insert */ ,
42777 +                 reiser4_dir_entry_desc * dir_entry    /* parameters of new
42778 +                                                        * directory entry */ )
42779 +{
42780 +       reiser4_item_data data;
42781 +       struct cde_entry entry;
42782 +       struct cde_entry_data edata;
42783 +       int result;
42784 +
42785 +       assert("nikita-1656", coord->node == lh->node);
42786 +       assert("nikita-1657", znode_is_write_locked(coord->node));
42787 +
42788 +       edata.num_of_entries = 1;
42789 +       edata.entry = &entry;
42790 +
42791 +       entry.dir = dir;
42792 +       entry.obj = dir_entry->obj;
42793 +       entry.name = &name->d_name;
42794 +
42795 +       data.data = (char *)&edata;
42796 +       data.user = 0;          /* &edata is not user space */
42797 +       data.iplug = item_plugin_by_id(COMPOUND_DIR_ID);
42798 +       data.arg = dir_entry;
42799 +       assert("nikita-1302", data.iplug != NULL);
42800 +
42801 +       result = is_dot_key(&dir_entry->key);
42802 +       data.length = estimate_cde(result ? coord : NULL, &data);
42803 +
42804 +       /* NOTE-NIKITA quota plugin? */
42805 +       if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data)))
42806 +               return RETERR(-EDQUOT);
42807 +
42808 +       if (result)
42809 +               result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0);
42810 +       else
42811 +               result = reiser4_resize_item(coord, &data, &dir_entry->key,
42812 +                                            lh, 0);
42813 +       return result;
42814 +}
42815 +
42816 +/* ->s.dir.rem_entry() */
42817 +int rem_entry_cde(struct inode *dir /* directory of item */ ,
42818 +                 const struct qstr *name, coord_t * coord /* coord of item */ ,
42819 +                 lock_handle * lh UNUSED_ARG   /* lock handle for
42820 +                                                * removal */ ,
42821 +                 reiser4_dir_entry_desc * entry UNUSED_ARG     /* parameters of
42822 +                                                                * directory entry
42823 +                                                                * being removed */ )
42824 +{
42825 +       coord_t shadow;
42826 +       int result;
42827 +       int length;
42828 +       ON_DEBUG(char buf[DE_NAME_BUF_LEN]);
42829 +
42830 +       assert("nikita-2870", strlen(name->name) == name->len);
42831 +       assert("nikita-2869",
42832 +              !strcmp(name->name, extract_name_cde(coord, buf)));
42833 +
42834 +       length = sizeof(directory_entry_format) + sizeof(cde_unit_header);
42835 +       if (is_longname(name->name, name->len))
42836 +               length += name->len + 1;
42837 +
42838 +       if (inode_get_bytes(dir) < length) {
42839 +               warning("nikita-2628", "Dir is broke: %llu: %llu",
42840 +                       (unsigned long long)get_inode_oid(dir),
42841 +                       inode_get_bytes(dir));
42842 +
42843 +               return RETERR(-EIO);
42844 +       }
42845 +
42846 +       /* cut_node() is supposed to take pointers to _different_
42847 +          coords, because it will modify them without respect to
42848 +          possible aliasing. To work around this, create temporary copy
42849 +          of @coord.
42850 +        */
42851 +       coord_dup(&shadow, coord);
42852 +       result =
42853 +           kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
42854 +       if (result == 0) {
42855 +               /* NOTE-NIKITA quota plugin? */
42856 +               DQUOT_FREE_SPACE_NODIRTY(dir, length);
42857 +       }
42858 +       return result;
42859 +}
42860 +
42861 +/* ->s.dir.max_name_len() method for this item plugin */
42862 +int max_name_len_cde(const struct inode *dir /* directory */ )
42863 +{
42864 +       return
42865 +               reiser4_tree_by_inode(dir)->nplug->max_item_size() -
42866 +               sizeof(directory_entry_format) - sizeof(cde_item_format) -
42867 +               sizeof(cde_unit_header) - 2;
42868 +}
42869 +
42870 +/* Make Linus happy.
42871 +   Local variables:
42872 +   c-indentation-style: "K&R"
42873 +   mode-name: "LC"
42874 +   c-basic-offset: 8
42875 +   tab-width: 8
42876 +   fill-column: 120
42877 +   End:
42878 +*/
42879 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/cde.h linux-2.6.27/fs/reiser4/plugin/item/cde.h
42880 --- linux-2.6.27.orig/fs/reiser4/plugin/item/cde.h      1970-01-01 03:00:00.000000000 +0300
42881 +++ linux-2.6.27/fs/reiser4/plugin/item/cde.h   2008-10-12 18:20:01.000000000 +0400
42882 @@ -0,0 +1,87 @@
42883 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
42884 +
42885 +/* Compound directory item. See cde.c for description. */
42886 +
42887 +#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ )
42888 +#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__
42889 +
42890 +#include "../../forward.h"
42891 +#include "../../kassign.h"
42892 +#include "../../dformat.h"
42893 +
42894 +#include <linux/fs.h>          /* for struct inode */
42895 +#include <linux/dcache.h>      /* for struct dentry, etc  */
42896 +
42897 +typedef struct cde_unit_header {
42898 +       de_id hash;
42899 +       d16 offset;
42900 +} cde_unit_header;
42901 +
42902 +typedef struct cde_item_format {
42903 +       d16 num_of_entries;
42904 +       cde_unit_header entry[0];
42905 +} cde_item_format;
42906 +
42907 +struct cde_entry {
42908 +       const struct inode *dir;
42909 +       const struct inode *obj;
42910 +       const struct qstr *name;
42911 +};
42912 +
42913 +struct cde_entry_data {
42914 +       int num_of_entries;
42915 +       struct cde_entry *entry;
42916 +};
42917 +
42918 +/* plugin->item.b.* */
42919 +reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result);
42920 +int can_contain_key_cde(const coord_t * coord, const reiser4_key * key,
42921 +                       const reiser4_item_data *);
42922 +int mergeable_cde(const coord_t * p1, const coord_t * p2);
42923 +pos_in_node_t nr_units_cde(const coord_t * coord);
42924 +reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key);
42925 +int estimate_cde(const coord_t * coord, const reiser4_item_data * data);
42926 +void print_cde(const char *prefix, coord_t * coord);
42927 +int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data);
42928 +lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias,
42929 +                        coord_t * coord);
42930 +int paste_cde(coord_t * coord, reiser4_item_data * data,
42931 +             carry_plugin_info * info UNUSED_ARG);
42932 +int can_shift_cde(unsigned free_space, coord_t * coord, znode * target,
42933 +                 shift_direction pend, unsigned *size, unsigned want);
42934 +void copy_units_cde(coord_t * target, coord_t * source, unsigned from,
42935 +                   unsigned count, shift_direction where_is_free_space,
42936 +                   unsigned free_space);
42937 +int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
42938 +                 struct carry_cut_data *, reiser4_key * smallest_removed,
42939 +                 reiser4_key * new_first);
42940 +int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
42941 +                  struct carry_kill_data *, reiser4_key * smallest_removed,
42942 +                  reiser4_key * new_first);
42943 +void print_cde(const char *prefix, coord_t * coord);
42944 +int reiser4_check_cde(const coord_t * coord, const char **error);
42945 +
42946 +/* plugin->u.item.s.dir.* */
42947 +int extract_key_cde(const coord_t * coord, reiser4_key * key);
42948 +int update_key_cde(const coord_t * coord, const reiser4_key * key,
42949 +                  lock_handle * lh);
42950 +char *extract_name_cde(const coord_t * coord, char *buf);
42951 +int add_entry_cde(struct inode *dir, coord_t * coord,
42952 +                 lock_handle * lh, const struct dentry *name,
42953 +                 reiser4_dir_entry_desc * entry);
42954 +int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord,
42955 +                 lock_handle * lh, reiser4_dir_entry_desc * entry);
42956 +int max_name_len_cde(const struct inode *dir);
42957 +
42958 +/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */
42959 +#endif
42960 +
42961 +/* Make Linus happy.
42962 +   Local variables:
42963 +   c-indentation-style: "K&R"
42964 +   mode-name: "LC"
42965 +   c-basic-offset: 8
42966 +   tab-width: 8
42967 +   fill-column: 120
42968 +   End:
42969 +*/
42970 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/ctail.c linux-2.6.27/fs/reiser4/plugin/item/ctail.c
42971 --- linux-2.6.27.orig/fs/reiser4/plugin/item/ctail.c    1970-01-01 03:00:00.000000000 +0300
42972 +++ linux-2.6.27/fs/reiser4/plugin/item/ctail.c 2008-10-12 18:20:01.000000000 +0400
42973 @@ -0,0 +1,1613 @@
42974 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
42975 +
42976 +/* ctails (aka "clustered tails") are items for cryptcompress objects */
42977 +
42978 +/* DESCRIPTION:
42979 +
42980 +Each cryptcompress object is stored on disk as a set of clusters sliced
42981 +into ctails.
42982 +
42983 +Internal on-disk structure:
42984 +
42985 +        HEADER   (1)  Here stored disk cluster shift
42986 +       BODY
42987 +*/
42988 +
42989 +#include "../../forward.h"
42990 +#include "../../debug.h"
42991 +#include "../../dformat.h"
42992 +#include "../../kassign.h"
42993 +#include "../../key.h"
42994 +#include "../../coord.h"
42995 +#include "item.h"
42996 +#include "../node/node.h"
42997 +#include "../plugin.h"
42998 +#include "../object.h"
42999 +#include "../../znode.h"
43000 +#include "../../carry.h"
43001 +#include "../../tree.h"
43002 +#include "../../inode.h"
43003 +#include "../../super.h"
43004 +#include "../../context.h"
43005 +#include "../../page_cache.h"
43006 +#include "../cluster.h"
43007 +#include "../../flush.h"
43008 +#include "../../tree_walk.h"
43009 +
43010 +#include <linux/pagevec.h>
43011 +#include <linux/swap.h>
43012 +#include <linux/fs.h>
43013 +
43014 +/* return body of ctail item at @coord */
43015 +static ctail_item_format *ctail_formatted_at(const coord_t * coord)
43016 +{
43017 +       assert("edward-60", coord != NULL);
43018 +       return item_body_by_coord(coord);
43019 +}
43020 +
43021 +static int cluster_shift_by_coord(const coord_t * coord)
43022 +{
43023 +       return get_unaligned(&ctail_formatted_at(coord)->cluster_shift);
43024 +}
43025 +
43026 +static inline void dclust_set_extension_shift(hint_t * hint)
43027 +{
43028 +       assert("edward-1270",
43029 +              item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID);
43030 +       hint->ext_coord.extension.ctail.shift =
43031 +           cluster_shift_by_coord(&hint->ext_coord.coord);
43032 +}
43033 +
43034 +static loff_t off_by_coord(const coord_t * coord)
43035 +{
43036 +       reiser4_key key;
43037 +       return get_key_offset(item_key_by_coord(coord, &key));
43038 +}
43039 +
43040 +int coord_is_unprepped_ctail(const coord_t * coord)
43041 +{
43042 +       assert("edward-1233", coord != NULL);
43043 +       assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID);
43044 +       assert("edward-1235",
43045 +              ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT,
43046 +                   nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS));
43047 +
43048 +       return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT;
43049 +}
43050 +
43051 +static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode)
43052 +{
43053 +       int shift;
43054 +
43055 +       if (inode != NULL) {
43056 +               shift = inode_cluster_shift(inode);
43057 +               assert("edward-1236",
43058 +                      ergo(!coord_is_unprepped_ctail(coord),
43059 +                           shift == cluster_shift_by_coord(coord)));
43060 +       } else {
43061 +               assert("edward-1237", !coord_is_unprepped_ctail(coord));
43062 +               shift = cluster_shift_by_coord(coord);
43063 +       }
43064 +       return off_by_coord(coord) >> shift;
43065 +}
43066 +
43067 +static int disk_cluster_size(const coord_t * coord)
43068 +{
43069 +       assert("edward-1156",
43070 +              item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
43071 +       /* calculation of disk cluster size
43072 +          is meaninless if ctail is unprepped */
43073 +       assert("edward-1238", !coord_is_unprepped_ctail(coord));
43074 +
43075 +       return 1 << cluster_shift_by_coord(coord);
43076 +}
43077 +
43078 +/* true if the key is of first disk cluster item */
43079 +static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord)
43080 +{
43081 +       assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID);
43082 +
43083 +       return coord_is_unprepped_ctail(coord) ||
43084 +           ((get_key_offset(key) &
43085 +             ((loff_t) disk_cluster_size(coord) - 1)) == 0);
43086 +}
43087 +
43088 +static char *first_unit(coord_t * coord)
43089 +{
43090 +       /* FIXME: warning: pointer of type `void *' used in arithmetic */
43091 +       return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format);
43092 +}
43093 +
43094 +/* plugin->u.item.b.max_key_inside :
43095 +   tail_max_key_inside */
43096 +
43097 +/* plugin->u.item.b.can_contain_key */
43098 +int
43099 +can_contain_key_ctail(const coord_t * coord, const reiser4_key * key,
43100 +                     const reiser4_item_data * data)
43101 +{
43102 +       reiser4_key item_key;
43103 +
43104 +       if (item_plugin_by_coord(coord) != data->iplug)
43105 +               return 0;
43106 +
43107 +       item_key_by_coord(coord, &item_key);
43108 +       if (get_key_locality(key) != get_key_locality(&item_key) ||
43109 +           get_key_objectid(key) != get_key_objectid(&item_key))
43110 +               return 0;
43111 +       if (get_key_offset(&item_key) + nr_units_ctail(coord) !=
43112 +           get_key_offset(key))
43113 +               return 0;
43114 +       if (is_disk_cluster_key(key, coord))
43115 +               return 0;
43116 +       return 1;
43117 +}
43118 +
43119 +/* plugin->u.item.b.mergeable */
43120 +int mergeable_ctail(const coord_t * p1, const coord_t * p2)
43121 +{
43122 +       reiser4_key key1, key2;
43123 +
43124 +       assert("edward-62", item_id_by_coord(p1) == CTAIL_ID);
43125 +       assert("edward-61", plugin_of_group(item_plugin_by_coord(p1),
43126 +                                           UNIX_FILE_METADATA_ITEM_TYPE));
43127 +
43128 +       if (item_id_by_coord(p2) != CTAIL_ID) {
43129 +               /* second item is of another type */
43130 +               return 0;
43131 +       }
43132 +
43133 +       item_key_by_coord(p1, &key1);
43134 +       item_key_by_coord(p2, &key2);
43135 +       if (get_key_locality(&key1) != get_key_locality(&key2) ||
43136 +           get_key_objectid(&key1) != get_key_objectid(&key2) ||
43137 +           get_key_type(&key1) != get_key_type(&key2)) {
43138 +               /* items of different objects */
43139 +               return 0;
43140 +       }
43141 +       if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2))
43142 +               /*  not adjacent items */
43143 +               return 0;
43144 +       if (is_disk_cluster_key(&key2, p2))
43145 +               return 0;
43146 +       return 1;
43147 +}
43148 +
43149 +/* plugin->u.item.b.nr_units */
43150 +pos_in_node_t nr_units_ctail(const coord_t * coord)
43151 +{
43152 +       return (item_length_by_coord(coord) -
43153 +               sizeof(ctail_formatted_at(coord)->cluster_shift));
43154 +}
43155 +
43156 +/* plugin->u.item.b.estimate:
43157 +   estimate how much space is needed to insert/paste @data->length bytes
43158 +   into ctail at @coord */
43159 +int estimate_ctail(const coord_t * coord /* coord of item */ ,
43160 +                  const reiser4_item_data *
43161 +                  data /* parameters for new item */ )
43162 +{
43163 +       if (coord == NULL)
43164 +               /* insert */
43165 +               return (sizeof(ctail_item_format) + data->length);
43166 +       else
43167 +               /* paste */
43168 +               return data->length;
43169 +}
43170 +
43171 +/* ->init() method for this item plugin. */
43172 +int init_ctail(coord_t * to /* coord of item */ ,
43173 +              coord_t * from /* old_item */ ,
43174 +              reiser4_item_data * data /* structure used for insertion */ )
43175 +{
43176 +       int cluster_shift;      /* cpu value to convert */
43177 +
43178 +       if (data) {
43179 +               assert("edward-463", data->length > sizeof(ctail_item_format));
43180 +               cluster_shift = *((int *)(data->arg));
43181 +               data->length -= sizeof(ctail_item_format);
43182 +       } else {
43183 +               assert("edward-464", from != NULL);
43184 +               assert("edward-855", ctail_ok(from));
43185 +               cluster_shift = (int)(cluster_shift_by_coord(from));
43186 +       }
43187 +       put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift);
43188 +       assert("edward-856", ctail_ok(to));
43189 +       return 0;
43190 +}
43191 +
43192 +/* plugin->u.item.b.lookup:
43193 +   NULL: We are looking for item keys only */
43194 +
43195 +#if REISER4_DEBUG
43196 +int ctail_ok(const coord_t * coord)
43197 +{
43198 +       return coord_is_unprepped_ctail(coord) ||
43199 +           cluster_shift_ok(cluster_shift_by_coord(coord));
43200 +}
43201 +
43202 +/* plugin->u.item.b.check */
43203 +int check_ctail(const coord_t * coord, const char **error)
43204 +{
43205 +       if (!ctail_ok(coord)) {
43206 +               if (error)
43207 +                       *error = "bad cluster shift in ctail";
43208 +               return 1;
43209 +       }
43210 +       return 0;
43211 +}
43212 +#endif
43213 +
43214 +/* plugin->u.item.b.paste */
43215 +int
43216 +paste_ctail(coord_t * coord, reiser4_item_data * data,
43217 +           carry_plugin_info * info UNUSED_ARG)
43218 +{
43219 +       unsigned old_nr_units;
43220 +
43221 +       assert("edward-268", data->data != NULL);
43222 +       /* copy only from kernel space */
43223 +       assert("edward-66", data->user == 0);
43224 +
43225 +       old_nr_units =
43226 +           item_length_by_coord(coord) - sizeof(ctail_item_format) -
43227 +           data->length;
43228 +
43229 +       /* ctail items never get pasted in the middle */
43230 +
43231 +       if (coord->unit_pos == 0 && coord->between == AT_UNIT) {
43232 +
43233 +               /* paste at the beginning when create new item */
43234 +               assert("edward-450",
43235 +                      item_length_by_coord(coord) ==
43236 +                      data->length + sizeof(ctail_item_format));
43237 +               assert("edward-451", old_nr_units == 0);
43238 +       } else if (coord->unit_pos == old_nr_units - 1
43239 +                  && coord->between == AFTER_UNIT) {
43240 +
43241 +               /* paste at the end */
43242 +               coord->unit_pos++;
43243 +       } else
43244 +               impossible("edward-453", "bad paste position");
43245 +
43246 +       memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length);
43247 +
43248 +       assert("edward-857", ctail_ok(coord));
43249 +
43250 +       return 0;
43251 +}
43252 +
43253 +/* plugin->u.item.b.fast_paste */
43254 +
43255 +/* plugin->u.item.b.can_shift
43256 +   number of units is returned via return value, number of bytes via @size. For
43257 +   ctail items they coincide */
43258 +int
43259 +can_shift_ctail(unsigned free_space, coord_t * source,
43260 +               znode * target, shift_direction direction UNUSED_ARG,
43261 +               unsigned *size /* number of bytes */ , unsigned want)
43262 +{
43263 +       /* make sure that that we do not want to shift more than we have */
43264 +       assert("edward-68", want > 0 && want <= nr_units_ctail(source));
43265 +
43266 +       *size = min(want, free_space);
43267 +
43268 +       if (!target) {
43269 +               /* new item will be created */
43270 +               if (*size <= sizeof(ctail_item_format)) {
43271 +                       *size = 0;
43272 +                       return 0;
43273 +               }
43274 +               return *size - sizeof(ctail_item_format);
43275 +       }
43276 +       return *size;
43277 +}
43278 +
43279 +/* plugin->u.item.b.copy_units
43280 +   cooperates with ->can_shift() */
43281 +void
43282 +copy_units_ctail(coord_t * target, coord_t * source,
43283 +                unsigned from, unsigned count /* units */ ,
43284 +                shift_direction where_is_free_space,
43285 +                unsigned free_space /* bytes */ )
43286 +{
43287 +       /* make sure that item @target is expanded already */
43288 +       assert("edward-69", (unsigned)item_length_by_coord(target) >= count);
43289 +       assert("edward-70", free_space == count || free_space == count + 1);
43290 +
43291 +       assert("edward-858", ctail_ok(source));
43292 +
43293 +       if (where_is_free_space == SHIFT_LEFT) {
43294 +               /* append item @target with @count first bytes of @source:
43295 +                  this restriction came from ordinary tails */
43296 +               assert("edward-71", from == 0);
43297 +               assert("edward-860", ctail_ok(target));
43298 +
43299 +               memcpy(first_unit(target) + nr_units_ctail(target) - count,
43300 +                      first_unit(source), count);
43301 +       } else {
43302 +               /* target item is moved to right already */
43303 +               reiser4_key key;
43304 +
43305 +               assert("edward-72", nr_units_ctail(source) == from + count);
43306 +
43307 +               if (free_space == count) {
43308 +                       init_ctail(target, source, NULL);
43309 +               } else {
43310 +                       /* new item has been created */
43311 +                       assert("edward-862", ctail_ok(target));
43312 +               }
43313 +               memcpy(first_unit(target), first_unit(source) + from, count);
43314 +
43315 +               assert("edward-863", ctail_ok(target));
43316 +
43317 +               /* new units are inserted before first unit in an item,
43318 +                  therefore, we have to update item key */
43319 +               item_key_by_coord(source, &key);
43320 +               set_key_offset(&key, get_key_offset(&key) + from);
43321 +
43322 +               node_plugin_by_node(target->node)->update_item_key(target, &key,
43323 +                                                                  NULL /*info */);
43324 +       }
43325 +}
43326 +
43327 +/* plugin->u.item.b.create_hook */
43328 +int create_hook_ctail(const coord_t * coord, void *arg)
43329 +{
43330 +       assert("edward-864", znode_is_loaded(coord->node));
43331 +
43332 +       znode_set_convertible(coord->node);
43333 +       return 0;
43334 +}
43335 +
43336 +/* plugin->u.item.b.kill_hook */
43337 +int kill_hook_ctail(const coord_t * coord, pos_in_node_t from,
43338 +                   pos_in_node_t count, carry_kill_data * kdata)
43339 +{
43340 +       struct inode *inode;
43341 +
43342 +       assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID);
43343 +       assert("edward-291", znode_is_write_locked(coord->node));
43344 +
43345 +       inode = kdata->inode;
43346 +       if (inode) {
43347 +               reiser4_key key;
43348 +               struct cryptcompress_info * info;
43349 +               cloff_t index;
43350 +
43351 +               item_key_by_coord(coord, &key);
43352 +               info = cryptcompress_inode_data(inode);
43353 +               index = off_to_clust(get_key_offset(&key), inode);
43354 +
43355 +               if (from == 0) {
43356 +                       info->trunc_index = index;
43357 +                       if (is_disk_cluster_key(&key, coord)) {
43358 +                               /*
43359 +                                * first item of disk cluster is to be killed
43360 +                                */
43361 +                               truncate_complete_page_cluster(
43362 +                                       inode, index, kdata->params.truncate);
43363 +                               inode_sub_bytes(inode,
43364 +                                               inode_cluster_size(inode));
43365 +                       }
43366 +               }
43367 +       }
43368 +       return 0;
43369 +}
43370 +
43371 +/* for shift_hook_ctail(),
43372 +   return true if the first disk cluster item has dirty child
43373 +*/
43374 +static int ctail_convertible(const coord_t * coord)
43375 +{
43376 +       int result;
43377 +       reiser4_key key;
43378 +       jnode *child = NULL;
43379 +
43380 +       assert("edward-477", coord != NULL);
43381 +       assert("edward-478", item_id_by_coord(coord) == CTAIL_ID);
43382 +
43383 +       if (coord_is_unprepped_ctail(coord))
43384 +               /* unprepped ctail should be converted */
43385 +               return 1;
43386 +
43387 +       item_key_by_coord(coord, &key);
43388 +       child = jlookup(current_tree,
43389 +                       get_key_objectid(&key),
43390 +                       off_to_pg(off_by_coord(coord)));
43391 +       if (!child)
43392 +               return 0;
43393 +       result = JF_ISSET(child, JNODE_DIRTY);
43394 +       jput(child);
43395 +       return result;
43396 +}
43397 +
43398 +/* FIXME-EDWARD */
43399 +/* plugin->u.item.b.shift_hook */
43400 +int shift_hook_ctail(const coord_t * item /* coord of item */ ,
43401 +                    unsigned from UNUSED_ARG /* start unit */ ,
43402 +                    unsigned count UNUSED_ARG /* stop unit */ ,
43403 +                    znode * old_node /* old parent */ )
43404 +{
43405 +       assert("edward-479", item != NULL);
43406 +       assert("edward-480", item->node != old_node);
43407 +
43408 +       if (!znode_convertible(old_node) || znode_convertible(item->node))
43409 +               return 0;
43410 +       if (ctail_convertible(item))
43411 +               znode_set_convertible(item->node);
43412 +       return 0;
43413 +}
43414 +
43415 +static int
43416 +cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
43417 +                       int cut, void *p, reiser4_key * smallest_removed,
43418 +                       reiser4_key * new_first)
43419 +{
43420 +       pos_in_node_t count;    /* number of units to cut */
43421 +       char *item;
43422 +
43423 +       count = to - from + 1;
43424 +       item = item_body_by_coord(coord);
43425 +
43426 +       assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord)));
43427 +
43428 +       if (smallest_removed) {
43429 +               /* store smallest key removed */
43430 +               item_key_by_coord(coord, smallest_removed);
43431 +               set_key_offset(smallest_removed,
43432 +                              get_key_offset(smallest_removed) + from);
43433 +       }
43434 +
43435 +       if (new_first) {
43436 +               assert("vs-1531", from == 0);
43437 +
43438 +               item_key_by_coord(coord, new_first);
43439 +               set_key_offset(new_first,
43440 +                              get_key_offset(new_first) + from + count);
43441 +       }
43442 +
43443 +       if (!cut)
43444 +               kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p);
43445 +
43446 +       if (from == 0) {
43447 +               if (count != nr_units_ctail(coord)) {
43448 +                       /* part of item is removed, so move free space at the beginning
43449 +                          of the item and update item key */
43450 +                       reiser4_key key;
43451 +                       memcpy(item + to + 1, item, sizeof(ctail_item_format));
43452 +                       item_key_by_coord(coord, &key);
43453 +                       set_key_offset(&key, get_key_offset(&key) + count);
43454 +                       node_plugin_by_node(coord->node)->update_item_key(coord,
43455 +                                                                         &key,
43456 +                                                                         NULL);
43457 +               } else {
43458 +                       /* cut_units should not be called to cut evrything */
43459 +                       assert("vs-1532", ergo(cut, 0));
43460 +                       /* whole item is cut, so more then amount of space occupied
43461 +                          by units got freed */
43462 +                       count += sizeof(ctail_item_format);
43463 +               }
43464 +               if (REISER4_DEBUG)
43465 +                       memset(item, 0, count);
43466 +       } else if (REISER4_DEBUG)
43467 +               memset(item + sizeof(ctail_item_format) + from, 0, count);
43468 +       return count;
43469 +}
43470 +
43471 +/* plugin->u.item.b.cut_units */
43472 +int
43473 +cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
43474 +               carry_cut_data * cdata, reiser4_key * smallest_removed,
43475 +               reiser4_key * new_first)
43476 +{
43477 +       return cut_or_kill_ctail_units(item, from, to, 1, NULL,
43478 +                                      smallest_removed, new_first);
43479 +}
43480 +
43481 +/* plugin->u.item.b.kill_units */
43482 +int
43483 +kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
43484 +                struct carry_kill_data *kdata, reiser4_key * smallest_removed,
43485 +                reiser4_key * new_first)
43486 +{
43487 +       return cut_or_kill_ctail_units(item, from, to, 0, kdata,
43488 +                                      smallest_removed, new_first);
43489 +}
43490 +
43491 +/* plugin->u.item.s.file.read */
43492 +int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint)
43493 +{
43494 +       uf_coord_t *uf_coord;
43495 +       coord_t *coord;
43496 +
43497 +       uf_coord = &hint->ext_coord;
43498 +       coord = &uf_coord->coord;
43499 +       assert("edward-127", f->user == 0);
43500 +       assert("edward-129", coord && coord->node);
43501 +       assert("edward-130", coord_is_existing_unit(coord));
43502 +       assert("edward-132", znode_is_loaded(coord->node));
43503 +
43504 +       /* start read only from the beginning of ctail */
43505 +       assert("edward-133", coord->unit_pos == 0);
43506 +       /* read only whole ctails */
43507 +       assert("edward-135", nr_units_ctail(coord) <= f->length);
43508 +
43509 +       assert("edward-136", reiser4_schedulable());
43510 +       assert("edward-886", ctail_ok(coord));
43511 +
43512 +       if (f->data)
43513 +               memcpy(f->data, (char *)first_unit(coord),
43514 +                      (size_t) nr_units_ctail(coord));
43515 +
43516 +       dclust_set_extension_shift(hint);
43517 +       mark_page_accessed(znode_page(coord->node));
43518 +       move_flow_forward(f, nr_units_ctail(coord));
43519 +
43520 +       return 0;
43521 +}
43522 +
43523 +/**
43524 + * Prepare transform stream with plain text for page
43525 + * @page taking into account synchronization issues.
43526 + */
43527 +static int ctail_read_disk_cluster(struct cluster_handle * clust,
43528 +                                  struct inode * inode, struct page * page,
43529 +                                  znode_lock_mode mode)
43530 +{
43531 +       int result;
43532 +
43533 +       assert("edward-1450", mode == ZNODE_READ_LOCK || ZNODE_WRITE_LOCK);
43534 +       assert("edward-671", clust->hint != NULL);
43535 +       assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER);
43536 +       assert("edward-672", cryptcompress_inode_ok(inode));
43537 +       assert("edward-1527", PageLocked(page));
43538 +
43539 +       unlock_page(page);
43540 +
43541 +       /* set input stream */
43542 +       result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM);
43543 +       if (result) {
43544 +               lock_page(page);
43545 +               return result;
43546 +       }
43547 +       result = find_disk_cluster(clust, inode, 1 /* read items */, mode);
43548 +       lock_page(page);
43549 +       if (result)
43550 +               return result;
43551 +       /*
43552 +        * at this point we have locked position in the tree
43553 +        */
43554 +       assert("edward-1528", znode_is_any_locked(clust->hint->lh.node));
43555 +
43556 +       if (page->mapping != inode->i_mapping) {
43557 +               /* page was truncated */
43558 +               reiser4_unset_hint(clust->hint);
43559 +               reset_cluster_params(clust);
43560 +               return AOP_TRUNCATED_PAGE;
43561 +       }
43562 +       if (PageUptodate(page)) {
43563 +               /* disk cluster can be obsolete, don't use it! */
43564 +               reiser4_unset_hint(clust->hint);
43565 +               reset_cluster_params(clust);
43566 +               return 0;
43567 +       }
43568 +       if (clust->dstat == FAKE_DISK_CLUSTER ||
43569 +           clust->dstat == UNPR_DISK_CLUSTER ||
43570 +           clust->dstat == TRNC_DISK_CLUSTER) {
43571 +               /*
43572 +                * this information about disk cluster will be valid
43573 +                * as long as we keep the position in the tree locked
43574 +                */
43575 +               tfm_cluster_set_uptodate(&clust->tc);
43576 +               return 0;
43577 +       }
43578 +       /* now prepare output stream.. */
43579 +       result = grab_coa(&clust->tc, inode_compression_plugin(inode));
43580 +       if (result)
43581 +               return result;
43582 +       /* ..and fill this with plain text */
43583 +       result = reiser4_inflate_cluster(clust, inode);
43584 +       if (result)
43585 +               return result;
43586 +       /*
43587 +        * The stream is ready! It won't be obsolete as
43588 +        * long as we keep last disk cluster item locked.
43589 +        */
43590 +       tfm_cluster_set_uptodate(&clust->tc);
43591 +       return 0;
43592 +}
43593 +
43594 +/*
43595 + * fill one page with plain text.
43596 + */
43597 +int do_readpage_ctail(struct inode * inode, struct cluster_handle * clust,
43598 +                     struct page *page, znode_lock_mode mode)
43599 +{
43600 +       int ret;
43601 +       unsigned cloff;
43602 +       char *data;
43603 +       size_t to_page;
43604 +       struct tfm_cluster * tc = &clust->tc;
43605 +
43606 +       assert("edward-212", PageLocked(page));
43607 +
43608 +       if (unlikely(page->mapping != inode->i_mapping))
43609 +               return AOP_TRUNCATED_PAGE;
43610 +       if (PageUptodate(page))
43611 +               goto exit;
43612 +       to_page = pbytes(page_index(page), inode);
43613 +       if (to_page == 0) {
43614 +               zero_user(page, 0, PAGE_CACHE_SIZE);
43615 +               SetPageUptodate(page);
43616 +               goto exit;
43617 +       }
43618 +       if (!tfm_cluster_is_uptodate(&clust->tc)) {
43619 +               clust->index = pg_to_clust(page->index, inode);
43620 +
43621 +               /* this will unlock/lock the page */
43622 +               ret = ctail_read_disk_cluster(clust, inode, page, mode);
43623 +
43624 +               assert("edward-212", PageLocked(page));
43625 +               if (ret)
43626 +                       return ret;
43627 +
43628 +               /* refresh bytes */
43629 +               to_page = pbytes(page_index(page), inode);
43630 +               if (to_page == 0) {
43631 +                       zero_user(page, 0, PAGE_CACHE_SIZE);
43632 +                       SetPageUptodate(page);
43633 +                       goto exit;
43634 +               }
43635 +       }
43636 +       if (PageUptodate(page))
43637 +               /* somebody else fill it already */
43638 +               goto exit;
43639 +
43640 +       assert("edward-119", tfm_cluster_is_uptodate(tc));
43641 +       assert("edward-1529", znode_is_any_locked(clust->hint->lh.node));
43642 +
43643 +       switch (clust->dstat) {
43644 +       case UNPR_DISK_CLUSTER:
43645 +               BUG_ON(1);
43646 +       case TRNC_DISK_CLUSTER:
43647 +               /*
43648 +                * Race with truncate!
43649 +                * We resolve it in favour of the last one (the only way,
43650 +                 * as in this case plain text is unrecoverable)
43651 +                */
43652 +       case FAKE_DISK_CLUSTER:
43653 +               /* fill the page by zeroes */
43654 +               zero_user(page, 0, PAGE_CACHE_SIZE);
43655 +               SetPageUptodate(page);
43656 +               break;
43657 +       case PREP_DISK_CLUSTER:
43658 +               /* fill page by transformed stream with plain text */
43659 +               assert("edward-1058", !PageUptodate(page));
43660 +               assert("edward-120", tc->len <= inode_cluster_size(inode));
43661 +
43662 +               /* page index in this logical cluster */
43663 +               cloff = pg_to_off_to_cloff(page->index, inode);
43664 +
43665 +               data = kmap(page);
43666 +               memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, to_page);
43667 +               memset(data + to_page, 0, (size_t) PAGE_CACHE_SIZE - to_page);
43668 +               flush_dcache_page(page);
43669 +               kunmap(page);
43670 +               SetPageUptodate(page);
43671 +               break;
43672 +       default:
43673 +               impossible("edward-1169", "bad disk cluster state");
43674 +       }
43675 +      exit:
43676 +       return 0;
43677 +}
43678 +
43679 +/* plugin->u.item.s.file.readpage */
43680 +int readpage_ctail(void *vp, struct page *page)
43681 +{
43682 +       int result;
43683 +       hint_t * hint;
43684 +       struct cluster_handle * clust = vp;
43685 +
43686 +       assert("edward-114", clust != NULL);
43687 +       assert("edward-115", PageLocked(page));
43688 +       assert("edward-116", !PageUptodate(page));
43689 +       assert("edward-118", page->mapping && page->mapping->host);
43690 +       assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc));
43691 +
43692 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
43693 +       if (hint == NULL) {
43694 +               unlock_page(page);
43695 +               return RETERR(-ENOMEM);
43696 +       }
43697 +       clust->hint = hint;
43698 +       result = load_file_hint(clust->file, hint);
43699 +       if (result) {
43700 +               kfree(hint);
43701 +               unlock_page(page);
43702 +               return result;
43703 +       }
43704 +       assert("vs-25", hint->ext_coord.lh == &hint->lh);
43705 +
43706 +       result = do_readpage_ctail(page->mapping->host, clust, page,
43707 +                                  ZNODE_READ_LOCK);
43708 +       assert("edward-213", PageLocked(page));
43709 +       assert("edward-1163", ergo(!result, PageUptodate(page)));
43710 +
43711 +       unlock_page(page);
43712 +       done_lh(&hint->lh);
43713 +       hint->ext_coord.valid = 0;
43714 +       save_file_hint(clust->file, hint);
43715 +       kfree(hint);
43716 +       tfm_cluster_clr_uptodate(&clust->tc);
43717 +
43718 +       return result;
43719 +}
43720 +
43721 +/* Helper function for ->readpages() */
43722 +static int ctail_read_page_cluster(struct cluster_handle * clust,
43723 +                                  struct inode *inode)
43724 +{
43725 +       int i;
43726 +       int result;
43727 +       assert("edward-779", clust != NULL);
43728 +       assert("edward-1059", clust->win == NULL);
43729 +       assert("edward-780", inode != NULL);
43730 +
43731 +       result = prepare_page_cluster(inode, clust, READ_OP);
43732 +       if (result)
43733 +               return result;
43734 +
43735 +       assert("edward-781", !tfm_cluster_is_uptodate(&clust->tc));
43736 +
43737 +       for (i = 0; i < clust->nr_pages; i++) {
43738 +               struct page *page = clust->pages[i];
43739 +               lock_page(page);
43740 +               result = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK);
43741 +               unlock_page(page);
43742 +               if (result)
43743 +                       break;
43744 +       }
43745 +       tfm_cluster_clr_uptodate(&clust->tc);
43746 +       put_page_cluster(clust, inode, READ_OP);
43747 +       return result;
43748 +}
43749 +
43750 +/* filler for read_cache_pages() */
43751 +static int ctail_readpages_filler(void * data, struct page * page)
43752 +{
43753 +       int ret = 0;
43754 +       struct cluster_handle * clust = data;
43755 +       struct inode * inode = clust->file->f_dentry->d_inode;
43756 +
43757 +       assert("edward-1525", page->mapping == inode->i_mapping);
43758 +
43759 +       if (PageUptodate(page)) {
43760 +               unlock_page(page);
43761 +               return 0;
43762 +       }
43763 +       if (pbytes(page_index(page), inode) == 0) {
43764 +               zero_user(page, 0, PAGE_CACHE_SIZE);
43765 +               SetPageUptodate(page);
43766 +               unlock_page(page);
43767 +               return 0;
43768 +       }
43769 +       move_cluster_forward(clust, inode, page->index);
43770 +       unlock_page(page);
43771 +       /*
43772 +        * read the whole page cluster
43773 +        */
43774 +       ret = ctail_read_page_cluster(clust, inode);
43775 +
43776 +       assert("edward-869", !tfm_cluster_is_uptodate(&clust->tc));
43777 +       return ret;
43778 +}
43779 +
43780 +/*
43781 + * We populate a bit more then upper readahead suggests:
43782 + * with each nominated page we read the whole page cluster
43783 + * this page belongs to.
43784 + */
43785 +int readpages_ctail(struct file *file, struct address_space *mapping,
43786 +                   struct list_head *pages)
43787 +{
43788 +       int ret = 0;
43789 +       hint_t *hint;
43790 +       struct cluster_handle clust;
43791 +       struct inode *inode = mapping->host;
43792 +
43793 +       assert("edward-1521", inode == file->f_dentry->d_inode);
43794 +
43795 +       cluster_init_read(&clust, NULL);
43796 +       clust.file = file;
43797 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
43798 +       if (hint == NULL) {
43799 +               warning("vs-28", "failed to allocate hint");
43800 +               ret = RETERR(-ENOMEM);
43801 +               goto exit1;
43802 +       }
43803 +       clust.hint = hint;
43804 +       ret = load_file_hint(clust.file, hint);
43805 +       if (ret) {
43806 +               warning("edward-1522", "failed to load hint");
43807 +               goto exit2;
43808 +       }
43809 +       assert("vs-26", hint->ext_coord.lh == &hint->lh);
43810 +       ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
43811 +       if (ret) {
43812 +               warning("edward-1523", "failed to alloc pgset");
43813 +               goto exit3;
43814 +       }
43815 +       ret = read_cache_pages(mapping, pages, ctail_readpages_filler, &clust);
43816 +
43817 +       assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc));
43818 + exit3:
43819 +       done_lh(&hint->lh);
43820 +       save_file_hint(file, hint);
43821 +       hint->ext_coord.valid = 0;
43822 + exit2:
43823 +       kfree(hint);
43824 + exit1:
43825 +       put_cluster_handle(&clust);
43826 +       return ret;
43827 +}
43828 +
43829 +/*
43830 +   plugin->u.item.s.file.append_key
43831 +   key of the first item of the next disk cluster
43832 +*/
43833 +reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key)
43834 +{
43835 +       assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID);
43836 +       assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord)));
43837 +
43838 +       item_key_by_coord(coord, key);
43839 +       set_key_offset(key, ((__u64) (clust_by_coord(coord, NULL)) + 1)
43840 +                      << cluster_shift_by_coord(coord));
43841 +       return key;
43842 +}
43843 +
43844 +static int insert_unprepped_ctail(struct cluster_handle * clust,
43845 +                                 struct inode *inode)
43846 +{
43847 +       int result;
43848 +       char buf[UCTAIL_NR_UNITS];
43849 +       reiser4_item_data data;
43850 +       reiser4_key key;
43851 +       int shift = (int)UCTAIL_SHIFT;
43852 +
43853 +       memset(buf, 0, (size_t) UCTAIL_NR_UNITS);
43854 +       result = key_by_inode_cryptcompress(inode,
43855 +                                           clust_to_off(clust->index, inode),
43856 +                                           &key);
43857 +       if (result)
43858 +               return result;
43859 +       data.user = 0;
43860 +       data.iplug = item_plugin_by_id(CTAIL_ID);
43861 +       data.arg = &shift;
43862 +       data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS;
43863 +       data.data = buf;
43864 +
43865 +       result = insert_by_coord(&clust->hint->ext_coord.coord,
43866 +                                &data, &key, clust->hint->ext_coord.lh, 0);
43867 +       return result;
43868 +}
43869 +
43870 +static int
43871 +insert_cryptcompress_flow(coord_t * coord, lock_handle * lh, flow_t * f,
43872 +                         int cluster_shift)
43873 +{
43874 +       int result;
43875 +       carry_pool *pool;
43876 +       carry_level *lowest_level;
43877 +       reiser4_item_data *data;
43878 +       carry_op *op;
43879 +
43880 +       pool =
43881 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
43882 +                           sizeof(*data));
43883 +       if (IS_ERR(pool))
43884 +               return PTR_ERR(pool);
43885 +       lowest_level = (carry_level *) (pool + 1);
43886 +       init_carry_level(lowest_level, pool);
43887 +       data = (reiser4_item_data *) (lowest_level + 3);
43888 +
43889 +       assert("edward-466", coord->between == AFTER_ITEM
43890 +              || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM
43891 +              || coord->between == EMPTY_NODE
43892 +              || coord->between == BEFORE_UNIT);
43893 +
43894 +       if (coord->between == AFTER_UNIT) {
43895 +               coord->unit_pos = 0;
43896 +               coord->between = AFTER_ITEM;
43897 +       }
43898 +       op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
43899 +                               0 /* operate directly on coord -> node */);
43900 +       if (IS_ERR(op) || (op == NULL)) {
43901 +               done_carry_pool(pool);
43902 +               return RETERR(op ? PTR_ERR(op) : -EIO);
43903 +       }
43904 +       data->user = 0;
43905 +       data->iplug = item_plugin_by_id(CTAIL_ID);
43906 +       data->arg = &cluster_shift;
43907 +
43908 +       data->length = 0;
43909 +       data->data = NULL;
43910 +
43911 +       op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT;
43912 +       op->u.insert_flow.insert_point = coord;
43913 +       op->u.insert_flow.flow = f;
43914 +       op->u.insert_flow.data = data;
43915 +       op->u.insert_flow.new_nodes = 0;
43916 +
43917 +       lowest_level->track_type = CARRY_TRACK_CHANGE;
43918 +       lowest_level->tracked = lh;
43919 +
43920 +       result = reiser4_carry(lowest_level, NULL);
43921 +       done_carry_pool(pool);
43922 +
43923 +       return result;
43924 +}
43925 +
43926 +/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */
43927 +static int insert_cryptcompress_flow_in_place(coord_t * coord,
43928 +                                             lock_handle * lh, flow_t * f,
43929 +                                             int cluster_shift)
43930 +{
43931 +       int ret;
43932 +       coord_t pos;
43933 +       lock_handle lock;
43934 +
43935 +       assert("edward-484",
43936 +              coord->between == AT_UNIT || coord->between == AFTER_ITEM);
43937 +       assert("edward-485", item_id_by_coord(coord) == CTAIL_ID);
43938 +
43939 +       coord_dup(&pos, coord);
43940 +       pos.unit_pos = 0;
43941 +       pos.between = AFTER_ITEM;
43942 +
43943 +       init_lh(&lock);
43944 +       copy_lh(&lock, lh);
43945 +
43946 +       ret = insert_cryptcompress_flow(&pos, &lock, f, cluster_shift);
43947 +       done_lh(&lock);
43948 +       assert("edward-1347", znode_is_write_locked(lh->node));
43949 +       assert("edward-1228", !ret);
43950 +       return ret;
43951 +}
43952 +
43953 +/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */
43954 +static int overwrite_ctail(coord_t * coord, flow_t * f)
43955 +{
43956 +       unsigned count;
43957 +
43958 +       assert("edward-269", f->user == 0);
43959 +       assert("edward-270", f->data != NULL);
43960 +       assert("edward-271", f->length > 0);
43961 +       assert("edward-272", coord_is_existing_unit(coord));
43962 +       assert("edward-273", coord->unit_pos == 0);
43963 +       assert("edward-274", znode_is_write_locked(coord->node));
43964 +       assert("edward-275", reiser4_schedulable());
43965 +       assert("edward-467", item_id_by_coord(coord) == CTAIL_ID);
43966 +       assert("edward-1243", ctail_ok(coord));
43967 +
43968 +       count = nr_units_ctail(coord);
43969 +
43970 +       if (count > f->length)
43971 +               count = f->length;
43972 +       memcpy(first_unit(coord), f->data, count);
43973 +       move_flow_forward(f, count);
43974 +       coord->unit_pos += count;
43975 +       return 0;
43976 +}
43977 +
43978 +/* Implementation of CRC_CUT_ITEM mode of ctail conversion:
43979 +   cut ctail (part or whole) starting from next unit position */
43980 +static int cut_ctail(coord_t * coord)
43981 +{
43982 +       coord_t stop;
43983 +
43984 +       assert("edward-435", coord->between == AT_UNIT &&
43985 +              coord->item_pos < coord_num_items(coord) &&
43986 +              coord->unit_pos <= coord_num_units(coord));
43987 +
43988 +       if (coord->unit_pos == coord_num_units(coord))
43989 +               /* nothing to cut */
43990 +               return 0;
43991 +       coord_dup(&stop, coord);
43992 +       stop.unit_pos = coord_last_unit_pos(coord);
43993 +
43994 +       return cut_node_content(coord, &stop, NULL, NULL, NULL);
43995 +}
43996 +
43997 +int ctail_insert_unprepped_cluster(struct cluster_handle * clust,
43998 +                                  struct inode * inode)
43999 +{
44000 +       int result;
44001 +       assert("edward-1244", inode != NULL);
44002 +       assert("edward-1245", clust->hint != NULL);
44003 +       assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER);
44004 +       assert("edward-1247", clust->reserved == 1);
44005 +
44006 +       result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK);
44007 +       if (cbk_errored(result))
44008 +               return result;
44009 +       assert("edward-1249", result == CBK_COORD_NOTFOUND);
44010 +       assert("edward-1250", znode_is_write_locked(clust->hint->lh.node));
44011 +
44012 +       assert("edward-1295",
44013 +              clust->hint->ext_coord.lh->node ==
44014 +              clust->hint->ext_coord.coord.node);
44015 +
44016 +       coord_set_between_clusters(&clust->hint->ext_coord.coord);
44017 +
44018 +       result = insert_unprepped_ctail(clust, inode);
44019 +       all_grabbed2free();
44020 +
44021 +       assert("edward-1251", !result);
44022 +       assert("edward-1252", cryptcompress_inode_ok(inode));
44023 +       assert("edward-1253", znode_is_write_locked(clust->hint->lh.node));
44024 +       assert("edward-1254",
44025 +              reiser4_clustered_blocks(reiser4_get_current_sb()));
44026 +       assert("edward-1255",
44027 +              znode_convertible(clust->hint->ext_coord.coord.node));
44028 +
44029 +       return result;
44030 +}
44031 +
44032 +static int do_convert_ctail(flush_pos_t * pos, cryptcompress_write_mode_t mode)
44033 +{
44034 +       int result = 0;
44035 +       struct convert_item_info * info;
44036 +
44037 +       assert("edward-468", pos != NULL);
44038 +       assert("edward-469", pos->sq != NULL);
44039 +       assert("edward-845", item_convert_data(pos) != NULL);
44040 +
44041 +       info = item_convert_data(pos);
44042 +       assert("edward-679", info->flow.data != NULL);
44043 +
44044 +       switch (mode) {
44045 +       case CRC_APPEND_ITEM:
44046 +               assert("edward-1229", info->flow.length != 0);
44047 +               assert("edward-1256",
44048 +                      cluster_shift_ok(cluster_shift_by_coord(&pos->coord)));
44049 +               result =
44050 +                   insert_cryptcompress_flow_in_place(&pos->coord,
44051 +                                                      &pos->lock,
44052 +                                                      &info->flow,
44053 +                                                      info->cluster_shift);
44054 +               break;
44055 +       case CRC_OVERWRITE_ITEM:
44056 +               assert("edward-1230", info->flow.length != 0);
44057 +               overwrite_ctail(&pos->coord, &info->flow);
44058 +               if (info->flow.length != 0)
44059 +                       break;
44060 +       case CRC_CUT_ITEM:
44061 +               assert("edward-1231", info->flow.length == 0);
44062 +               result = cut_ctail(&pos->coord);
44063 +               break;
44064 +       default:
44065 +               result = RETERR(-EIO);
44066 +               impossible("edward-244", "bad convert mode");
44067 +       }
44068 +       return result;
44069 +}
44070 +
44071 +/* plugin->u.item.f.scan */
44072 +int scan_ctail(flush_scan * scan)
44073 +{
44074 +       int result = 0;
44075 +       struct page *page;
44076 +       struct inode *inode;
44077 +       jnode *node = scan->node;
44078 +
44079 +       assert("edward-227", scan->node != NULL);
44080 +       assert("edward-228", jnode_is_cluster_page(scan->node));
44081 +       assert("edward-639", znode_is_write_locked(scan->parent_lock.node));
44082 +
44083 +       page = jnode_page(node);
44084 +       inode = page->mapping->host;
44085 +
44086 +       if (!reiser4_scanning_left(scan))
44087 +               return result;
44088 +       if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY))
44089 +               znode_make_dirty(scan->parent_lock.node);
44090 +
44091 +       if (!znode_convertible(scan->parent_lock.node)) {
44092 +               if (JF_ISSET(scan->node, JNODE_DIRTY))
44093 +                       znode_set_convertible(scan->parent_lock.node);
44094 +               else {
44095 +                       warning("edward-681",
44096 +                               "cluster page is already processed");
44097 +                       return -EAGAIN;
44098 +               }
44099 +       }
44100 +       return result;
44101 +}
44102 +
44103 +/* If true, this function attaches children */
44104 +static int should_attach_convert_idata(flush_pos_t * pos)
44105 +{
44106 +       int result;
44107 +       assert("edward-431", pos != NULL);
44108 +       assert("edward-432", pos->child == NULL);
44109 +       assert("edward-619", znode_is_write_locked(pos->coord.node));
44110 +       assert("edward-470",
44111 +              item_plugin_by_coord(&pos->coord) ==
44112 +              item_plugin_by_id(CTAIL_ID));
44113 +
44114 +       /* check for leftmost child */
44115 +       utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child);
44116 +
44117 +       if (!pos->child)
44118 +               return 0;
44119 +       spin_lock_jnode(pos->child);
44120 +       result = (JF_ISSET(pos->child, JNODE_DIRTY) &&
44121 +                 pos->child->atom == ZJNODE(pos->coord.node)->atom);
44122 +       spin_unlock_jnode(pos->child);
44123 +       if (!result && pos->child) {
44124 +               /* existing child isn't to attach, clear up this one */
44125 +               jput(pos->child);
44126 +               pos->child = NULL;
44127 +       }
44128 +       return result;
44129 +}
44130 +
44131 +/**
44132 + * Collect all needed information about the object here,
44133 + * as in-memory inode can be evicted from memory before
44134 + * disk update completion.
44135 + */
44136 +static int init_convert_data_ctail(struct convert_item_info * idata,
44137 +                                  struct inode *inode)
44138 +{
44139 +       assert("edward-813", idata != NULL);
44140 +       assert("edward-814", inode != NULL);
44141 +
44142 +       idata->cluster_shift = inode_cluster_shift(inode);
44143 +       idata->d_cur = DC_FIRST_ITEM;
44144 +       idata->d_next = DC_INVALID_STATE;
44145 +
44146 +       return 0;
44147 +}
44148 +
44149 +static int alloc_item_convert_data(struct convert_info * sq)
44150 +{
44151 +       assert("edward-816", sq != NULL);
44152 +       assert("edward-817", sq->itm == NULL);
44153 +
44154 +       sq->itm = kmalloc(sizeof(*sq->itm), reiser4_ctx_gfp_mask_get());
44155 +       if (sq->itm == NULL)
44156 +               return RETERR(-ENOMEM);
44157 +       return 0;
44158 +}
44159 +
44160 +static void free_item_convert_data(struct convert_info * sq)
44161 +{
44162 +       assert("edward-818", sq != NULL);
44163 +       assert("edward-819", sq->itm != NULL);
44164 +       assert("edward-820", sq->iplug != NULL);
44165 +
44166 +       kfree(sq->itm);
44167 +       sq->itm = NULL;
44168 +       return;
44169 +}
44170 +
44171 +static int alloc_convert_data(flush_pos_t * pos)
44172 +{
44173 +       assert("edward-821", pos != NULL);
44174 +       assert("edward-822", pos->sq == NULL);
44175 +
44176 +       pos->sq = kmalloc(sizeof(*pos->sq), reiser4_ctx_gfp_mask_get());
44177 +       if (!pos->sq)
44178 +               return RETERR(-ENOMEM);
44179 +       memset(pos->sq, 0, sizeof(*pos->sq));
44180 +       cluster_init_write(&pos->sq->clust, NULL);
44181 +       return 0;
44182 +}
44183 +
44184 +void free_convert_data(flush_pos_t * pos)
44185 +{
44186 +       struct convert_info *sq;
44187 +
44188 +       assert("edward-823", pos != NULL);
44189 +       assert("edward-824", pos->sq != NULL);
44190 +
44191 +       sq = pos->sq;
44192 +       if (sq->itm)
44193 +               free_item_convert_data(sq);
44194 +       put_cluster_handle(&sq->clust);
44195 +       kfree(pos->sq);
44196 +       pos->sq = NULL;
44197 +       return;
44198 +}
44199 +
44200 +static int init_item_convert_data(flush_pos_t * pos, struct inode *inode)
44201 +{
44202 +       struct convert_info *sq;
44203 +
44204 +       assert("edward-825", pos != NULL);
44205 +       assert("edward-826", pos->sq != NULL);
44206 +       assert("edward-827", item_convert_data(pos) != NULL);
44207 +       assert("edward-828", inode != NULL);
44208 +
44209 +       sq = pos->sq;
44210 +
44211 +       memset(sq->itm, 0, sizeof(*sq->itm));
44212 +
44213 +       /* iplug->init_convert_data() */
44214 +       return init_convert_data_ctail(sq->itm, inode);
44215 +}
44216 +
44217 +/* create and attach disk cluster info used by 'convert' phase of the flush
44218 +   squalloc() */
44219 +static int attach_convert_idata(flush_pos_t * pos, struct inode *inode)
44220 +{
44221 +       int ret = 0;
44222 +       struct convert_item_info *info;
44223 +       struct cluster_handle *clust;
44224 +       file_plugin *fplug = inode_file_plugin(inode);
44225 +       compression_plugin *cplug = inode_compression_plugin(inode);
44226 +
44227 +       assert("edward-248", pos != NULL);
44228 +       assert("edward-249", pos->child != NULL);
44229 +       assert("edward-251", inode != NULL);
44230 +       assert("edward-682", cryptcompress_inode_ok(inode));
44231 +       assert("edward-252",
44232 +              fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
44233 +       assert("edward-473",
44234 +              item_plugin_by_coord(&pos->coord) ==
44235 +              item_plugin_by_id(CTAIL_ID));
44236 +
44237 +       if (!pos->sq) {
44238 +               ret = alloc_convert_data(pos);
44239 +               if (ret)
44240 +                       return ret;
44241 +       }
44242 +       clust = &pos->sq->clust;
44243 +       ret = grab_coa(&clust->tc, cplug);
44244 +       if (ret)
44245 +               goto err;
44246 +       ret = set_cluster_by_page(clust,
44247 +                                 jnode_page(pos->child),
44248 +                                 MAX_CLUSTER_NRPAGES);
44249 +       if (ret)
44250 +               goto err;
44251 +
44252 +       assert("edward-829", pos->sq != NULL);
44253 +       assert("edward-250", item_convert_data(pos) == NULL);
44254 +
44255 +       pos->sq->iplug = item_plugin_by_id(CTAIL_ID);
44256 +
44257 +       ret = alloc_item_convert_data(pos->sq);
44258 +       if (ret)
44259 +               goto err;
44260 +       ret = init_item_convert_data(pos, inode);
44261 +       if (ret)
44262 +               goto err;
44263 +       info = item_convert_data(pos);
44264 +
44265 +       ret = checkout_logical_cluster(clust, pos->child, inode);
44266 +       if (ret)
44267 +               goto err;
44268 +
44269 +       reiser4_deflate_cluster(clust, inode);
44270 +       inc_item_convert_count(pos);
44271 +
44272 +       /* prepare flow for insertion */
44273 +       fplug->flow_by_inode(inode,
44274 +                            (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM),
44275 +                            0 /* kernel space */ ,
44276 +                            clust->tc.len,
44277 +                            clust_to_off(clust->index, inode),
44278 +                            WRITE_OP, &info->flow);
44279 +       jput(pos->child);
44280 +       return 0;
44281 +      err:
44282 +       jput(pos->child);
44283 +       free_convert_data(pos);
44284 +       return ret;
44285 +}
44286 +
44287 +/* clear up disk cluster info */
44288 +static void detach_convert_idata(struct convert_info * sq)
44289 +{
44290 +       struct convert_item_info *info;
44291 +
44292 +       assert("edward-253", sq != NULL);
44293 +       assert("edward-840", sq->itm != NULL);
44294 +
44295 +       info = sq->itm;
44296 +       assert("edward-1212", info->flow.length == 0);
44297 +
44298 +       free_item_convert_data(sq);
44299 +       return;
44300 +}
44301 +
44302 +/* plugin->u.item.f.utmost_child */
44303 +
44304 +/* This function sets leftmost child for a first cluster item,
44305 +   if the child exists, and NULL in other cases.
44306 +   NOTE-EDWARD: Do not call this for RIGHT_SIDE */
44307 +
44308 +int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child)
44309 +{
44310 +       reiser4_key key;
44311 +
44312 +       item_key_by_coord(coord, &key);
44313 +
44314 +       assert("edward-257", coord != NULL);
44315 +       assert("edward-258", child != NULL);
44316 +       assert("edward-259", side == LEFT_SIDE);
44317 +       assert("edward-260",
44318 +              item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
44319 +
44320 +       if (!is_disk_cluster_key(&key, coord))
44321 +               *child = NULL;
44322 +       else
44323 +               *child = jlookup(current_tree,
44324 +                                get_key_objectid(item_key_by_coord
44325 +                                                 (coord, &key)),
44326 +                                off_to_pg(get_key_offset(&key)));
44327 +       return 0;
44328 +}
44329 +
44330 +/* Returns true if @p2 is the next item to @p1
44331 +   in the _same_ disk cluster.
44332 +   Disk cluster is a set of items. If ->clustered() != NULL,
44333 +   with each item the whole disk cluster should be read/modified
44334 +*/
44335 +
44336 +/* Go rightward and check for next disk cluster item, set
44337 + * d_next to DC_CHAINED_ITEM, if the last one exists.
44338 + * If the current position is last item, go to right neighbor.
44339 + * Skip empty nodes. Note, that right neighbors may be not in
44340 + * the slum because of races. If so, make it dirty and
44341 + * convertible.
44342 + */
44343 +static int next_item_dc_stat(flush_pos_t * pos)
44344 +{
44345 +       int ret = 0;
44346 +       int stop = 0;
44347 +       znode *cur;
44348 +       coord_t coord;
44349 +       lock_handle lh;
44350 +       lock_handle right_lock;
44351 +
44352 +       assert("edward-1232", !node_is_empty(pos->coord.node));
44353 +       assert("edward-1014",
44354 +              pos->coord.item_pos < coord_num_items(&pos->coord));
44355 +       assert("edward-1015", chaining_data_present(pos));
44356 +       assert("edward-1017",
44357 +              item_convert_data(pos)->d_next == DC_INVALID_STATE);
44358 +
44359 +       item_convert_data(pos)->d_next = DC_AFTER_CLUSTER;
44360 +
44361 +       if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER)
44362 +               return ret;
44363 +       if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1)
44364 +               return ret;
44365 +
44366 +       /* Check next slum item.
44367 +        * Note, that it can not be killed by concurrent truncate,
44368 +        * as the last one will want the lock held by us.
44369 +        */
44370 +       init_lh(&right_lock);
44371 +       cur = pos->coord.node;
44372 +
44373 +       while (!stop) {
44374 +               init_lh(&lh);
44375 +               ret = reiser4_get_right_neighbor(&lh,
44376 +                                                cur,
44377 +                                                ZNODE_WRITE_LOCK,
44378 +                                                GN_CAN_USE_UPPER_LEVELS);
44379 +               if (ret)
44380 +                       break;
44381 +               ret = zload(lh.node);
44382 +               if (ret) {
44383 +                       done_lh(&lh);
44384 +                       break;
44385 +               }
44386 +               coord_init_before_first_item(&coord, lh.node);
44387 +
44388 +               if (node_is_empty(lh.node)) {
44389 +                       znode_make_dirty(lh.node);
44390 +                       znode_set_convertible(lh.node);
44391 +                       stop = 0;
44392 +               } else if (same_disk_cluster(&pos->coord, &coord)) {
44393 +
44394 +                       item_convert_data(pos)->d_next = DC_CHAINED_ITEM;
44395 +
44396 +                       if (!ZF_ISSET(lh.node, JNODE_DIRTY)) {
44397 +                               /*
44398 +                                  warning("edward-1024",
44399 +                                  "next slum item mergeable, "
44400 +                                  "but znode %p isn't dirty\n",
44401 +                                  lh.node);
44402 +                                */
44403 +                               znode_make_dirty(lh.node);
44404 +                       }
44405 +                       if (!znode_convertible(lh.node)) {
44406 +                               /*
44407 +                                  warning("edward-1272",
44408 +                                  "next slum item mergeable, "
44409 +                                  "but znode %p isn't convertible\n",
44410 +                                  lh.node);
44411 +                                */
44412 +                               znode_set_convertible(lh.node);
44413 +                       }
44414 +                       stop = 1;
44415 +               } else
44416 +                       stop = 1;
44417 +               zrelse(lh.node);
44418 +               done_lh(&right_lock);
44419 +               copy_lh(&right_lock, &lh);
44420 +               done_lh(&lh);
44421 +               cur = right_lock.node;
44422 +       }
44423 +       done_lh(&right_lock);
44424 +
44425 +       if (ret == -E_NO_NEIGHBOR)
44426 +               ret = 0;
44427 +       return ret;
44428 +}
44429 +
44430 +static int
44431 +assign_convert_mode(struct convert_item_info * idata,
44432 +                   cryptcompress_write_mode_t * mode)
44433 +{
44434 +       int result = 0;
44435 +
44436 +       assert("edward-1025", idata != NULL);
44437 +
44438 +       if (idata->flow.length) {
44439 +               /* append or overwrite */
44440 +               switch (idata->d_cur) {
44441 +               case DC_FIRST_ITEM:
44442 +               case DC_CHAINED_ITEM:
44443 +                       *mode = CRC_OVERWRITE_ITEM;
44444 +                       break;
44445 +               case DC_AFTER_CLUSTER:
44446 +                       *mode = CRC_APPEND_ITEM;
44447 +                       break;
44448 +               default:
44449 +                       impossible("edward-1018", "wrong current item state");
44450 +               }
44451 +       } else {
44452 +               /* cut or invalidate */
44453 +               switch (idata->d_cur) {
44454 +               case DC_FIRST_ITEM:
44455 +               case DC_CHAINED_ITEM:
44456 +                       *mode = CRC_CUT_ITEM;
44457 +                       break;
44458 +               case DC_AFTER_CLUSTER:
44459 +                       result = 1;
44460 +                       break;
44461 +               default:
44462 +                       impossible("edward-1019", "wrong current item state");
44463 +               }
44464 +       }
44465 +       return result;
44466 +}
44467 +
44468 +/* plugin->u.item.f.convert */
44469 +/* write ctail in guessed mode */
44470 +int convert_ctail(flush_pos_t * pos)
44471 +{
44472 +       int result;
44473 +       int nr_items;
44474 +       cryptcompress_write_mode_t mode = CRC_OVERWRITE_ITEM;
44475 +
44476 +       assert("edward-1020", pos != NULL);
44477 +       assert("edward-1213", coord_num_items(&pos->coord) != 0);
44478 +       assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID);
44479 +       assert("edward-1258", ctail_ok(&pos->coord));
44480 +       assert("edward-261", pos->coord.node != NULL);
44481 +
44482 +       nr_items = coord_num_items(&pos->coord);
44483 +       if (!chaining_data_present(pos)) {
44484 +               if (should_attach_convert_idata(pos)) {
44485 +                       /* attach convert item info */
44486 +                       struct inode *inode;
44487 +
44488 +                       assert("edward-264", pos->child != NULL);
44489 +                       assert("edward-265", jnode_page(pos->child) != NULL);
44490 +                       assert("edward-266",
44491 +                              jnode_page(pos->child)->mapping != NULL);
44492 +
44493 +                       inode = jnode_page(pos->child)->mapping->host;
44494 +
44495 +                       assert("edward-267", inode != NULL);
44496 +
44497 +                       /* attach item convert info by child and put the last one */
44498 +                       result = attach_convert_idata(pos, inode);
44499 +                       pos->child = NULL;
44500 +                       if (result == -E_REPEAT) {
44501 +                               /* jnode became clean, or there is no dirty
44502 +                                  pages (nothing to update in disk cluster) */
44503 +                               warning("edward-1021",
44504 +                                       "convert_ctail: nothing to attach");
44505 +                               return 0;
44506 +                       }
44507 +                       if (result != 0)
44508 +                               return result;
44509 +               } else
44510 +                       /* unconvertible */
44511 +                       return 0;
44512 +       } else {
44513 +               /* use old convert info */
44514 +
44515 +               struct convert_item_info *idata;
44516 +
44517 +               idata = item_convert_data(pos);
44518 +
44519 +               result = assign_convert_mode(idata, &mode);
44520 +               if (result) {
44521 +                       /* disk cluster is over,
44522 +                          nothing to update anymore */
44523 +                       detach_convert_idata(pos->sq);
44524 +                       return 0;
44525 +               }
44526 +       }
44527 +
44528 +       assert("edward-433", chaining_data_present(pos));
44529 +       assert("edward-1022",
44530 +              pos->coord.item_pos < coord_num_items(&pos->coord));
44531 +
44532 +       /* check if next item is of current disk cluster */
44533 +       result = next_item_dc_stat(pos);
44534 +       if (result) {
44535 +               detach_convert_idata(pos->sq);
44536 +               return result;
44537 +       }
44538 +       result = do_convert_ctail(pos, mode);
44539 +       if (result) {
44540 +               detach_convert_idata(pos->sq);
44541 +               return result;
44542 +       }
44543 +       switch (mode) {
44544 +       case CRC_CUT_ITEM:
44545 +               assert("edward-1214", item_convert_data(pos)->flow.length == 0);
44546 +               assert("edward-1215",
44547 +                      coord_num_items(&pos->coord) == nr_items ||
44548 +                      coord_num_items(&pos->coord) == nr_items - 1);
44549 +               if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM)
44550 +                       break;
44551 +               if (coord_num_items(&pos->coord) != nr_items) {
44552 +                       /* the item was killed, no more chained items */
44553 +                       detach_convert_idata(pos->sq);
44554 +                       if (!node_is_empty(pos->coord.node))
44555 +                               /* make sure the next item will be scanned */
44556 +                               coord_init_before_item(&pos->coord);
44557 +                       break;
44558 +               }
44559 +       case CRC_APPEND_ITEM:
44560 +               assert("edward-434", item_convert_data(pos)->flow.length == 0);
44561 +               detach_convert_idata(pos->sq);
44562 +               break;
44563 +       case CRC_OVERWRITE_ITEM:
44564 +               if (coord_is_unprepped_ctail(&pos->coord)) {
44565 +                       /* convert unpprepped ctail to prepped one */
44566 +                       assert("edward-1259",
44567 +                              cluster_shift_ok(item_convert_data(pos)->
44568 +                                               cluster_shift));
44569 +                       put_unaligned((d8)item_convert_data(pos)->cluster_shift,
44570 +                                     &ctail_formatted_at(&pos->coord)->
44571 +                                     cluster_shift);
44572 +               }
44573 +               break;
44574 +       }
44575 +       return result;
44576 +}
44577 +
44578 +/* Make Linus happy.
44579 +   Local variables:
44580 +   c-indentation-style: "K&R"
44581 +   mode-name: "LC"
44582 +   c-basic-offset: 8
44583 +   tab-width: 8
44584 +   fill-column: 120
44585 +   End:
44586 +*/
44587 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/ctail.h linux-2.6.27/fs/reiser4/plugin/item/ctail.h
44588 --- linux-2.6.27.orig/fs/reiser4/plugin/item/ctail.h    1970-01-01 03:00:00.000000000 +0300
44589 +++ linux-2.6.27/fs/reiser4/plugin/item/ctail.h 2008-10-12 18:20:01.000000000 +0400
44590 @@ -0,0 +1,102 @@
44591 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44592 +
44593 +/* Ctail items are fragments (or bodies) of special tipe to provide
44594 +   optimal storage of encrypted and(or) compressed files. */
44595 +
44596 +
44597 +#if !defined( __FS_REISER4_CTAIL_H__ )
44598 +#define __FS_REISER4_CTAIL_H__
44599 +
44600 +/* Disk format of ctail item */
44601 +typedef struct ctail_item_format {
44602 +       /* packed shift;
44603 +          if its value is different from UCTAIL_SHIFT (see below), then
44604 +          size of disk cluster is calculated as (1 << cluster_shift) */
44605 +       d8 cluster_shift;
44606 +       /* ctail body */
44607 +       d8 body[0];
44608 +} __attribute__ ((packed)) ctail_item_format;
44609 +
44610 +/* "Unprepped" disk cluster is represented by a single ctail item
44611 +   with the following "magic" attributes: */
44612 +/* "magic" cluster_shift */
44613 +#define UCTAIL_SHIFT 0xff
44614 +/* How many units unprepped ctail item has */
44615 +#define UCTAIL_NR_UNITS 1
44616 +
44617 +/* The following is a set of various item states in a disk cluster.
44618 +   Disk cluster is a set of items whose keys belong to the interval
44619 +   [dc_key , dc_key + disk_cluster_size - 1] */
44620 +typedef enum {
44621 +       DC_INVALID_STATE = 0,
44622 +       DC_FIRST_ITEM = 1,
44623 +       DC_CHAINED_ITEM = 2,
44624 +       DC_AFTER_CLUSTER = 3
44625 +} dc_item_stat;
44626 +
44627 +/* ctail-specific extension.
44628 +   In particular this describes parameters of disk cluster an item belongs to */
44629 +struct ctail_coord_extension {
44630 +       int shift; /* this contains cluster_shift extracted from
44631 +                     ctail_item_format (above), or UCTAIL_SHIFT
44632 +                     (the last one is the "magic" of unprepped disk clusters)*/
44633 +       int dsize; /* size of a prepped disk cluster */
44634 +       int ncount; /* count of nodes occupied by a disk cluster */
44635 +};
44636 +
44637 +struct cut_list;
44638 +
44639 +/* plugin->item.b.* */
44640 +int can_contain_key_ctail(const coord_t *, const reiser4_key *,
44641 +                         const reiser4_item_data *);
44642 +int mergeable_ctail(const coord_t * p1, const coord_t * p2);
44643 +pos_in_node_t nr_units_ctail(const coord_t * coord);
44644 +int estimate_ctail(const coord_t * coord, const reiser4_item_data * data);
44645 +void print_ctail(const char *prefix, coord_t * coord);
44646 +lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *);
44647 +
44648 +int paste_ctail(coord_t * coord, reiser4_item_data * data,
44649 +               carry_plugin_info * info UNUSED_ARG);
44650 +int init_ctail(coord_t *, coord_t *, reiser4_item_data *);
44651 +int can_shift_ctail(unsigned free_space, coord_t * coord,
44652 +                   znode * target, shift_direction pend, unsigned *size,
44653 +                   unsigned want);
44654 +void copy_units_ctail(coord_t * target, coord_t * source, unsigned from,
44655 +                     unsigned count, shift_direction where_is_free_space,
44656 +                     unsigned free_space);
44657 +int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
44658 +                   carry_cut_data *, reiser4_key * smallest_removed,
44659 +                   reiser4_key * new_first);
44660 +int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
44661 +                    carry_kill_data *, reiser4_key * smallest_removed,
44662 +                    reiser4_key * new_first);
44663 +int ctail_ok(const coord_t * coord);
44664 +int check_ctail(const coord_t * coord, const char **error);
44665 +
44666 +/* plugin->u.item.s.* */
44667 +int read_ctail(struct file *, flow_t *, hint_t *);
44668 +int readpage_ctail(void *, struct page *);
44669 +int readpages_ctail(struct file *, struct address_space *, struct list_head *);
44670 +reiser4_key *append_key_ctail(const coord_t *, reiser4_key *);
44671 +int create_hook_ctail(const coord_t * coord, void *arg);
44672 +int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t,
44673 +                   carry_kill_data *);
44674 +int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *);
44675 +
44676 +/* plugin->u.item.f */
44677 +int utmost_child_ctail(const coord_t *, sideof, jnode **);
44678 +int scan_ctail(flush_scan *);
44679 +int convert_ctail(flush_pos_t *);
44680 +size_t inode_scaled_cluster_size(struct inode *);
44681 +
44682 +#endif                         /* __FS_REISER4_CTAIL_H__ */
44683 +
44684 +/* Make Linus happy.
44685 +   Local variables:
44686 +   c-indentation-style: "K&R"
44687 +   mode-name: "LC"
44688 +   c-basic-offset: 8
44689 +   tab-width: 8
44690 +   fill-column: 120
44691 +   End:
44692 +*/
44693 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/extent.c linux-2.6.27/fs/reiser4/plugin/item/extent.c
44694 --- linux-2.6.27.orig/fs/reiser4/plugin/item/extent.c   1970-01-01 03:00:00.000000000 +0300
44695 +++ linux-2.6.27/fs/reiser4/plugin/item/extent.c        2008-10-12 18:20:01.000000000 +0400
44696 @@ -0,0 +1,197 @@
44697 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44698 +
44699 +#include "item.h"
44700 +#include "../../key.h"
44701 +#include "../../super.h"
44702 +#include "../../carry.h"
44703 +#include "../../inode.h"
44704 +#include "../../page_cache.h"
44705 +#include "../../flush.h"
44706 +#include "../object.h"
44707 +
44708 +/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */
44709 +/* Audited by: green(2002.06.13) */
44710 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
44711 +                                  int nr_extents)
44712 +{
44713 +       data->data = ext_unit;
44714 +       /* data->data is kernel space */
44715 +       data->user = 0;
44716 +       data->length = sizeof(reiser4_extent) * nr_extents;
44717 +       data->arg = NULL;
44718 +       data->iplug = item_plugin_by_id(EXTENT_POINTER_ID);
44719 +       return data;
44720 +}
44721 +
44722 +/* how many bytes are addressed by @nr first extents of the extent item */
44723 +reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr)
44724 +{
44725 +       pos_in_node_t i;
44726 +       reiser4_block_nr blocks;
44727 +       reiser4_extent *ext;
44728 +
44729 +       ext = item_body_by_coord(coord);
44730 +       assert("vs-263", nr <= nr_units_extent(coord));
44731 +
44732 +       blocks = 0;
44733 +       for (i = 0; i < nr; i++, ext++) {
44734 +               blocks += extent_get_width(ext);
44735 +       }
44736 +
44737 +       return blocks * current_blocksize;
44738 +}
44739 +
44740 +extent_state state_of_extent(reiser4_extent * ext)
44741 +{
44742 +       switch ((int)extent_get_start(ext)) {
44743 +       case 0:
44744 +               return HOLE_EXTENT;
44745 +       case 1:
44746 +               return UNALLOCATED_EXTENT;
44747 +       default:
44748 +               break;
44749 +       }
44750 +       return ALLOCATED_EXTENT;
44751 +}
44752 +
44753 +int extent_is_unallocated(const coord_t * item)
44754 +{
44755 +       assert("jmacd-5133", item_is_extent(item));
44756 +
44757 +       return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT;
44758 +}
44759 +
44760 +/* set extent's start and width */
44761 +void reiser4_set_extent(reiser4_extent * ext, reiser4_block_nr start,
44762 +                       reiser4_block_nr width)
44763 +{
44764 +       extent_set_start(ext, start);
44765 +       extent_set_width(ext, width);
44766 +}
44767 +
44768 +/**
44769 + * reiser4_replace_extent - replace extent and paste 1 or 2 after it
44770 + * @un_extent: coordinate of extent to be overwritten
44771 + * @lh: need better comment
44772 + * @key: need better comment
44773 + * @exts_to_add: data prepared for insertion into tree
44774 + * @replace: need better comment
44775 + * @flags: need better comment
44776 + * @return_insert_position: need better comment
44777 + *
44778 + * Overwrites one extent, pastes 1 or 2 more ones after overwritten one.  If
44779 + * @return_inserted_position is 1 - @un_extent and @lh are returned set to
44780 + * first of newly inserted units, if it is 0 - @un_extent and @lh are returned
44781 + * set to extent which was overwritten.
44782 + */
44783 +int reiser4_replace_extent(struct replace_handle *h,
44784 +                          int return_inserted_position)
44785 +{
44786 +       int result;
44787 +       znode *orig_znode;
44788 +       /*ON_DEBUG(reiser4_extent orig_ext);*/  /* this is for debugging */
44789 +
44790 +       assert("vs-990", coord_is_existing_unit(h->coord));
44791 +       assert("vs-1375", znode_is_write_locked(h->coord->node));
44792 +       assert("vs-1426", extent_get_width(&h->overwrite) != 0);
44793 +       assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0);
44794 +       assert("vs-1427", ergo(h->nr_new_extents == 2,
44795 +                              extent_get_width(&h->new_extents[1]) != 0));
44796 +
44797 +       /* compose structure for paste */
44798 +       init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents);
44799 +
44800 +       coord_dup(&h->coord_after, h->coord);
44801 +       init_lh(&h->lh_after);
44802 +       copy_lh(&h->lh_after, h->lh);
44803 +       reiser4_tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK);
44804 +       reiser4_tap_monitor(&h->watch);
44805 +
44806 +       ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord));
44807 +       orig_znode = h->coord->node;
44808 +
44809 +#if REISER4_DEBUG
44810 +       /* make sure that key is set properly */
44811 +       unit_key_by_coord(h->coord, &h->tmp);
44812 +       set_key_offset(&h->tmp,
44813 +                      get_key_offset(&h->tmp) +
44814 +                      extent_get_width(&h->overwrite) * current_blocksize);
44815 +       assert("vs-1080", keyeq(&h->tmp, &h->paste_key));
44816 +#endif
44817 +
44818 +       /* set insert point after unit to be replaced */
44819 +       h->coord->between = AFTER_UNIT;
44820 +
44821 +       result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL,
44822 +                                 &h->paste_key, &h->item, h->flags);
44823 +       if (!result) {
44824 +               /* now we have to replace the unit after which new units were
44825 +                  inserted. Its position is tracked by @watch */
44826 +               reiser4_extent *ext;
44827 +               znode *node;
44828 +
44829 +               node = h->coord_after.node;
44830 +               if (node != orig_znode) {
44831 +                       coord_clear_iplug(&h->coord_after);
44832 +                       result = zload(node);
44833 +               }
44834 +
44835 +               if (likely(!result)) {
44836 +                       ext = extent_by_coord(&h->coord_after);
44837 +
44838 +                       assert("vs-987", znode_is_loaded(node));
44839 +                       assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext)));
44840 +
44841 +                       /* overwrite extent unit */
44842 +                       memcpy(ext, &h->overwrite, sizeof(reiser4_extent));
44843 +                       znode_make_dirty(node);
44844 +
44845 +                       if (node != orig_znode)
44846 +                               zrelse(node);
44847 +
44848 +                       if (return_inserted_position == 0) {
44849 +                               /* coord and lh are to be set to overwritten
44850 +                                  extent */
44851 +                               assert("vs-1662",
44852 +                                      WITH_DATA(node, !memcmp(&h->overwrite,
44853 +                                                              extent_by_coord(
44854 +                                                                      &h->coord_after),
44855 +                                                              sizeof(reiser4_extent))));
44856 +
44857 +                               *h->coord = h->coord_after;
44858 +                               done_lh(h->lh);
44859 +                               copy_lh(h->lh, &h->lh_after);
44860 +                       } else {
44861 +                               /* h->coord and h->lh are to be set to first of
44862 +                                  inserted units */
44863 +                               assert("vs-1663",
44864 +                                      WITH_DATA(h->coord->node,
44865 +                                                !memcmp(&h->new_extents[0],
44866 +                                                        extent_by_coord(h->coord),
44867 +                                                        sizeof(reiser4_extent))));
44868 +                               assert("vs-1664", h->lh->node == h->coord->node);
44869 +                       }
44870 +               }
44871 +       }
44872 +       reiser4_tap_done(&h->watch);
44873 +
44874 +       return result;
44875 +}
44876 +
44877 +lock_handle *znode_lh(znode *node)
44878 +{
44879 +       assert("vs-1371", znode_is_write_locked(node));
44880 +       assert("vs-1372", znode_is_wlocked_once(node));
44881 +       return list_entry(node->lock.owners.next, lock_handle, owners_link);
44882 +}
44883 +
44884 +/*
44885 + * Local variables:
44886 + * c-indentation-style: "K&R"
44887 + * mode-name: "LC"
44888 + * c-basic-offset: 8
44889 + * tab-width: 8
44890 + * fill-column: 79
44891 + * scroll-step: 1
44892 + * End:
44893 + */
44894 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/extent_file_ops.c linux-2.6.27/fs/reiser4/plugin/item/extent_file_ops.c
44895 --- linux-2.6.27.orig/fs/reiser4/plugin/item/extent_file_ops.c  1970-01-01 03:00:00.000000000 +0300
44896 +++ linux-2.6.27/fs/reiser4/plugin/item/extent_file_ops.c       2008-10-12 18:20:01.000000000 +0400
44897 @@ -0,0 +1,1450 @@
44898 +/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44899 +
44900 +#include "item.h"
44901 +#include "../../inode.h"
44902 +#include "../../page_cache.h"
44903 +#include "../object.h"
44904 +
44905 +#include <linux/quotaops.h>
44906 +#include <linux/swap.h>
44907 +
44908 +static inline reiser4_extent *ext_by_offset(const znode *node, int offset)
44909 +{
44910 +       reiser4_extent *ext;
44911 +
44912 +       ext = (reiser4_extent *) (zdata(node) + offset);
44913 +       return ext;
44914 +}
44915 +
44916 +/**
44917 + * check_uf_coord - verify coord extension
44918 + * @uf_coord:
44919 + * @key:
44920 + *
44921 + * Makes sure that all fields of @uf_coord are set properly. If @key is
44922 + * specified - check whether @uf_coord is set correspondingly.
44923 + */
44924 +static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key)
44925 +{
44926 +#if REISER4_DEBUG
44927 +       const coord_t *coord;
44928 +       const struct extent_coord_extension *ext_coord;
44929 +       reiser4_extent *ext;
44930 +
44931 +       coord = &uf_coord->coord;
44932 +       ext_coord = &uf_coord->extension.extent;
44933 +       ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset);
44934 +
44935 +       assert("",
44936 +              WITH_DATA(coord->node,
44937 +                        (uf_coord->valid == 1 &&
44938 +                         coord_is_iplug_set(coord) &&
44939 +                         item_is_extent(coord) &&
44940 +                         ext_coord->nr_units == nr_units_extent(coord) &&
44941 +                         ext == extent_by_coord(coord) &&
44942 +                         ext_coord->width == extent_get_width(ext) &&
44943 +                         coord->unit_pos < ext_coord->nr_units &&
44944 +                         ext_coord->pos_in_unit < ext_coord->width &&
44945 +                         memcmp(ext, &ext_coord->extent,
44946 +                                sizeof(reiser4_extent)) == 0)));
44947 +       if (key) {
44948 +               reiser4_key coord_key;
44949 +
44950 +               unit_key_by_coord(&uf_coord->coord, &coord_key);
44951 +               set_key_offset(&coord_key,
44952 +                              get_key_offset(&coord_key) +
44953 +                              (uf_coord->extension.extent.
44954 +                               pos_in_unit << PAGE_CACHE_SHIFT));
44955 +               assert("", keyeq(key, &coord_key));
44956 +       }
44957 +#endif
44958 +}
44959 +
44960 +static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord)
44961 +{
44962 +       check_uf_coord(uf_coord, NULL);
44963 +
44964 +       return ext_by_offset(uf_coord->coord.node,
44965 +                            uf_coord->extension.extent.ext_offset);
44966 +}
44967 +
44968 +#if REISER4_DEBUG
44969 +
44970 +/**
44971 + * offset_is_in_unit
44972 + *
44973 + *
44974 + *
44975 + */
44976 +/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set
44977 +   pos_in_unit inside of unit correspondingly */
44978 +static int offset_is_in_unit(const coord_t *coord, loff_t off)
44979 +{
44980 +       reiser4_key unit_key;
44981 +       __u64 unit_off;
44982 +       reiser4_extent *ext;
44983 +
44984 +       ext = extent_by_coord(coord);
44985 +
44986 +       unit_key_extent(coord, &unit_key);
44987 +       unit_off = get_key_offset(&unit_key);
44988 +       if (off < unit_off)
44989 +               return 0;
44990 +       if (off >= (unit_off + (current_blocksize * extent_get_width(ext))))
44991 +               return 0;
44992 +       return 1;
44993 +}
44994 +
44995 +static int
44996 +coord_matches_key_extent(const coord_t * coord, const reiser4_key * key)
44997 +{
44998 +       reiser4_key item_key;
44999 +
45000 +       assert("vs-771", coord_is_existing_unit(coord));
45001 +       assert("vs-1258", keylt(key, append_key_extent(coord, &item_key)));
45002 +       assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key)));
45003 +
45004 +       return offset_is_in_unit(coord, get_key_offset(key));
45005 +}
45006 +
45007 +#endif
45008 +
45009 +/**
45010 + * can_append -
45011 + * @key:
45012 + * @coord:
45013 + *
45014 + * Returns 1 if @key is equal to an append key of item @coord is set to
45015 + */
45016 +static int can_append(const reiser4_key *key, const coord_t *coord)
45017 +{
45018 +       reiser4_key append_key;
45019 +
45020 +       return keyeq(key, append_key_extent(coord, &append_key));
45021 +}
45022 +
45023 +/**
45024 + * append_hole
45025 + * @coord:
45026 + * @lh:
45027 + * @key:
45028 + *
45029 + */
45030 +static int append_hole(coord_t *coord, lock_handle *lh,
45031 +                      const reiser4_key *key)
45032 +{
45033 +       reiser4_key append_key;
45034 +       reiser4_block_nr hole_width;
45035 +       reiser4_extent *ext, new_ext;
45036 +       reiser4_item_data idata;
45037 +
45038 +       /* last item of file may have to be appended with hole */
45039 +       assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL);
45040 +       assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID);
45041 +
45042 +       /* key of first byte which is not addressed by this extent */
45043 +       append_key_extent(coord, &append_key);
45044 +
45045 +       assert("", keyle(&append_key, key));
45046 +
45047 +       /*
45048 +        * extent item has to be appended with hole. Calculate length of that
45049 +        * hole
45050 +        */
45051 +       hole_width = ((get_key_offset(key) - get_key_offset(&append_key) +
45052 +                      current_blocksize - 1) >> current_blocksize_bits);
45053 +       assert("vs-954", hole_width > 0);
45054 +
45055 +       /* set coord after last unit */
45056 +       coord_init_after_item_end(coord);
45057 +
45058 +       /* get last extent in the item */
45059 +       ext = extent_by_coord(coord);
45060 +       if (state_of_extent(ext) == HOLE_EXTENT) {
45061 +               /*
45062 +                * last extent of a file is hole extent. Widen that extent by
45063 +                * @hole_width blocks. Note that we do not worry about
45064 +                * overflowing - extent width is 64 bits
45065 +                */
45066 +               reiser4_set_extent(ext, HOLE_EXTENT_START,
45067 +                                  extent_get_width(ext) + hole_width);
45068 +               znode_make_dirty(coord->node);
45069 +               return 0;
45070 +       }
45071 +
45072 +       /* append last item of the file with hole extent unit */
45073 +       assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT ||
45074 +                         state_of_extent(ext) == UNALLOCATED_EXTENT));
45075 +
45076 +       reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
45077 +       init_new_extent(&idata, &new_ext, 1);
45078 +       return insert_into_item(coord, lh, &append_key, &idata, 0);
45079 +}
45080 +
45081 +/**
45082 + * check_jnodes
45083 + * @twig: longterm locked twig node
45084 + * @key:
45085 + *
45086 + */
45087 +static void check_jnodes(znode *twig, const reiser4_key *key, int count)
45088 +{
45089 +#if REISER4_DEBUG
45090 +       coord_t c;
45091 +       reiser4_key node_key, jnode_key;
45092 +
45093 +       jnode_key = *key;
45094 +
45095 +       assert("", twig != NULL);
45096 +       assert("", znode_get_level(twig) == TWIG_LEVEL);
45097 +       assert("", znode_is_write_locked(twig));
45098 +
45099 +       zload(twig);
45100 +       /* get the smallest key in twig node */
45101 +       coord_init_first_unit(&c, twig);
45102 +       unit_key_by_coord(&c, &node_key);
45103 +       assert("", keyle(&node_key, &jnode_key));
45104 +
45105 +       coord_init_last_unit(&c, twig);
45106 +       unit_key_by_coord(&c, &node_key);
45107 +       if (item_plugin_by_coord(&c)->s.file.append_key)
45108 +               item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key);
45109 +       set_key_offset(&jnode_key,
45110 +                      get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1);
45111 +       assert("", keylt(&jnode_key, &node_key));
45112 +       zrelse(twig);
45113 +#endif
45114 +}
45115 +
45116 +/**
45117 + * append_last_extent - append last file item
45118 + * @uf_coord: coord to start insertion from
45119 + * @jnodes: array of jnodes
45120 + * @count: number of jnodes in the array
45121 + *
45122 + * There is already at least one extent item of file @inode in the tree. Append
45123 + * the last of them with unallocated extent unit of width @count. Assign
45124 + * fake block numbers to jnodes corresponding to the inserted extent.
45125 + */
45126 +static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key,
45127 +                             jnode **jnodes, int count)
45128 +{
45129 +       int result;
45130 +       reiser4_extent new_ext;
45131 +       reiser4_item_data idata;
45132 +       coord_t *coord;
45133 +       struct extent_coord_extension *ext_coord;
45134 +       reiser4_extent *ext;
45135 +       reiser4_block_nr block;
45136 +       jnode *node;
45137 +       int i;
45138 +
45139 +       coord = &uf_coord->coord;
45140 +       ext_coord = &uf_coord->extension.extent;
45141 +       ext = ext_by_ext_coord(uf_coord);
45142 +
45143 +       /* check correctness of position in the item */
45144 +       assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord));
45145 +       assert("vs-1311", coord->between == AFTER_UNIT);
45146 +       assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1);
45147 +
45148 +       if (!can_append(key, coord)) {
45149 +               /* hole extent has to be inserted */
45150 +               result = append_hole(coord, uf_coord->lh, key);
45151 +               uf_coord->valid = 0;
45152 +               return result;
45153 +       }
45154 +
45155 +       if (count == 0)
45156 +               return 0;
45157 +
45158 +       assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE);
45159 +
45160 +       result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host,
45161 +                                          count);
45162 +       BUG_ON(result != 0);
45163 +
45164 +       switch (state_of_extent(ext)) {
45165 +       case UNALLOCATED_EXTENT:
45166 +               /*
45167 +                * last extent unit of the file is unallocated one. Increase
45168 +                * its width by @count
45169 +                */
45170 +               reiser4_set_extent(ext, UNALLOCATED_EXTENT_START,
45171 +                                  extent_get_width(ext) + count);
45172 +               znode_make_dirty(coord->node);
45173 +
45174 +               /* update coord extension */
45175 +               ext_coord->width += count;
45176 +               ON_DEBUG(extent_set_width
45177 +                        (&uf_coord->extension.extent.extent,
45178 +                         ext_coord->width));
45179 +               break;
45180 +
45181 +       case HOLE_EXTENT:
45182 +       case ALLOCATED_EXTENT:
45183 +               /*
45184 +                * last extent unit of the file is either hole or allocated
45185 +                * one. Append one unallocated extent of width @count
45186 +                */
45187 +               reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
45188 +               init_new_extent(&idata, &new_ext, 1);
45189 +               result = insert_into_item(coord, uf_coord->lh, key, &idata, 0);
45190 +               uf_coord->valid = 0;
45191 +               if (result)
45192 +                       return result;
45193 +               break;
45194 +
45195 +       default:
45196 +               return RETERR(-EIO);
45197 +       }
45198 +
45199 +       /*
45200 +        * make sure that we hold long term locked twig node containing all
45201 +        * jnodes we are about to capture
45202 +        */
45203 +       check_jnodes(uf_coord->lh->node, key, count);
45204 +
45205 +       /*
45206 +        * assign fake block numbers to all jnodes. FIXME: make sure whether
45207 +        * twig node containing inserted extent item is locked
45208 +        */
45209 +       block = fake_blocknr_unformatted(count);
45210 +       for (i = 0; i < count; i ++, block ++) {
45211 +               node = jnodes[i];
45212 +               spin_lock_jnode(node);
45213 +               JF_SET(node, JNODE_CREATED);
45214 +               jnode_set_block(node, &block);
45215 +               result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
45216 +               BUG_ON(result != 0);
45217 +               jnode_make_dirty_locked(node);
45218 +               spin_unlock_jnode(node);
45219 +       }
45220 +       return count;
45221 +}
45222 +
45223 +/**
45224 + * insert_first_hole - inser hole extent into tree
45225 + * @coord:
45226 + * @lh:
45227 + * @key:
45228 + *
45229 + *
45230 + */
45231 +static int insert_first_hole(coord_t *coord, lock_handle *lh,
45232 +                            const reiser4_key *key)
45233 +{
45234 +       reiser4_extent new_ext;
45235 +       reiser4_item_data idata;
45236 +       reiser4_key item_key;
45237 +       reiser4_block_nr hole_width;
45238 +
45239 +       /* @coord must be set for inserting of new item */
45240 +       assert("vs-711", coord_is_between_items(coord));
45241 +
45242 +       item_key = *key;
45243 +       set_key_offset(&item_key, 0ull);
45244 +
45245 +       hole_width = ((get_key_offset(key) + current_blocksize - 1) >>
45246 +                     current_blocksize_bits);
45247 +       assert("vs-710", hole_width > 0);
45248 +
45249 +       /* compose body of hole extent and insert item into tree */
45250 +       reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
45251 +       init_new_extent(&idata, &new_ext, 1);
45252 +       return insert_extent_by_coord(coord, &idata, &item_key, lh);
45253 +}
45254 +
45255 +
45256 +/**
45257 + * insert_first_extent - insert first file item
45258 + * @inode: inode of file
45259 + * @uf_coord: coord to start insertion from
45260 + * @jnodes: array of jnodes
45261 + * @count: number of jnodes in the array
45262 + * @inode:
45263 + *
45264 + * There are no items of file @inode in the tree yet. Insert unallocated extent
45265 + * of width @count into tree or hole extent if writing not to the
45266 + * beginning. Assign fake block numbers to jnodes corresponding to the inserted
45267 + * unallocated extent. Returns number of jnodes or error code.
45268 + */
45269 +static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key,
45270 +                              jnode **jnodes, int count,
45271 +                              struct inode *inode)
45272 +{
45273 +       int result;
45274 +       int i;
45275 +       reiser4_extent new_ext;
45276 +       reiser4_item_data idata;
45277 +       reiser4_block_nr block;
45278 +       struct unix_file_info *uf_info;
45279 +       jnode *node;
45280 +
45281 +       /* first extent insertion starts at leaf level */
45282 +       assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL);
45283 +       assert("vs-711", coord_is_between_items(&uf_coord->coord));
45284 +
45285 +       if (get_key_offset(key) != 0) {
45286 +               result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key);
45287 +               uf_coord->valid = 0;
45288 +               uf_info = unix_file_inode_data(inode);
45289 +
45290 +               /*
45291 +                * first item insertion is only possible when writing to empty
45292 +                * file or performing tail conversion
45293 +                */
45294 +               assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
45295 +                           (reiser4_inode_get_flag(inode,
45296 +                                                   REISER4_PART_MIXED) &&
45297 +                            reiser4_inode_get_flag(inode,
45298 +                                                   REISER4_PART_IN_CONV))));
45299 +               /* if file was empty - update its state */
45300 +               if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
45301 +                       uf_info->container = UF_CONTAINER_EXTENTS;
45302 +               return result;
45303 +       }
45304 +
45305 +       if (count == 0)
45306 +               return 0;
45307 +
45308 +       result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, count);
45309 +       BUG_ON(result != 0);
45310 +
45311 +       /*
45312 +        * prepare for tree modification: compose body of item and item data
45313 +        * structure needed for insertion
45314 +        */
45315 +       reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
45316 +       init_new_extent(&idata, &new_ext, 1);
45317 +
45318 +       /* insert extent item into the tree */
45319 +       result = insert_extent_by_coord(&uf_coord->coord, &idata, key,
45320 +                                       uf_coord->lh);
45321 +       if (result)
45322 +               return result;
45323 +
45324 +       /*
45325 +        * make sure that we hold long term locked twig node containing all
45326 +        * jnodes we are about to capture
45327 +        */
45328 +       check_jnodes(uf_coord->lh->node, key, count);
45329 +       /*
45330 +        * assign fake block numbers to all jnodes, capture and mark them dirty
45331 +        */
45332 +       block = fake_blocknr_unformatted(count);
45333 +       for (i = 0; i < count; i ++, block ++) {
45334 +               node = jnodes[i];
45335 +               spin_lock_jnode(node);
45336 +               JF_SET(node, JNODE_CREATED);
45337 +               jnode_set_block(node, &block);
45338 +               result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
45339 +               BUG_ON(result != 0);
45340 +               jnode_make_dirty_locked(node);
45341 +               spin_unlock_jnode(node);
45342 +       }
45343 +
45344 +       /*
45345 +        * invalidate coordinate, research must be performed to continue
45346 +        * because write will continue on twig level
45347 +        */
45348 +       uf_coord->valid = 0;
45349 +       return count;
45350 +}
45351 +
45352 +/**
45353 + * plug_hole - replace hole extent with unallocated and holes
45354 + * @uf_coord:
45355 + * @key:
45356 + * @node:
45357 + * @h: structure containing coordinate, lock handle, key, etc
45358 + *
45359 + * Creates an unallocated extent of width 1 within a hole. In worst case two
45360 + * additional extents can be created.
45361 + */
45362 +static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how)
45363 +{
45364 +       struct replace_handle rh;
45365 +       reiser4_extent *ext;
45366 +       reiser4_block_nr width, pos_in_unit;
45367 +       coord_t *coord;
45368 +       struct extent_coord_extension *ext_coord;
45369 +       int return_inserted_position;
45370 +
45371 +       check_uf_coord(uf_coord, key);
45372 +
45373 +       rh.coord = coord_by_uf_coord(uf_coord);
45374 +       rh.lh = uf_coord->lh;
45375 +       rh.flags = 0;
45376 +
45377 +       coord = coord_by_uf_coord(uf_coord);
45378 +       ext_coord = ext_coord_by_uf_coord(uf_coord);
45379 +       ext = ext_by_ext_coord(uf_coord);
45380 +
45381 +       width = ext_coord->width;
45382 +       pos_in_unit = ext_coord->pos_in_unit;
45383 +
45384 +       *how = 0;
45385 +       if (width == 1) {
45386 +               reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, 1);
45387 +               znode_make_dirty(coord->node);
45388 +               /* update uf_coord */
45389 +               ON_DEBUG(ext_coord->extent = *ext);
45390 +               *how = 1;
45391 +               return 0;
45392 +       } else if (pos_in_unit == 0) {
45393 +               /* we deal with first element of extent */
45394 +               if (coord->unit_pos) {
45395 +                       /* there is an extent to the left */
45396 +                       if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) {
45397 +                               /*
45398 +                                * left neighboring unit is an unallocated
45399 +                                * extent. Increase its width and decrease
45400 +                                * width of hole
45401 +                                */
45402 +                               extent_set_width(ext - 1,
45403 +                                                extent_get_width(ext - 1) + 1);
45404 +                               extent_set_width(ext, width - 1);
45405 +                               znode_make_dirty(coord->node);
45406 +
45407 +                               /* update coord extension */
45408 +                               coord->unit_pos--;
45409 +                               ext_coord->width = extent_get_width(ext - 1);
45410 +                               ext_coord->pos_in_unit = ext_coord->width - 1;
45411 +                               ext_coord->ext_offset -= sizeof(reiser4_extent);
45412 +                               ON_DEBUG(ext_coord->extent =
45413 +                                        *extent_by_coord(coord));
45414 +                               *how = 2;
45415 +                               return 0;
45416 +                       }
45417 +               }
45418 +               /* extent for replace */
45419 +               reiser4_set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1);
45420 +               /* extent to be inserted */
45421 +               reiser4_set_extent(&rh.new_extents[0], HOLE_EXTENT_START,
45422 +                                  width - 1);
45423 +               rh.nr_new_extents = 1;
45424 +
45425 +               /* have reiser4_replace_extent to return with @coord and
45426 +                  @uf_coord->lh set to unit which was replaced */
45427 +               return_inserted_position = 0;
45428 +               *how = 3;
45429 +       } else if (pos_in_unit == width - 1) {
45430 +               /* we deal with last element of extent */
45431 +               if (coord->unit_pos < nr_units_extent(coord) - 1) {
45432 +                       /* there is an extent unit to the right */
45433 +                       if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) {
45434 +                               /*
45435 +                                * right neighboring unit is an unallocated
45436 +                                * extent. Increase its width and decrease
45437 +                                * width of hole
45438 +                                */
45439 +                               extent_set_width(ext + 1,
45440 +                                                extent_get_width(ext + 1) + 1);
45441 +                               extent_set_width(ext, width - 1);
45442 +                               znode_make_dirty(coord->node);
45443 +
45444 +                               /* update coord extension */
45445 +                               coord->unit_pos++;
45446 +                               ext_coord->width = extent_get_width(ext + 1);
45447 +                               ext_coord->pos_in_unit = 0;
45448 +                               ext_coord->ext_offset += sizeof(reiser4_extent);
45449 +                               ON_DEBUG(ext_coord->extent =
45450 +                                        *extent_by_coord(coord));
45451 +                               *how = 4;
45452 +                               return 0;
45453 +                       }
45454 +               }
45455 +               /* extent for replace */
45456 +               reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1);
45457 +               /* extent to be inserted */
45458 +               reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
45459 +                                  1);
45460 +               rh.nr_new_extents = 1;
45461 +
45462 +               /* have reiser4_replace_extent to return with @coord and
45463 +                  @uf_coord->lh set to unit which was inserted */
45464 +               return_inserted_position = 1;
45465 +               *how = 5;
45466 +       } else {
45467 +               /* extent for replace */
45468 +               reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START,
45469 +                                  pos_in_unit);
45470 +               /* extents to be inserted */
45471 +               reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
45472 +                                  1);
45473 +               reiser4_set_extent(&rh.new_extents[1], HOLE_EXTENT_START,
45474 +                                  width - pos_in_unit - 1);
45475 +               rh.nr_new_extents = 2;
45476 +
45477 +               /* have reiser4_replace_extent to return with @coord and
45478 +                  @uf_coord->lh set to first of units which were inserted */
45479 +               return_inserted_position = 1;
45480 +               *how = 6;
45481 +       }
45482 +       unit_key_by_coord(coord, &rh.paste_key);
45483 +       set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) +
45484 +                      extent_get_width(&rh.overwrite) * current_blocksize);
45485 +
45486 +       uf_coord->valid = 0;
45487 +       return reiser4_replace_extent(&rh, return_inserted_position);
45488 +}
45489 +
45490 +/**
45491 + * overwrite_one_block -
45492 + * @uf_coord:
45493 + * @key:
45494 + * @node:
45495 + *
45496 + * If @node corresponds to hole extent - create unallocated extent for it and
45497 + * assign fake block number. If @node corresponds to allocated extent - assign
45498 + * block number of jnode
45499 + */
45500 +static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key,
45501 +                              jnode *node, int *hole_plugged)
45502 +{
45503 +       int result;
45504 +       struct extent_coord_extension *ext_coord;
45505 +       reiser4_extent *ext;
45506 +       reiser4_block_nr block;
45507 +       int how;
45508 +
45509 +       assert("vs-1312", uf_coord->coord.between == AT_UNIT);
45510 +
45511 +       result = 0;
45512 +       ext_coord = ext_coord_by_uf_coord(uf_coord);
45513 +       ext = ext_by_ext_coord(uf_coord);
45514 +       assert("", state_of_extent(ext) != UNALLOCATED_EXTENT);
45515 +
45516 +       switch (state_of_extent(ext)) {
45517 +       case ALLOCATED_EXTENT:
45518 +               block = extent_get_start(ext) + ext_coord->pos_in_unit;
45519 +               break;
45520 +
45521 +       case HOLE_EXTENT:
45522 +               result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(node)->host, 1);
45523 +               BUG_ON(result != 0);
45524 +               result = plug_hole(uf_coord, key, &how);
45525 +               if (result)
45526 +                       return result;
45527 +               block = fake_blocknr_unformatted(1);
45528 +               if (hole_plugged)
45529 +                       *hole_plugged = 1;
45530 +               JF_SET(node, JNODE_CREATED);
45531 +               break;
45532 +
45533 +       default:
45534 +               return RETERR(-EIO);
45535 +       }
45536 +
45537 +       jnode_set_block(node, &block);
45538 +       return 0;
45539 +}
45540 +
45541 +/**
45542 + * move_coord - move coordinate forward
45543 + * @uf_coord:
45544 + *
45545 + * Move coordinate one data block pointer forward. Return 1 if coord is set to
45546 + * the last one already or is invalid.
45547 + */
45548 +static int move_coord(uf_coord_t *uf_coord)
45549 +{
45550 +       struct extent_coord_extension *ext_coord;
45551 +
45552 +       if (uf_coord->valid == 0)
45553 +               return 1;
45554 +       ext_coord = &uf_coord->extension.extent;
45555 +       ext_coord->pos_in_unit ++;
45556 +       if (ext_coord->pos_in_unit < ext_coord->width)
45557 +               /* coordinate moved within the unit */
45558 +               return 0;
45559 +
45560 +       /* end of unit is reached. Try to move to next unit */
45561 +       ext_coord->pos_in_unit = 0;
45562 +       uf_coord->coord.unit_pos ++;
45563 +       if (uf_coord->coord.unit_pos < ext_coord->nr_units) {
45564 +               /* coordinate moved to next unit */
45565 +               ext_coord->ext_offset += sizeof(reiser4_extent);
45566 +               ext_coord->width =
45567 +                       extent_get_width(ext_by_offset
45568 +                                        (uf_coord->coord.node,
45569 +                                         ext_coord->ext_offset));
45570 +               ON_DEBUG(ext_coord->extent =
45571 +                        *ext_by_offset(uf_coord->coord.node,
45572 +                                       ext_coord->ext_offset));
45573 +               return 0;
45574 +       }
45575 +       /* end of item is reached */
45576 +       uf_coord->valid = 0;
45577 +       return 1;
45578 +}
45579 +
45580 +/**
45581 + * overwrite_extent -
45582 + * @inode:
45583 + *
45584 + * Returns number of handled jnodes.
45585 + */
45586 +static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key,
45587 +                           jnode **jnodes, int count, int *plugged_hole)
45588 +{
45589 +       int result;
45590 +       reiser4_key k;
45591 +       int i;
45592 +       jnode *node;
45593 +
45594 +       k = *key;
45595 +       for (i = 0; i < count; i ++) {
45596 +               node = jnodes[i];
45597 +               if (*jnode_get_block(node) == 0) {
45598 +                       result = overwrite_one_block(uf_coord, &k, node, plugged_hole);
45599 +                       if (result)
45600 +                               return result;
45601 +               }
45602 +               /*
45603 +                * make sure that we hold long term locked twig node containing
45604 +                * all jnodes we are about to capture
45605 +                */
45606 +               check_jnodes(uf_coord->lh->node, &k, 1);
45607 +               /*
45608 +                * assign fake block numbers to all jnodes, capture and mark
45609 +                * them dirty
45610 +                */
45611 +               spin_lock_jnode(node);
45612 +               result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
45613 +               BUG_ON(result != 0);
45614 +               jnode_make_dirty_locked(node);
45615 +               spin_unlock_jnode(node);
45616 +
45617 +               if (uf_coord->valid == 0)
45618 +                       return i + 1;
45619 +
45620 +               check_uf_coord(uf_coord, &k);
45621 +
45622 +               if (move_coord(uf_coord)) {
45623 +                       /*
45624 +                        * failed to move to the next node pointer. Either end
45625 +                        * of file or end of twig node is reached. In the later
45626 +                        * case we might go to the right neighbor.
45627 +                        */
45628 +                       uf_coord->valid = 0;
45629 +                       return i + 1;
45630 +               }
45631 +               set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE);
45632 +       }
45633 +
45634 +       return count;
45635 +}
45636 +
45637 +/**
45638 + * reiser4_update_extent
45639 + * @file:
45640 + * @jnodes:
45641 + * @count:
45642 + * @off:
45643 + *
45644 + */
45645 +int reiser4_update_extent(struct inode *inode, jnode *node, loff_t pos,
45646 +                 int *plugged_hole)
45647 +{
45648 +       int result;
45649 +       znode *loaded;
45650 +       uf_coord_t uf_coord;
45651 +       coord_t *coord;
45652 +       lock_handle lh;
45653 +       reiser4_key key;
45654 +
45655 +       assert("", reiser4_lock_counters()->d_refs == 0);
45656 +
45657 +       key_by_inode_and_offset_common(inode, pos, &key);
45658 +
45659 +       init_uf_coord(&uf_coord, &lh);
45660 +       coord = &uf_coord.coord;
45661 +       result = find_file_item_nohint(coord, &lh, &key,
45662 +                                      ZNODE_WRITE_LOCK, inode);
45663 +       if (IS_CBKERR(result)) {
45664 +               assert("", reiser4_lock_counters()->d_refs == 0);
45665 +               return result;
45666 +       }
45667 +
45668 +       result = zload(coord->node);
45669 +       BUG_ON(result != 0);
45670 +       loaded = coord->node;
45671 +
45672 +       if (coord->between == AFTER_UNIT) {
45673 +               /*
45674 +                * append existing extent item with unallocated extent of width
45675 +                * nr_jnodes
45676 +                */
45677 +               init_coord_extension_extent(&uf_coord,
45678 +                                           get_key_offset(&key));
45679 +               result = append_last_extent(&uf_coord, &key,
45680 +                                           &node, 1);
45681 +       } else if (coord->between == AT_UNIT) {
45682 +               /*
45683 +                * overwrite
45684 +                * not optimal yet. Will be optimized if new write will show
45685 +                * performance win.
45686 +                */
45687 +               init_coord_extension_extent(&uf_coord,
45688 +                                           get_key_offset(&key));
45689 +               result = overwrite_extent(&uf_coord, &key,
45690 +                                         &node, 1, plugged_hole);
45691 +       } else {
45692 +               /*
45693 +                * there are no items of this file in the tree yet. Create
45694 +                * first item of the file inserting one unallocated extent of
45695 +                * width nr_jnodes
45696 +                */
45697 +               result = insert_first_extent(&uf_coord, &key, &node, 1, inode);
45698 +       }
45699 +       assert("", result == 1 || result < 0);
45700 +       zrelse(loaded);
45701 +       done_lh(&lh);
45702 +       assert("", reiser4_lock_counters()->d_refs == 0);
45703 +       return (result == 1) ? 0 : result;
45704 +}
45705 +
45706 +/**
45707 + * update_extents
45708 + * @file:
45709 + * @jnodes:
45710 + * @count:
45711 + * @off:
45712 + *
45713 + */
45714 +static int update_extents(struct file *file, struct inode *inode,
45715 +                         jnode **jnodes, int count, loff_t pos)
45716 +{
45717 +       struct hint hint;
45718 +       reiser4_key key;
45719 +       int result;
45720 +       znode *loaded;
45721 +
45722 +       result = load_file_hint(file, &hint);
45723 +       BUG_ON(result != 0);
45724 +
45725 +       if (count != 0)
45726 +               /*
45727 +                * count == 0 is special case: expanding truncate
45728 +                */
45729 +               pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT;
45730 +       key_by_inode_and_offset_common(inode, pos, &key);
45731 +
45732 +       assert("", reiser4_lock_counters()->d_refs == 0);
45733 +
45734 +       do {
45735 +               result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode);
45736 +               if (IS_CBKERR(result)) {
45737 +                       assert("", reiser4_lock_counters()->d_refs == 0);
45738 +                       return result;
45739 +               }
45740 +
45741 +               result = zload(hint.ext_coord.coord.node);
45742 +               BUG_ON(result != 0);
45743 +               loaded = hint.ext_coord.coord.node;
45744 +
45745 +               if (hint.ext_coord.coord.between == AFTER_UNIT) {
45746 +                       /*
45747 +                        * append existing extent item with unallocated extent
45748 +                        * of width nr_jnodes
45749 +                        */
45750 +                       if (hint.ext_coord.valid == 0)
45751 +                               /* NOTE: get statistics on this */
45752 +                               init_coord_extension_extent(&hint.ext_coord,
45753 +                                                           get_key_offset(&key));
45754 +                       result = append_last_extent(&hint.ext_coord, &key,
45755 +                                                   jnodes, count);
45756 +               } else if (hint.ext_coord.coord.between == AT_UNIT) {
45757 +                       /*
45758 +                        * overwrite
45759 +                        * not optimal yet. Will be optimized if new write will
45760 +                        * show performance win.
45761 +                        */
45762 +                       if (hint.ext_coord.valid == 0)
45763 +                               /* NOTE: get statistics on this */
45764 +                               init_coord_extension_extent(&hint.ext_coord,
45765 +                                                           get_key_offset(&key));
45766 +                       result = overwrite_extent(&hint.ext_coord, &key,
45767 +                                                 jnodes, count, NULL);
45768 +               } else {
45769 +                       /*
45770 +                        * there are no items of this file in the tree
45771 +                        * yet. Create first item of the file inserting one
45772 +                        * unallocated extent of * width nr_jnodes
45773 +                        */
45774 +                       result = insert_first_extent(&hint.ext_coord, &key,
45775 +                                                    jnodes, count, inode);
45776 +               }
45777 +               zrelse(loaded);
45778 +               if (result < 0) {
45779 +                       done_lh(hint.ext_coord.lh);
45780 +                       break;
45781 +               }
45782 +
45783 +               jnodes += result;
45784 +               count -= result;
45785 +               set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE);
45786 +
45787 +               /* seal and unlock znode */
45788 +               if (hint.ext_coord.valid)
45789 +                       reiser4_set_hint(&hint, &key, ZNODE_WRITE_LOCK);
45790 +               else
45791 +                       reiser4_unset_hint(&hint);
45792 +
45793 +       } while (count > 0);
45794 +
45795 +       save_file_hint(file, &hint);
45796 +       assert("", reiser4_lock_counters()->d_refs == 0);
45797 +       return result;
45798 +}
45799 +
45800 +/**
45801 + * write_extent_reserve_space - reserve space for extent write operation
45802 + * @inode:
45803 + *
45804 + * Estimates and reserves space which may be required for writing
45805 + * WRITE_GRANULARITY pages of file.
45806 + */
45807 +static int write_extent_reserve_space(struct inode *inode)
45808 +{
45809 +       __u64 count;
45810 +       reiser4_tree *tree;
45811 +
45812 +       /*
45813 +        * to write WRITE_GRANULARITY pages to a file by extents we have to
45814 +        * reserve disk space for:
45815 +
45816 +        * 1. find_file_item may have to insert empty node to the tree (empty
45817 +        * leaf node between two extent items). This requires 1 block and
45818 +        * number of blocks which are necessary to perform insertion of an
45819 +        * internal item into twig level.
45820 +
45821 +        * 2. for each of written pages there might be needed 1 block and
45822 +        * number of blocks which might be necessary to perform insertion of or
45823 +        * paste to an extent item.
45824 +
45825 +        * 3. stat data update
45826 +        */
45827 +       tree = reiser4_tree_by_inode(inode);
45828 +       count = estimate_one_insert_item(tree) +
45829 +               WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) +
45830 +               estimate_one_insert_item(tree);
45831 +       grab_space_enable();
45832 +       return reiser4_grab_space(count, 0 /* flags */);
45833 +}
45834 +
45835 +/*
45836 + * filemap_copy_from_user no longer exists in generic code, because it
45837 + * is deadlocky (copying from user while holding the page lock is bad).
45838 + * As a temporary fix for reiser4, just define it here.
45839 + */
45840 +static inline size_t
45841 +filemap_copy_from_user(struct page *page, unsigned long offset,
45842 +                       const char __user *buf, unsigned bytes)
45843 +{
45844 +       char *kaddr;
45845 +       int left;
45846 +
45847 +       kaddr = kmap_atomic(page, KM_USER0);
45848 +       left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
45849 +       kunmap_atomic(kaddr, KM_USER0);
45850 +
45851 +       if (left != 0) {
45852 +               /* Do it the slow way */
45853 +               kaddr = kmap(page);
45854 +               left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
45855 +               kunmap(page);
45856 +       }
45857 +       return bytes - left;
45858 +}
45859 +
45860 +/**
45861 + * reiser4_write_extent - write method of extent item plugin
45862 + * @file: file to write to
45863 + * @buf: address of user-space buffer
45864 + * @count: number of bytes to write
45865 + * @pos: position in file to write to
45866 + *
45867 + */
45868 +ssize_t reiser4_write_extent(struct file *file, struct inode * inode,
45869 +                            const char __user *buf, size_t count, loff_t *pos)
45870 +{
45871 +       int have_to_update_extent;
45872 +       int nr_pages, nr_dirty;
45873 +       struct page *page;
45874 +       jnode *jnodes[WRITE_GRANULARITY + 1];
45875 +       unsigned long index;
45876 +       unsigned long end;
45877 +       int i;
45878 +       int to_page, page_off;
45879 +       size_t left, written;
45880 +       int result = 0;
45881 +
45882 +       if (write_extent_reserve_space(inode))
45883 +               return RETERR(-ENOSPC);
45884 +
45885 +       if (count == 0) {
45886 +               /* truncate case */
45887 +               update_extents(file, inode, jnodes, 0, *pos);
45888 +               return 0;
45889 +       }
45890 +
45891 +       BUG_ON(get_current_context()->trans->atom != NULL);
45892 +
45893 +       left = count;
45894 +       index = *pos >> PAGE_CACHE_SHIFT;
45895 +       /* calculate number of pages which are to be written */
45896 +       end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT);
45897 +       nr_pages = end - index + 1;
45898 +       nr_dirty = 0;
45899 +       assert("", nr_pages <= WRITE_GRANULARITY + 1);
45900 +
45901 +       /* get pages and jnodes */
45902 +       for (i = 0; i < nr_pages; i ++) {
45903 +               page = find_or_create_page(inode->i_mapping, index + i,
45904 +                                          reiser4_ctx_gfp_mask_get());
45905 +               if (page == NULL) {
45906 +                       nr_pages = i;
45907 +                       result = RETERR(-ENOMEM);
45908 +                       goto out;
45909 +               }
45910 +
45911 +               jnodes[i] = jnode_of_page(page);
45912 +               if (IS_ERR(jnodes[i])) {
45913 +                       unlock_page(page);
45914 +                       page_cache_release(page);
45915 +                       nr_pages = i;
45916 +                       result = RETERR(-ENOMEM);
45917 +                       goto out;
45918 +               }
45919 +               /* prevent jnode and page from disconnecting */
45920 +               JF_SET(jnodes[i], JNODE_WRITE_PREPARED);
45921 +               unlock_page(page);
45922 +       }
45923 +
45924 +       BUG_ON(get_current_context()->trans->atom != NULL);
45925 +
45926 +       have_to_update_extent = 0;
45927 +
45928 +       page_off = (*pos & (PAGE_CACHE_SIZE - 1));
45929 +       for (i = 0; i < nr_pages; i ++) {
45930 +               to_page = PAGE_CACHE_SIZE - page_off;
45931 +               if (to_page > left)
45932 +                       to_page = left;
45933 +               page = jnode_page(jnodes[i]);
45934 +               if (page_offset(page) < inode->i_size &&
45935 +                   !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
45936 +                       /*
45937 +                        * the above is not optimal for partial write to last
45938 +                        * page of file when file size is not at boundary of
45939 +                        * page
45940 +                        */
45941 +                       lock_page(page);
45942 +                       if (!PageUptodate(page)) {
45943 +                               result = readpage_unix_file(NULL, page);
45944 +                               BUG_ON(result != 0);
45945 +                               /* wait for read completion */
45946 +                               lock_page(page);
45947 +                               BUG_ON(!PageUptodate(page));
45948 +                       } else
45949 +                               result = 0;
45950 +                       unlock_page(page);
45951 +               }
45952 +
45953 +               BUG_ON(get_current_context()->trans->atom != NULL);
45954 +               fault_in_pages_readable(buf, to_page);
45955 +               BUG_ON(get_current_context()->trans->atom != NULL);
45956 +
45957 +               lock_page(page);
45958 +               if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE)
45959 +                       simple_prepare_write(file, page, page_off,
45960 +                                            page_off + to_page);
45961 +
45962 +               written = filemap_copy_from_user(page, page_off, buf, to_page);
45963 +               if (unlikely(written != to_page)) {
45964 +                       unlock_page(page);
45965 +                       result = RETERR(-EFAULT);
45966 +                       break;
45967 +               }
45968 +
45969 +               flush_dcache_page(page);
45970 +               reiser4_set_page_dirty_internal(page);
45971 +               unlock_page(page);
45972 +               nr_dirty++;
45973 +
45974 +               mark_page_accessed(page);
45975 +               SetPageUptodate(page);
45976 +
45977 +               if (jnodes[i]->blocknr == 0)
45978 +                       have_to_update_extent ++;
45979 +
45980 +               page_off = 0;
45981 +               buf += to_page;
45982 +               left -= to_page;
45983 +               BUG_ON(get_current_context()->trans->atom != NULL);
45984 +       }
45985 +
45986 +       if (have_to_update_extent) {
45987 +               update_extents(file, inode, jnodes, nr_dirty, *pos);
45988 +       } else {
45989 +               for (i = 0; i < nr_dirty; i ++) {
45990 +                       int ret;
45991 +                       spin_lock_jnode(jnodes[i]);
45992 +                       ret = reiser4_try_capture(jnodes[i],
45993 +                                                    ZNODE_WRITE_LOCK, 0);
45994 +                       BUG_ON(ret != 0);
45995 +                       jnode_make_dirty_locked(jnodes[i]);
45996 +                       spin_unlock_jnode(jnodes[i]);
45997 +               }
45998 +       }
45999 +out:
46000 +       for (i = 0; i < nr_pages; i ++) {
46001 +               page_cache_release(jnode_page(jnodes[i]));
46002 +               JF_CLR(jnodes[i], JNODE_WRITE_PREPARED);
46003 +               jput(jnodes[i]);
46004 +       }
46005 +
46006 +       /* the only errors handled so far is ENOMEM and
46007 +          EFAULT on copy_from_user  */
46008 +
46009 +       return (count - left) ? (count - left) : result;
46010 +}
46011 +
46012 +int reiser4_do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos,
46013 +                              struct page *page)
46014 +{
46015 +       jnode *j;
46016 +       struct address_space *mapping;
46017 +       unsigned long index;
46018 +       oid_t oid;
46019 +       reiser4_block_nr block;
46020 +
46021 +       mapping = page->mapping;
46022 +       oid = get_inode_oid(mapping->host);
46023 +       index = page->index;
46024 +
46025 +       switch (state_of_extent(ext)) {
46026 +       case HOLE_EXTENT:
46027 +               /*
46028 +                * it is possible to have hole page with jnode, if page was
46029 +                * eflushed previously.
46030 +                */
46031 +               j = jfind(mapping, index);
46032 +               if (j == NULL) {
46033 +                       zero_user(page, 0, PAGE_CACHE_SIZE);
46034 +                       SetPageUptodate(page);
46035 +                       unlock_page(page);
46036 +                       return 0;
46037 +               }
46038 +               spin_lock_jnode(j);
46039 +               if (!jnode_page(j)) {
46040 +                       jnode_attach_page(j, page);
46041 +               } else {
46042 +                       BUG_ON(jnode_page(j) != page);
46043 +                       assert("vs-1504", jnode_page(j) == page);
46044 +               }
46045 +               block = *jnode_get_io_block(j);
46046 +               spin_unlock_jnode(j);
46047 +               if (block == 0) {
46048 +                       zero_user(page, 0, PAGE_CACHE_SIZE);
46049 +                       SetPageUptodate(page);
46050 +                       unlock_page(page);
46051 +                       jput(j);
46052 +                       return 0;
46053 +               }
46054 +               break;
46055 +
46056 +       case ALLOCATED_EXTENT:
46057 +               j = jnode_of_page(page);
46058 +               if (IS_ERR(j))
46059 +                       return PTR_ERR(j);
46060 +               if (*jnode_get_block(j) == 0) {
46061 +                       reiser4_block_nr blocknr;
46062 +
46063 +                       blocknr = extent_get_start(ext) + pos;
46064 +                       jnode_set_block(j, &blocknr);
46065 +               } else
46066 +                       assert("vs-1403",
46067 +                              j->blocknr == extent_get_start(ext) + pos);
46068 +               break;
46069 +
46070 +       case UNALLOCATED_EXTENT:
46071 +               j = jfind(mapping, index);
46072 +               assert("nikita-2688", j);
46073 +               assert("vs-1426", jnode_page(j) == NULL);
46074 +
46075 +               spin_lock_jnode(j);
46076 +               jnode_attach_page(j, page);
46077 +               spin_unlock_jnode(j);
46078 +               break;
46079 +
46080 +       default:
46081 +               warning("vs-957", "wrong extent\n");
46082 +               return RETERR(-EIO);
46083 +       }
46084 +
46085 +       BUG_ON(j == 0);
46086 +       reiser4_page_io(page, j, READ, reiser4_ctx_gfp_mask_get());
46087 +       jput(j);
46088 +       return 0;
46089 +}
46090 +
46091 +/* Implements plugin->u.item.s.file.read operation for extent items. */
46092 +int reiser4_read_extent(struct file *file, flow_t *flow, hint_t *hint)
46093 +{
46094 +       int result;
46095 +       struct page *page;
46096 +       unsigned long cur_page, next_page;
46097 +       unsigned long page_off, count;
46098 +       struct address_space *mapping;
46099 +       loff_t file_off;
46100 +       uf_coord_t *uf_coord;
46101 +       coord_t *coord;
46102 +       struct extent_coord_extension *ext_coord;
46103 +       unsigned long nr_pages;
46104 +       char *kaddr;
46105 +
46106 +       assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE);
46107 +       assert("vs-572", flow->user == 1);
46108 +       assert("vs-1351", flow->length > 0);
46109 +
46110 +       uf_coord = &hint->ext_coord;
46111 +
46112 +       check_uf_coord(uf_coord, NULL);
46113 +       assert("vs-33", uf_coord->lh == &hint->lh);
46114 +
46115 +       coord = &uf_coord->coord;
46116 +       assert("vs-1119", znode_is_rlocked(coord->node));
46117 +       assert("vs-1120", znode_is_loaded(coord->node));
46118 +       assert("vs-1256", coord_matches_key_extent(coord, &flow->key));
46119 +
46120 +       mapping = file->f_dentry->d_inode->i_mapping;
46121 +       ext_coord = &uf_coord->extension.extent;
46122 +
46123 +       /* offset in a file to start read from */
46124 +       file_off = get_key_offset(&flow->key);
46125 +       /* offset within the page to start read from */
46126 +       page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1));
46127 +       /* bytes which can be read from the page which contains file_off */
46128 +       count = PAGE_CACHE_SIZE - page_off;
46129 +
46130 +       /* index of page containing offset read is to start from */
46131 +       cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT);
46132 +       next_page = cur_page;
46133 +       /* number of pages flow spans over */
46134 +       nr_pages =
46135 +           ((file_off + flow->length + PAGE_CACHE_SIZE -
46136 +             1) >> PAGE_CACHE_SHIFT) - cur_page;
46137 +
46138 +       /* we start having twig node read locked. However, we do not want to
46139 +          keep that lock all the time readahead works. So, set a sel and
46140 +          release twig node. */
46141 +       reiser4_set_hint(hint, &flow->key, ZNODE_READ_LOCK);
46142 +       /* &hint->lh is done-ed */
46143 +
46144 +       do {
46145 +               reiser4_txn_restart_current();
46146 +               page = read_mapping_page(mapping, cur_page, file);
46147 +               if (IS_ERR(page))
46148 +                       return PTR_ERR(page);
46149 +               lock_page(page);
46150 +               if (!PageUptodate(page)) {
46151 +                       unlock_page(page);
46152 +                       page_cache_release(page);
46153 +                       warning("jmacd-97178", "extent_read: page is not up to date");
46154 +                       return RETERR(-EIO);
46155 +               }
46156 +               mark_page_accessed(page);
46157 +               unlock_page(page);
46158 +
46159 +               /* If users can be writing to this page using arbitrary virtual
46160 +                  addresses, take care about potential aliasing before reading
46161 +                  the page on the kernel side.
46162 +                */
46163 +               if (mapping_writably_mapped(mapping))
46164 +                       flush_dcache_page(page);
46165 +
46166 +               assert("nikita-3034", reiser4_schedulable());
46167 +
46168 +               /* number of bytes which are to be read from the page */
46169 +               if (count > flow->length)
46170 +                       count = flow->length;
46171 +
46172 +               result = fault_in_pages_writeable(flow->data, count);
46173 +               if (result) {
46174 +                       page_cache_release(page);
46175 +                       return RETERR(-EFAULT);
46176 +               }
46177 +
46178 +               kaddr = kmap_atomic(page, KM_USER0);
46179 +               result = __copy_to_user_inatomic(flow->data,
46180 +                                              kaddr + page_off, count);
46181 +               kunmap_atomic(kaddr, KM_USER0);
46182 +               if (result != 0) {
46183 +                       kaddr = kmap(page);
46184 +                       result = __copy_to_user(flow->data, kaddr + page_off, count);
46185 +                       kunmap(page);
46186 +                       if (unlikely(result))
46187 +                               return RETERR(-EFAULT);
46188 +               }
46189 +
46190 +               page_cache_release(page);
46191 +
46192 +               /* increase key (flow->key), update user area pointer (flow->data) */
46193 +               move_flow_forward(flow, count);
46194 +
46195 +               page_off = 0;
46196 +               cur_page ++;
46197 +               count = PAGE_CACHE_SIZE;
46198 +               nr_pages--;
46199 +       } while (flow->length);
46200 +
46201 +       return 0;
46202 +}
46203 +
46204 +/*
46205 +   plugin->s.file.readpage
46206 +   reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage
46207 +   or
46208 +   filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_extent
46209 +
46210 +   At the beginning: coord->node is read locked, zloaded, page is
46211 +   locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index)
46212 +*/
46213 +int reiser4_readpage_extent(void *vp, struct page *page)
46214 +{
46215 +       uf_coord_t *uf_coord = vp;
46216 +       ON_DEBUG(coord_t * coord = &uf_coord->coord);
46217 +       ON_DEBUG(reiser4_key key);
46218 +
46219 +       assert("vs-1040", PageLocked(page));
46220 +       assert("vs-1050", !PageUptodate(page));
46221 +       assert("vs-1039", page->mapping && page->mapping->host);
46222 +
46223 +       assert("vs-1044", znode_is_loaded(coord->node));
46224 +       assert("vs-758", item_is_extent(coord));
46225 +       assert("vs-1046", coord_is_existing_unit(coord));
46226 +       assert("vs-1045", znode_is_rlocked(coord->node));
46227 +       assert("vs-1047",
46228 +              page->mapping->host->i_ino ==
46229 +              get_key_objectid(item_key_by_coord(coord, &key)));
46230 +       check_uf_coord(uf_coord, NULL);
46231 +
46232 +       return reiser4_do_readpage_extent(
46233 +               ext_by_ext_coord(uf_coord),
46234 +               uf_coord->extension.extent.pos_in_unit, page);
46235 +}
46236 +
46237 +/**
46238 + * get_block_address_extent
46239 + * @coord:
46240 + * @block:
46241 + * @result:
46242 + *
46243 + *
46244 + */
46245 +int get_block_address_extent(const coord_t *coord, sector_t block,
46246 +                            sector_t *result)
46247 +{
46248 +       reiser4_extent *ext;
46249 +
46250 +       if (!coord_is_existing_unit(coord))
46251 +               return RETERR(-EINVAL);
46252 +
46253 +       ext = extent_by_coord(coord);
46254 +
46255 +       if (state_of_extent(ext) != ALLOCATED_EXTENT)
46256 +               /* FIXME: bad things may happen if it is unallocated extent */
46257 +               *result = 0;
46258 +       else {
46259 +               reiser4_key key;
46260 +
46261 +               unit_key_by_coord(coord, &key);
46262 +               assert("vs-1645",
46263 +                      block >= get_key_offset(&key) >> current_blocksize_bits);
46264 +               assert("vs-1646",
46265 +                      block <
46266 +                      (get_key_offset(&key) >> current_blocksize_bits) +
46267 +                      extent_get_width(ext));
46268 +               *result =
46269 +                   extent_get_start(ext) + (block -
46270 +                                            (get_key_offset(&key) >>
46271 +                                             current_blocksize_bits));
46272 +       }
46273 +       return 0;
46274 +}
46275 +
46276 +/*
46277 +  plugin->u.item.s.file.append_key
46278 +  key of first byte which is the next to last byte by addressed by this extent
46279 +*/
46280 +reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key)
46281 +{
46282 +       item_key_by_coord(coord, key);
46283 +       set_key_offset(key,
46284 +                      get_key_offset(key) + reiser4_extent_size(coord,
46285 +                                                                nr_units_extent
46286 +                                                                (coord)));
46287 +
46288 +       assert("vs-610", get_key_offset(key)
46289 +              && (get_key_offset(key) & (current_blocksize - 1)) == 0);
46290 +       return key;
46291 +}
46292 +
46293 +/* plugin->u.item.s.file.init_coord_extension */
46294 +void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped)
46295 +{
46296 +       coord_t *coord;
46297 +       struct extent_coord_extension *ext_coord;
46298 +       reiser4_key key;
46299 +       loff_t offset;
46300 +
46301 +       assert("vs-1295", uf_coord->valid == 0);
46302 +
46303 +       coord = &uf_coord->coord;
46304 +       assert("vs-1288", coord_is_iplug_set(coord));
46305 +       assert("vs-1327", znode_is_loaded(coord->node));
46306 +
46307 +       if (coord->between != AFTER_UNIT && coord->between != AT_UNIT)
46308 +               return;
46309 +
46310 +       ext_coord = &uf_coord->extension.extent;
46311 +       ext_coord->nr_units = nr_units_extent(coord);
46312 +       ext_coord->ext_offset =
46313 +           (char *)extent_by_coord(coord) - zdata(coord->node);
46314 +       ext_coord->width = extent_get_width(extent_by_coord(coord));
46315 +       ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
46316 +       uf_coord->valid = 1;
46317 +
46318 +       /* pos_in_unit is the only uninitialized field in extended coord */
46319 +       if (coord->between == AFTER_UNIT) {
46320 +               assert("vs-1330",
46321 +                      coord->unit_pos == nr_units_extent(coord) - 1);
46322 +
46323 +               ext_coord->pos_in_unit = ext_coord->width - 1;
46324 +       } else {
46325 +               /* AT_UNIT */
46326 +               unit_key_by_coord(coord, &key);
46327 +               offset = get_key_offset(&key);
46328 +
46329 +               assert("vs-1328", offset <= lookuped);
46330 +               assert("vs-1329",
46331 +                      lookuped <
46332 +                      offset + ext_coord->width * current_blocksize);
46333 +               ext_coord->pos_in_unit =
46334 +                   ((lookuped - offset) >> current_blocksize_bits);
46335 +       }
46336 +}
46337 +
46338 +/*
46339 + * Local variables:
46340 + * c-indentation-style: "K&R"
46341 + * mode-name: "LC"
46342 + * c-basic-offset: 8
46343 + * tab-width: 8
46344 + * fill-column: 79
46345 + * scroll-step: 1
46346 + * End:
46347 + */
46348 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/extent_flush_ops.c linux-2.6.27/fs/reiser4/plugin/item/extent_flush_ops.c
46349 --- linux-2.6.27.orig/fs/reiser4/plugin/item/extent_flush_ops.c 1970-01-01 03:00:00.000000000 +0300
46350 +++ linux-2.6.27/fs/reiser4/plugin/item/extent_flush_ops.c      2008-10-12 18:20:01.000000000 +0400
46351 @@ -0,0 +1,1028 @@
46352 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
46353 +
46354 +#include "item.h"
46355 +#include "../../tree.h"
46356 +#include "../../jnode.h"
46357 +#include "../../super.h"
46358 +#include "../../flush.h"
46359 +#include "../../carry.h"
46360 +#include "../object.h"
46361 +
46362 +#include <linux/pagemap.h>
46363 +
46364 +static reiser4_block_nr extent_unit_start(const coord_t * item);
46365 +
46366 +/* Return either first or last extent (depending on @side) of the item
46367 +   @coord is set to. Set @pos_in_unit either to first or to last block
46368 +   of extent. */
46369 +static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side,
46370 +                                        reiser4_block_nr * pos_in_unit)
46371 +{
46372 +       reiser4_extent *ext;
46373 +
46374 +       if (side == LEFT_SIDE) {
46375 +               /* get first extent of item */
46376 +               ext = extent_item(coord);
46377 +               *pos_in_unit = 0;
46378 +       } else {
46379 +               /* get last extent of item and last position within it */
46380 +               assert("vs-363", side == RIGHT_SIDE);
46381 +               ext = extent_item(coord) + coord_last_unit_pos(coord);
46382 +               *pos_in_unit = extent_get_width(ext) - 1;
46383 +       }
46384 +
46385 +       return ext;
46386 +}
46387 +
46388 +/* item_plugin->f.utmost_child */
46389 +/* Return the child. Coord is set to extent item. Find jnode corresponding
46390 +   either to first or to last unformatted node pointed by the item */
46391 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp)
46392 +{
46393 +       reiser4_extent *ext;
46394 +       reiser4_block_nr pos_in_unit;
46395 +
46396 +       ext = extent_utmost_ext(coord, side, &pos_in_unit);
46397 +
46398 +       switch (state_of_extent(ext)) {
46399 +       case HOLE_EXTENT:
46400 +               *childp = NULL;
46401 +               return 0;
46402 +       case ALLOCATED_EXTENT:
46403 +       case UNALLOCATED_EXTENT:
46404 +               break;
46405 +       default:
46406 +               /* this should never happen */
46407 +               assert("vs-1417", 0);
46408 +       }
46409 +
46410 +       {
46411 +               reiser4_key key;
46412 +               reiser4_tree *tree;
46413 +               unsigned long index;
46414 +
46415 +               if (side == LEFT_SIDE) {
46416 +                       /* get key of first byte addressed by the extent */
46417 +                       item_key_by_coord(coord, &key);
46418 +               } else {
46419 +                       /* get key of byte which next after last byte addressed by the extent */
46420 +                       append_key_extent(coord, &key);
46421 +               }
46422 +
46423 +               assert("vs-544",
46424 +                      (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul);
46425 +               /* index of first or last (depending on @side) page addressed
46426 +                  by the extent */
46427 +               index =
46428 +                   (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT);
46429 +               if (side == RIGHT_SIDE)
46430 +                       index--;
46431 +
46432 +               tree = coord->node->zjnode.tree;
46433 +               *childp = jlookup(tree, get_key_objectid(&key), index);
46434 +       }
46435 +
46436 +       return 0;
46437 +}
46438 +
46439 +/* item_plugin->f.utmost_child_real_block */
46440 +/* Return the child's block, if allocated. */
46441 +int
46442 +utmost_child_real_block_extent(const coord_t * coord, sideof side,
46443 +                              reiser4_block_nr * block)
46444 +{
46445 +       reiser4_extent *ext;
46446 +
46447 +       ext = extent_by_coord(coord);
46448 +
46449 +       switch (state_of_extent(ext)) {
46450 +       case ALLOCATED_EXTENT:
46451 +               *block = extent_get_start(ext);
46452 +               if (side == RIGHT_SIDE)
46453 +                       *block += extent_get_width(ext) - 1;
46454 +               break;
46455 +       case HOLE_EXTENT:
46456 +       case UNALLOCATED_EXTENT:
46457 +               *block = 0;
46458 +               break;
46459 +       default:
46460 +               /* this should never happen */
46461 +               assert("vs-1418", 0);
46462 +       }
46463 +
46464 +       return 0;
46465 +}
46466 +
46467 +/* item_plugin->f.scan */
46468 +/* Performs leftward scanning starting from an unformatted node and its parent coordinate.
46469 +   This scan continues, advancing the parent coordinate, until either it encounters a
46470 +   formatted child or it finishes scanning this node.
46471 +
46472 +   If unallocated, the entire extent must be dirty and in the same atom.  (Actually, I'm
46473 +   not sure this is last property (same atom) is enforced, but it should be the case since
46474 +   one atom must write the parent and the others must read the parent, thus fusing?).  In
46475 +   any case, the code below asserts this case for unallocated extents.  Unallocated
46476 +   extents are thus optimized because we can skip to the endpoint when scanning.
46477 +
46478 +   It returns control to reiser4_scan_extent, handles these terminating conditions,
46479 +   e.g., by loading the next twig.
46480 +*/
46481 +int reiser4_scan_extent(flush_scan * scan)
46482 +{
46483 +       coord_t coord;
46484 +       jnode *neighbor;
46485 +       unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist;
46486 +       reiser4_block_nr unit_start;
46487 +       __u64 oid;
46488 +       reiser4_key key;
46489 +       int ret = 0, allocated, incr;
46490 +       reiser4_tree *tree;
46491 +
46492 +       if (!JF_ISSET(scan->node, JNODE_DIRTY)) {
46493 +               scan->stop = 1;
46494 +               return 0;       /* Race with truncate, this node is already
46495 +                                * truncated. */
46496 +       }
46497 +
46498 +       coord_dup(&coord, &scan->parent_coord);
46499 +
46500 +       assert("jmacd-1404", !reiser4_scan_finished(scan));
46501 +       assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL);
46502 +       assert("jmacd-1406", jnode_is_unformatted(scan->node));
46503 +
46504 +       /* The scan_index variable corresponds to the current page index of the
46505 +          unformatted block scan position. */
46506 +       scan_index = index_jnode(scan->node);
46507 +
46508 +       assert("jmacd-7889", item_is_extent(&coord));
46509 +
46510 +      repeat:
46511 +       /* objectid of file */
46512 +       oid = get_key_objectid(item_key_by_coord(&coord, &key));
46513 +
46514 +       allocated = !extent_is_unallocated(&coord);
46515 +       /* Get the values of this extent unit: */
46516 +       unit_index = extent_unit_index(&coord);
46517 +       unit_width = extent_unit_width(&coord);
46518 +       unit_start = extent_unit_start(&coord);
46519 +
46520 +       assert("jmacd-7187", unit_width > 0);
46521 +       assert("jmacd-7188", scan_index >= unit_index);
46522 +       assert("jmacd-7189", scan_index <= unit_index + unit_width - 1);
46523 +
46524 +       /* Depending on the scan direction, we set different maximum values for scan_index
46525 +          (scan_max) and the number of nodes that would be passed if the scan goes the
46526 +          entire way (scan_dist).  Incr is an integer reflecting the incremental
46527 +          direction of scan_index. */
46528 +       if (reiser4_scanning_left(scan)) {
46529 +               scan_max = unit_index;
46530 +               scan_dist = scan_index - unit_index;
46531 +               incr = -1;
46532 +       } else {
46533 +               scan_max = unit_index + unit_width - 1;
46534 +               scan_dist = scan_max - unit_index;
46535 +               incr = +1;
46536 +       }
46537 +
46538 +       tree = coord.node->zjnode.tree;
46539 +
46540 +       /* If the extent is allocated we have to check each of its blocks.  If the extent
46541 +          is unallocated we can skip to the scan_max. */
46542 +       if (allocated) {
46543 +               do {
46544 +                       neighbor = jlookup(tree, oid, scan_index);
46545 +                       if (neighbor == NULL)
46546 +                               goto stop_same_parent;
46547 +
46548 +                       if (scan->node != neighbor
46549 +                           && !reiser4_scan_goto(scan, neighbor)) {
46550 +                               /* @neighbor was jput() by reiser4_scan_goto */
46551 +                               goto stop_same_parent;
46552 +                       }
46553 +
46554 +                       ret = scan_set_current(scan, neighbor, 1, &coord);
46555 +                       if (ret != 0) {
46556 +                               goto exit;
46557 +                       }
46558 +
46559 +                       /* reference to @neighbor is stored in @scan, no need
46560 +                          to jput(). */
46561 +                       scan_index += incr;
46562 +
46563 +               } while (incr + scan_max != scan_index);
46564 +
46565 +       } else {
46566 +               /* Optimized case for unallocated extents, skip to the end. */
46567 +               neighbor = jlookup(tree, oid, scan_max /*index */ );
46568 +               if (neighbor == NULL) {
46569 +                       /* Race with truncate */
46570 +                       scan->stop = 1;
46571 +                       ret = 0;
46572 +                       goto exit;
46573 +               }
46574 +
46575 +               assert("zam-1043",
46576 +                      reiser4_blocknr_is_fake(jnode_get_block(neighbor)));
46577 +
46578 +               ret = scan_set_current(scan, neighbor, scan_dist, &coord);
46579 +               if (ret != 0) {
46580 +                       goto exit;
46581 +               }
46582 +       }
46583 +
46584 +       if (coord_sideof_unit(&coord, scan->direction) == 0
46585 +           && item_is_extent(&coord)) {
46586 +               /* Continue as long as there are more extent units. */
46587 +
46588 +               scan_index =
46589 +                   extent_unit_index(&coord) +
46590 +                   (reiser4_scanning_left(scan) ?
46591 +                    extent_unit_width(&coord) - 1 : 0);
46592 +               goto repeat;
46593 +       }
46594 +
46595 +       if (0) {
46596 +             stop_same_parent:
46597 +
46598 +               /* If we are scanning left and we stop in the middle of an allocated
46599 +                  extent, we know the preceder immediately.. */
46600 +               /* middle of extent is (scan_index - unit_index) != 0. */
46601 +               if (reiser4_scanning_left(scan) &&
46602 +                   (scan_index - unit_index) != 0) {
46603 +                       /* FIXME(B): Someone should step-through and verify that this preceder
46604 +                          calculation is indeed correct. */
46605 +                       /* @unit_start is starting block (number) of extent
46606 +                          unit. Flush stopped at the @scan_index block from
46607 +                          the beginning of the file, which is (scan_index -
46608 +                          unit_index) block within extent.
46609 +                        */
46610 +                       if (unit_start) {
46611 +                               /* skip preceder update when we are at hole */
46612 +                               scan->preceder_blk =
46613 +                                   unit_start + scan_index - unit_index;
46614 +                               check_preceder(scan->preceder_blk);
46615 +                       }
46616 +               }
46617 +
46618 +               /* In this case, we leave coord set to the parent of scan->node. */
46619 +               scan->stop = 1;
46620 +
46621 +       } else {
46622 +               /* In this case, we are still scanning, coord is set to the next item which is
46623 +                  either off-the-end of the node or not an extent. */
46624 +               assert("jmacd-8912", scan->stop == 0);
46625 +               assert("jmacd-7812",
46626 +                      (coord_is_after_sideof_unit(&coord, scan->direction)
46627 +                       || !item_is_extent(&coord)));
46628 +       }
46629 +
46630 +       ret = 0;
46631 +      exit:
46632 +       return ret;
46633 +}
46634 +
46635 +/* ask block allocator for some blocks */
46636 +static void extent_allocate_blocks(reiser4_blocknr_hint *preceder,
46637 +                                  reiser4_block_nr wanted_count,
46638 +                                  reiser4_block_nr *first_allocated,
46639 +                                  reiser4_block_nr *allocated,
46640 +                                  block_stage_t block_stage)
46641 +{
46642 +       *allocated = wanted_count;
46643 +       preceder->max_dist = 0; /* scan whole disk, if needed */
46644 +
46645 +       /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */
46646 +       preceder->block_stage = block_stage;
46647 +
46648 +       /* FIXME: we do not handle errors here now */
46649 +       check_me("vs-420",
46650 +                reiser4_alloc_blocks(preceder, first_allocated, allocated,
46651 +                                     BA_PERMANENT) == 0);
46652 +       /* update flush_pos's preceder to last allocated block number */
46653 +       preceder->blk = *first_allocated + *allocated - 1;
46654 +}
46655 +
46656 +/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent
46657 +   will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have
46658 +   to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */
46659 +static reiser4_block_nr reserve_replace(void)
46660 +{
46661 +       reiser4_block_nr grabbed, needed;
46662 +
46663 +       grabbed = get_current_context()->grabbed_blocks;
46664 +       needed = estimate_one_insert_into_item(current_tree);
46665 +       check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED));
46666 +       return grabbed;
46667 +}
46668 +
46669 +static void free_replace_reserved(reiser4_block_nr grabbed)
46670 +{
46671 +       reiser4_context *ctx;
46672 +
46673 +       ctx = get_current_context();
46674 +       grabbed2free(ctx, get_super_private(ctx->super),
46675 +                    ctx->grabbed_blocks - grabbed);
46676 +}
46677 +
46678 +/* Block offset of first block addressed by unit */
46679 +__u64 extent_unit_index(const coord_t * item)
46680 +{
46681 +       reiser4_key key;
46682 +
46683 +       assert("vs-648", coord_is_existing_unit(item));
46684 +       unit_key_by_coord(item, &key);
46685 +       return get_key_offset(&key) >> current_blocksize_bits;
46686 +}
46687 +
46688 +/* AUDIT shouldn't return value be of reiser4_block_nr type?
46689 +   Josh's answer: who knows?  Is a "number of blocks" the same type as "block offset"? */
46690 +__u64 extent_unit_width(const coord_t * item)
46691 +{
46692 +       assert("vs-649", coord_is_existing_unit(item));
46693 +       return width_by_coord(item);
46694 +}
46695 +
46696 +/* Starting block location of this unit */
46697 +static reiser4_block_nr extent_unit_start(const coord_t * item)
46698 +{
46699 +       return extent_get_start(extent_by_coord(item));
46700 +}
46701 +
46702 +/**
46703 + * split_allocated_extent -
46704 + * @coord:
46705 + * @pos_in_unit:
46706 + *
46707 + * replace allocated extent with two allocated extents
46708 + */
46709 +static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit)
46710 +{
46711 +       int result;
46712 +       struct replace_handle *h;
46713 +       reiser4_extent *ext;
46714 +       reiser4_block_nr grabbed;
46715 +
46716 +       ext = extent_by_coord(coord);
46717 +       assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT);
46718 +       assert("vs-1411", extent_get_width(ext) > pos_in_unit);
46719 +
46720 +       h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
46721 +       if (h == NULL)
46722 +               return RETERR(-ENOMEM);
46723 +       h->coord = coord;
46724 +       h->lh = znode_lh(coord->node);
46725 +       h->pkey = &h->key;
46726 +       unit_key_by_coord(coord, h->pkey);
46727 +       set_key_offset(h->pkey,
46728 +                      (get_key_offset(h->pkey) +
46729 +                       pos_in_unit * current_blocksize));
46730 +       reiser4_set_extent(&h->overwrite, extent_get_start(ext),
46731 +                          pos_in_unit);
46732 +       reiser4_set_extent(&h->new_extents[0],
46733 +                          extent_get_start(ext) + pos_in_unit,
46734 +                          extent_get_width(ext) - pos_in_unit);
46735 +       h->nr_new_extents = 1;
46736 +       h->flags = COPI_DONT_SHIFT_LEFT;
46737 +       h->paste_key = h->key;
46738 +
46739 +       /* reserve space for extent unit paste, @grabbed is reserved before */
46740 +       grabbed = reserve_replace();
46741 +       result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
46742 +                                               extent */);
46743 +       /* restore reserved */
46744 +       free_replace_reserved(grabbed);
46745 +       kfree(h);
46746 +       return result;
46747 +}
46748 +
46749 +/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is
46750 +   one). Return 1 if it succeeded, 0 - otherwise */
46751 +static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext,
46752 +                      reiser4_extent *replace)
46753 +{
46754 +       assert("vs-1415", extent_by_coord(coord) == ext);
46755 +
46756 +       if (coord->unit_pos == 0
46757 +           || state_of_extent(ext - 1) != ALLOCATED_EXTENT)
46758 +               /* @ext either does not exist or is not allocated extent */
46759 +               return 0;
46760 +       if (extent_get_start(ext - 1) + extent_get_width(ext - 1) !=
46761 +           extent_get_start(replace))
46762 +               return 0;
46763 +
46764 +       /* we can glue, widen previous unit */
46765 +       extent_set_width(ext - 1,
46766 +                        extent_get_width(ext - 1) + extent_get_width(replace));
46767 +
46768 +       if (extent_get_width(ext) != extent_get_width(replace)) {
46769 +               /* make current extent narrower */
46770 +               if (state_of_extent(ext) == ALLOCATED_EXTENT)
46771 +                       extent_set_start(ext,
46772 +                                        extent_get_start(ext) +
46773 +                                        extent_get_width(replace));
46774 +               extent_set_width(ext,
46775 +                                extent_get_width(ext) -
46776 +                                extent_get_width(replace));
46777 +       } else {
46778 +               /* current extent completely glued with its left neighbor, remove it */
46779 +               coord_t from, to;
46780 +
46781 +               coord_dup(&from, coord);
46782 +               from.unit_pos = nr_units_extent(coord) - 1;
46783 +               coord_dup(&to, &from);
46784 +
46785 +               /* currently cut from extent can cut either from the beginning or from the end. Move place which got
46786 +                  freed after unit removal to end of item */
46787 +               memmove(ext, ext + 1,
46788 +                       (from.unit_pos -
46789 +                        coord->unit_pos) * sizeof(reiser4_extent));
46790 +               /* wipe part of item which is going to be cut, so that node_check will not be confused */
46791 +               cut_node_content(&from, &to, NULL, NULL, NULL);
46792 +       }
46793 +       znode_make_dirty(coord->node);
46794 +       /* move coord back */
46795 +       coord->unit_pos--;
46796 +       return 1;
46797 +}
46798 +
46799 +/**
46800 + * conv_extent - replace extent with 2 ones
46801 + * @coord: coordinate of extent to be replaced
46802 + * @replace: extent to overwrite the one @coord is set to
46803 + *
46804 + * Overwrites extent @coord is set to and paste one extent unit after
46805 + * overwritten one if @replace is shorter than initial extent
46806 + */
46807 +static int conv_extent(coord_t *coord, reiser4_extent *replace)
46808 +{
46809 +       int result;
46810 +       struct replace_handle *h;
46811 +       reiser4_extent *ext;
46812 +       reiser4_block_nr start, width, new_width;
46813 +       reiser4_block_nr grabbed;
46814 +       extent_state state;
46815 +
46816 +       ext = extent_by_coord(coord);
46817 +       state = state_of_extent(ext);
46818 +       start = extent_get_start(ext);
46819 +       width = extent_get_width(ext);
46820 +       new_width = extent_get_width(replace);
46821 +
46822 +       assert("vs-1458", (state == UNALLOCATED_EXTENT ||
46823 +                          state == ALLOCATED_EXTENT));
46824 +       assert("vs-1459", width >= new_width);
46825 +
46826 +       if (try_to_merge_with_left(coord, ext, replace)) {
46827 +               /* merged @replace with left neighbor. Current unit is either
46828 +                  removed or narrowed */
46829 +               return 0;
46830 +       }
46831 +
46832 +       if (width == new_width) {
46833 +               /* replace current extent with @replace */
46834 +               *ext = *replace;
46835 +               znode_make_dirty(coord->node);
46836 +               return 0;
46837 +       }
46838 +
46839 +       h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
46840 +       if (h == NULL)
46841 +               return RETERR(-ENOMEM);
46842 +       h->coord = coord;
46843 +       h->lh = znode_lh(coord->node);
46844 +       h->pkey = &h->key;
46845 +       unit_key_by_coord(coord, h->pkey);
46846 +       set_key_offset(h->pkey,
46847 +                      (get_key_offset(h->pkey) + new_width * current_blocksize));
46848 +       h->overwrite = *replace;
46849 +
46850 +       /* replace @ext with @replace and padding extent */
46851 +       reiser4_set_extent(&h->new_extents[0],
46852 +                          (state == ALLOCATED_EXTENT) ?
46853 +                          (start + new_width) :
46854 +                          UNALLOCATED_EXTENT_START,
46855 +                          width - new_width);
46856 +       h->nr_new_extents = 1;
46857 +       h->flags = COPI_DONT_SHIFT_LEFT;
46858 +       h->paste_key = h->key;
46859 +
46860 +       /* reserve space for extent unit paste, @grabbed is reserved before */
46861 +       grabbed = reserve_replace();
46862 +       result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
46863 +                                               extent */);
46864 +
46865 +       /* restore reserved */
46866 +       free_replace_reserved(grabbed);
46867 +       kfree(h);
46868 +       return result;
46869 +}
46870 +
46871 +/**
46872 + * assign_real_blocknrs
46873 + * @flush_pos:
46874 + * @oid: objectid of file jnodes to assign block number to belongs to
46875 + * @index: first jnode on the range
46876 + * @count: number of jnodes to assign block numbers to
46877 + * @first: start of allocated block range
46878 + *
46879 + * Assigns block numbers to each of @count jnodes. Index of first jnode is
46880 + * @index. Jnodes get lookuped with jlookup.
46881 + */
46882 +static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid,
46883 +                                unsigned long index, reiser4_block_nr count,
46884 +                                reiser4_block_nr first)
46885 +{
46886 +       unsigned long i;
46887 +       reiser4_tree *tree;
46888 +       txn_atom *atom;
46889 +       int nr;
46890 +
46891 +       atom = atom_locked_by_fq(flush_pos->fq);
46892 +       assert("vs-1468", atom);
46893 +       BUG_ON(atom == NULL);
46894 +
46895 +       nr = 0;
46896 +       tree = current_tree;
46897 +       for (i = 0; i < count; ++i, ++index) {
46898 +               jnode *node;
46899 +
46900 +               node = jlookup(tree, oid, index);
46901 +               assert("", node != NULL);
46902 +               BUG_ON(node == NULL);
46903 +
46904 +               spin_lock_jnode(node);
46905 +               assert("", !jnode_is_flushprepped(node));
46906 +               assert("vs-1475", node->atom == atom);
46907 +               assert("vs-1476", atomic_read(&node->x_count) > 0);
46908 +
46909 +               JF_CLR(node, JNODE_FLUSH_RESERVED);
46910 +               jnode_set_block(node, &first);
46911 +               unformatted_make_reloc(node, flush_pos->fq);
46912 +               ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
46913 +                                    FQ_LIST, 0));
46914 +               spin_unlock_jnode(node);
46915 +               first++;
46916 +
46917 +               atomic_dec(&node->x_count);
46918 +               nr ++;
46919 +       }
46920 +
46921 +       spin_unlock_atom(atom);
46922 +       return;
46923 +}
46924 +
46925 +/**
46926 + * make_node_ovrwr - assign node to overwrite set
46927 + * @jnodes: overwrite set list head
46928 + * @node: jnode to belong to overwrite set
46929 + *
46930 + * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes
46931 + * which is an accumulator for nodes before they get to overwrite set list of
46932 + * atom.
46933 + */
46934 +static void make_node_ovrwr(struct list_head *jnodes, jnode *node)
46935 +{
46936 +       spin_lock_jnode(node);
46937 +
46938 +       assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
46939 +       assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
46940 +
46941 +       JF_SET(node, JNODE_OVRWR);
46942 +       list_move_tail(&node->capture_link, jnodes);
46943 +       ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0));
46944 +
46945 +       spin_unlock_jnode(node);
46946 +}
46947 +
46948 +/**
46949 + * mark_jnodes_overwrite - put bunch of jnodes to overwrite set
46950 + * @flush_pos: flush position
46951 + * @oid: objectid of file jnodes belong to
46952 + * @index: starting index
46953 + * @width: extent width
46954 + *
46955 + * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's
46956 + * overwrite set. Starting from the one with index @index. If end of slum is
46957 + * detected (node is not found or flushprepped) - stop iterating and set flush
46958 + * position's state to POS_INVALID.
46959 + */
46960 +static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid,
46961 +                                 unsigned long index, reiser4_block_nr width)
46962 +{
46963 +       unsigned long i;
46964 +       reiser4_tree *tree;
46965 +       jnode *node;
46966 +       txn_atom *atom;
46967 +       LIST_HEAD(jnodes);
46968 +
46969 +       tree = current_tree;
46970 +
46971 +       atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
46972 +       assert("vs-1478", atom);
46973 +
46974 +       for (i = flush_pos->pos_in_unit; i < width; i++, index++) {
46975 +               node = jlookup(tree, oid, index);
46976 +               if (!node) {
46977 +                       flush_pos->state = POS_INVALID;
46978 +                       break;
46979 +               }
46980 +               if (jnode_check_flushprepped(node)) {
46981 +                       flush_pos->state = POS_INVALID;
46982 +                       atomic_dec(&node->x_count);
46983 +                       break;
46984 +               }
46985 +               if (node->atom != atom) {
46986 +                       flush_pos->state = POS_INVALID;
46987 +                       atomic_dec(&node->x_count);
46988 +                       break;
46989 +               }
46990 +               make_node_ovrwr(&jnodes, node);
46991 +               atomic_dec(&node->x_count);
46992 +       }
46993 +
46994 +       list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev);
46995 +       spin_unlock_atom(atom);
46996 +}
46997 +
46998 +/**
46999 + * allocated_extent_slum_size
47000 + * @flush_pos:
47001 + * @oid:
47002 + * @index:
47003 + * @count:
47004 + *
47005 + *
47006 + */
47007 +static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid,
47008 +                                     unsigned long index, unsigned long count)
47009 +{
47010 +       unsigned long i;
47011 +       reiser4_tree *tree;
47012 +       txn_atom *atom;
47013 +       int nr;
47014 +
47015 +       atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
47016 +       assert("vs-1468", atom);
47017 +
47018 +       nr = 0;
47019 +       tree = current_tree;
47020 +       for (i = 0; i < count; ++i, ++index) {
47021 +               jnode *node;
47022 +
47023 +               node = jlookup(tree, oid, index);
47024 +               if (!node)
47025 +                       break;
47026 +
47027 +               if (jnode_check_flushprepped(node)) {
47028 +                       atomic_dec(&node->x_count);
47029 +                       break;
47030 +               }
47031 +
47032 +               if (node->atom != atom) {
47033 +                       /*
47034 +                        * this is possible on overwrite: extent_write may
47035 +                        * capture several unformatted nodes without capturing
47036 +                        * any formatted nodes.
47037 +                        */
47038 +                       atomic_dec(&node->x_count);
47039 +                       break;
47040 +               }
47041 +
47042 +               assert("vs-1476", atomic_read(&node->x_count) > 1);
47043 +               atomic_dec(&node->x_count);
47044 +               nr ++;
47045 +       }
47046 +
47047 +       spin_unlock_atom(atom);
47048 +       return nr;
47049 +}
47050 +
47051 +/**
47052 + * alloc_extent
47053 + * @flush_pos:
47054 + *
47055 + *
47056 + * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord
47057 + * is set to. It is to prepare for flushing sequence of not flushprepped nodes
47058 + * (slum). It supposes that slum starts at flush_pos->pos_in_unit position
47059 + * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is
47060 + * set to 1 and to overwrite set otherwise
47061 + */
47062 +int reiser4_alloc_extent(flush_pos_t *flush_pos)
47063 +{
47064 +       coord_t *coord;
47065 +       reiser4_extent *ext;
47066 +       reiser4_extent replace_ext;
47067 +       oid_t oid;
47068 +       reiser4_block_nr protected;
47069 +       reiser4_block_nr start;
47070 +       __u64 index;
47071 +       __u64 width;
47072 +       extent_state state;
47073 +       int result;
47074 +       reiser4_block_nr first_allocated;
47075 +       __u64 allocated;
47076 +       reiser4_key key;
47077 +       block_stage_t block_stage;
47078 +
47079 +       assert("vs-1468", flush_pos->state == POS_ON_EPOINT);
47080 +       assert("vs-1469", coord_is_existing_unit(&flush_pos->coord)
47081 +              && item_is_extent(&flush_pos->coord));
47082 +
47083 +       coord = &flush_pos->coord;
47084 +
47085 +       ext = extent_by_coord(coord);
47086 +       state = state_of_extent(ext);
47087 +       if (state == HOLE_EXTENT) {
47088 +               flush_pos->state = POS_INVALID;
47089 +               return 0;
47090 +       }
47091 +
47092 +       item_key_by_coord(coord, &key);
47093 +       oid = get_key_objectid(&key);
47094 +       index = extent_unit_index(coord) + flush_pos->pos_in_unit;
47095 +       start = extent_get_start(ext);
47096 +       width = extent_get_width(ext);
47097 +
47098 +       assert("vs-1457", width > flush_pos->pos_in_unit);
47099 +
47100 +       if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) {
47101 +               /* relocate */
47102 +               if (flush_pos->pos_in_unit) {
47103 +                       /* split extent unit into two */
47104 +                       result =
47105 +                           split_allocated_extent(coord,
47106 +                                                  flush_pos->pos_in_unit);
47107 +                       flush_pos->pos_in_unit = 0;
47108 +                       return result;
47109 +               }
47110 +
47111 +               /* limit number of nodes to allocate */
47112 +               if (flush_pos->nr_to_write < width)
47113 +                       width = flush_pos->nr_to_write;
47114 +
47115 +               if (state == ALLOCATED_EXTENT) {
47116 +                       /*
47117 +                        * all protected nodes are not flushprepped, therefore
47118 +                        * they are counted as flush_reserved
47119 +                        */
47120 +                       block_stage = BLOCK_FLUSH_RESERVED;
47121 +                       protected = allocated_extent_slum_size(flush_pos, oid,
47122 +                                                              index, width);
47123 +                       if (protected == 0) {
47124 +                               flush_pos->state = POS_INVALID;
47125 +                               flush_pos->pos_in_unit = 0;
47126 +                               return 0;
47127 +                       }
47128 +               } else {
47129 +                       block_stage = BLOCK_UNALLOCATED;
47130 +                       protected = width;
47131 +               }
47132 +
47133 +               /*
47134 +                * look at previous unit if possible. If it is allocated, make
47135 +                * preceder more precise
47136 +                */
47137 +               if (coord->unit_pos &&
47138 +                   (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
47139 +                       reiser4_pos_hint(flush_pos)->blk =
47140 +                               extent_get_start(ext - 1) +
47141 +                               extent_get_width(ext - 1);
47142 +
47143 +               /* allocate new block numbers for protected nodes */
47144 +               extent_allocate_blocks(reiser4_pos_hint(flush_pos),
47145 +                                      protected,
47146 +                                      &first_allocated, &allocated,
47147 +                                      block_stage);
47148 +
47149 +               if (state == ALLOCATED_EXTENT)
47150 +                       /*
47151 +                        * on relocating - free nodes which are going to be
47152 +                        * relocated
47153 +                        */
47154 +                       reiser4_dealloc_blocks(&start, &allocated,
47155 +                                              BLOCK_ALLOCATED, BA_DEFER);
47156 +
47157 +               /* assign new block numbers to protected nodes */
47158 +               assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated);
47159 +
47160 +               /* prepare extent which will replace current one */
47161 +               reiser4_set_extent(&replace_ext, first_allocated, allocated);
47162 +
47163 +               /* adjust extent item */
47164 +               result = conv_extent(coord, &replace_ext);
47165 +               if (result != 0 && result != -ENOMEM) {
47166 +                       warning("vs-1461",
47167 +                               "Failed to allocate extent. Should not happen\n");
47168 +                       return result;
47169 +               }
47170 +
47171 +               /*
47172 +                * break flush: we prepared for flushing as many blocks as we
47173 +                * were asked for
47174 +                */
47175 +               if (flush_pos->nr_to_write == allocated)
47176 +                       flush_pos->state = POS_INVALID;
47177 +       } else {
47178 +               /* overwrite */
47179 +               mark_jnodes_overwrite(flush_pos, oid, index, width);
47180 +       }
47181 +       flush_pos->pos_in_unit = 0;
47182 +       return 0;
47183 +}
47184 +
47185 +/* if @key is glueable to the item @coord is set to */
47186 +static int must_insert(const coord_t *coord, const reiser4_key *key)
47187 +{
47188 +       reiser4_key last;
47189 +
47190 +       if (item_id_by_coord(coord) == EXTENT_POINTER_ID
47191 +           && keyeq(append_key_extent(coord, &last), key))
47192 +               return 0;
47193 +       return 1;
47194 +}
47195 +
47196 +/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item,
47197 +   or modify last unit of last item to have greater width */
47198 +static int put_unit_to_end(znode *node, const reiser4_key *key,
47199 +                          reiser4_extent *copy_ext)
47200 +{
47201 +       int result;
47202 +       coord_t coord;
47203 +       cop_insert_flag flags;
47204 +       reiser4_extent *last_ext;
47205 +       reiser4_item_data data;
47206 +
47207 +       /* set coord after last unit in an item */
47208 +       coord_init_last_unit(&coord, node);
47209 +       coord.between = AFTER_UNIT;
47210 +
47211 +       flags =
47212 +           COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE;
47213 +       if (must_insert(&coord, key)) {
47214 +               result =
47215 +                   insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1),
47216 +                                   key, NULL /*lh */ , flags);
47217 +
47218 +       } else {
47219 +               /* try to glue with last unit */
47220 +               last_ext = extent_by_coord(&coord);
47221 +               if (state_of_extent(last_ext) &&
47222 +                   extent_get_start(last_ext) + extent_get_width(last_ext) ==
47223 +                   extent_get_start(copy_ext)) {
47224 +                       /* widen last unit of node */
47225 +                       extent_set_width(last_ext,
47226 +                                        extent_get_width(last_ext) +
47227 +                                        extent_get_width(copy_ext));
47228 +                       znode_make_dirty(node);
47229 +                       return 0;
47230 +               }
47231 +
47232 +               /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */
47233 +               result =
47234 +                   insert_into_item(&coord, NULL /*lh */ , key,
47235 +                                    init_new_extent(&data, copy_ext, 1),
47236 +                                    flags);
47237 +       }
47238 +
47239 +       assert("vs-438", result == 0 || result == -E_NODE_FULL);
47240 +       return result;
47241 +}
47242 +
47243 +/* @coord is set to extent unit */
47244 +squeeze_result squalloc_extent(znode *left, const coord_t *coord,
47245 +                              flush_pos_t *flush_pos,
47246 +                              reiser4_key *stop_key)
47247 +{
47248 +       reiser4_extent *ext;
47249 +       __u64 index;
47250 +       __u64 width;
47251 +       reiser4_block_nr start;
47252 +       extent_state state;
47253 +       oid_t oid;
47254 +       reiser4_block_nr first_allocated;
47255 +       __u64 allocated;
47256 +       __u64 protected;
47257 +       reiser4_extent copy_extent;
47258 +       reiser4_key key;
47259 +       int result;
47260 +       block_stage_t block_stage;
47261 +
47262 +       assert("vs-1457", flush_pos->pos_in_unit == 0);
47263 +       assert("vs-1467", coord_is_leftmost_unit(coord));
47264 +       assert("vs-1467", item_is_extent(coord));
47265 +
47266 +       ext = extent_by_coord(coord);
47267 +       index = extent_unit_index(coord);
47268 +       start = extent_get_start(ext);
47269 +       width = extent_get_width(ext);
47270 +       state = state_of_extent(ext);
47271 +       unit_key_by_coord(coord, &key);
47272 +       oid = get_key_objectid(&key);
47273 +
47274 +       if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) ||
47275 +           (state == UNALLOCATED_EXTENT)) {
47276 +               /* relocate */
47277 +               if (state == ALLOCATED_EXTENT) {
47278 +                       /* all protected nodes are not flushprepped, therefore
47279 +                        * they are counted as flush_reserved */
47280 +                       block_stage = BLOCK_FLUSH_RESERVED;
47281 +                       protected = allocated_extent_slum_size(flush_pos, oid,
47282 +                                                              index, width);
47283 +                       if (protected == 0) {
47284 +                               flush_pos->state = POS_INVALID;
47285 +                               flush_pos->pos_in_unit = 0;
47286 +                               return 0;
47287 +                       }
47288 +               } else {
47289 +                       block_stage = BLOCK_UNALLOCATED;
47290 +                       protected = width;
47291 +               }
47292 +
47293 +               /*
47294 +                * look at previous unit if possible. If it is allocated, make
47295 +                * preceder more precise
47296 +                */
47297 +               if (coord->unit_pos &&
47298 +                   (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
47299 +                       reiser4_pos_hint(flush_pos)->blk =
47300 +                               extent_get_start(ext - 1) +
47301 +                               extent_get_width(ext - 1);
47302 +
47303 +               /* allocate new block numbers for protected nodes */
47304 +               extent_allocate_blocks(reiser4_pos_hint(flush_pos),
47305 +                                      protected,
47306 +                                      &first_allocated, &allocated,
47307 +                                      block_stage);
47308 +
47309 +               /* prepare extent which will be copied to left */
47310 +               reiser4_set_extent(&copy_extent, first_allocated, allocated);
47311 +
47312 +               result = put_unit_to_end(left, &key, &copy_extent);
47313 +               if (result == -E_NODE_FULL) {
47314 +                       int target_block_stage;
47315 +
47316 +                       /* free blocks which were just allocated */
47317 +                       target_block_stage =
47318 +                           (state ==
47319 +                            ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED :
47320 +                           BLOCK_UNALLOCATED;
47321 +                       reiser4_dealloc_blocks(&first_allocated, &allocated,
47322 +                                              target_block_stage,
47323 +                                              BA_PERMANENT);
47324 +
47325 +                       /* rewind the preceder. */
47326 +                       flush_pos->preceder.blk = first_allocated;
47327 +                       check_preceder(flush_pos->preceder.blk);
47328 +
47329 +                       return SQUEEZE_TARGET_FULL;
47330 +               }
47331 +
47332 +               if (state == ALLOCATED_EXTENT) {
47333 +                       /* free nodes which were relocated */
47334 +                       reiser4_dealloc_blocks(&start, &allocated,
47335 +                                              BLOCK_ALLOCATED, BA_DEFER);
47336 +               }
47337 +
47338 +               /* assign new block numbers to protected nodes */
47339 +               assign_real_blocknrs(flush_pos, oid, index, allocated,
47340 +                                    first_allocated);
47341 +
47342 +               set_key_offset(&key,
47343 +                              get_key_offset(&key) +
47344 +                              (allocated << current_blocksize_bits));
47345 +       } else {
47346 +               /*
47347 +                * overwrite: try to copy unit as it is to left neighbor and
47348 +                * make all first not flushprepped nodes overwrite nodes
47349 +                */
47350 +               reiser4_set_extent(&copy_extent, start, width);
47351 +               result = put_unit_to_end(left, &key, &copy_extent);
47352 +               if (result == -E_NODE_FULL)
47353 +                       return SQUEEZE_TARGET_FULL;
47354 +
47355 +               if (state != HOLE_EXTENT)
47356 +                       mark_jnodes_overwrite(flush_pos, oid, index, width);
47357 +               set_key_offset(&key,
47358 +                              get_key_offset(&key) +
47359 +                              (width << current_blocksize_bits));
47360 +       }
47361 +       *stop_key = key;
47362 +       return SQUEEZE_CONTINUE;
47363 +}
47364 +
47365 +int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key)
47366 +{
47367 +       return key_by_inode_and_offset_common(inode, off, key);
47368 +}
47369 +
47370 +/*
47371 + * Local variables:
47372 + * c-indentation-style: "K&R"
47373 + * mode-name: "LC"
47374 + * c-basic-offset: 8
47375 + * tab-width: 8
47376 + * fill-column: 79
47377 + * scroll-step: 1
47378 + * End:
47379 + */
47380 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/extent.h linux-2.6.27/fs/reiser4/plugin/item/extent.h
47381 --- linux-2.6.27.orig/fs/reiser4/plugin/item/extent.h   1970-01-01 03:00:00.000000000 +0300
47382 +++ linux-2.6.27/fs/reiser4/plugin/item/extent.h        2008-10-12 18:20:01.000000000 +0400
47383 @@ -0,0 +1,231 @@
47384 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47385 +
47386 +#ifndef __REISER4_EXTENT_H__
47387 +#define __REISER4_EXTENT_H__
47388 +
47389 +/* on disk extent */
47390 +typedef struct {
47391 +       reiser4_dblock_nr start;
47392 +       reiser4_dblock_nr width;
47393 +} reiser4_extent;
47394 +
47395 +struct extent_stat {
47396 +       int unallocated_units;
47397 +       int unallocated_blocks;
47398 +       int allocated_units;
47399 +       int allocated_blocks;
47400 +       int hole_units;
47401 +       int hole_blocks;
47402 +};
47403 +
47404 +/* extents in an extent item can be either holes, or unallocated or allocated
47405 +   extents */
47406 +typedef enum {
47407 +       HOLE_EXTENT,
47408 +       UNALLOCATED_EXTENT,
47409 +       ALLOCATED_EXTENT
47410 +} extent_state;
47411 +
47412 +#define HOLE_EXTENT_START 0
47413 +#define UNALLOCATED_EXTENT_START 1
47414 +#define UNALLOCATED_EXTENT_START2 2
47415 +
47416 +struct extent_coord_extension {
47417 +       reiser4_block_nr pos_in_unit;
47418 +       reiser4_block_nr width; /* width of current unit */
47419 +       pos_in_node_t nr_units; /* number of units */
47420 +       int ext_offset;         /* offset from the beginning of zdata() */
47421 +       unsigned long expected_page;
47422 +#if REISER4_DEBUG
47423 +       reiser4_extent extent;
47424 +#endif
47425 +};
47426 +
47427 +/* macros to set/get fields of on-disk extent */
47428 +static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext)
47429 +{
47430 +       return le64_to_cpu(ext->start);
47431 +}
47432 +
47433 +static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext)
47434 +{
47435 +       return le64_to_cpu(ext->width);
47436 +}
47437 +
47438 +extern __u64 reiser4_current_block_count(void);
47439 +
47440 +static inline void
47441 +extent_set_start(reiser4_extent * ext, reiser4_block_nr start)
47442 +{
47443 +       cassert(sizeof(ext->start) == 8);
47444 +       assert("nikita-2510",
47445 +              ergo(start > 1, start < reiser4_current_block_count()));
47446 +       put_unaligned(cpu_to_le64(start), &ext->start);
47447 +}
47448 +
47449 +static inline void
47450 +extent_set_width(reiser4_extent * ext, reiser4_block_nr width)
47451 +{
47452 +       cassert(sizeof(ext->width) == 8);
47453 +       assert("", width > 0);
47454 +       put_unaligned(cpu_to_le64(width), &ext->width);
47455 +       assert("nikita-2511",
47456 +              ergo(extent_get_start(ext) > 1,
47457 +                   extent_get_start(ext) + width <=
47458 +                   reiser4_current_block_count()));
47459 +}
47460 +
47461 +#define extent_item(coord)                                     \
47462 +({                                                             \
47463 +       assert("nikita-3143", item_is_extent(coord));           \
47464 +       ((reiser4_extent *)item_body_by_coord (coord));         \
47465 +})
47466 +
47467 +#define extent_by_coord(coord)                                 \
47468 +({                                                             \
47469 +       assert("nikita-3144", item_is_extent(coord));           \
47470 +       (extent_item (coord) + (coord)->unit_pos);              \
47471 +})
47472 +
47473 +#define width_by_coord(coord)                                  \
47474 +({                                                             \
47475 +       assert("nikita-3145", item_is_extent(coord));           \
47476 +       extent_get_width (extent_by_coord(coord));              \
47477 +})
47478 +
47479 +struct carry_cut_data;
47480 +struct carry_kill_data;
47481 +
47482 +/* plugin->u.item.b.* */
47483 +reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *);
47484 +int can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
47485 +                          const reiser4_item_data *);
47486 +int mergeable_extent(const coord_t * p1, const coord_t * p2);
47487 +pos_in_node_t nr_units_extent(const coord_t *);
47488 +lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *);
47489 +void init_coord_extent(coord_t *);
47490 +int init_extent(coord_t *, reiser4_item_data *);
47491 +int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *);
47492 +int can_shift_extent(unsigned free_space,
47493 +                    coord_t * source, znode * target, shift_direction,
47494 +                    unsigned *size, unsigned want);
47495 +void copy_units_extent(coord_t * target, coord_t * source, unsigned from,
47496 +                      unsigned count, shift_direction where_is_free_space,
47497 +                      unsigned free_space);
47498 +int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count,
47499 +                    struct carry_kill_data *);
47500 +int create_hook_extent(const coord_t * coord, void *arg);
47501 +int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47502 +                    struct carry_cut_data *, reiser4_key * smallest_removed,
47503 +                    reiser4_key * new_first);
47504 +int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47505 +                     struct carry_kill_data *, reiser4_key * smallest_removed,
47506 +                     reiser4_key * new_first);
47507 +reiser4_key *unit_key_extent(const coord_t *, reiser4_key *);
47508 +reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *);
47509 +void print_extent(const char *, coord_t *);
47510 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child);
47511 +int utmost_child_real_block_extent(const coord_t * coord, sideof side,
47512 +                                  reiser4_block_nr * block);
47513 +void item_stat_extent(const coord_t * coord, void *vp);
47514 +int reiser4_check_extent(const coord_t * coord, const char **error);
47515 +
47516 +/* plugin->u.item.s.file.* */
47517 +ssize_t reiser4_write_extent(struct file *, struct inode * inode,
47518 +                            const char __user *, size_t, loff_t *);
47519 +int reiser4_read_extent(struct file *, flow_t *, hint_t *);
47520 +int reiser4_readpage_extent(void *, struct page *);
47521 +int reiser4_do_readpage_extent(reiser4_extent*, reiser4_block_nr, struct page*);
47522 +reiser4_key *append_key_extent(const coord_t *, reiser4_key *);
47523 +void init_coord_extension_extent(uf_coord_t *, loff_t offset);
47524 +int get_block_address_extent(const coord_t *, sector_t block,
47525 +                            sector_t * result);
47526 +
47527 +/* these are used in flush.c
47528 +   FIXME-VS: should they be somewhere in item_plugin? */
47529 +int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos);
47530 +int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos,
47531 +                            reiser4_key * stop_key);
47532 +
47533 +int extent_is_unallocated(const coord_t * item);       /* True if this extent is unallocated (i.e., not a hole, not allocated). */
47534 +__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */
47535 +__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */
47536 +
47537 +/* plugin->u.item.f. */
47538 +int reiser4_scan_extent(flush_scan * scan);
47539 +extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *);
47540 +
47541 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
47542 +                                  int nr_extents);
47543 +reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr);
47544 +extent_state state_of_extent(reiser4_extent * ext);
47545 +void reiser4_set_extent(reiser4_extent *, reiser4_block_nr start,
47546 +                       reiser4_block_nr width);
47547 +int reiser4_update_extent(struct inode *, jnode *, loff_t pos,
47548 +                         int *plugged_hole);
47549 +
47550 +#include "../../coord.h"
47551 +#include "../../lock.h"
47552 +#include "../../tap.h"
47553 +
47554 +struct replace_handle {
47555 +       /* these are to be set before calling reiser4_replace_extent */
47556 +       coord_t *coord;
47557 +       lock_handle *lh;
47558 +       reiser4_key key;
47559 +       reiser4_key *pkey;
47560 +       reiser4_extent overwrite;
47561 +       reiser4_extent new_extents[2];
47562 +       int nr_new_extents;
47563 +       unsigned flags;
47564 +
47565 +       /* these are used by reiser4_replace_extent */
47566 +       reiser4_item_data item;
47567 +       coord_t coord_after;
47568 +       lock_handle lh_after;
47569 +       tap_t watch;
47570 +       reiser4_key paste_key;
47571 +#if REISER4_DEBUG
47572 +       reiser4_extent orig_ext;
47573 +       reiser4_key tmp;
47574 +#endif
47575 +};
47576 +
47577 +/* this structure is kmalloced before calling make_extent to avoid excessive
47578 +   stack consumption on plug_hole->reiser4_replace_extent */
47579 +struct make_extent_handle {
47580 +       uf_coord_t *uf_coord;
47581 +       reiser4_block_nr blocknr;
47582 +       int created;
47583 +       struct inode *inode;
47584 +       union {
47585 +               struct {
47586 +               } append;
47587 +               struct replace_handle replace;
47588 +       } u;
47589 +};
47590 +
47591 +int reiser4_replace_extent(struct replace_handle *,
47592 +                          int return_inserted_position);
47593 +lock_handle *znode_lh(znode *);
47594 +
47595 +/* the reiser4 repacker support */
47596 +struct repacker_cursor;
47597 +extern int process_extent_backward_for_repacking(tap_t *,
47598 +                                                struct repacker_cursor *);
47599 +extern int mark_extent_for_repacking(tap_t *, int);
47600 +
47601 +#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord))
47602 +#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent))
47603 +
47604 +/* __REISER4_EXTENT_H__ */
47605 +#endif
47606 +/*
47607 +   Local variables:
47608 +   c-indentation-style: "K&R"
47609 +   mode-name: "LC"
47610 +   c-basic-offset: 8
47611 +   tab-width: 8
47612 +   fill-column: 120
47613 +   End:
47614 +*/
47615 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/extent_item_ops.c linux-2.6.27/fs/reiser4/plugin/item/extent_item_ops.c
47616 --- linux-2.6.27.orig/fs/reiser4/plugin/item/extent_item_ops.c  1970-01-01 03:00:00.000000000 +0300
47617 +++ linux-2.6.27/fs/reiser4/plugin/item/extent_item_ops.c       2008-10-12 18:20:01.000000000 +0400
47618 @@ -0,0 +1,889 @@
47619 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47620 +
47621 +#include "item.h"
47622 +#include "../../inode.h"
47623 +#include "../../tree_walk.h"   /* check_sibling_list() */
47624 +#include "../../page_cache.h"
47625 +#include "../../carry.h"
47626 +
47627 +#include <linux/quotaops.h>
47628 +
47629 +/* item_plugin->b.max_key_inside */
47630 +reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key)
47631 +{
47632 +       item_key_by_coord(coord, key);
47633 +       set_key_offset(key, get_key_offset(reiser4_max_key()));
47634 +       return key;
47635 +}
47636 +
47637 +/* item_plugin->b.can_contain_key
47638 +   this checks whether @key of @data is matching to position set by @coord */
47639 +int
47640 +can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
47641 +                      const reiser4_item_data * data)
47642 +{
47643 +       reiser4_key item_key;
47644 +
47645 +       if (item_plugin_by_coord(coord) != data->iplug)
47646 +               return 0;
47647 +
47648 +       item_key_by_coord(coord, &item_key);
47649 +       if (get_key_locality(key) != get_key_locality(&item_key) ||
47650 +           get_key_objectid(key) != get_key_objectid(&item_key) ||
47651 +           get_key_ordering(key) != get_key_ordering(&item_key))
47652 +               return 0;
47653 +
47654 +       return 1;
47655 +}
47656 +
47657 +/* item_plugin->b.mergeable
47658 +   first item is of extent type */
47659 +/* Audited by: green(2002.06.13) */
47660 +int mergeable_extent(const coord_t * p1, const coord_t * p2)
47661 +{
47662 +       reiser4_key key1, key2;
47663 +
47664 +       assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID);
47665 +       /* FIXME-VS: Which is it? Assert or return 0 */
47666 +       if (item_id_by_coord(p2) != EXTENT_POINTER_ID) {
47667 +               return 0;
47668 +       }
47669 +
47670 +       item_key_by_coord(p1, &key1);
47671 +       item_key_by_coord(p2, &key2);
47672 +       if (get_key_locality(&key1) != get_key_locality(&key2) ||
47673 +           get_key_objectid(&key1) != get_key_objectid(&key2) ||
47674 +           get_key_ordering(&key1) != get_key_ordering(&key2) ||
47675 +           get_key_type(&key1) != get_key_type(&key2))
47676 +               return 0;
47677 +       if (get_key_offset(&key1) +
47678 +           reiser4_extent_size(p1, nr_units_extent(p1)) !=
47679 +           get_key_offset(&key2))
47680 +               return 0;
47681 +       return 1;
47682 +}
47683 +
47684 +/* item_plugin->b.nr_units */
47685 +pos_in_node_t nr_units_extent(const coord_t * coord)
47686 +{
47687 +       /* length of extent item has to be multiple of extent size */
47688 +       assert("vs-1424",
47689 +              (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0);
47690 +       return item_length_by_coord(coord) / sizeof(reiser4_extent);
47691 +}
47692 +
47693 +/* item_plugin->b.lookup */
47694 +lookup_result
47695 +lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG,
47696 +             coord_t * coord)
47697 +{                              /* znode and item_pos are
47698 +                                  set to an extent item to
47699 +                                  look through */
47700 +       reiser4_key item_key;
47701 +       reiser4_block_nr lookuped, offset;
47702 +       unsigned i, nr_units;
47703 +       reiser4_extent *ext;
47704 +       unsigned blocksize;
47705 +       unsigned char blocksize_bits;
47706 +
47707 +       item_key_by_coord(coord, &item_key);
47708 +       offset = get_key_offset(&item_key);
47709 +
47710 +       /* key we are looking for must be greater than key of item @coord */
47711 +       assert("vs-414", keygt(key, &item_key));
47712 +
47713 +       assert("umka-99945",
47714 +              !keygt(key, max_key_inside_extent(coord, &item_key)));
47715 +
47716 +       ext = extent_item(coord);
47717 +       assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset));
47718 +
47719 +       blocksize = current_blocksize;
47720 +       blocksize_bits = current_blocksize_bits;
47721 +
47722 +       /* offset we are looking for */
47723 +       lookuped = get_key_offset(key);
47724 +
47725 +       nr_units = nr_units_extent(coord);
47726 +       /* go through all extents until the one which address given offset */
47727 +       for (i = 0; i < nr_units; i++, ext++) {
47728 +               offset += (extent_get_width(ext) << blocksize_bits);
47729 +               if (offset > lookuped) {
47730 +                       /* desired byte is somewhere in this extent */
47731 +                       coord->unit_pos = i;
47732 +                       coord->between = AT_UNIT;
47733 +                       return CBK_COORD_FOUND;
47734 +               }
47735 +       }
47736 +
47737 +       /* set coord after last unit */
47738 +       coord->unit_pos = nr_units - 1;
47739 +       coord->between = AFTER_UNIT;
47740 +       return CBK_COORD_FOUND;
47741 +}
47742 +
47743 +/* item_plugin->b.paste
47744 +   item @coord is set to has been appended with @data->length of free
47745 +   space. data->data contains data to be pasted into the item in position
47746 +   @coord->in_item.unit_pos. It must fit into that free space.
47747 +   @coord must be set between units.
47748 +*/
47749 +int
47750 +paste_extent(coord_t * coord, reiser4_item_data * data,
47751 +            carry_plugin_info * info UNUSED_ARG)
47752 +{
47753 +       unsigned old_nr_units;
47754 +       reiser4_extent *ext;
47755 +       int item_length;
47756 +
47757 +       ext = extent_item(coord);
47758 +       item_length = item_length_by_coord(coord);
47759 +       old_nr_units = (item_length - data->length) / sizeof(reiser4_extent);
47760 +
47761 +       /* this is also used to copy extent into newly created item, so
47762 +          old_nr_units could be 0 */
47763 +       assert("vs-260", item_length >= data->length);
47764 +
47765 +       /* make sure that coord is set properly */
47766 +       assert("vs-35",
47767 +              ((!coord_is_existing_unit(coord))
47768 +               || (!old_nr_units && !coord->unit_pos)));
47769 +
47770 +       /* first unit to be moved */
47771 +       switch (coord->between) {
47772 +       case AFTER_UNIT:
47773 +               coord->unit_pos++;
47774 +       case BEFORE_UNIT:
47775 +               coord->between = AT_UNIT;
47776 +               break;
47777 +       case AT_UNIT:
47778 +               assert("vs-331", !old_nr_units && !coord->unit_pos);
47779 +               break;
47780 +       default:
47781 +               impossible("vs-330", "coord is set improperly");
47782 +       }
47783 +
47784 +       /* prepare space for new units */
47785 +       memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent),
47786 +               ext + coord->unit_pos,
47787 +               (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent));
47788 +
47789 +       /* copy new data from kernel space */
47790 +       assert("vs-556", data->user == 0);
47791 +       memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length);
47792 +
47793 +       /* after paste @coord is set to first of pasted units */
47794 +       assert("vs-332", coord_is_existing_unit(coord));
47795 +       assert("vs-333",
47796 +              !memcmp(data->data, extent_by_coord(coord),
47797 +                      (unsigned)data->length));
47798 +       return 0;
47799 +}
47800 +
47801 +/* item_plugin->b.can_shift */
47802 +int
47803 +can_shift_extent(unsigned free_space, coord_t * source,
47804 +                znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG,
47805 +                unsigned *size, unsigned want)
47806 +{
47807 +       *size = item_length_by_coord(source);
47808 +       if (*size > free_space)
47809 +               /* never split a unit of extent item */
47810 +               *size = free_space - free_space % sizeof(reiser4_extent);
47811 +
47812 +       /* we can shift *size bytes, calculate how many do we want to shift */
47813 +       if (*size > want * sizeof(reiser4_extent))
47814 +               *size = want * sizeof(reiser4_extent);
47815 +
47816 +       if (*size % sizeof(reiser4_extent) != 0)
47817 +               impossible("vs-119", "Wrong extent size: %i %zd", *size,
47818 +                          sizeof(reiser4_extent));
47819 +       return *size / sizeof(reiser4_extent);
47820 +
47821 +}
47822 +
47823 +/* item_plugin->b.copy_units */
47824 +void
47825 +copy_units_extent(coord_t * target, coord_t * source,
47826 +                 unsigned from, unsigned count,
47827 +                 shift_direction where_is_free_space, unsigned free_space)
47828 +{
47829 +       char *from_ext, *to_ext;
47830 +
47831 +       assert("vs-217", free_space == count * sizeof(reiser4_extent));
47832 +
47833 +       from_ext = item_body_by_coord(source);
47834 +       to_ext = item_body_by_coord(target);
47835 +
47836 +       if (where_is_free_space == SHIFT_LEFT) {
47837 +               assert("vs-215", from == 0);
47838 +
47839 +               /* At this moment, item length was already updated in the item
47840 +                  header by shifting code, hence nr_units_extent() will
47841 +                  return "new" number of units---one we obtain after copying
47842 +                  units.
47843 +                */
47844 +               to_ext +=
47845 +                   (nr_units_extent(target) - count) * sizeof(reiser4_extent);
47846 +       } else {
47847 +               reiser4_key key;
47848 +               coord_t coord;
47849 +
47850 +               assert("vs-216",
47851 +                      from + count == coord_last_unit_pos(source) + 1);
47852 +
47853 +               from_ext += item_length_by_coord(source) - free_space;
47854 +
47855 +               /* new units are inserted before first unit in an item,
47856 +                  therefore, we have to update item key */
47857 +               coord = *source;
47858 +               coord.unit_pos = from;
47859 +               unit_key_extent(&coord, &key);
47860 +
47861 +               node_plugin_by_node(target->node)->update_item_key(target, &key,
47862 +                                                                  NULL /*info */);
47863 +       }
47864 +
47865 +       memcpy(to_ext, from_ext, free_space);
47866 +}
47867 +
47868 +/* item_plugin->b.create_hook
47869 +   @arg is znode of leaf node for which we need to update right delimiting key */
47870 +int create_hook_extent(const coord_t * coord, void *arg)
47871 +{
47872 +       coord_t *child_coord;
47873 +       znode *node;
47874 +       reiser4_key key;
47875 +       reiser4_tree *tree;
47876 +
47877 +       if (!arg)
47878 +               return 0;
47879 +
47880 +       child_coord = arg;
47881 +       tree = znode_get_tree(coord->node);
47882 +
47883 +       assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL);
47884 +
47885 +       write_lock_tree(tree);
47886 +       write_lock_dk(tree);
47887 +       /* find a node on the left level for which right delimiting key has to
47888 +          be updated */
47889 +       if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) {
47890 +               assert("vs-411", znode_is_left_connected(child_coord->node));
47891 +               node = child_coord->node->left;
47892 +       } else {
47893 +               assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT);
47894 +               node = child_coord->node;
47895 +               assert("nikita-3314", node != NULL);
47896 +       }
47897 +
47898 +       if (node != NULL) {
47899 +               znode_set_rd_key(node, item_key_by_coord(coord, &key));
47900 +
47901 +               assert("nikita-3282", check_sibling_list(node));
47902 +               /* break sibling links */
47903 +               if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) {
47904 +                       ON_DEBUG(node->right->left_version =
47905 +                                atomic_inc_return(&delim_key_version);
47906 +                                node->right_version =
47907 +                                atomic_inc_return(&delim_key_version););
47908 +
47909 +                       node->right->left = NULL;
47910 +                       node->right = NULL;
47911 +               }
47912 +       }
47913 +       write_unlock_dk(tree);
47914 +       write_unlock_tree(tree);
47915 +       return 0;
47916 +}
47917 +
47918 +#define ITEM_TAIL_KILLED 0
47919 +#define ITEM_HEAD_KILLED 1
47920 +#define ITEM_KILLED 2
47921 +
47922 +/* item_plugin->b.kill_hook
47923 +   this is called when @count units starting from @from-th one are going to be removed
47924 +   */
47925 +int
47926 +kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
47927 +                struct carry_kill_data *kdata)
47928 +{
47929 +       reiser4_extent *ext;
47930 +       reiser4_block_nr start, length;
47931 +       const reiser4_key *pfrom_key, *pto_key;
47932 +       struct inode *inode;
47933 +       reiser4_tree *tree;
47934 +       pgoff_t from_off, to_off, offset, skip;
47935 +       int retval;
47936 +
47937 +       /* these are located in memory kmalloc-ed by kill_node_content */
47938 +       reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key;
47939 +       coord_t *dup, *next;
47940 +
47941 +       assert("zam-811", znode_is_write_locked(coord->node));
47942 +       assert("nikita-3315", kdata != NULL);
47943 +       assert("vs-34", kdata->buf != NULL);
47944 +
47945 +       /* map structures to kdata->buf */
47946 +       min_item_key = (reiser4_key *) (kdata->buf);
47947 +       max_item_key = min_item_key + 1;
47948 +       from_key = max_item_key + 1;
47949 +       to_key = from_key + 1;
47950 +       key = to_key + 1;
47951 +       dup = (coord_t *) (key + 1);
47952 +       next = dup + 1;
47953 +
47954 +       item_key_by_coord(coord, min_item_key);
47955 +       max_item_key_by_coord(coord, max_item_key);
47956 +
47957 +       if (kdata->params.from_key) {
47958 +               pfrom_key = kdata->params.from_key;
47959 +               pto_key = kdata->params.to_key;
47960 +       } else {
47961 +               assert("vs-1549", from == coord->unit_pos);
47962 +               unit_key_by_coord(coord, from_key);
47963 +               pfrom_key = from_key;
47964 +
47965 +               coord_dup(dup, coord);
47966 +               dup->unit_pos = from + count - 1;
47967 +               max_unit_key_by_coord(dup, to_key);
47968 +               pto_key = to_key;
47969 +       }
47970 +
47971 +       if (!keylt(pto_key, max_item_key)) {
47972 +               if (!keygt(pfrom_key, min_item_key)) {
47973 +                       znode *left, *right;
47974 +
47975 +                       /* item is to be removed completely */
47976 +                       assert("nikita-3316", kdata->left != NULL
47977 +                              && kdata->right != NULL);
47978 +
47979 +                       left = kdata->left->node;
47980 +                       right = kdata->right->node;
47981 +
47982 +                       tree = current_tree;
47983 +                       /* we have to do two things:
47984 +                        *
47985 +                        *     1. link left and right formatted neighbors of
47986 +                        *        extent being removed, and
47987 +                        *
47988 +                        *     2. update their delimiting keys.
47989 +                        *
47990 +                        * atomicity of these operations is protected by
47991 +                        * taking dk-lock and tree-lock.
47992 +                        */
47993 +                       /* if neighbors of item being removed are znodes -
47994 +                        * link them */
47995 +                       write_lock_tree(tree);
47996 +                       write_lock_dk(tree);
47997 +                       link_left_and_right(left, right);
47998 +                       if (left) {
47999 +                               /* update right delimiting key of left
48000 +                                * neighbor of extent item */
48001 +                               /*coord_t next;
48002 +                                  reiser4_key key; */
48003 +
48004 +                               coord_dup(next, coord);
48005 +
48006 +                               if (coord_next_item(next))
48007 +                                       *key = *znode_get_rd_key(coord->node);
48008 +                               else
48009 +                                       item_key_by_coord(next, key);
48010 +                               znode_set_rd_key(left, key);
48011 +                       }
48012 +                       write_unlock_dk(tree);
48013 +                       write_unlock_tree(tree);
48014 +
48015 +                       from_off =
48016 +                           get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT;
48017 +                       to_off =
48018 +                           (get_key_offset(max_item_key) +
48019 +                            1) >> PAGE_CACHE_SHIFT;
48020 +                       retval = ITEM_KILLED;
48021 +               } else {
48022 +                       /* tail of item is to be removed */
48023 +                       from_off =
48024 +                           (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE -
48025 +                            1) >> PAGE_CACHE_SHIFT;
48026 +                       to_off =
48027 +                           (get_key_offset(max_item_key) +
48028 +                            1) >> PAGE_CACHE_SHIFT;
48029 +                       retval = ITEM_TAIL_KILLED;
48030 +               }
48031 +       } else {
48032 +               /* head of item is to be removed */
48033 +               assert("vs-1571", keyeq(pfrom_key, min_item_key));
48034 +               assert("vs-1572",
48035 +                      (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) ==
48036 +                      0);
48037 +               assert("vs-1573",
48038 +                      ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
48039 +                                                        1)) == 0);
48040 +
48041 +               if (kdata->left->node) {
48042 +                       /* update right delimiting key of left neighbor of extent item */
48043 +                       /*reiser4_key key; */
48044 +
48045 +                       *key = *pto_key;
48046 +                       set_key_offset(key, get_key_offset(pto_key) + 1);
48047 +
48048 +                       write_lock_dk(current_tree);
48049 +                       znode_set_rd_key(kdata->left->node, key);
48050 +                       write_unlock_dk(current_tree);
48051 +               }
48052 +
48053 +               from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT;
48054 +               to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT;
48055 +               retval = ITEM_HEAD_KILLED;
48056 +       }
48057 +
48058 +       inode = kdata->inode;
48059 +       assert("vs-1545", inode != NULL);
48060 +       if (inode != NULL)
48061 +               /* take care of pages and jnodes corresponding to part of item being killed */
48062 +               reiser4_invalidate_pages(inode->i_mapping, from_off,
48063 +                                        to_off - from_off,
48064 +                                        kdata->params.truncate);
48065 +
48066 +       ext = extent_item(coord) + from;
48067 +       offset =
48068 +           (get_key_offset(min_item_key) +
48069 +            reiser4_extent_size(coord, from)) >> PAGE_CACHE_SHIFT;
48070 +
48071 +       assert("vs-1551", from_off >= offset);
48072 +       assert("vs-1552", from_off - offset <= extent_get_width(ext));
48073 +       skip = from_off - offset;
48074 +       offset = from_off;
48075 +
48076 +       while (offset < to_off) {
48077 +               length = extent_get_width(ext) - skip;
48078 +               if (state_of_extent(ext) == HOLE_EXTENT) {
48079 +                       skip = 0;
48080 +                       offset += length;
48081 +                       ext++;
48082 +                       continue;
48083 +               }
48084 +
48085 +               if (offset + length > to_off) {
48086 +                       length = to_off - offset;
48087 +               }
48088 +
48089 +               DQUOT_FREE_BLOCK_NODIRTY(inode, length);
48090 +
48091 +               if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
48092 +                       /* some jnodes corresponding to this unallocated extent */
48093 +                       fake_allocated2free(length, 0 /* unformatted */ );
48094 +
48095 +                       skip = 0;
48096 +                       offset += length;
48097 +                       ext++;
48098 +                       continue;
48099 +               }
48100 +
48101 +               assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT);
48102 +
48103 +               if (length != 0) {
48104 +                       start = extent_get_start(ext) + skip;
48105 +
48106 +                       /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed
48107 +                          immediately */
48108 +                       reiser4_dealloc_blocks(&start, &length,
48109 +                                              0 /* not used */ ,
48110 +                                              BA_DEFER
48111 +                                              /* unformatted with defer */ );
48112 +               }
48113 +               skip = 0;
48114 +               offset += length;
48115 +               ext++;
48116 +       }
48117 +       return retval;
48118 +}
48119 +
48120 +/* item_plugin->b.kill_units */
48121 +int
48122 +kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
48123 +                 struct carry_kill_data *kdata, reiser4_key * smallest_removed,
48124 +                 reiser4_key * new_first)
48125 +{
48126 +       reiser4_extent *ext;
48127 +       reiser4_key item_key;
48128 +       pos_in_node_t count;
48129 +       reiser4_key from_key, to_key;
48130 +       const reiser4_key *pfrom_key, *pto_key;
48131 +       loff_t off;
48132 +       int result;
48133 +
48134 +       assert("vs-1541",
48135 +              ((kdata->params.from_key == NULL && kdata->params.to_key == NULL)
48136 +               || (kdata->params.from_key != NULL
48137 +                   && kdata->params.to_key != NULL)));
48138 +
48139 +       if (kdata->params.from_key) {
48140 +               pfrom_key = kdata->params.from_key;
48141 +               pto_key = kdata->params.to_key;
48142 +       } else {
48143 +               coord_t dup;
48144 +
48145 +               /* calculate key range of kill */
48146 +               assert("vs-1549", from == coord->unit_pos);
48147 +               unit_key_by_coord(coord, &from_key);
48148 +               pfrom_key = &from_key;
48149 +
48150 +               coord_dup(&dup, coord);
48151 +               dup.unit_pos = to;
48152 +               max_unit_key_by_coord(&dup, &to_key);
48153 +               pto_key = &to_key;
48154 +       }
48155 +
48156 +       item_key_by_coord(coord, &item_key);
48157 +
48158 +#if REISER4_DEBUG
48159 +       {
48160 +               reiser4_key max_item_key;
48161 +
48162 +               max_item_key_by_coord(coord, &max_item_key);
48163 +
48164 +               if (new_first) {
48165 +                       /* head of item is to be cut */
48166 +                       assert("vs-1542", keyeq(pfrom_key, &item_key));
48167 +                       assert("vs-1538", keylt(pto_key, &max_item_key));
48168 +               } else {
48169 +                       /* tail of item is to be cut */
48170 +                       assert("vs-1540", keygt(pfrom_key, &item_key));
48171 +                       assert("vs-1543", !keylt(pto_key, &max_item_key));
48172 +               }
48173 +       }
48174 +#endif
48175 +
48176 +       if (smallest_removed)
48177 +               *smallest_removed = *pfrom_key;
48178 +
48179 +       if (new_first) {
48180 +               /* item head is cut. Item key will change. This new key is calculated here */
48181 +               assert("vs-1556",
48182 +                      (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
48183 +                      (PAGE_CACHE_SIZE - 1));
48184 +               *new_first = *pto_key;
48185 +               set_key_offset(new_first, get_key_offset(new_first) + 1);
48186 +       }
48187 +
48188 +       count = to - from + 1;
48189 +       result = kill_hook_extent(coord, from, count, kdata);
48190 +       if (result == ITEM_TAIL_KILLED) {
48191 +               assert("vs-1553",
48192 +                      get_key_offset(pfrom_key) >=
48193 +                      get_key_offset(&item_key) +
48194 +                      reiser4_extent_size(coord, from));
48195 +               off =
48196 +                   get_key_offset(pfrom_key) -
48197 +                       (get_key_offset(&item_key) +
48198 +                        reiser4_extent_size(coord, from));
48199 +               if (off) {
48200 +                       /* unit @from is to be cut partially. Its width decreases */
48201 +                       ext = extent_item(coord) + from;
48202 +                       extent_set_width(ext,
48203 +                                        (off + PAGE_CACHE_SIZE -
48204 +                                         1) >> PAGE_CACHE_SHIFT);
48205 +                       count--;
48206 +               }
48207 +       } else {
48208 +               __u64 max_to_offset;
48209 +               __u64 rest;
48210 +
48211 +               assert("vs-1575", result == ITEM_HEAD_KILLED);
48212 +               assert("", from == 0);
48213 +               assert("",
48214 +                      ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
48215 +                                                        1)) == 0);
48216 +               assert("",
48217 +                      get_key_offset(pto_key) + 1 >
48218 +                      get_key_offset(&item_key) +
48219 +                      reiser4_extent_size(coord, to));
48220 +               max_to_offset =
48221 +                   get_key_offset(&item_key) +
48222 +                       reiser4_extent_size(coord, to + 1) - 1;
48223 +               assert("", get_key_offset(pto_key) <= max_to_offset);
48224 +
48225 +               rest =
48226 +                   (max_to_offset -
48227 +                    get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT;
48228 +               if (rest) {
48229 +                       /* unit @to is to be cut partially */
48230 +                       ext = extent_item(coord) + to;
48231 +
48232 +                       assert("", extent_get_width(ext) > rest);
48233 +
48234 +                       if (state_of_extent(ext) == ALLOCATED_EXTENT)
48235 +                               extent_set_start(ext,
48236 +                                                extent_get_start(ext) +
48237 +                                                (extent_get_width(ext) -
48238 +                                                 rest));
48239 +
48240 +                       extent_set_width(ext, rest);
48241 +                       count--;
48242 +               }
48243 +       }
48244 +       return count * sizeof(reiser4_extent);
48245 +}
48246 +
48247 +/* item_plugin->b.cut_units
48248 +   this is too similar to kill_units_extent */
48249 +int
48250 +cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
48251 +                struct carry_cut_data *cdata, reiser4_key * smallest_removed,
48252 +                reiser4_key * new_first)
48253 +{
48254 +       reiser4_extent *ext;
48255 +       reiser4_key item_key;
48256 +       pos_in_node_t count;
48257 +       reiser4_key from_key, to_key;
48258 +       const reiser4_key *pfrom_key, *pto_key;
48259 +       loff_t off;
48260 +
48261 +       assert("vs-1541",
48262 +              ((cdata->params.from_key == NULL && cdata->params.to_key == NULL)
48263 +               || (cdata->params.from_key != NULL
48264 +                   && cdata->params.to_key != NULL)));
48265 +
48266 +       if (cdata->params.from_key) {
48267 +               pfrom_key = cdata->params.from_key;
48268 +               pto_key = cdata->params.to_key;
48269 +       } else {
48270 +               coord_t dup;
48271 +
48272 +               /* calculate key range of kill */
48273 +               coord_dup(&dup, coord);
48274 +               dup.unit_pos = from;
48275 +               unit_key_by_coord(&dup, &from_key);
48276 +
48277 +               dup.unit_pos = to;
48278 +               max_unit_key_by_coord(&dup, &to_key);
48279 +
48280 +               pfrom_key = &from_key;
48281 +               pto_key = &to_key;
48282 +       }
48283 +
48284 +       assert("vs-1555",
48285 +              (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0);
48286 +       assert("vs-1556",
48287 +              (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
48288 +              (PAGE_CACHE_SIZE - 1));
48289 +
48290 +       item_key_by_coord(coord, &item_key);
48291 +
48292 +#if REISER4_DEBUG
48293 +       {
48294 +               reiser4_key max_item_key;
48295 +
48296 +               assert("vs-1584",
48297 +                      get_key_locality(pfrom_key) ==
48298 +                      get_key_locality(&item_key));
48299 +               assert("vs-1585",
48300 +                      get_key_type(pfrom_key) == get_key_type(&item_key));
48301 +               assert("vs-1586",
48302 +                      get_key_objectid(pfrom_key) ==
48303 +                      get_key_objectid(&item_key));
48304 +               assert("vs-1587",
48305 +                      get_key_ordering(pfrom_key) ==
48306 +                      get_key_ordering(&item_key));
48307 +
48308 +               max_item_key_by_coord(coord, &max_item_key);
48309 +
48310 +               if (new_first != NULL) {
48311 +                       /* head of item is to be cut */
48312 +                       assert("vs-1542", keyeq(pfrom_key, &item_key));
48313 +                       assert("vs-1538", keylt(pto_key, &max_item_key));
48314 +               } else {
48315 +                       /* tail of item is to be cut */
48316 +                       assert("vs-1540", keygt(pfrom_key, &item_key));
48317 +                       assert("vs-1543", keyeq(pto_key, &max_item_key));
48318 +               }
48319 +       }
48320 +#endif
48321 +
48322 +       if (smallest_removed)
48323 +               *smallest_removed = *pfrom_key;
48324 +
48325 +       if (new_first) {
48326 +               /* item head is cut. Item key will change. This new key is calculated here */
48327 +               *new_first = *pto_key;
48328 +               set_key_offset(new_first, get_key_offset(new_first) + 1);
48329 +       }
48330 +
48331 +       count = to - from + 1;
48332 +
48333 +       assert("vs-1553",
48334 +              get_key_offset(pfrom_key) >=
48335 +              get_key_offset(&item_key) + reiser4_extent_size(coord, from));
48336 +       off =
48337 +           get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
48338 +                                        reiser4_extent_size(coord, from));
48339 +       if (off) {
48340 +               /* tail of unit @from is to be cut partially. Its width decreases */
48341 +               assert("vs-1582", new_first == NULL);
48342 +               ext = extent_item(coord) + from;
48343 +               extent_set_width(ext, off >> PAGE_CACHE_SHIFT);
48344 +               count--;
48345 +       }
48346 +
48347 +       assert("vs-1554",
48348 +              get_key_offset(pto_key) <=
48349 +              get_key_offset(&item_key) +
48350 +              reiser4_extent_size(coord, to + 1) - 1);
48351 +       off =
48352 +               (get_key_offset(&item_key) +
48353 +                reiser4_extent_size(coord, to + 1) - 1) -
48354 +               get_key_offset(pto_key);
48355 +       if (off) {
48356 +               /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased
48357 +                  and width decreased. */
48358 +               assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0);
48359 +               ext = extent_item(coord) + to;
48360 +               if (state_of_extent(ext) == ALLOCATED_EXTENT)
48361 +                       extent_set_start(ext,
48362 +                                        extent_get_start(ext) +
48363 +                                        (extent_get_width(ext) -
48364 +                                         (off >> PAGE_CACHE_SHIFT)));
48365 +
48366 +               extent_set_width(ext, (off >> PAGE_CACHE_SHIFT));
48367 +               count--;
48368 +       }
48369 +       return count * sizeof(reiser4_extent);
48370 +}
48371 +
48372 +/* item_plugin->b.unit_key */
48373 +reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key)
48374 +{
48375 +       assert("vs-300", coord_is_existing_unit(coord));
48376 +
48377 +       item_key_by_coord(coord, key);
48378 +       set_key_offset(key,
48379 +                      (get_key_offset(key) +
48380 +                       reiser4_extent_size(coord, coord->unit_pos)));
48381 +
48382 +       return key;
48383 +}
48384 +
48385 +/* item_plugin->b.max_unit_key */
48386 +reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key)
48387 +{
48388 +       assert("vs-300", coord_is_existing_unit(coord));
48389 +
48390 +       item_key_by_coord(coord, key);
48391 +       set_key_offset(key,
48392 +                      (get_key_offset(key) +
48393 +                       reiser4_extent_size(coord, coord->unit_pos + 1) - 1));
48394 +       return key;
48395 +}
48396 +
48397 +/* item_plugin->b.estimate
48398 +   item_plugin->b.item_data_by_flow */
48399 +
48400 +#if REISER4_DEBUG
48401 +
48402 +/* item_plugin->b.check
48403 +   used for debugging, every item should have here the most complete
48404 +   possible check of the consistency of the item that the inventor can
48405 +   construct
48406 +*/
48407 +int reiser4_check_extent(const coord_t * coord /* coord of item to check */,
48408 +                        const char **error /* where to store error message */)
48409 +{
48410 +       reiser4_extent *ext, *first;
48411 +       unsigned i, j;
48412 +       reiser4_block_nr start, width, blk_cnt;
48413 +       unsigned num_units;
48414 +       reiser4_tree *tree;
48415 +       oid_t oid;
48416 +       reiser4_key key;
48417 +       coord_t scan;
48418 +
48419 +       assert("vs-933", REISER4_DEBUG);
48420 +
48421 +       if (znode_get_level(coord->node) != TWIG_LEVEL) {
48422 +               *error = "Extent on the wrong level";
48423 +               return -1;
48424 +       }
48425 +       if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) {
48426 +               *error = "Wrong item size";
48427 +               return -1;
48428 +       }
48429 +       ext = first = extent_item(coord);
48430 +       blk_cnt = reiser4_block_count(reiser4_get_current_sb());
48431 +       num_units = coord_num_units(coord);
48432 +       tree = znode_get_tree(coord->node);
48433 +       item_key_by_coord(coord, &key);
48434 +       oid = get_key_objectid(&key);
48435 +       coord_dup(&scan, coord);
48436 +
48437 +       for (i = 0; i < num_units; ++i, ++ext) {
48438 +               __u64 index;
48439 +
48440 +               scan.unit_pos = i;
48441 +               index = extent_unit_index(&scan);
48442 +
48443 +#if 0
48444 +               /* check that all jnodes are present for the unallocated
48445 +                * extent */
48446 +               if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
48447 +                       for (j = 0; j < extent_get_width(ext); j++) {
48448 +                               jnode *node;
48449 +
48450 +                               node = jlookup(tree, oid, index + j);
48451 +                               if (node == NULL) {
48452 +                                       print_coord("scan", &scan, 0);
48453 +                                       *error = "Jnode missing";
48454 +                                       return -1;
48455 +                               }
48456 +                               jput(node);
48457 +                       }
48458 +               }
48459 +#endif
48460 +
48461 +               start = extent_get_start(ext);
48462 +               if (start < 2)
48463 +                       continue;
48464 +               /* extent is allocated one */
48465 +               width = extent_get_width(ext);
48466 +               if (start >= blk_cnt) {
48467 +                       *error = "Start too large";
48468 +                       return -1;
48469 +               }
48470 +               if (start + width > blk_cnt) {
48471 +                       *error = "End too large";
48472 +                       return -1;
48473 +               }
48474 +               /* make sure that this extent does not overlap with other
48475 +                  allocated extents extents */
48476 +               for (j = 0; j < i; j++) {
48477 +                       if (state_of_extent(first + j) != ALLOCATED_EXTENT)
48478 +                               continue;
48479 +                       if (!
48480 +                           ((extent_get_start(ext) >=
48481 +                             extent_get_start(first + j) +
48482 +                             extent_get_width(first + j))
48483 +                            || (extent_get_start(ext) +
48484 +                                extent_get_width(ext) <=
48485 +                                extent_get_start(first + j)))) {
48486 +                               *error = "Extent overlaps with others";
48487 +                               return -1;
48488 +                       }
48489 +               }
48490 +
48491 +       }
48492 +
48493 +       return 0;
48494 +}
48495 +
48496 +#endif                         /* REISER4_DEBUG */
48497 +
48498 +/*
48499 +   Local variables:
48500 +   c-indentation-style: "K&R"
48501 +   mode-name: "LC"
48502 +   c-basic-offset: 8
48503 +   tab-width: 8
48504 +   fill-column: 120
48505 +   scroll-step: 1
48506 +   End:
48507 +*/
48508 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/internal.c linux-2.6.27/fs/reiser4/plugin/item/internal.c
48509 --- linux-2.6.27.orig/fs/reiser4/plugin/item/internal.c 1970-01-01 03:00:00.000000000 +0300
48510 +++ linux-2.6.27/fs/reiser4/plugin/item/internal.c      2008-10-12 18:20:01.000000000 +0400
48511 @@ -0,0 +1,404 @@
48512 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48513 +
48514 +/* Implementation of internal-item plugin methods. */
48515 +
48516 +#include "../../forward.h"
48517 +#include "../../debug.h"
48518 +#include "../../dformat.h"
48519 +#include "../../key.h"
48520 +#include "../../coord.h"
48521 +#include "internal.h"
48522 +#include "item.h"
48523 +#include "../node/node.h"
48524 +#include "../plugin.h"
48525 +#include "../../jnode.h"
48526 +#include "../../znode.h"
48527 +#include "../../tree_walk.h"
48528 +#include "../../tree_mod.h"
48529 +#include "../../tree.h"
48530 +#include "../../super.h"
48531 +#include "../../block_alloc.h"
48532 +
48533 +/* see internal.h for explanation */
48534 +
48535 +/* plugin->u.item.b.mergeable */
48536 +int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ ,
48537 +                      const coord_t * p2 UNUSED_ARG /* second item */ )
48538 +{
48539 +       /* internal items are not mergeable */
48540 +       return 0;
48541 +}
48542 +
48543 +/* ->lookup() method for internal items */
48544 +lookup_result lookup_internal(const reiser4_key * key /* key to look up */ ,
48545 +                             lookup_bias bias UNUSED_ARG /* lookup bias */ ,
48546 +                             coord_t * coord /* coord of item */ )
48547 +{
48548 +       reiser4_key ukey;
48549 +
48550 +       switch (keycmp(unit_key_by_coord(coord, &ukey), key)) {
48551 +       default:
48552 +               impossible("", "keycmp()?!");
48553 +       case LESS_THAN:
48554 +               /* FIXME-VS: AFTER_ITEM used to be here. But with new coord
48555 +                  item plugin can not be taken using coord set this way */
48556 +               assert("vs-681", coord->unit_pos == 0);
48557 +               coord->between = AFTER_UNIT;
48558 +       case EQUAL_TO:
48559 +               return CBK_COORD_FOUND;
48560 +       case GREATER_THAN:
48561 +               return CBK_COORD_NOTFOUND;
48562 +       }
48563 +}
48564 +
48565 +/* return body of internal item at @coord */
48566 +static internal_item_layout *internal_at(const coord_t * coord /* coord of
48567 +                                                                * item */ )
48568 +{
48569 +       assert("nikita-607", coord != NULL);
48570 +       assert("nikita-1650",
48571 +              item_plugin_by_coord(coord) ==
48572 +              item_plugin_by_id(NODE_POINTER_ID));
48573 +       return (internal_item_layout *) item_body_by_coord(coord);
48574 +}
48575 +
48576 +void reiser4_update_internal(const coord_t * coord,
48577 +                            const reiser4_block_nr * blocknr)
48578 +{
48579 +       internal_item_layout *item = internal_at(coord);
48580 +       assert("nikita-2959", reiser4_blocknr_is_sane(blocknr));
48581 +
48582 +       put_unaligned(cpu_to_le64(*blocknr), &item->pointer);
48583 +}
48584 +
48585 +/* return child block number stored in the internal item at @coord */
48586 +static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ )
48587 +{
48588 +       assert("nikita-608", coord != NULL);
48589 +       return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer));
48590 +}
48591 +
48592 +/* get znode pointed to by internal @item */
48593 +static znode *znode_at(const coord_t * item /* coord of item */ ,
48594 +                      znode * parent /* parent node */ )
48595 +{
48596 +       return child_znode(item, parent, 1, 0);
48597 +}
48598 +
48599 +/* store pointer from internal item into "block". Implementation of
48600 +    ->down_link() method */
48601 +void down_link_internal(const coord_t * coord /* coord of item */ ,
48602 +                       const reiser4_key * key UNUSED_ARG      /* key to get
48603 +                                                                * pointer for */ ,
48604 +                       reiser4_block_nr * block /* resulting block number */ )
48605 +{
48606 +       ON_DEBUG(reiser4_key item_key);
48607 +
48608 +       assert("nikita-609", coord != NULL);
48609 +       assert("nikita-611", block != NULL);
48610 +       assert("nikita-612", (key == NULL) ||
48611 +              /* twig horrors */
48612 +              (znode_get_level(coord->node) == TWIG_LEVEL)
48613 +              || keyle(item_key_by_coord(coord, &item_key), key));
48614 +
48615 +       *block = pointer_at(coord);
48616 +       assert("nikita-2960", reiser4_blocknr_is_sane(block));
48617 +}
48618 +
48619 +/* Get the child's block number, or 0 if the block is unallocated. */
48620 +int
48621 +utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG,
48622 +                                reiser4_block_nr * block)
48623 +{
48624 +       assert("jmacd-2059", coord != NULL);
48625 +
48626 +       *block = pointer_at(coord);
48627 +       assert("nikita-2961", reiser4_blocknr_is_sane(block));
48628 +
48629 +       if (reiser4_blocknr_is_fake(block)) {
48630 +               *block = 0;
48631 +       }
48632 +
48633 +       return 0;
48634 +}
48635 +
48636 +/* Return the child. */
48637 +int
48638 +utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG,
48639 +                     jnode ** childp)
48640 +{
48641 +       reiser4_block_nr block = pointer_at(coord);
48642 +       znode *child;
48643 +
48644 +       assert("jmacd-2059", childp != NULL);
48645 +       assert("nikita-2962", reiser4_blocknr_is_sane(&block));
48646 +
48647 +       child = zlook(znode_get_tree(coord->node), &block);
48648 +
48649 +       if (IS_ERR(child)) {
48650 +               return PTR_ERR(child);
48651 +       }
48652 +
48653 +       *childp = ZJNODE(child);
48654 +
48655 +       return 0;
48656 +}
48657 +
48658 +#if REISER4_DEBUG
48659 +
48660 +static void check_link(znode * left, znode * right)
48661 +{
48662 +       znode *scan;
48663 +
48664 +       for (scan = left; scan != right; scan = scan->right) {
48665 +               if (ZF_ISSET(scan, JNODE_RIP))
48666 +                       break;
48667 +               if (znode_is_right_connected(scan) && scan->right != NULL) {
48668 +                       if (ZF_ISSET(scan->right, JNODE_RIP))
48669 +                               break;
48670 +                       assert("nikita-3285",
48671 +                              znode_is_left_connected(scan->right));
48672 +                       assert("nikita-3265",
48673 +                              ergo(scan != left,
48674 +                                   ZF_ISSET(scan, JNODE_HEARD_BANSHEE)));
48675 +                       assert("nikita-3284", scan->right->left == scan);
48676 +               } else
48677 +                       break;
48678 +       }
48679 +}
48680 +
48681 +int check__internal(const coord_t * coord, const char **error)
48682 +{
48683 +       reiser4_block_nr blk;
48684 +       znode *child;
48685 +       coord_t cpy;
48686 +
48687 +       blk = pointer_at(coord);
48688 +       if (!reiser4_blocknr_is_sane(&blk)) {
48689 +               *error = "Invalid pointer";
48690 +               return -1;
48691 +       }
48692 +       coord_dup(&cpy, coord);
48693 +       child = znode_at(&cpy, cpy.node);
48694 +       if (child != NULL) {
48695 +               znode *left_child;
48696 +               znode *right_child;
48697 +
48698 +               left_child = right_child = NULL;
48699 +
48700 +               assert("nikita-3256", znode_invariant(child));
48701 +               if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) {
48702 +                       left_child = znode_at(&cpy, cpy.node);
48703 +                       if (left_child != NULL) {
48704 +                               read_lock_tree(znode_get_tree(child));
48705 +                               check_link(left_child, child);
48706 +                               read_unlock_tree(znode_get_tree(child));
48707 +                               zput(left_child);
48708 +                       }
48709 +               }
48710 +               coord_dup(&cpy, coord);
48711 +               if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) {
48712 +                       right_child = znode_at(&cpy, cpy.node);
48713 +                       if (right_child != NULL) {
48714 +                               read_lock_tree(znode_get_tree(child));
48715 +                               check_link(child, right_child);
48716 +                               read_unlock_tree(znode_get_tree(child));
48717 +                               zput(right_child);
48718 +                       }
48719 +               }
48720 +               zput(child);
48721 +       }
48722 +       return 0;
48723 +}
48724 +
48725 +#endif  /*  REISER4_DEBUG  */
48726 +
48727 +/* return true only if this item really points to "block" */
48728 +/* Audited by: green(2002.06.14) */
48729 +int has_pointer_to_internal(const coord_t * coord /* coord of item */ ,
48730 +                           const reiser4_block_nr * block      /* block number to
48731 +                                                                * check */ )
48732 +{
48733 +       assert("nikita-613", coord != NULL);
48734 +       assert("nikita-614", block != NULL);
48735 +
48736 +       return pointer_at(coord) == *block;
48737 +}
48738 +
48739 +/* hook called by ->create_item() method of node plugin after new internal
48740 +   item was just created.
48741 +
48742 +   This is point where pointer to new node is inserted into tree. Initialize
48743 +   parent pointer in child znode, insert child into sibling list and slum.
48744 +
48745 +*/
48746 +int create_hook_internal(const coord_t * item /* coord of item */ ,
48747 +                        void *arg /* child's left neighbor, if any */ )
48748 +{
48749 +       znode *child;
48750 +       __u64 child_ptr;
48751 +
48752 +       assert("nikita-1252", item != NULL);
48753 +       assert("nikita-1253", item->node != NULL);
48754 +       assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL);
48755 +       assert("nikita-1450", item->unit_pos == 0);
48756 +
48757 +       /*
48758 +        * preparing to item insertion build_child_ptr_data sets pointer to
48759 +        * data to be inserted to jnode's blocknr which is in cpu byte
48760 +        * order. Node's create_item simply copied those data. As result we
48761 +        * have child pointer in cpu's byte order. Convert content of internal
48762 +        * item to little endian byte order.
48763 +        */
48764 +       child_ptr = get_unaligned((__u64 *)item_body_by_coord(item));
48765 +       reiser4_update_internal(item, &child_ptr);
48766 +
48767 +       child = znode_at(item, item->node);
48768 +       if (child != NULL && !IS_ERR(child)) {
48769 +               znode *left;
48770 +               int result = 0;
48771 +               reiser4_tree *tree;
48772 +
48773 +               left = arg;
48774 +               tree = znode_get_tree(item->node);
48775 +               write_lock_tree(tree);
48776 +               write_lock_dk(tree);
48777 +               assert("nikita-1400", (child->in_parent.node == NULL)
48778 +                      || (znode_above_root(child->in_parent.node)));
48779 +               ++item->node->c_count;
48780 +               coord_to_parent_coord(item, &child->in_parent);
48781 +               sibling_list_insert_nolock(child, left);
48782 +
48783 +               assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN));
48784 +               ZF_CLR(child, JNODE_ORPHAN);
48785 +
48786 +               if ((left != NULL) && !keyeq(znode_get_rd_key(left),
48787 +                                            znode_get_rd_key(child))) {
48788 +                       znode_set_rd_key(child, znode_get_rd_key(left));
48789 +               }
48790 +               write_unlock_dk(tree);
48791 +               write_unlock_tree(tree);
48792 +               zput(child);
48793 +               return result;
48794 +       } else {
48795 +               if (child == NULL)
48796 +                       child = ERR_PTR(-EIO);
48797 +               return PTR_ERR(child);
48798 +       }
48799 +}
48800 +
48801 +/* hook called by ->cut_and_kill() method of node plugin just before internal
48802 +   item is removed.
48803 +
48804 +   This is point where empty node is removed from the tree. Clear parent
48805 +   pointer in child, and mark node for pending deletion.
48806 +
48807 +   Node will be actually deleted later and in several installations:
48808 +
48809 +    . when last lock on this node will be released, node will be removed from
48810 +    the sibling list and its lock will be invalidated
48811 +
48812 +    . when last reference to this node will be dropped, bitmap will be updated
48813 +    and node will be actually removed from the memory.
48814 +
48815 +*/
48816 +int kill_hook_internal(const coord_t * item /* coord of item */ ,
48817 +                      pos_in_node_t from UNUSED_ARG /* start unit */ ,
48818 +                      pos_in_node_t count UNUSED_ARG /* stop unit */ ,
48819 +                      struct carry_kill_data *p UNUSED_ARG)
48820 +{
48821 +       znode *child;
48822 +       int result = 0;
48823 +
48824 +       assert("nikita-1222", item != NULL);
48825 +       assert("nikita-1224", from == 0);
48826 +       assert("nikita-1225", count == 1);
48827 +
48828 +       child = znode_at(item, item->node);
48829 +       if (child == NULL)
48830 +               return 0;
48831 +       if (IS_ERR(child))
48832 +               return PTR_ERR(child);
48833 +       result = zload(child);
48834 +       if (result) {
48835 +               zput(child);
48836 +               return result;
48837 +       }
48838 +       if (node_is_empty(child)) {
48839 +               reiser4_tree *tree;
48840 +
48841 +               assert("nikita-1397", znode_is_write_locked(child));
48842 +               assert("nikita-1398", child->c_count == 0);
48843 +               assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE));
48844 +
48845 +               tree = znode_get_tree(item->node);
48846 +               write_lock_tree(tree);
48847 +               init_parent_coord(&child->in_parent, NULL);
48848 +               --item->node->c_count;
48849 +               write_unlock_tree(tree);
48850 +       } else {
48851 +               warning("nikita-1223",
48852 +                       "Cowardly refuse to remove link to non-empty node");
48853 +               result = RETERR(-EIO);
48854 +       }
48855 +       zrelse(child);
48856 +       zput(child);
48857 +       return result;
48858 +}
48859 +
48860 +/* hook called by ->shift() node plugin method when iternal item was just
48861 +   moved from one node to another.
48862 +
48863 +   Update parent pointer in child and c_counts in old and new parent
48864 +
48865 +*/
48866 +int shift_hook_internal(const coord_t * item /* coord of item */ ,
48867 +                       unsigned from UNUSED_ARG /* start unit */ ,
48868 +                       unsigned count UNUSED_ARG /* stop unit */ ,
48869 +                       znode * old_node /* old parent */ )
48870 +{
48871 +       znode *child;
48872 +       znode *new_node;
48873 +       reiser4_tree *tree;
48874 +
48875 +       assert("nikita-1276", item != NULL);
48876 +       assert("nikita-1277", from == 0);
48877 +       assert("nikita-1278", count == 1);
48878 +       assert("nikita-1451", item->unit_pos == 0);
48879 +
48880 +       new_node = item->node;
48881 +       assert("nikita-2132", new_node != old_node);
48882 +       tree = znode_get_tree(item->node);
48883 +       child = child_znode(item, old_node, 1, 0);
48884 +       if (child == NULL)
48885 +               return 0;
48886 +       if (!IS_ERR(child)) {
48887 +               write_lock_tree(tree);
48888 +               ++new_node->c_count;
48889 +               assert("nikita-1395", znode_parent(child) == old_node);
48890 +               assert("nikita-1396", old_node->c_count > 0);
48891 +               coord_to_parent_coord(item, &child->in_parent);
48892 +               assert("nikita-1781", znode_parent(child) == new_node);
48893 +               assert("nikita-1782",
48894 +                      check_tree_pointer(item, child) == NS_FOUND);
48895 +               --old_node->c_count;
48896 +               write_unlock_tree(tree);
48897 +               zput(child);
48898 +               return 0;
48899 +       } else
48900 +               return PTR_ERR(child);
48901 +}
48902 +
48903 +/* plugin->u.item.b.max_key_inside - not defined */
48904 +
48905 +/* plugin->u.item.b.nr_units - item.c:single_unit */
48906 +
48907 +/* Make Linus happy.
48908 +   Local variables:
48909 +   c-indentation-style: "K&R"
48910 +   mode-name: "LC"
48911 +   c-basic-offset: 8
48912 +   tab-width: 8
48913 +   fill-column: 120
48914 +   End:
48915 +*/
48916 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/internal.h linux-2.6.27/fs/reiser4/plugin/item/internal.h
48917 --- linux-2.6.27.orig/fs/reiser4/plugin/item/internal.h 1970-01-01 03:00:00.000000000 +0300
48918 +++ linux-2.6.27/fs/reiser4/plugin/item/internal.h      2008-10-12 18:20:01.000000000 +0400
48919 @@ -0,0 +1,57 @@
48920 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48921 +/* Internal item contains down-link to the child of the internal/twig
48922 +   node in a tree. It is internal items that are actually used during
48923 +   tree traversal. */
48924 +
48925 +#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ )
48926 +#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__
48927 +
48928 +#include "../../forward.h"
48929 +#include "../../dformat.h"
48930 +
48931 +/* on-disk layout of internal item */
48932 +typedef struct internal_item_layout {
48933 +       /*  0 */ reiser4_dblock_nr pointer;
48934 +       /*  4 */
48935 +} internal_item_layout;
48936 +
48937 +struct cut_list;
48938 +
48939 +int mergeable_internal(const coord_t * p1, const coord_t * p2);
48940 +lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias,
48941 +                             coord_t * coord);
48942 +/* store pointer from internal item into "block". Implementation of
48943 +    ->down_link() method */
48944 +extern void down_link_internal(const coord_t * coord, const reiser4_key * key,
48945 +                              reiser4_block_nr * block);
48946 +extern int has_pointer_to_internal(const coord_t * coord,
48947 +                                  const reiser4_block_nr * block);
48948 +extern int create_hook_internal(const coord_t * item, void *arg);
48949 +extern int kill_hook_internal(const coord_t * item, pos_in_node_t from,
48950 +                             pos_in_node_t count, struct carry_kill_data *);
48951 +extern int shift_hook_internal(const coord_t * item, unsigned from,
48952 +                              unsigned count, znode * old_node);
48953 +extern void reiser4_print_internal(const char *prefix, coord_t * coord);
48954 +
48955 +extern int utmost_child_internal(const coord_t * coord, sideof side,
48956 +                                jnode ** child);
48957 +int utmost_child_real_block_internal(const coord_t * coord, sideof side,
48958 +                                    reiser4_block_nr * block);
48959 +
48960 +extern void reiser4_update_internal(const coord_t * coord,
48961 +                                   const reiser4_block_nr * blocknr);
48962 +/* FIXME: reiserfs has check_internal */
48963 +extern int check__internal(const coord_t * coord, const char **error);
48964 +
48965 +/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */
48966 +#endif
48967 +
48968 +/* Make Linus happy.
48969 +   Local variables:
48970 +   c-indentation-style: "K&R"
48971 +   mode-name: "LC"
48972 +   c-basic-offset: 8
48973 +   tab-width: 8
48974 +   fill-column: 120
48975 +   End:
48976 +*/
48977 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/item.c linux-2.6.27/fs/reiser4/plugin/item/item.c
48978 --- linux-2.6.27.orig/fs/reiser4/plugin/item/item.c     1970-01-01 03:00:00.000000000 +0300
48979 +++ linux-2.6.27/fs/reiser4/plugin/item/item.c  2008-10-12 18:20:01.000000000 +0400
48980 @@ -0,0 +1,719 @@
48981 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48982 +
48983 +/* definition of item plugins. */
48984 +
48985 +#include "../../forward.h"
48986 +#include "../../debug.h"
48987 +#include "../../key.h"
48988 +#include "../../coord.h"
48989 +#include "../plugin_header.h"
48990 +#include "sde.h"
48991 +#include "internal.h"
48992 +#include "item.h"
48993 +#include "static_stat.h"
48994 +#include "../plugin.h"
48995 +#include "../../znode.h"
48996 +#include "../../tree.h"
48997 +#include "../../context.h"
48998 +#include "ctail.h"
48999 +
49000 +/* return pointer to item body */
49001 +void item_body_by_coord_hard(coord_t * coord /* coord to query */ )
49002 +{
49003 +       assert("nikita-324", coord != NULL);
49004 +       assert("nikita-325", coord->node != NULL);
49005 +       assert("nikita-326", znode_is_loaded(coord->node));
49006 +       assert("nikita-3200", coord->offset == INVALID_OFFSET);
49007 +
49008 +       coord->offset =
49009 +           node_plugin_by_node(coord->node)->item_by_coord(coord) -
49010 +           zdata(coord->node);
49011 +       ON_DEBUG(coord->body_v = coord->node->times_locked);
49012 +}
49013 +
49014 +void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ )
49015 +{
49016 +       return zdata(coord->node) + coord->offset;
49017 +}
49018 +
49019 +#if REISER4_DEBUG
49020 +
49021 +int item_body_is_valid(const coord_t * coord)
49022 +{
49023 +       return
49024 +           coord->offset ==
49025 +           node_plugin_by_node(coord->node)->item_by_coord(coord) -
49026 +           zdata(coord->node);
49027 +}
49028 +
49029 +#endif
49030 +
49031 +/* return length of item at @coord */
49032 +pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ )
49033 +{
49034 +       int len;
49035 +
49036 +       assert("nikita-327", coord != NULL);
49037 +       assert("nikita-328", coord->node != NULL);
49038 +       assert("nikita-329", znode_is_loaded(coord->node));
49039 +
49040 +       len = node_plugin_by_node(coord->node)->length_by_coord(coord);
49041 +       return len;
49042 +}
49043 +
49044 +void obtain_item_plugin(const coord_t * coord)
49045 +{
49046 +       assert("nikita-330", coord != NULL);
49047 +       assert("nikita-331", coord->node != NULL);
49048 +       assert("nikita-332", znode_is_loaded(coord->node));
49049 +
49050 +       coord_set_iplug((coord_t *) coord,
49051 +                       node_plugin_by_node(coord->node)->
49052 +                       plugin_by_coord(coord));
49053 +       assert("nikita-2479",
49054 +              coord_iplug(coord) ==
49055 +              node_plugin_by_node(coord->node)->plugin_by_coord(coord));
49056 +}
49057 +
49058 +/* return id of item */
49059 +/* Audited by: green(2002.06.15) */
49060 +item_id item_id_by_coord(const coord_t * coord /* coord to query */ )
49061 +{
49062 +       assert("vs-539", coord != NULL);
49063 +       assert("vs-538", coord->node != NULL);
49064 +       assert("vs-537", znode_is_loaded(coord->node));
49065 +       assert("vs-536", item_plugin_by_coord(coord) != NULL);
49066 +       assert("vs-540",
49067 +              item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID);
49068 +
49069 +       return item_id_by_plugin(item_plugin_by_coord(coord));
49070 +}
49071 +
49072 +/* return key of item at @coord */
49073 +/* Audited by: green(2002.06.15) */
49074 +reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ ,
49075 +                              reiser4_key * key /* result */ )
49076 +{
49077 +       assert("nikita-338", coord != NULL);
49078 +       assert("nikita-339", coord->node != NULL);
49079 +       assert("nikita-340", znode_is_loaded(coord->node));
49080 +
49081 +       return node_plugin_by_node(coord->node)->key_at(coord, key);
49082 +}
49083 +
49084 +/* this returns max key in the item */
49085 +reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ ,
49086 +                                  reiser4_key * key /* result */ )
49087 +{
49088 +       coord_t last;
49089 +
49090 +       assert("nikita-338", coord != NULL);
49091 +       assert("nikita-339", coord->node != NULL);
49092 +       assert("nikita-340", znode_is_loaded(coord->node));
49093 +
49094 +       /* make coord pointing to last item's unit */
49095 +       coord_dup(&last, coord);
49096 +       last.unit_pos = coord_num_units(&last) - 1;
49097 +       assert("vs-1560", coord_is_existing_unit(&last));
49098 +
49099 +       max_unit_key_by_coord(&last, key);
49100 +       return key;
49101 +}
49102 +
49103 +/* return key of unit at @coord */
49104 +reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ ,
49105 +                              reiser4_key * key /* result */ )
49106 +{
49107 +       assert("nikita-772", coord != NULL);
49108 +       assert("nikita-774", coord->node != NULL);
49109 +       assert("nikita-775", znode_is_loaded(coord->node));
49110 +
49111 +       if (item_plugin_by_coord(coord)->b.unit_key != NULL)
49112 +               return item_plugin_by_coord(coord)->b.unit_key(coord, key);
49113 +       else
49114 +               return item_key_by_coord(coord, key);
49115 +}
49116 +
49117 +/* return the biggest key contained the unit @coord */
49118 +reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ ,
49119 +                                  reiser4_key * key /* result */ )
49120 +{
49121 +       assert("nikita-772", coord != NULL);
49122 +       assert("nikita-774", coord->node != NULL);
49123 +       assert("nikita-775", znode_is_loaded(coord->node));
49124 +
49125 +       if (item_plugin_by_coord(coord)->b.max_unit_key != NULL)
49126 +               return item_plugin_by_coord(coord)->b.max_unit_key(coord, key);
49127 +       else
49128 +               return unit_key_by_coord(coord, key);
49129 +}
49130 +
49131 +/* ->max_key_inside() method for items consisting of exactly one key (like
49132 +    stat-data) */
49133 +static reiser4_key *max_key_inside_single_key(const coord_t *
49134 +                                             coord /* coord of item */ ,
49135 +                                             reiser4_key *
49136 +                                             result /* resulting key */ )
49137 +{
49138 +       assert("nikita-604", coord != NULL);
49139 +
49140 +       /* coord -> key is starting key of this item and it has to be already
49141 +          filled in */
49142 +       return unit_key_by_coord(coord, result);
49143 +}
49144 +
49145 +/* ->nr_units() method for items consisting of exactly one unit always */
49146 +pos_in_node_t
49147 +nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ )
49148 +{
49149 +       return 1;
49150 +}
49151 +
49152 +static int
49153 +paste_no_paste(coord_t * coord UNUSED_ARG,
49154 +              reiser4_item_data * data UNUSED_ARG,
49155 +              carry_plugin_info * info UNUSED_ARG)
49156 +{
49157 +       return 0;
49158 +}
49159 +
49160 +/* default ->fast_paste() method */
49161 +static int
49162 +agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ )
49163 +{
49164 +       return 1;
49165 +}
49166 +
49167 +int item_can_contain_key(const coord_t * item /* coord of item */ ,
49168 +                        const reiser4_key * key /* key to check */ ,
49169 +                        const reiser4_item_data * data /* parameters of item
49170 +                                                        * being created */ )
49171 +{
49172 +       item_plugin *iplug;
49173 +       reiser4_key min_key_in_item;
49174 +       reiser4_key max_key_in_item;
49175 +
49176 +       assert("nikita-1658", item != NULL);
49177 +       assert("nikita-1659", key != NULL);
49178 +
49179 +       iplug = item_plugin_by_coord(item);
49180 +       if (iplug->b.can_contain_key != NULL)
49181 +               return iplug->b.can_contain_key(item, key, data);
49182 +       else {
49183 +               assert("nikita-1681", iplug->b.max_key_inside != NULL);
49184 +               item_key_by_coord(item, &min_key_in_item);
49185 +               iplug->b.max_key_inside(item, &max_key_in_item);
49186 +
49187 +               /* can contain key if
49188 +                  min_key_in_item <= key &&
49189 +                  key <= max_key_in_item
49190 +                */
49191 +               return keyle(&min_key_in_item, key)
49192 +                   && keyle(key, &max_key_in_item);
49193 +       }
49194 +}
49195 +
49196 +/* mergeable method for non mergeable items */
49197 +static int
49198 +not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG)
49199 +{
49200 +       return 0;
49201 +}
49202 +
49203 +/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */
49204 +int are_items_mergeable(const coord_t * i1 /* coord of first item */ ,
49205 +                       const coord_t * i2 /* coord of second item */ )
49206 +{
49207 +       item_plugin *iplug;
49208 +       reiser4_key k1;
49209 +       reiser4_key k2;
49210 +
49211 +       assert("nikita-1336", i1 != NULL);
49212 +       assert("nikita-1337", i2 != NULL);
49213 +
49214 +       iplug = item_plugin_by_coord(i1);
49215 +       assert("nikita-1338", iplug != NULL);
49216 +
49217 +       /* NOTE-NIKITA are_items_mergeable() is also called by assertions in
49218 +          shifting code when nodes are in "suspended" state. */
49219 +       assert("nikita-1663",
49220 +              keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2)));
49221 +
49222 +       if (iplug->b.mergeable != NULL) {
49223 +               return iplug->b.mergeable(i1, i2);
49224 +       } else if (iplug->b.max_key_inside != NULL) {
49225 +               iplug->b.max_key_inside(i1, &k1);
49226 +               item_key_by_coord(i2, &k2);
49227 +
49228 +               /* mergeable if ->max_key_inside() >= key of i2; */
49229 +               return keyge(iplug->b.max_key_inside(i1, &k1),
49230 +                            item_key_by_coord(i2, &k2));
49231 +       } else {
49232 +               item_key_by_coord(i1, &k1);
49233 +               item_key_by_coord(i2, &k2);
49234 +
49235 +               return
49236 +                   (get_key_locality(&k1) == get_key_locality(&k2)) &&
49237 +                   (get_key_objectid(&k1) == get_key_objectid(&k2))
49238 +                   && (iplug == item_plugin_by_coord(i2));
49239 +       }
49240 +}
49241 +
49242 +int item_is_extent(const coord_t * item)
49243 +{
49244 +       assert("vs-482", coord_is_existing_item(item));
49245 +       return item_id_by_coord(item) == EXTENT_POINTER_ID;
49246 +}
49247 +
49248 +int item_is_tail(const coord_t * item)
49249 +{
49250 +       assert("vs-482", coord_is_existing_item(item));
49251 +       return item_id_by_coord(item) == FORMATTING_ID;
49252 +}
49253 +
49254 +#if REISER4_DEBUG
49255 +
49256 +int item_is_statdata(const coord_t * item)
49257 +{
49258 +       assert("vs-516", coord_is_existing_item(item));
49259 +       return plugin_of_group(item_plugin_by_coord(item), STAT_DATA_ITEM_TYPE);
49260 +}
49261 +
49262 +int item_is_ctail(const coord_t * item)
49263 +{
49264 +       assert("edward-xx", coord_is_existing_item(item));
49265 +       return item_id_by_coord(item) == CTAIL_ID;
49266 +}
49267 +
49268 +#endif  /*  REISER4_DEBUG  */
49269 +
49270 +static int change_item(struct inode *inode,
49271 +                      reiser4_plugin * plugin,
49272 +                      pset_member memb)
49273 +{
49274 +       /* cannot change constituent item (sd, or dir_item) */
49275 +       return RETERR(-EINVAL);
49276 +}
49277 +
49278 +static reiser4_plugin_ops item_plugin_ops = {
49279 +       .init = NULL,
49280 +       .load = NULL,
49281 +       .save_len = NULL,
49282 +       .save = NULL,
49283 +       .change = change_item
49284 +};
49285 +
49286 +item_plugin item_plugins[LAST_ITEM_ID] = {
49287 +       [STATIC_STAT_DATA_ID] = {
49288 +               .h = {
49289 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
49290 +                       .id = STATIC_STAT_DATA_ID,
49291 +                       .groups = (1 << STAT_DATA_ITEM_TYPE),
49292 +                       .pops = &item_plugin_ops,
49293 +                       .label = "sd",
49294 +                       .desc = "stat-data",
49295 +                       .linkage = {NULL, NULL}
49296 +               },
49297 +               .b = {
49298 +                       .max_key_inside = max_key_inside_single_key,
49299 +                       .can_contain_key = NULL,
49300 +                       .mergeable = not_mergeable,
49301 +                       .nr_units = nr_units_single_unit,
49302 +                       .lookup = NULL,
49303 +                       .init = NULL,
49304 +                       .paste = paste_no_paste,
49305 +                       .fast_paste = NULL,
49306 +                       .can_shift = NULL,
49307 +                       .copy_units = NULL,
49308 +                       .create_hook = NULL,
49309 +                       .kill_hook = NULL,
49310 +                       .shift_hook = NULL,
49311 +                       .cut_units = NULL,
49312 +                       .kill_units = NULL,
49313 +                       .unit_key = NULL,
49314 +                       .max_unit_key = NULL,
49315 +                       .estimate = NULL,
49316 +                       .item_data_by_flow = NULL,
49317 +#if REISER4_DEBUG
49318 +                       .check = NULL
49319 +#endif
49320 +               },
49321 +               .f = {
49322 +                       .utmost_child = NULL,
49323 +                       .utmost_child_real_block = NULL,
49324 +                       .update = NULL,
49325 +                       .scan = NULL,
49326 +                       .convert = NULL
49327 +               },
49328 +               .s = {
49329 +                       .sd = {
49330 +                               .init_inode = init_inode_static_sd,
49331 +                               .save_len = save_len_static_sd,
49332 +                               .save = save_static_sd
49333 +                       }
49334 +               }
49335 +       },
49336 +       [SIMPLE_DIR_ENTRY_ID] = {
49337 +               .h = {
49338 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
49339 +                       .id = SIMPLE_DIR_ENTRY_ID,
49340 +                       .groups = (1 << DIR_ENTRY_ITEM_TYPE),
49341 +                       .pops = &item_plugin_ops,
49342 +                       .label = "de",
49343 +                       .desc = "directory entry",
49344 +                       .linkage = {NULL, NULL}
49345 +               },
49346 +               .b = {
49347 +                       .max_key_inside = max_key_inside_single_key,
49348 +                       .can_contain_key = NULL,
49349 +                       .mergeable = NULL,
49350 +                       .nr_units = nr_units_single_unit,
49351 +                       .lookup = NULL,
49352 +                       .init = NULL,
49353 +                       .paste = NULL,
49354 +                       .fast_paste = NULL,
49355 +                       .can_shift = NULL,
49356 +                       .copy_units = NULL,
49357 +                       .create_hook = NULL,
49358 +                       .kill_hook = NULL,
49359 +                       .shift_hook = NULL,
49360 +                       .cut_units = NULL,
49361 +                       .kill_units = NULL,
49362 +                       .unit_key = NULL,
49363 +                       .max_unit_key = NULL,
49364 +                       .estimate = NULL,
49365 +                       .item_data_by_flow = NULL,
49366 +#if REISER4_DEBUG
49367 +                       .check = NULL
49368 +#endif
49369 +               },
49370 +               .f = {
49371 +                       .utmost_child = NULL,
49372 +                       .utmost_child_real_block = NULL,
49373 +                       .update = NULL,
49374 +                       .scan = NULL,
49375 +                       .convert = NULL
49376 +               },
49377 +               .s = {
49378 +                       .dir = {
49379 +                               .extract_key = extract_key_de,
49380 +                               .update_key = update_key_de,
49381 +                               .extract_name = extract_name_de,
49382 +                               .extract_file_type = extract_file_type_de,
49383 +                               .add_entry = add_entry_de,
49384 +                               .rem_entry = rem_entry_de,
49385 +                               .max_name_len = max_name_len_de
49386 +                       }
49387 +               }
49388 +       },
49389 +       [COMPOUND_DIR_ID] = {
49390 +               .h = {
49391 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
49392 +                       .id = COMPOUND_DIR_ID,
49393 +                       .groups = (1 << DIR_ENTRY_ITEM_TYPE),
49394 +                       .pops = &item_plugin_ops,
49395 +                       .label = "cde",
49396 +                       .desc = "compressed directory entry",
49397 +                       .linkage = {NULL, NULL}
49398 +               },
49399 +               .b = {
49400 +                       .max_key_inside = max_key_inside_cde,
49401 +                       .can_contain_key = can_contain_key_cde,
49402 +                       .mergeable = mergeable_cde,
49403 +                       .nr_units = nr_units_cde,
49404 +                       .lookup = lookup_cde,
49405 +                       .init = init_cde,
49406 +                       .paste = paste_cde,
49407 +                       .fast_paste = agree_to_fast_op,
49408 +                       .can_shift = can_shift_cde,
49409 +                       .copy_units = copy_units_cde,
49410 +                       .create_hook = NULL,
49411 +                       .kill_hook = NULL,
49412 +                       .shift_hook = NULL,
49413 +                       .cut_units = cut_units_cde,
49414 +                       .kill_units = kill_units_cde,
49415 +                       .unit_key = unit_key_cde,
49416 +                       .max_unit_key = unit_key_cde,
49417 +                       .estimate = estimate_cde,
49418 +                       .item_data_by_flow = NULL,
49419 +#if REISER4_DEBUG
49420 +                       .check = reiser4_check_cde
49421 +#endif
49422 +               },
49423 +               .f = {
49424 +                       .utmost_child = NULL,
49425 +                       .utmost_child_real_block = NULL,
49426 +                       .update = NULL,
49427 +                       .scan = NULL,
49428 +                       .convert = NULL
49429 +               },
49430 +               .s = {
49431 +                       .dir = {
49432 +                               .extract_key = extract_key_cde,
49433 +                               .update_key = update_key_cde,
49434 +                               .extract_name = extract_name_cde,
49435 +                               .extract_file_type = extract_file_type_de,
49436 +                               .add_entry = add_entry_cde,
49437 +                               .rem_entry = rem_entry_cde,
49438 +                               .max_name_len = max_name_len_cde
49439 +                       }
49440 +               }
49441 +       },
49442 +       [NODE_POINTER_ID] = {
49443 +               .h = {
49444 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
49445 +                       .id = NODE_POINTER_ID,
49446 +                       .groups = (1 << INTERNAL_ITEM_TYPE),
49447 +                       .pops = NULL,
49448 +                       .label = "internal",
49449 +                       .desc = "internal item",
49450 +                       .linkage = {NULL, NULL}
49451 +               },
49452 +               .b = {
49453 +                       .max_key_inside = NULL,
49454 +                       .can_contain_key = NULL,
49455 +                       .mergeable = mergeable_internal,
49456 +                       .nr_units = nr_units_single_unit,
49457 +                       .lookup = lookup_internal,
49458 +                       .init = NULL,
49459 +                       .paste = NULL,
49460 +                       .fast_paste = NULL,
49461 +                       .can_shift = NULL,
49462 +                       .copy_units = NULL,
49463 +                       .create_hook = create_hook_internal,
49464 +                       .kill_hook = kill_hook_internal,
49465 +                       .shift_hook = shift_hook_internal,
49466 +                       .cut_units = NULL,
49467 +                       .kill_units = NULL,
49468 +                       .unit_key = NULL,
49469 +                       .max_unit_key = NULL,
49470 +                       .estimate = NULL,
49471 +                       .item_data_by_flow = NULL,
49472 +#if REISER4_DEBUG
49473 +                       .check = check__internal
49474 +#endif
49475 +               },
49476 +               .f = {
49477 +                       .utmost_child = utmost_child_internal,
49478 +                       .utmost_child_real_block =
49479 +                       utmost_child_real_block_internal,
49480 +                       .update = reiser4_update_internal,
49481 +                       .scan = NULL,
49482 +                       .convert = NULL
49483 +               },
49484 +               .s = {
49485 +                       .internal = {
49486 +                               .down_link = down_link_internal,
49487 +                               .has_pointer_to = has_pointer_to_internal
49488 +                       }
49489 +               }
49490 +       },
49491 +       [EXTENT_POINTER_ID] = {
49492 +               .h = {
49493 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
49494 +                       .id = EXTENT_POINTER_ID,
49495 +                       .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
49496 +                       .pops = NULL,
49497 +                       .label = "extent",
49498 +                       .desc = "extent item",
49499 +                       .linkage = {NULL, NULL}
49500 +               },
49501 +               .b = {
49502 +                       .max_key_inside = max_key_inside_extent,
49503 +                       .can_contain_key = can_contain_key_extent,
49504 +                       .mergeable = mergeable_extent,
49505 +                       .nr_units = nr_units_extent,
49506 +                       .lookup = lookup_extent,
49507 +                       .init = NULL,
49508 +                       .paste = paste_extent,
49509 +                       .fast_paste = agree_to_fast_op,
49510 +                       .can_shift = can_shift_extent,
49511 +                       .create_hook = create_hook_extent,
49512 +                       .copy_units = copy_units_extent,
49513 +                       .kill_hook = kill_hook_extent,
49514 +                       .shift_hook = NULL,
49515 +                       .cut_units = cut_units_extent,
49516 +                       .kill_units = kill_units_extent,
49517 +                       .unit_key = unit_key_extent,
49518 +                       .max_unit_key = max_unit_key_extent,
49519 +                       .estimate = NULL,
49520 +                       .item_data_by_flow = NULL,
49521 +#if REISER4_DEBUG
49522 +                       .check = reiser4_check_extent
49523 +#endif
49524 +               },
49525 +               .f = {
49526 +                       .utmost_child = utmost_child_extent,
49527 +                       .utmost_child_real_block =
49528 +                       utmost_child_real_block_extent,
49529 +                       .update = NULL,
49530 +                       .scan = reiser4_scan_extent,
49531 +                       .convert = NULL,
49532 +                       .key_by_offset = key_by_offset_extent
49533 +               },
49534 +               .s = {
49535 +                       .file = {
49536 +                               .write = reiser4_write_extent,
49537 +                               .read = reiser4_read_extent,
49538 +                               .readpage = reiser4_readpage_extent,
49539 +                               .get_block = get_block_address_extent,
49540 +                               .append_key = append_key_extent,
49541 +                               .init_coord_extension =
49542 +                               init_coord_extension_extent
49543 +                       }
49544 +               }
49545 +       },
49546 +       [FORMATTING_ID] = {
49547 +               .h = {
49548 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
49549 +                       .id = FORMATTING_ID,
49550 +                       .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
49551 +                       .pops = NULL,
49552 +                       .label = "body",
49553 +                       .desc = "body (or tail?) item",
49554 +                       .linkage = {NULL, NULL}
49555 +               },
49556 +               .b = {
49557 +                       .max_key_inside = max_key_inside_tail,
49558 +                       .can_contain_key = can_contain_key_tail,
49559 +                       .mergeable = mergeable_tail,
49560 +                       .nr_units = nr_units_tail,
49561 +                       .lookup = lookup_tail,
49562 +                       .init = NULL,
49563 +                       .paste = paste_tail,
49564 +                       .fast_paste = agree_to_fast_op,
49565 +                       .can_shift = can_shift_tail,
49566 +                       .create_hook = NULL,
49567 +                       .copy_units = copy_units_tail,
49568 +                       .kill_hook = kill_hook_tail,
49569 +                       .shift_hook = NULL,
49570 +                       .cut_units = cut_units_tail,
49571 +                       .kill_units = kill_units_tail,
49572 +                       .unit_key = unit_key_tail,
49573 +                       .max_unit_key = unit_key_tail,
49574 +                       .estimate = NULL,
49575 +                       .item_data_by_flow = NULL,
49576 +#if REISER4_DEBUG
49577 +                       .check = NULL
49578 +#endif
49579 +               },
49580 +               .f = {
49581 +                       .utmost_child = NULL,
49582 +                       .utmost_child_real_block = NULL,
49583 +                       .update = NULL,
49584 +                       .scan = NULL,
49585 +                       .convert = NULL
49586 +               },
49587 +               .s = {
49588 +                       .file = {
49589 +                               .write = reiser4_write_tail,
49590 +                               .read = reiser4_read_tail,
49591 +                               .readpage = readpage_tail,
49592 +                               .get_block = get_block_address_tail,
49593 +                               .append_key = append_key_tail,
49594 +                               .init_coord_extension =
49595 +                               init_coord_extension_tail
49596 +                       }
49597 +               }
49598 +       },
49599 +       [CTAIL_ID] = {
49600 +               .h = {
49601 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
49602 +                       .id = CTAIL_ID,
49603 +                       .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
49604 +                       .pops = NULL,
49605 +                       .label = "ctail",
49606 +                       .desc = "cryptcompress tail item",
49607 +                       .linkage = {NULL, NULL}
49608 +               },
49609 +               .b = {
49610 +                       .max_key_inside = max_key_inside_tail,
49611 +                       .can_contain_key = can_contain_key_ctail,
49612 +                       .mergeable = mergeable_ctail,
49613 +                       .nr_units = nr_units_ctail,
49614 +                       .lookup = NULL,
49615 +                       .init = init_ctail,
49616 +                       .paste = paste_ctail,
49617 +                       .fast_paste = agree_to_fast_op,
49618 +                       .can_shift = can_shift_ctail,
49619 +                       .create_hook = create_hook_ctail,
49620 +                       .copy_units = copy_units_ctail,
49621 +                       .kill_hook = kill_hook_ctail,
49622 +                       .shift_hook = shift_hook_ctail,
49623 +                       .cut_units = cut_units_ctail,
49624 +                       .kill_units = kill_units_ctail,
49625 +                       .unit_key = unit_key_tail,
49626 +                       .max_unit_key = unit_key_tail,
49627 +                       .estimate = estimate_ctail,
49628 +                       .item_data_by_flow = NULL,
49629 +#if REISER4_DEBUG
49630 +                       .check = check_ctail
49631 +#endif
49632 +               },
49633 +               .f = {
49634 +                       .utmost_child = utmost_child_ctail,
49635 +                       /* FIXME-EDWARD: write this */
49636 +                       .utmost_child_real_block = NULL,
49637 +                       .update = NULL,
49638 +                       .scan = scan_ctail,
49639 +                       .convert = convert_ctail
49640 +               },
49641 +               .s = {
49642 +                       .file = {
49643 +                               .write = NULL,
49644 +                               .read = read_ctail,
49645 +                               .readpage = readpage_ctail,
49646 +                               .get_block = get_block_address_tail,
49647 +                               .append_key = append_key_ctail,
49648 +                               .init_coord_extension =
49649 +                               init_coord_extension_tail
49650 +                       }
49651 +               }
49652 +       },
49653 +       [BLACK_BOX_ID] = {
49654 +               .h = {
49655 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
49656 +                       .id = BLACK_BOX_ID,
49657 +                       .groups = (1 << OTHER_ITEM_TYPE),
49658 +                       .pops = NULL,
49659 +                       .label = "blackbox",
49660 +                       .desc = "black box item",
49661 +                       .linkage = {NULL, NULL}
49662 +               },
49663 +               .b = {
49664 +                       .max_key_inside = NULL,
49665 +                       .can_contain_key = NULL,
49666 +                       .mergeable = not_mergeable,
49667 +                       .nr_units = nr_units_single_unit,
49668 +                       /* to need for ->lookup method */
49669 +                       .lookup = NULL,
49670 +                       .init = NULL,
49671 +                       .paste = NULL,
49672 +                       .fast_paste = NULL,
49673 +                       .can_shift = NULL,
49674 +                       .copy_units = NULL,
49675 +                       .create_hook = NULL,
49676 +                       .kill_hook = NULL,
49677 +                       .shift_hook = NULL,
49678 +                       .cut_units = NULL,
49679 +                       .kill_units = NULL,
49680 +                       .unit_key = NULL,
49681 +                       .max_unit_key = NULL,
49682 +                       .estimate = NULL,
49683 +                       .item_data_by_flow = NULL,
49684 +#if REISER4_DEBUG
49685 +                       .check = NULL
49686 +#endif
49687 +               }
49688 +       }
49689 +};
49690 +
49691 +/* Make Linus happy.
49692 +   Local variables:
49693 +   c-indentation-style: "K&R"
49694 +   mode-name: "LC"
49695 +   c-basic-offset: 8
49696 +   tab-width: 8
49697 +   fill-column: 120
49698 +   End:
49699 +*/
49700 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/item.h linux-2.6.27/fs/reiser4/plugin/item/item.h
49701 --- linux-2.6.27.orig/fs/reiser4/plugin/item/item.h     1970-01-01 03:00:00.000000000 +0300
49702 +++ linux-2.6.27/fs/reiser4/plugin/item/item.h  2008-10-12 18:20:01.000000000 +0400
49703 @@ -0,0 +1,398 @@
49704 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49705 +
49706 +/* first read balance.c comments before reading this */
49707 +
49708 +/* An item_plugin implements all of the operations required for
49709 +   balancing that are item specific. */
49710 +
49711 +/* an item plugin also implements other operations that are specific to that
49712 +   item.  These go into the item specific operations portion of the item
49713 +   handler, and all of the item specific portions of the item handler are put
49714 +   into a union. */
49715 +
49716 +#if !defined( __REISER4_ITEM_H__ )
49717 +#define __REISER4_ITEM_H__
49718 +
49719 +#include "../../forward.h"
49720 +#include "../plugin_header.h"
49721 +#include "../../dformat.h"
49722 +#include "../../seal.h"
49723 +#include "../../plugin/file/file.h"
49724 +
49725 +#include <linux/fs.h>          /* for struct file, struct inode  */
49726 +#include <linux/mm.h>          /* for struct page */
49727 +#include <linux/dcache.h>      /* for struct dentry */
49728 +
49729 +typedef enum {
49730 +       STAT_DATA_ITEM_TYPE,
49731 +       DIR_ENTRY_ITEM_TYPE,
49732 +       INTERNAL_ITEM_TYPE,
49733 +       UNIX_FILE_METADATA_ITEM_TYPE,
49734 +       OTHER_ITEM_TYPE
49735 +} item_type_id;
49736 +
49737 +/* this is the part of each item plugin that all items are expected to
49738 +   support or at least explicitly fail to support by setting the
49739 +   pointer to null. */
49740 +struct balance_ops {
49741 +       /* operations called by balancing
49742 +
49743 +          It is interesting to consider that some of these item
49744 +          operations could be given sources or targets that are not
49745 +          really items in nodes.  This could be ok/useful.
49746 +
49747 +        */
49748 +       /* maximal key that can _possibly_ be occupied by this item
49749 +
49750 +          When inserting, and node ->lookup() method (called by
49751 +          coord_by_key()) reaches an item after binary search,
49752 +          the  ->max_key_inside() item plugin method is used to determine
49753 +          whether new item should pasted into existing item
49754 +          (new_key<=max_key_inside()) or new item has to be created
49755 +          (new_key>max_key_inside()).
49756 +
49757 +          For items that occupy exactly one key (like stat-data)
49758 +          this method should return this key. For items that can
49759 +          grow indefinitely (extent, directory item) this should
49760 +          return reiser4_max_key().
49761 +
49762 +          For example extent with the key
49763 +
49764 +          (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
49765 +
49766 +          ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and
49767 +        */
49768 +       reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *);
49769 +
49770 +       /* true if item @coord can merge data at @key. */
49771 +       int (*can_contain_key) (const coord_t *, const reiser4_key *,
49772 +                               const reiser4_item_data *);
49773 +       /* mergeable() - check items for mergeability
49774 +
49775 +          Optional method. Returns true if two items can be merged.
49776 +
49777 +        */
49778 +       int (*mergeable) (const coord_t *, const coord_t *);
49779 +
49780 +       /* number of atomic things in an item.
49781 +          NOTE FOR CONTRIBUTORS: use a generic method
49782 +          nr_units_single_unit() for solid (atomic) items, as
49783 +          tree operations use it as a criterion of solidness
49784 +          (see is_solid_item macro) */
49785 +       pos_in_node_t(*nr_units) (const coord_t *);
49786 +
49787 +       /* search within item for a unit within the item, and return a
49788 +          pointer to it.  This can be used to calculate how many
49789 +          bytes to shrink an item if you use pointer arithmetic and
49790 +          compare to the start of the item body if the item's data
49791 +          are continuous in the node, if the item's data are not
49792 +          continuous in the node, all sorts of other things are maybe
49793 +          going to break as well. */
49794 +        lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *);
49795 +       /* method called by ode_plugin->create_item() to initialise new
49796 +          item */
49797 +       int (*init) (coord_t * target, coord_t * from,
49798 +                    reiser4_item_data * data);
49799 +       /* method called (e.g., by reiser4_resize_item()) to place new data
49800 +          into item when it grows */
49801 +       int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *);
49802 +       /* return true if paste into @coord is allowed to skip
49803 +          carry. That is, if such paste would require any changes
49804 +          at the parent level
49805 +        */
49806 +       int (*fast_paste) (const coord_t *);
49807 +       /* how many but not more than @want units of @source can be
49808 +          shifted into @target node. If pend == append - we try to
49809 +          append last item of @target by first units of @source. If
49810 +          pend == prepend - we try to "prepend" first item in @target
49811 +          by last units of @source. @target node has @free_space
49812 +          bytes of free space. Total size of those units are returned
49813 +          via @size.
49814 +
49815 +          @target is not NULL if shifting to the mergeable item and
49816 +          NULL is new item will be created during shifting.
49817 +        */
49818 +       int (*can_shift) (unsigned free_space, coord_t *,
49819 +                         znode *, shift_direction, unsigned *size,
49820 +                         unsigned want);
49821 +
49822 +       /* starting off @from-th unit of item @source append or
49823 +          prepend @count units to @target. @target has been already
49824 +          expanded by @free_space bytes. That must be exactly what is
49825 +          needed for those items in @target. If @where_is_free_space
49826 +          == SHIFT_LEFT - free space is at the end of @target item,
49827 +          othersize - it is in the beginning of it. */
49828 +       void (*copy_units) (coord_t *, coord_t *,
49829 +                           unsigned from, unsigned count,
49830 +                           shift_direction where_is_free_space,
49831 +                           unsigned free_space);
49832 +
49833 +       int (*create_hook) (const coord_t *, void *);
49834 +       /* do whatever is necessary to do when @count units starting
49835 +          from @from-th one are removed from the tree */
49836 +       /* FIXME-VS: this is used to be here for, in particular,
49837 +          extents and items of internal type to free blocks they point
49838 +          to at the same time with removing items from a
49839 +          tree. Problems start, however, when dealloc_block fails due
49840 +          to some reason. Item gets removed, but blocks it pointed to
49841 +          are not freed. It is not clear how to fix this for items of
49842 +          internal type because a need to remove internal item may
49843 +          appear in the middle of balancing, and there is no way to
49844 +          undo changes made. OTOH, if space allocator involves
49845 +          balancing to perform dealloc_block - this will probably
49846 +          break balancing due to deadlock issues
49847 +        */
49848 +       int (*kill_hook) (const coord_t *, pos_in_node_t from,
49849 +                         pos_in_node_t count, struct carry_kill_data *);
49850 +       int (*shift_hook) (const coord_t *, unsigned from, unsigned count,
49851 +                          znode * _node);
49852 +
49853 +       /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key
49854 +          including boundaries. When units are cut from item beginning - move space which gets freed to head of
49855 +          item. When units are cut from item end - move freed space to item end. When units are cut from the middle of
49856 +          item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in
49857 +          @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0
49858 +        */
49859 +       int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
49860 +                         struct carry_cut_data *,
49861 +                         reiser4_key * smallest_removed,
49862 +                         reiser4_key * new_first_key);
49863 +
49864 +       /* like cut_units, except that these units are removed from the
49865 +          tree, not only from a node */
49866 +       int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
49867 +                          struct carry_kill_data *,
49868 +                          reiser4_key * smallest_removed,
49869 +                          reiser4_key * new_first);
49870 +
49871 +       /* if @key_of_coord == 1 - returned key of coord, otherwise -
49872 +          key of unit is returned. If @coord is not set to certain
49873 +          unit - ERR_PTR(-ENOENT) is returned */
49874 +       reiser4_key *(*unit_key) (const coord_t *, reiser4_key *);
49875 +       reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *);
49876 +       /* estimate how much space is needed for paste @data into item at
49877 +          @coord. if @coord==0 - estimate insertion, otherwise - estimate
49878 +          pasting
49879 +        */
49880 +       int (*estimate) (const coord_t *, const reiser4_item_data *);
49881 +
49882 +       /* converts flow @f to item data. @coord == 0 on insert */
49883 +       int (*item_data_by_flow) (const coord_t *, const flow_t *,
49884 +                                 reiser4_item_data *);
49885 +
49886 +       /*void (*show) (struct seq_file *, coord_t *); */
49887 +
49888 +#if REISER4_DEBUG
49889 +       /* used for debugging, every item should have here the most
49890 +          complete possible check of the consistency of the item that
49891 +          the inventor can construct */
49892 +       int (*check) (const coord_t *, const char **error);
49893 +#endif
49894 +
49895 +};
49896 +
49897 +struct flush_ops {
49898 +       /* return the right or left child of @coord, only if it is in memory */
49899 +       int (*utmost_child) (const coord_t *, sideof side, jnode ** child);
49900 +
49901 +       /* return whether the right or left child of @coord has a non-fake
49902 +          block number. */
49903 +       int (*utmost_child_real_block) (const coord_t *, sideof side,
49904 +                                       reiser4_block_nr *);
49905 +       /* relocate child at @coord to the @block */
49906 +       void (*update) (const coord_t *, const reiser4_block_nr *);
49907 +       /* count unformatted nodes per item for leave relocation policy, etc.. */
49908 +       int (*scan) (flush_scan * scan);
49909 +       /* convert item by flush */
49910 +       int (*convert) (flush_pos_t * pos);
49911 +       /* backward mapping from jnode offset to a key.  */
49912 +       int (*key_by_offset) (struct inode *, loff_t, reiser4_key *);
49913 +};
49914 +
49915 +/* operations specific to the directory item */
49916 +struct dir_entry_iops {
49917 +       /* extract stat-data key from directory entry at @coord and place it
49918 +          into @key. */
49919 +       int (*extract_key) (const coord_t *, reiser4_key * key);
49920 +       /* update object key in item. */
49921 +       int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *);
49922 +       /* extract name from directory entry at @coord and return it */
49923 +       char *(*extract_name) (const coord_t *, char *buf);
49924 +       /* extract file type (DT_* stuff) from directory entry at @coord and
49925 +          return it */
49926 +       unsigned (*extract_file_type) (const coord_t *);
49927 +       int (*add_entry) (struct inode * dir,
49928 +                         coord_t *, lock_handle *,
49929 +                         const struct dentry * name,
49930 +                         reiser4_dir_entry_desc * entry);
49931 +       int (*rem_entry) (struct inode * dir, const struct qstr * name,
49932 +                         coord_t *, lock_handle *,
49933 +                         reiser4_dir_entry_desc * entry);
49934 +       int (*max_name_len) (const struct inode * dir);
49935 +};
49936 +
49937 +/* operations specific to items regular (unix) file metadata are built of */
49938 +struct file_iops{
49939 +       int (*write) (struct file *, struct inode *,
49940 +                     const char __user *, size_t, loff_t *pos);
49941 +       int (*read) (struct file *, flow_t *, hint_t *);
49942 +       int (*readpage) (void *, struct page *);
49943 +       int (*get_block) (const coord_t *, sector_t, sector_t *);
49944 +       /*
49945 +        * key of first byte which is not addressed by the item @coord is set
49946 +        * to.
49947 +        * For example, for extent item with the key
49948 +        *
49949 +        * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
49950 +        *
49951 +        * ->append_key is
49952 +        *
49953 +        * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size)
49954 +        */
49955 +       reiser4_key *(*append_key) (const coord_t *, reiser4_key *);
49956 +
49957 +       void (*init_coord_extension) (uf_coord_t *, loff_t);
49958 +};
49959 +
49960 +/* operations specific to items of stat data type */
49961 +struct sd_iops {
49962 +       int (*init_inode) (struct inode * inode, char *sd, int len);
49963 +       int (*save_len) (struct inode * inode);
49964 +       int (*save) (struct inode * inode, char **area);
49965 +};
49966 +
49967 +/* operations specific to internal item */
49968 +struct internal_iops{
49969 +       /* all tree traversal want to know from internal item is where
49970 +          to go next. */
49971 +       void (*down_link) (const coord_t * coord,
49972 +                          const reiser4_key * key, reiser4_block_nr * block);
49973 +       /* check that given internal item contains given pointer. */
49974 +       int (*has_pointer_to) (const coord_t * coord,
49975 +                              const reiser4_block_nr * block);
49976 +};
49977 +
49978 +struct item_plugin {
49979 +       /* generic fields */
49980 +       plugin_header h;
49981 +       /* methods common for all item types */
49982 +       struct balance_ops b; /* balance operations */
49983 +       struct flush_ops f;   /* flush operates with items via this methods */
49984 +
49985 +       /* methods specific to particular type of item */
49986 +       union {
49987 +               struct dir_entry_iops dir;
49988 +               struct      file_iops file;
49989 +               struct        sd_iops sd;
49990 +               struct  internal_iops internal;
49991 +       } s;
49992 +};
49993 +
49994 +#define is_solid_item(iplug) ((iplug)->b.nr_units == nr_units_single_unit)
49995 +
49996 +static inline item_id item_id_by_plugin(item_plugin * plugin)
49997 +{
49998 +       return plugin->h.id;
49999 +}
50000 +
50001 +static inline char get_iplugid(item_plugin * iplug)
50002 +{
50003 +       assert("nikita-2838", iplug != NULL);
50004 +       assert("nikita-2839", iplug->h.id < 0xff);
50005 +       return (char)item_id_by_plugin(iplug);
50006 +}
50007 +
50008 +extern unsigned long znode_times_locked(const znode * z);
50009 +
50010 +static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug)
50011 +{
50012 +       assert("nikita-2837", coord != NULL);
50013 +       assert("nikita-2838", iplug != NULL);
50014 +       coord->iplugid = get_iplugid(iplug);
50015 +       ON_DEBUG(coord->plug_v = znode_times_locked(coord->node));
50016 +}
50017 +
50018 +static inline item_plugin *coord_iplug(const coord_t * coord)
50019 +{
50020 +       assert("nikita-2833", coord != NULL);
50021 +       assert("nikita-2834", coord->iplugid != INVALID_PLUGID);
50022 +       assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node));
50023 +       return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE,
50024 +                                           coord->iplugid);
50025 +}
50026 +
50027 +extern int item_can_contain_key(const coord_t * item, const reiser4_key * key,
50028 +                               const reiser4_item_data *);
50029 +extern int are_items_mergeable(const coord_t * i1, const coord_t * i2);
50030 +extern int item_is_extent(const coord_t *);
50031 +extern int item_is_tail(const coord_t *);
50032 +extern int item_is_statdata(const coord_t * item);
50033 +extern int item_is_ctail(const coord_t *);
50034 +
50035 +extern pos_in_node_t item_length_by_coord(const coord_t * coord);
50036 +extern pos_in_node_t nr_units_single_unit(const coord_t * coord);
50037 +extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ );
50038 +extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key);
50039 +extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *);
50040 +extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key);
50041 +extern reiser4_key *max_unit_key_by_coord(const coord_t * coord,
50042 +                                         reiser4_key * key);
50043 +extern void obtain_item_plugin(const coord_t * coord);
50044 +
50045 +#if defined(REISER4_DEBUG)
50046 +extern int znode_is_loaded(const znode * node);
50047 +#endif
50048 +
50049 +/* return plugin of item at @coord */
50050 +static inline item_plugin *item_plugin_by_coord(const coord_t *
50051 +                                               coord /* coord to query */ )
50052 +{
50053 +       assert("nikita-330", coord != NULL);
50054 +       assert("nikita-331", coord->node != NULL);
50055 +       assert("nikita-332", znode_is_loaded(coord->node));
50056 +
50057 +       if (unlikely(!coord_is_iplug_set(coord)))
50058 +               obtain_item_plugin(coord);
50059 +       return coord_iplug(coord);
50060 +}
50061 +
50062 +/* this returns true if item is of internal type */
50063 +static inline int item_is_internal(const coord_t * item)
50064 +{
50065 +       assert("vs-483", coord_is_existing_item(item));
50066 +       return plugin_of_group(item_plugin_by_coord(item), INTERNAL_ITEM_TYPE);
50067 +}
50068 +
50069 +extern void item_body_by_coord_hard(coord_t * coord);
50070 +extern void *item_body_by_coord_easy(const coord_t * coord);
50071 +#if REISER4_DEBUG
50072 +extern int item_body_is_valid(const coord_t * coord);
50073 +#endif
50074 +
50075 +/* return pointer to item body */
50076 +static inline void *item_body_by_coord(const coord_t *
50077 +                                      coord /* coord to query */ )
50078 +{
50079 +       assert("nikita-324", coord != NULL);
50080 +       assert("nikita-325", coord->node != NULL);
50081 +       assert("nikita-326", znode_is_loaded(coord->node));
50082 +
50083 +       if (coord->offset == INVALID_OFFSET)
50084 +               item_body_by_coord_hard((coord_t *) coord);
50085 +       assert("nikita-3201", item_body_is_valid(coord));
50086 +       assert("nikita-3550", coord->body_v == znode_times_locked(coord->node));
50087 +       return item_body_by_coord_easy(coord);
50088 +}
50089 +
50090 +/* __REISER4_ITEM_H__ */
50091 +#endif
50092 +/* Make Linus happy.
50093 +   Local variables:
50094 +   c-indentation-style: "K&R"
50095 +   mode-name: "LC"
50096 +   c-basic-offset: 8
50097 +   tab-width: 8
50098 +   fill-column: 120
50099 +   scroll-step: 1
50100 +   End:
50101 +*/
50102 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/Makefile linux-2.6.27/fs/reiser4/plugin/item/Makefile
50103 --- linux-2.6.27.orig/fs/reiser4/plugin/item/Makefile   1970-01-01 03:00:00.000000000 +0300
50104 +++ linux-2.6.27/fs/reiser4/plugin/item/Makefile        2008-10-12 18:20:01.000000000 +0400
50105 @@ -0,0 +1,18 @@
50106 +obj-$(CONFIG_REISER4_FS) += item_plugins.o
50107 +
50108 +item_plugins-objs :=           \
50109 +       item.o                  \
50110 +       static_stat.o           \
50111 +       sde.o                   \
50112 +       cde.o                   \
50113 +       blackbox.o              \
50114 +       internal.o              \
50115 +       tail.o                  \
50116 +       ctail.o                 \
50117 +       extent.o                \
50118 +       extent_item_ops.o       \
50119 +       extent_file_ops.o       \
50120 +       extent_flush_ops.o
50121 +
50122 +
50123 +
50124 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/sde.c linux-2.6.27/fs/reiser4/plugin/item/sde.c
50125 --- linux-2.6.27.orig/fs/reiser4/plugin/item/sde.c      1970-01-01 03:00:00.000000000 +0300
50126 +++ linux-2.6.27/fs/reiser4/plugin/item/sde.c   2008-10-12 18:20:01.000000000 +0400
50127 @@ -0,0 +1,190 @@
50128 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50129 +
50130 +/* Directory entry implementation */
50131 +#include "../../forward.h"
50132 +#include "../../debug.h"
50133 +#include "../../dformat.h"
50134 +#include "../../kassign.h"
50135 +#include "../../coord.h"
50136 +#include "sde.h"
50137 +#include "item.h"
50138 +#include "../plugin.h"
50139 +#include "../../znode.h"
50140 +#include "../../carry.h"
50141 +#include "../../tree.h"
50142 +#include "../../inode.h"
50143 +
50144 +#include <linux/fs.h>          /* for struct inode */
50145 +#include <linux/dcache.h>      /* for struct dentry */
50146 +#include <linux/quotaops.h>
50147 +
50148 +/* ->extract_key() method of simple directory item plugin. */
50149 +int extract_key_de(const coord_t * coord /* coord of item */ ,
50150 +                  reiser4_key * key /* resulting key */ )
50151 +{
50152 +       directory_entry_format *dent;
50153 +
50154 +       assert("nikita-1458", coord != NULL);
50155 +       assert("nikita-1459", key != NULL);
50156 +
50157 +       dent = (directory_entry_format *) item_body_by_coord(coord);
50158 +       assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent);
50159 +       return extract_key_from_id(&dent->id, key);
50160 +}
50161 +
50162 +int
50163 +update_key_de(const coord_t * coord, const reiser4_key * key,
50164 +             lock_handle * lh UNUSED_ARG)
50165 +{
50166 +       directory_entry_format *dent;
50167 +       obj_key_id obj_id;
50168 +       int result;
50169 +
50170 +       assert("nikita-2342", coord != NULL);
50171 +       assert("nikita-2343", key != NULL);
50172 +
50173 +       dent = (directory_entry_format *) item_body_by_coord(coord);
50174 +       result = build_obj_key_id(key, &obj_id);
50175 +       if (result == 0) {
50176 +               dent->id = obj_id;
50177 +               znode_make_dirty(coord->node);
50178 +       }
50179 +       return 0;
50180 +}
50181 +
50182 +char *extract_dent_name(const coord_t * coord, directory_entry_format * dent,
50183 +                       char *buf)
50184 +{
50185 +       reiser4_key key;
50186 +
50187 +       unit_key_by_coord(coord, &key);
50188 +       if (get_key_type(&key) != KEY_FILE_NAME_MINOR)
50189 +               reiser4_print_address("oops", znode_get_block(coord->node));
50190 +       if (!is_longname_key(&key)) {
50191 +               if (is_dot_key(&key))
50192 +                       return (char *)".";
50193 +               else
50194 +                       return extract_name_from_key(&key, buf);
50195 +       } else
50196 +               return (char *)dent->name;
50197 +}
50198 +
50199 +/* ->extract_name() method of simple directory item plugin. */
50200 +char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf)
50201 +{
50202 +       directory_entry_format *dent;
50203 +
50204 +       assert("nikita-1460", coord != NULL);
50205 +
50206 +       dent = (directory_entry_format *) item_body_by_coord(coord);
50207 +       return extract_dent_name(coord, dent, buf);
50208 +}
50209 +
50210 +/* ->extract_file_type() method of simple directory item plugin. */
50211 +unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of
50212 +                                                                * item */ )
50213 +{
50214 +       assert("nikita-1764", coord != NULL);
50215 +       /* we don't store file type in the directory entry yet.
50216 +
50217 +          But see comments at kassign.h:obj_key_id
50218 +        */
50219 +       return DT_UNKNOWN;
50220 +}
50221 +
50222 +int add_entry_de(struct inode *dir /* directory of item */ ,
50223 +                coord_t * coord /* coord of item */ ,
50224 +                lock_handle * lh /* insertion lock handle */ ,
50225 +                const struct dentry *de /* name to add */ ,
50226 +                reiser4_dir_entry_desc * entry /* parameters of new directory
50227 +                                                * entry */ )
50228 +{
50229 +       reiser4_item_data data;
50230 +       directory_entry_format *dent;
50231 +       int result;
50232 +       const char *name;
50233 +       int len;
50234 +       int longname;
50235 +
50236 +       name = de->d_name.name;
50237 +       len = de->d_name.len;
50238 +       assert("nikita-1163", strlen(name) == len);
50239 +
50240 +       longname = is_longname(name, len);
50241 +
50242 +       data.length = sizeof *dent;
50243 +       if (longname)
50244 +               data.length += len + 1;
50245 +       data.data = NULL;
50246 +       data.user = 0;
50247 +       data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
50248 +
50249 +       /* NOTE-NIKITA quota plugin */
50250 +       if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length))
50251 +               return -EDQUOT;
50252 +
50253 +       result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
50254 +       if (result != 0)
50255 +               return result;
50256 +
50257 +       dent = (directory_entry_format *) item_body_by_coord(coord);
50258 +       build_inode_key_id(entry->obj, &dent->id);
50259 +       if (longname) {
50260 +               memcpy(dent->name, name, len);
50261 +               put_unaligned(0, &dent->name[len]);
50262 +       }
50263 +       return 0;
50264 +}
50265 +
50266 +int rem_entry_de(struct inode *dir /* directory of item */ ,
50267 +                const struct qstr *name UNUSED_ARG,
50268 +                coord_t * coord /* coord of item */ ,
50269 +                lock_handle * lh UNUSED_ARG    /* lock handle for
50270 +                                                * removal */ ,
50271 +                reiser4_dir_entry_desc * entry UNUSED_ARG      /* parameters of
50272 +                                                                * directory entry
50273 +                                                                * being removed */ )
50274 +{
50275 +       coord_t shadow;
50276 +       int result;
50277 +       int length;
50278 +
50279 +       length = item_length_by_coord(coord);
50280 +       if (inode_get_bytes(dir) < length) {
50281 +               warning("nikita-2627", "Dir is broke: %llu: %llu",
50282 +                       (unsigned long long)get_inode_oid(dir),
50283 +                       inode_get_bytes(dir));
50284 +
50285 +               return RETERR(-EIO);
50286 +       }
50287 +
50288 +       /* cut_node() is supposed to take pointers to _different_
50289 +          coords, because it will modify them without respect to
50290 +          possible aliasing. To work around this, create temporary copy
50291 +          of @coord.
50292 +        */
50293 +       coord_dup(&shadow, coord);
50294 +       result =
50295 +           kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
50296 +       if (result == 0) {
50297 +               /* NOTE-NIKITA quota plugin */
50298 +               DQUOT_FREE_SPACE_NODIRTY(dir, length);
50299 +       }
50300 +       return result;
50301 +}
50302 +
50303 +int max_name_len_de(const struct inode *dir)
50304 +{
50305 +       return reiser4_tree_by_inode(dir)->nplug->max_item_size() -
50306 +               sizeof(directory_entry_format) - 2;
50307 +}
50308 +
50309 +/* Make Linus happy.
50310 +   Local variables:
50311 +   c-indentation-style: "K&R"
50312 +   mode-name: "LC"
50313 +   c-basic-offset: 8
50314 +   tab-width: 8
50315 +   fill-column: 120
50316 +   End:
50317 +*/
50318 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/sde.h linux-2.6.27/fs/reiser4/plugin/item/sde.h
50319 --- linux-2.6.27.orig/fs/reiser4/plugin/item/sde.h      1970-01-01 03:00:00.000000000 +0300
50320 +++ linux-2.6.27/fs/reiser4/plugin/item/sde.h   2008-10-12 18:20:01.000000000 +0400
50321 @@ -0,0 +1,66 @@
50322 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50323 +
50324 +/* Directory entry. */
50325 +
50326 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
50327 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
50328 +
50329 +#include "../../forward.h"
50330 +#include "../../dformat.h"
50331 +#include "../../kassign.h"
50332 +#include "../../key.h"
50333 +
50334 +#include <linux/fs.h>
50335 +#include <linux/dcache.h>      /* for struct dentry */
50336 +
50337 +typedef struct directory_entry_format {
50338 +       /* key of object stat-data. It's not necessary to store whole
50339 +          key here, because it's always key of stat-data, so minor
50340 +          packing locality and offset can be omitted here. But this
50341 +          relies on particular key allocation scheme for stat-data, so,
50342 +          for extensibility sake, whole key can be stored here.
50343 +
50344 +          We store key as array of bytes, because we don't want 8-byte
50345 +          alignment of dir entries.
50346 +        */
50347 +       obj_key_id id;
50348 +       /* file name. Null terminated string. */
50349 +       d8 name[0];
50350 +} directory_entry_format;
50351 +
50352 +void print_de(const char *prefix, coord_t * coord);
50353 +int extract_key_de(const coord_t * coord, reiser4_key * key);
50354 +int update_key_de(const coord_t * coord, const reiser4_key * key,
50355 +                 lock_handle * lh);
50356 +char *extract_name_de(const coord_t * coord, char *buf);
50357 +unsigned extract_file_type_de(const coord_t * coord);
50358 +int add_entry_de(struct inode *dir, coord_t * coord,
50359 +                lock_handle * lh, const struct dentry *name,
50360 +                reiser4_dir_entry_desc * entry);
50361 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
50362 +                lock_handle * lh, reiser4_dir_entry_desc * entry);
50363 +int max_name_len_de(const struct inode *dir);
50364 +
50365 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
50366 +
50367 +char *extract_dent_name(const coord_t * coord,
50368 +                       directory_entry_format * dent, char *buf);
50369 +
50370 +#if REISER4_LARGE_KEY
50371 +#define DE_NAME_BUF_LEN (24)
50372 +#else
50373 +#define DE_NAME_BUF_LEN (16)
50374 +#endif
50375 +
50376 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
50377 +#endif
50378 +
50379 +/* Make Linus happy.
50380 +   Local variables:
50381 +   c-indentation-style: "K&R"
50382 +   mode-name: "LC"
50383 +   c-basic-offset: 8
50384 +   tab-width: 8
50385 +   fill-column: 120
50386 +   End:
50387 +*/
50388 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/static_stat.c linux-2.6.27/fs/reiser4/plugin/item/static_stat.c
50389 --- linux-2.6.27.orig/fs/reiser4/plugin/item/static_stat.c      1970-01-01 03:00:00.000000000 +0300
50390 +++ linux-2.6.27/fs/reiser4/plugin/item/static_stat.c   2008-10-12 18:20:01.000000000 +0400
50391 @@ -0,0 +1,1107 @@
50392 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50393 +
50394 +/* stat data manipulation. */
50395 +
50396 +#include "../../forward.h"
50397 +#include "../../super.h"
50398 +#include "../../vfs_ops.h"
50399 +#include "../../inode.h"
50400 +#include "../../debug.h"
50401 +#include "../../dformat.h"
50402 +#include "../object.h"
50403 +#include "../plugin.h"
50404 +#include "../plugin_header.h"
50405 +#include "static_stat.h"
50406 +#include "item.h"
50407 +
50408 +#include <linux/types.h>
50409 +#include <linux/fs.h>
50410 +
50411 +/* see static_stat.h for explanation */
50412 +
50413 +/* helper function used while we are dumping/loading inode/plugin state
50414 +    to/from the stat-data. */
50415 +
50416 +static void move_on(int *length /* space remaining in stat-data */ ,
50417 +                   char **area /* current coord in stat data */ ,
50418 +                   int size_of /* how many bytes to move forward */ )
50419 +{
50420 +       assert("nikita-615", length != NULL);
50421 +       assert("nikita-616", area != NULL);
50422 +
50423 +       *length -= size_of;
50424 +       *area += size_of;
50425 +
50426 +       assert("nikita-617", *length >= 0);
50427 +}
50428 +
50429 +/* helper function used while loading inode/plugin state from stat-data.
50430 +    Complain if there is less space in stat-data than was expected.
50431 +    Can only happen on disk corruption. */
50432 +static int not_enough_space(struct inode *inode /* object being processed */ ,
50433 +                           const char *where /* error message */ )
50434 +{
50435 +       assert("nikita-618", inode != NULL);
50436 +
50437 +       warning("nikita-619", "Not enough space in %llu while loading %s",
50438 +               (unsigned long long)get_inode_oid(inode), where);
50439 +
50440 +       return RETERR(-EINVAL);
50441 +}
50442 +
50443 +/* helper function used while loading inode/plugin state from
50444 +    stat-data. Call it if invalid plugin id was found. */
50445 +static int unknown_plugin(reiser4_plugin_id id /* invalid id */ ,
50446 +                         struct inode *inode /* object being processed */ )
50447 +{
50448 +       warning("nikita-620", "Unknown plugin %i in %llu",
50449 +               id, (unsigned long long)get_inode_oid(inode));
50450 +
50451 +       return RETERR(-EINVAL);
50452 +}
50453 +
50454 +/* this is installed as ->init_inode() method of
50455 +    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
50456 +    Copies data from on-disk stat-data format into inode.
50457 +    Handles stat-data extensions. */
50458 +/* was sd_load */
50459 +int init_inode_static_sd(struct inode *inode /* object being processed */ ,
50460 +                        char *sd /* stat-data body */ ,
50461 +                        int len /* length of stat-data */ )
50462 +{
50463 +       int result;
50464 +       int bit;
50465 +       int chunk;
50466 +       __u16 mask;
50467 +       __u64 bigmask;
50468 +       reiser4_stat_data_base *sd_base;
50469 +       reiser4_inode *state;
50470 +
50471 +       assert("nikita-625", inode != NULL);
50472 +       assert("nikita-626", sd != NULL);
50473 +
50474 +       result = 0;
50475 +       sd_base = (reiser4_stat_data_base *) sd;
50476 +       state = reiser4_inode_data(inode);
50477 +       mask = le16_to_cpu(get_unaligned(&sd_base->extmask));
50478 +       bigmask = mask;
50479 +       reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
50480 +
50481 +       move_on(&len, &sd, sizeof *sd_base);
50482 +       for (bit = 0, chunk = 0;
50483 +            mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION;
50484 +            ++bit, mask >>= 1) {
50485 +               if (((bit + 1) % 16) != 0) {
50486 +                       /* handle extension */
50487 +                       sd_ext_plugin *sdplug;
50488 +
50489 +                       if (bit >= LAST_SD_EXTENSION) {
50490 +                               warning("vpf-1904",
50491 +                                       "No such extension %i in inode %llu",
50492 +                                       bit,
50493 +                                       (unsigned long long)
50494 +                                       get_inode_oid(inode));
50495 +
50496 +                               result = RETERR(-EINVAL);
50497 +                               break;
50498 +                       }
50499 +
50500 +                       sdplug = sd_ext_plugin_by_id(bit);
50501 +                       if (sdplug == NULL) {
50502 +                               warning("nikita-627",
50503 +                                       "No such extension %i in inode %llu",
50504 +                                       bit,
50505 +                                       (unsigned long long)
50506 +                                       get_inode_oid(inode));
50507 +
50508 +                               result = RETERR(-EINVAL);
50509 +                               break;
50510 +                       }
50511 +                       if (mask & 1) {
50512 +                               assert("nikita-628", sdplug->present);
50513 +                               /* alignment is not supported in node layout
50514 +                                  plugin yet.
50515 +                                  result = align( inode, &len, &sd,
50516 +                                  sdplug -> alignment );
50517 +                                  if( result != 0 )
50518 +                                  return result; */
50519 +                               result = sdplug->present(inode, &sd, &len);
50520 +                       } else if (sdplug->absent != NULL)
50521 +                               result = sdplug->absent(inode);
50522 +                       if (result)
50523 +                               break;
50524 +                       /* else, we are looking at the last bit in 16-bit
50525 +                          portion of bitmask */
50526 +               } else if (mask & 1) {
50527 +                       /* next portion of bitmask */
50528 +                       if (len < (int)sizeof(d16)) {
50529 +                               warning("nikita-629",
50530 +                                       "No space for bitmap in inode %llu",
50531 +                                       (unsigned long long)
50532 +                                       get_inode_oid(inode));
50533 +
50534 +                               result = RETERR(-EINVAL);
50535 +                               break;
50536 +                       }
50537 +                       mask = le16_to_cpu(get_unaligned((d16 *)sd));
50538 +                       bigmask <<= 16;
50539 +                       bigmask |= mask;
50540 +                       move_on(&len, &sd, sizeof(d16));
50541 +                       ++chunk;
50542 +                       if (chunk == 3) {
50543 +                               if (!(mask & 0x8000)) {
50544 +                                       /* clear last bit */
50545 +                                       mask &= ~0x8000;
50546 +                                       continue;
50547 +                               }
50548 +                               /* too much */
50549 +                               warning("nikita-630",
50550 +                                       "Too many extensions in %llu",
50551 +                                       (unsigned long long)
50552 +                                       get_inode_oid(inode));
50553 +
50554 +                               result = RETERR(-EINVAL);
50555 +                               break;
50556 +                       }
50557 +               } else
50558 +                       /* bitmask exhausted */
50559 +                       break;
50560 +       }
50561 +       state->extmask = bigmask;
50562 +       /* common initialisations */
50563 +       if (len - (bit / 16 * sizeof(d16)) > 0) {
50564 +               /* alignment in save_len_static_sd() is taken into account
50565 +                  -edward */
50566 +               warning("nikita-631", "unused space in inode %llu",
50567 +                       (unsigned long long)get_inode_oid(inode));
50568 +       }
50569 +
50570 +       return result;
50571 +}
50572 +
50573 +/* estimates size of stat-data required to store inode.
50574 +    Installed as ->save_len() method of
50575 +    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
50576 +/* was sd_len */
50577 +int save_len_static_sd(struct inode *inode /* object being processed */ )
50578 +{
50579 +       unsigned int result;
50580 +       __u64 mask;
50581 +       int bit;
50582 +
50583 +       assert("nikita-632", inode != NULL);
50584 +
50585 +       result = sizeof(reiser4_stat_data_base);
50586 +       mask = reiser4_inode_data(inode)->extmask;
50587 +       for (bit = 0; mask != 0; ++bit, mask >>= 1) {
50588 +               if (mask & 1) {
50589 +                       sd_ext_plugin *sdplug;
50590 +
50591 +                       sdplug = sd_ext_plugin_by_id(bit);
50592 +                       assert("nikita-633", sdplug != NULL);
50593 +                       /* no aligment support
50594 +                          result +=
50595 +                          round_up( result, sdplug -> alignment ) - result; */
50596 +                       result += sdplug->save_len(inode);
50597 +               }
50598 +       }
50599 +       result += bit / 16 * sizeof(d16);
50600 +       return result;
50601 +}
50602 +
50603 +/* saves inode into stat-data.
50604 +    Installed as ->save() method of
50605 +    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
50606 +/* was sd_save */
50607 +int save_static_sd(struct inode *inode /* object being processed */ ,
50608 +                  char **area /* where to save stat-data */ )
50609 +{
50610 +       int result;
50611 +       __u64 emask;
50612 +       int bit;
50613 +       unsigned int len;
50614 +       reiser4_stat_data_base *sd_base;
50615 +
50616 +       assert("nikita-634", inode != NULL);
50617 +       assert("nikita-635", area != NULL);
50618 +
50619 +       result = 0;
50620 +       emask = reiser4_inode_data(inode)->extmask;
50621 +       sd_base = (reiser4_stat_data_base *) * area;
50622 +       put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask);
50623 +       /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/
50624 +
50625 +       *area += sizeof *sd_base;
50626 +       len = 0xffffffffu;
50627 +       for (bit = 0; emask != 0; ++bit, emask >>= 1) {
50628 +               if (emask & 1) {
50629 +                       if ((bit + 1) % 16 != 0) {
50630 +                               sd_ext_plugin *sdplug;
50631 +                               sdplug = sd_ext_plugin_by_id(bit);
50632 +                               assert("nikita-636", sdplug != NULL);
50633 +                               /* no alignment support yet
50634 +                                  align( inode, &len, area,
50635 +                                  sdplug -> alignment ); */
50636 +                               result = sdplug->save(inode, area);
50637 +                               if (result)
50638 +                                       break;
50639 +                       } else {
50640 +                               put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)),
50641 +                                             (d16 *)(*area));
50642 +                               /*cputod16((unsigned)(emask & 0xffff),
50643 +                                 (d16 *) * area);*/
50644 +                               *area += sizeof(d16);
50645 +                       }
50646 +               }
50647 +       }
50648 +       return result;
50649 +}
50650 +
50651 +/* stat-data extension handling functions. */
50652 +
50653 +static int present_lw_sd(struct inode *inode /* object being processed */ ,
50654 +                        char **area /* position in stat-data */ ,
50655 +                        int *len /* remaining length */ )
50656 +{
50657 +       if (*len >= (int)sizeof(reiser4_light_weight_stat)) {
50658 +               reiser4_light_weight_stat *sd_lw;
50659 +
50660 +               sd_lw = (reiser4_light_weight_stat *) * area;
50661 +
50662 +               inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode));
50663 +               inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink));
50664 +               inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size));
50665 +               if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) {
50666 +                       inode->i_mode &= ~S_IFIFO;
50667 +                       warning("", "partially converted file is encountered");
50668 +                       reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
50669 +               }
50670 +               move_on(len, area, sizeof *sd_lw);
50671 +               return 0;
50672 +       } else
50673 +               return not_enough_space(inode, "lw sd");
50674 +}
50675 +
50676 +static int save_len_lw_sd(struct inode *inode UNUSED_ARG       /* object being
50677 +                                                                * processed */ )
50678 +{
50679 +       return sizeof(reiser4_light_weight_stat);
50680 +}
50681 +
50682 +static int save_lw_sd(struct inode *inode /* object being processed */ ,
50683 +                     char **area /* position in stat-data */ )
50684 +{
50685 +       reiser4_light_weight_stat *sd;
50686 +       mode_t delta;
50687 +
50688 +       assert("nikita-2705", inode != NULL);
50689 +       assert("nikita-2706", area != NULL);
50690 +       assert("nikita-2707", *area != NULL);
50691 +
50692 +       sd = (reiser4_light_weight_stat *) * area;
50693 +
50694 +       delta = (reiser4_inode_get_flag(inode,
50695 +                                       REISER4_PART_MIXED) ? S_IFIFO : 0);
50696 +       put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode);
50697 +       put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink);
50698 +       put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size);
50699 +       *area += sizeof *sd;
50700 +       return 0;
50701 +}
50702 +
50703 +static int present_unix_sd(struct inode *inode /* object being processed */ ,
50704 +                          char **area /* position in stat-data */ ,
50705 +                          int *len /* remaining length */ )
50706 +{
50707 +       assert("nikita-637", inode != NULL);
50708 +       assert("nikita-638", area != NULL);
50709 +       assert("nikita-639", *area != NULL);
50710 +       assert("nikita-640", len != NULL);
50711 +       assert("nikita-641", *len > 0);
50712 +
50713 +       if (*len >= (int)sizeof(reiser4_unix_stat)) {
50714 +               reiser4_unix_stat *sd;
50715 +
50716 +               sd = (reiser4_unix_stat *) * area;
50717 +
50718 +               inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid));
50719 +               inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid));
50720 +               inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime));
50721 +               inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime));
50722 +               inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime));
50723 +               if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
50724 +                       inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev));
50725 +               else
50726 +                       inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes)));
50727 +               move_on(len, area, sizeof *sd);
50728 +               return 0;
50729 +       } else
50730 +               return not_enough_space(inode, "unix sd");
50731 +}
50732 +
50733 +static int absent_unix_sd(struct inode *inode /* object being processed */ )
50734 +{
50735 +       inode->i_uid = get_super_private(inode->i_sb)->default_uid;
50736 +       inode->i_gid = get_super_private(inode->i_sb)->default_gid;
50737 +       inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
50738 +       inode_set_bytes(inode, inode->i_size);
50739 +       /* mark inode as lightweight, so that caller (lookup_common) will
50740 +          complete initialisation by copying [ug]id from a parent. */
50741 +       reiser4_inode_set_flag(inode, REISER4_LIGHT_WEIGHT);
50742 +       return 0;
50743 +}
50744 +
50745 +/* Audited by: green(2002.06.14) */
50746 +static int save_len_unix_sd(struct inode *inode UNUSED_ARG     /* object being
50747 +                                                                * processed */ )
50748 +{
50749 +       return sizeof(reiser4_unix_stat);
50750 +}
50751 +
50752 +static int save_unix_sd(struct inode *inode /* object being processed */ ,
50753 +                       char **area /* position in stat-data */ )
50754 +{
50755 +       reiser4_unix_stat *sd;
50756 +
50757 +       assert("nikita-642", inode != NULL);
50758 +       assert("nikita-643", area != NULL);
50759 +       assert("nikita-644", *area != NULL);
50760 +
50761 +       sd = (reiser4_unix_stat *) * area;
50762 +       put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid);
50763 +       put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid);
50764 +       put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime);
50765 +       put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime);
50766 +       put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime);
50767 +       if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
50768 +               put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev);
50769 +       else
50770 +               put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes);
50771 +       *area += sizeof *sd;
50772 +       return 0;
50773 +}
50774 +
50775 +static int
50776 +present_large_times_sd(struct inode *inode /* object being processed */ ,
50777 +                      char **area /* position in stat-data */ ,
50778 +                      int *len /* remaining length */ )
50779 +{
50780 +       if (*len >= (int)sizeof(reiser4_large_times_stat)) {
50781 +               reiser4_large_times_stat *sd_lt;
50782 +
50783 +               sd_lt = (reiser4_large_times_stat *) * area;
50784 +
50785 +               inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime));
50786 +               inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime));
50787 +               inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime));
50788 +
50789 +               move_on(len, area, sizeof *sd_lt);
50790 +               return 0;
50791 +       } else
50792 +               return not_enough_space(inode, "large times sd");
50793 +}
50794 +
50795 +static int
50796 +save_len_large_times_sd(struct inode *inode UNUSED_ARG
50797 +                       /* object being processed */ )
50798 +{
50799 +       return sizeof(reiser4_large_times_stat);
50800 +}
50801 +
50802 +static int
50803 +save_large_times_sd(struct inode *inode /* object being processed */ ,
50804 +                   char **area /* position in stat-data */ )
50805 +{
50806 +       reiser4_large_times_stat *sd;
50807 +
50808 +       assert("nikita-2817", inode != NULL);
50809 +       assert("nikita-2818", area != NULL);
50810 +       assert("nikita-2819", *area != NULL);
50811 +
50812 +       sd = (reiser4_large_times_stat *) * area;
50813 +
50814 +       put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime);
50815 +       put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime);
50816 +       put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime);
50817 +
50818 +       *area += sizeof *sd;
50819 +       return 0;
50820 +}
50821 +
50822 +/* symlink stat data extension */
50823 +
50824 +/* allocate memory for symlink target and attach it to inode->i_private */
50825 +static int
50826 +symlink_target_to_inode(struct inode *inode, const char *target, int len)
50827 +{
50828 +       assert("vs-845", inode->i_private == NULL);
50829 +       assert("vs-846", !reiser4_inode_get_flag(inode,
50830 +                                                REISER4_GENERIC_PTR_USED));
50831 +       /* FIXME-VS: this is prone to deadlock. Not more than other similar
50832 +          places, though */
50833 +       inode->i_private = kmalloc((size_t) len + 1,
50834 +                                  reiser4_ctx_gfp_mask_get());
50835 +       if (!inode->i_private)
50836 +               return RETERR(-ENOMEM);
50837 +
50838 +       memcpy((char *)(inode->i_private), target, (size_t) len);
50839 +       ((char *)(inode->i_private))[len] = 0;
50840 +       reiser4_inode_set_flag(inode, REISER4_GENERIC_PTR_USED);
50841 +       return 0;
50842 +}
50843 +
50844 +/* this is called on read_inode. There is nothing to do actually, but some
50845 +   sanity checks */
50846 +static int present_symlink_sd(struct inode *inode, char **area, int *len)
50847 +{
50848 +       int result;
50849 +       int length;
50850 +       reiser4_symlink_stat *sd;
50851 +
50852 +       length = (int)inode->i_size;
50853 +       /*
50854 +        * *len is number of bytes in stat data item from *area to the end of
50855 +        * item. It must be not less than size of symlink + 1 for ending 0
50856 +        */
50857 +       if (length > *len)
50858 +               return not_enough_space(inode, "symlink");
50859 +
50860 +       if (*(*area + length) != 0) {
50861 +               warning("vs-840", "Symlink is not zero terminated");
50862 +               return RETERR(-EIO);
50863 +       }
50864 +
50865 +       sd = (reiser4_symlink_stat *) * area;
50866 +       result = symlink_target_to_inode(inode, sd->body, length);
50867 +
50868 +       move_on(len, area, length + 1);
50869 +       return result;
50870 +}
50871 +
50872 +static int save_len_symlink_sd(struct inode *inode)
50873 +{
50874 +       return inode->i_size + 1;
50875 +}
50876 +
50877 +/* this is called on create and update stat data. Do nothing on update but
50878 +   update @area */
50879 +static int save_symlink_sd(struct inode *inode, char **area)
50880 +{
50881 +       int result;
50882 +       int length;
50883 +       reiser4_symlink_stat *sd;
50884 +
50885 +       length = (int)inode->i_size;
50886 +       /* inode->i_size must be set already */
50887 +       assert("vs-841", length);
50888 +
50889 +       result = 0;
50890 +       sd = (reiser4_symlink_stat *) * area;
50891 +       if (!reiser4_inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) {
50892 +               const char *target;
50893 +
50894 +               target = (const char *)(inode->i_private);
50895 +               inode->i_private = NULL;
50896 +
50897 +               result = symlink_target_to_inode(inode, target, length);
50898 +
50899 +               /* copy symlink to stat data */
50900 +               memcpy(sd->body, target, (size_t) length);
50901 +               (*area)[length] = 0;
50902 +       } else {
50903 +               /* there is nothing to do in update but move area */
50904 +               assert("vs-844",
50905 +                      !memcmp(inode->i_private, sd->body,
50906 +                              (size_t) length + 1));
50907 +       }
50908 +
50909 +       *area += (length + 1);
50910 +       return result;
50911 +}
50912 +
50913 +static int present_flags_sd(struct inode *inode /* object being processed */ ,
50914 +                           char **area /* position in stat-data */ ,
50915 +                           int *len /* remaining length */ )
50916 +{
50917 +       assert("nikita-645", inode != NULL);
50918 +       assert("nikita-646", area != NULL);
50919 +       assert("nikita-647", *area != NULL);
50920 +       assert("nikita-648", len != NULL);
50921 +       assert("nikita-649", *len > 0);
50922 +
50923 +       if (*len >= (int)sizeof(reiser4_flags_stat)) {
50924 +               reiser4_flags_stat *sd;
50925 +
50926 +               sd = (reiser4_flags_stat *) * area;
50927 +               inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags));
50928 +               move_on(len, area, sizeof *sd);
50929 +               return 0;
50930 +       } else
50931 +               return not_enough_space(inode, "generation and attrs");
50932 +}
50933 +
50934 +/* Audited by: green(2002.06.14) */
50935 +static int save_len_flags_sd(struct inode *inode UNUSED_ARG    /* object being
50936 +                                                                * processed */ )
50937 +{
50938 +       return sizeof(reiser4_flags_stat);
50939 +}
50940 +
50941 +static int save_flags_sd(struct inode *inode /* object being processed */ ,
50942 +                        char **area /* position in stat-data */ )
50943 +{
50944 +       reiser4_flags_stat *sd;
50945 +
50946 +       assert("nikita-650", inode != NULL);
50947 +       assert("nikita-651", area != NULL);
50948 +       assert("nikita-652", *area != NULL);
50949 +
50950 +       sd = (reiser4_flags_stat *) * area;
50951 +       put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags);
50952 +       *area += sizeof *sd;
50953 +       return 0;
50954 +}
50955 +
50956 +static int absent_plugin_sd(struct inode *inode);
50957 +static int present_plugin_sd(struct inode *inode /* object being processed */ ,
50958 +                            char **area /* position in stat-data */ ,
50959 +                            int *len /* remaining length */,
50960 +                            int is_pset /* 1 if plugin set, 0 if heir set. */)
50961 +{
50962 +       reiser4_plugin_stat *sd;
50963 +       reiser4_plugin *plugin;
50964 +       reiser4_inode *info;
50965 +       int i;
50966 +       __u16 mask;
50967 +       int result;
50968 +       int num_of_plugins;
50969 +
50970 +       assert("nikita-653", inode != NULL);
50971 +       assert("nikita-654", area != NULL);
50972 +       assert("nikita-655", *area != NULL);
50973 +       assert("nikita-656", len != NULL);
50974 +       assert("nikita-657", *len > 0);
50975 +
50976 +       if (*len < (int)sizeof(reiser4_plugin_stat))
50977 +               return not_enough_space(inode, "plugin");
50978 +
50979 +       sd = (reiser4_plugin_stat *) * area;
50980 +       info = reiser4_inode_data(inode);
50981 +
50982 +       mask = 0;
50983 +       num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no));
50984 +       move_on(len, area, sizeof *sd);
50985 +       result = 0;
50986 +       for (i = 0; i < num_of_plugins; ++i) {
50987 +               reiser4_plugin_slot *slot;
50988 +               reiser4_plugin_type type;
50989 +               pset_member memb;
50990 +
50991 +               slot = (reiser4_plugin_slot *) * area;
50992 +               if (*len < (int)sizeof *slot)
50993 +                       return not_enough_space(inode, "additional plugin");
50994 +
50995 +               memb = le16_to_cpu(get_unaligned(&slot->pset_memb));
50996 +               type = aset_member_to_type_unsafe(memb);
50997 +
50998 +               if (type == REISER4_PLUGIN_TYPES) {
50999 +                       warning("nikita-3502",
51000 +                               "wrong %s member (%i) for %llu", is_pset ?
51001 +                               "pset" : "hset", memb,
51002 +                               (unsigned long long)get_inode_oid(inode));
51003 +                       return RETERR(-EINVAL);
51004 +               }
51005 +               plugin = plugin_by_disk_id(reiser4_tree_by_inode(inode),
51006 +                                          type, &slot->id);
51007 +               if (plugin == NULL)
51008 +                       return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode);
51009 +
51010 +               /* plugin is loaded into inode, mark this into inode's
51011 +                  bitmask of loaded non-standard plugins */
51012 +               if (!(mask & (1 << memb))) {
51013 +                       mask |= (1 << memb);
51014 +               } else {
51015 +                       warning("nikita-658", "duplicate plugin for %llu",
51016 +                               (unsigned long long)get_inode_oid(inode));
51017 +                       return RETERR(-EINVAL);
51018 +               }
51019 +               move_on(len, area, sizeof *slot);
51020 +               /* load plugin data, if any */
51021 +               if (plugin->h.pops != NULL && plugin->h.pops->load)
51022 +                       result = plugin->h.pops->load(inode, plugin, area, len);
51023 +               else
51024 +                       result = aset_set_unsafe(is_pset ? &info->pset :
51025 +                                                &info->hset, memb, plugin);
51026 +               if (result)
51027 +                       return result;
51028 +       }
51029 +       if (is_pset) {
51030 +               /* if object plugin wasn't loaded from stat-data, guess it by
51031 +                  mode bits */
51032 +               plugin = file_plugin_to_plugin(inode_file_plugin(inode));
51033 +               if (plugin == NULL)
51034 +                       result = absent_plugin_sd(inode);
51035 +               info->plugin_mask = mask;
51036 +       } else
51037 +               info->heir_mask = mask;
51038 +
51039 +       return result;
51040 +}
51041 +
51042 +static int present_pset_sd(struct inode *inode, char **area, int *len) {
51043 +       return present_plugin_sd(inode, area, len, 1 /* pset */);
51044 +}
51045 +
51046 +/* Determine object plugin for @inode based on i_mode.
51047 +
51048 +   Many objects in reiser4 file system are controlled by standard object
51049 +   plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on.
51050 +
51051 +   For such files we don't explicitly store plugin id in object stat
51052 +   data. Rather required plugin is guessed from mode bits, where file "type"
51053 +   is encoded (see stat(2)).
51054 +*/
51055 +static int
51056 +guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ )
51057 +{
51058 +       int fplug_id;
51059 +       int dplug_id;
51060 +       reiser4_inode *info;
51061 +
51062 +       assert("nikita-736", inode != NULL);
51063 +
51064 +       dplug_id = fplug_id = -1;
51065 +
51066 +       switch (inode->i_mode & S_IFMT) {
51067 +       case S_IFSOCK:
51068 +       case S_IFBLK:
51069 +       case S_IFCHR:
51070 +       case S_IFIFO:
51071 +               fplug_id = SPECIAL_FILE_PLUGIN_ID;
51072 +               break;
51073 +       case S_IFLNK:
51074 +               fplug_id = SYMLINK_FILE_PLUGIN_ID;
51075 +               break;
51076 +       case S_IFDIR:
51077 +               fplug_id = DIRECTORY_FILE_PLUGIN_ID;
51078 +               dplug_id = HASHED_DIR_PLUGIN_ID;
51079 +               break;
51080 +       default:
51081 +               warning("nikita-737", "wrong file mode: %o", inode->i_mode);
51082 +               return RETERR(-EIO);
51083 +       case S_IFREG:
51084 +               fplug_id = UNIX_FILE_PLUGIN_ID;
51085 +               break;
51086 +       }
51087 +       info = reiser4_inode_data(inode);
51088 +       set_plugin(&info->pset, PSET_FILE, (fplug_id >= 0) ?
51089 +                  plugin_by_id(REISER4_FILE_PLUGIN_TYPE, fplug_id) : NULL);
51090 +       set_plugin(&info->pset, PSET_DIR, (dplug_id >= 0) ?
51091 +                  plugin_by_id(REISER4_DIR_PLUGIN_TYPE, dplug_id) : NULL);
51092 +       return 0;
51093 +}
51094 +
51095 +/* Audited by: green(2002.06.14) */
51096 +static int absent_plugin_sd(struct inode *inode /* object being processed */ )
51097 +{
51098 +       int result;
51099 +
51100 +       assert("nikita-659", inode != NULL);
51101 +
51102 +       result = guess_plugin_by_mode(inode);
51103 +       /* if mode was wrong, guess_plugin_by_mode() returns "regular file",
51104 +          but setup_inode_ops() will call make_bad_inode().
51105 +          Another, more logical but bit more complex solution is to add
51106 +          "bad-file plugin". */
51107 +       /* FIXME-VS: activate was called here */
51108 +       return result;
51109 +}
51110 +
51111 +/* helper function for plugin_sd_save_len(): calculate how much space
51112 +    required to save state of given plugin */
51113 +/* Audited by: green(2002.06.14) */
51114 +static int len_for(reiser4_plugin * plugin /* plugin to save */ ,
51115 +                  struct inode *inode /* object being processed */ ,
51116 +                  pset_member memb,
51117 +                  int len, int is_pset)
51118 +{
51119 +       reiser4_inode *info;
51120 +       assert("nikita-661", inode != NULL);
51121 +
51122 +       if (plugin == NULL)
51123 +               return len;
51124 +
51125 +       info = reiser4_inode_data(inode);
51126 +       if (is_pset ?
51127 +           info->plugin_mask & (1 << memb) :
51128 +           info->heir_mask & (1 << memb)) {
51129 +               len += sizeof(reiser4_plugin_slot);
51130 +               if (plugin->h.pops && plugin->h.pops->save_len != NULL) {
51131 +                       /* non-standard plugin, call method */
51132 +                       /* commented as it is incompatible with alignment
51133 +                        * policy in save_plug() -edward */
51134 +                       /* len = round_up(len, plugin->h.pops->alignment); */
51135 +                       len += plugin->h.pops->save_len(inode, plugin);
51136 +               }
51137 +       }
51138 +       return len;
51139 +}
51140 +
51141 +/* calculate how much space is required to save state of all plugins,
51142 +    associated with inode */
51143 +static int save_len_plugin_sd(struct inode *inode /* object being processed */,
51144 +                             int is_pset)
51145 +{
51146 +       int len;
51147 +       int last;
51148 +       reiser4_inode *state;
51149 +       pset_member memb;
51150 +
51151 +       assert("nikita-663", inode != NULL);
51152 +
51153 +       state = reiser4_inode_data(inode);
51154 +
51155 +       /* common case: no non-standard plugins */
51156 +       if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
51157 +               return 0;
51158 +       len = sizeof(reiser4_plugin_stat);
51159 +       last = PSET_LAST;
51160 +
51161 +       for (memb = 0; memb < last; ++memb) {
51162 +             len = len_for(aset_get(is_pset ? state->pset : state->hset, memb),
51163 +                           inode, memb, len, is_pset);
51164 +       }
51165 +       assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat));
51166 +       return len;
51167 +}
51168 +
51169 +static int save_len_pset_sd(struct inode *inode) {
51170 +       return save_len_plugin_sd(inode, 1 /* pset */);
51171 +}
51172 +
51173 +/* helper function for plugin_sd_save(): save plugin, associated with
51174 +    inode. */
51175 +static int save_plug(reiser4_plugin * plugin /* plugin to save */ ,
51176 +                    struct inode *inode /* object being processed */ ,
51177 +                    int memb /* what element of pset is saved */ ,
51178 +                    char **area /* position in stat-data */ ,
51179 +                    int *count /* incremented if plugin were actually saved. */,
51180 +                    int is_pset /* 1 for plugin set, 0 for heir set */)
51181 +{
51182 +       reiser4_plugin_slot *slot;
51183 +       int fake_len;
51184 +       int result;
51185 +
51186 +       assert("nikita-665", inode != NULL);
51187 +       assert("nikita-666", area != NULL);
51188 +       assert("nikita-667", *area != NULL);
51189 +
51190 +       if (plugin == NULL)
51191 +               return 0;
51192 +
51193 +       if (is_pset ?
51194 +           !(reiser4_inode_data(inode)->plugin_mask & (1 << memb)) :
51195 +           !(reiser4_inode_data(inode)->heir_mask & (1 << memb)))
51196 +               return 0;
51197 +       slot = (reiser4_plugin_slot *) * area;
51198 +       put_unaligned(cpu_to_le16(memb), &slot->pset_memb);
51199 +       put_unaligned(cpu_to_le16(plugin->h.id), &slot->id);
51200 +       fake_len = (int)0xffff;
51201 +       move_on(&fake_len, area, sizeof *slot);
51202 +       ++*count;
51203 +       result = 0;
51204 +       if (plugin->h.pops != NULL) {
51205 +               if (plugin->h.pops->save != NULL)
51206 +                       result = plugin->h.pops->save(inode, plugin, area);
51207 +       }
51208 +       return result;
51209 +}
51210 +
51211 +/* save state of all non-standard plugins associated with inode */
51212 +static int save_plugin_sd(struct inode *inode /* object being processed */ ,
51213 +                         char **area /* position in stat-data */,
51214 +                         int is_pset /* 1 for pset, 0 for hset */)
51215 +{
51216 +       int fake_len;
51217 +       int result = 0;
51218 +       int num_of_plugins;
51219 +       reiser4_plugin_stat *sd;
51220 +       reiser4_inode *state;
51221 +       pset_member memb;
51222 +
51223 +       assert("nikita-669", inode != NULL);
51224 +       assert("nikita-670", area != NULL);
51225 +       assert("nikita-671", *area != NULL);
51226 +
51227 +       state = reiser4_inode_data(inode);
51228 +       if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
51229 +               return 0;
51230 +       sd = (reiser4_plugin_stat *) * area;
51231 +       fake_len = (int)0xffff;
51232 +       move_on(&fake_len, area, sizeof *sd);
51233 +
51234 +       num_of_plugins = 0;
51235 +       for (memb = 0; memb < PSET_LAST; ++memb) {
51236 +               result = save_plug(aset_get(is_pset ? state->pset : state->hset,
51237 +                                           memb),
51238 +                                  inode, memb, area, &num_of_plugins, is_pset);
51239 +               if (result != 0)
51240 +                       break;
51241 +       }
51242 +
51243 +       put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no);
51244 +       return result;
51245 +}
51246 +
51247 +static int save_pset_sd(struct inode *inode, char **area) {
51248 +       return save_plugin_sd(inode, area, 1 /* pset */);
51249 +}
51250 +
51251 +static int present_hset_sd(struct inode *inode, char **area, int *len) {
51252 +       return present_plugin_sd(inode, area, len, 0 /* hset */);
51253 +}
51254 +
51255 +static int save_len_hset_sd(struct inode *inode) {
51256 +       return save_len_plugin_sd(inode, 0 /* pset */);
51257 +}
51258 +
51259 +static int save_hset_sd(struct inode *inode, char **area) {
51260 +       return save_plugin_sd(inode, area, 0 /* hset */);
51261 +}
51262 +
51263 +/* helper function for crypto_sd_present(), crypto_sd_save.
51264 +   Extract crypto info from stat-data and attach it to inode */
51265 +static int extract_crypto_info (struct inode * inode,
51266 +                               reiser4_crypto_stat * sd)
51267 +{
51268 +       struct reiser4_crypto_info * info;
51269 +       assert("edward-11", !inode_crypto_info(inode));
51270 +       assert("edward-1413",
51271 +              !reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED));
51272 +       /* create and attach a crypto-stat without secret key loaded */
51273 +       info = reiser4_alloc_crypto_info(inode);
51274 +       if (IS_ERR(info))
51275 +               return PTR_ERR(info);
51276 +       info->keysize = le16_to_cpu(get_unaligned(&sd->keysize));
51277 +       memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize);
51278 +       reiser4_attach_crypto_info(inode, info);
51279 +       reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
51280 +       return 0;
51281 +}
51282 +
51283 +/* crypto stat-data extension */
51284 +
51285 +static int present_crypto_sd(struct inode *inode, char **area, int *len)
51286 +{
51287 +       int result;
51288 +       reiser4_crypto_stat *sd;
51289 +       digest_plugin *dplug = inode_digest_plugin(inode);
51290 +
51291 +       assert("edward-06", dplug != NULL);
51292 +       assert("edward-684", dplug->fipsize);
51293 +       assert("edward-07", area != NULL);
51294 +       assert("edward-08", *area != NULL);
51295 +       assert("edward-09", len != NULL);
51296 +       assert("edward-10", *len > 0);
51297 +
51298 +       if (*len < (int)sizeof(reiser4_crypto_stat)) {
51299 +               return not_enough_space(inode, "crypto-sd");
51300 +       }
51301 +       /* *len is number of bytes in stat data item from *area to the end of
51302 +          item. It must be not less than size of this extension */
51303 +       assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len);
51304 +
51305 +       sd = (reiser4_crypto_stat *) * area;
51306 +       result = extract_crypto_info(inode, sd);
51307 +       move_on(len, area, sizeof(*sd) + dplug->fipsize);
51308 +
51309 +       return result;
51310 +}
51311 +
51312 +static int save_len_crypto_sd(struct inode *inode)
51313 +{
51314 +       return sizeof(reiser4_crypto_stat) +
51315 +               inode_digest_plugin(inode)->fipsize;
51316 +}
51317 +
51318 +static int save_crypto_sd(struct inode *inode, char **area)
51319 +{
51320 +       int result = 0;
51321 +       reiser4_crypto_stat *sd;
51322 +       struct reiser4_crypto_info * info = inode_crypto_info(inode);
51323 +       digest_plugin *dplug = inode_digest_plugin(inode);
51324 +
51325 +       assert("edward-12", dplug != NULL);
51326 +       assert("edward-13", area != NULL);
51327 +       assert("edward-14", *area != NULL);
51328 +       assert("edward-15", info != NULL);
51329 +       assert("edward-1414", info->keyid != NULL);
51330 +       assert("edward-1415", info->keysize != 0);
51331 +       assert("edward-76", reiser4_inode_data(inode) != NULL);
51332 +
51333 +       if (!reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) {
51334 +               /* file is just created */
51335 +               sd = (reiser4_crypto_stat *) *area;
51336 +               /* copy everything but private key to the disk stat-data */
51337 +               put_unaligned(cpu_to_le16(info->keysize), &sd->keysize);
51338 +               memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize);
51339 +               reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
51340 +       }
51341 +       *area += (sizeof(*sd) + dplug->fipsize);
51342 +       return result;
51343 +}
51344 +
51345 +static int eio(struct inode *inode, char **area, int *len)
51346 +{
51347 +       return RETERR(-EIO);
51348 +}
51349 +
51350 +sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = {
51351 +       [LIGHT_WEIGHT_STAT] = {
51352 +               .h = {
51353 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51354 +                       .id = LIGHT_WEIGHT_STAT,
51355 +                       .pops = NULL,
51356 +                       .label = "light-weight sd",
51357 +                       .desc = "sd for light-weight files",
51358 +                       .linkage = {NULL,NULL}
51359 +               },
51360 +               .present = present_lw_sd,
51361 +               .absent = NULL,
51362 +               .save_len = save_len_lw_sd,
51363 +               .save = save_lw_sd,
51364 +               .alignment = 8
51365 +       },
51366 +       [UNIX_STAT] = {
51367 +               .h = {
51368 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51369 +                       .id = UNIX_STAT,
51370 +                       .pops = NULL,
51371 +                       .label = "unix-sd",
51372 +                       .desc = "unix stat-data fields",
51373 +                       .linkage = {NULL,NULL}
51374 +               },
51375 +               .present = present_unix_sd,
51376 +               .absent = absent_unix_sd,
51377 +               .save_len = save_len_unix_sd,
51378 +               .save = save_unix_sd,
51379 +               .alignment = 8
51380 +       },
51381 +       [LARGE_TIMES_STAT] = {
51382 +               .h = {
51383 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51384 +                       .id = LARGE_TIMES_STAT,
51385 +                       .pops = NULL,
51386 +                       .label = "64time-sd",
51387 +                       .desc = "nanosecond resolution for times",
51388 +                       .linkage = {NULL,NULL}
51389 +               },
51390 +               .present = present_large_times_sd,
51391 +               .absent = NULL,
51392 +               .save_len = save_len_large_times_sd,
51393 +               .save = save_large_times_sd,
51394 +               .alignment = 8
51395 +       },
51396 +       [SYMLINK_STAT] = {
51397 +               /* stat data of symlink has this extension */
51398 +               .h = {
51399 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51400 +                       .id = SYMLINK_STAT,
51401 +                       .pops = NULL,
51402 +                       .label = "symlink-sd",
51403 +                       .desc =
51404 +                       "stat data is appended with symlink name",
51405 +                       .linkage = {NULL,NULL}
51406 +               },
51407 +               .present = present_symlink_sd,
51408 +               .absent = NULL,
51409 +               .save_len = save_len_symlink_sd,
51410 +               .save = save_symlink_sd,
51411 +               .alignment = 8
51412 +       },
51413 +       [PLUGIN_STAT] = {
51414 +               .h = {
51415 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51416 +                       .id = PLUGIN_STAT,
51417 +                       .pops = NULL,
51418 +                       .label = "plugin-sd",
51419 +                       .desc = "plugin stat-data fields",
51420 +                       .linkage = {NULL,NULL}
51421 +               },
51422 +               .present = present_pset_sd,
51423 +               .absent = absent_plugin_sd,
51424 +               .save_len = save_len_pset_sd,
51425 +               .save = save_pset_sd,
51426 +               .alignment = 8
51427 +       },
51428 +       [HEIR_STAT] = {
51429 +               .h = {
51430 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51431 +                       .id = HEIR_STAT,
51432 +                       .pops = NULL,
51433 +                       .label = "heir-plugin-sd",
51434 +                       .desc = "heir plugin stat-data fields",
51435 +                       .linkage = {NULL,NULL}
51436 +               },
51437 +               .present = present_hset_sd,
51438 +               .absent = NULL,
51439 +               .save_len = save_len_hset_sd,
51440 +               .save = save_hset_sd,
51441 +               .alignment = 8
51442 +       },
51443 +       [FLAGS_STAT] = {
51444 +               .h = {
51445 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51446 +                       .id = FLAGS_STAT,
51447 +                       .pops = NULL,
51448 +                       .label = "flags-sd",
51449 +                       .desc = "inode bit flags",
51450 +                       .linkage = {NULL, NULL}
51451 +               },
51452 +               .present = present_flags_sd,
51453 +               .absent = NULL,
51454 +               .save_len = save_len_flags_sd,
51455 +               .save = save_flags_sd,
51456 +               .alignment = 8
51457 +       },
51458 +       [CAPABILITIES_STAT] = {
51459 +               .h = {
51460 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51461 +                       .id = CAPABILITIES_STAT,
51462 +                       .pops = NULL,
51463 +                       .label = "capabilities-sd",
51464 +                       .desc = "capabilities",
51465 +                       .linkage = {NULL, NULL}
51466 +               },
51467 +               .present = eio,
51468 +               .absent = NULL,
51469 +               .save_len = save_len_flags_sd,
51470 +               .save = save_flags_sd,
51471 +               .alignment = 8
51472 +       },
51473 +       [CRYPTO_STAT] = {
51474 +               .h = {
51475 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
51476 +                       .id = CRYPTO_STAT,
51477 +                       .pops = NULL,
51478 +                       .label = "crypto-sd",
51479 +                       .desc = "secret key size and id",
51480 +                       .linkage = {NULL, NULL}
51481 +               },
51482 +               .present = present_crypto_sd,
51483 +               .absent = NULL,
51484 +               .save_len = save_len_crypto_sd,
51485 +               .save = save_crypto_sd,
51486 +               .alignment = 8
51487 +       }
51488 +};
51489 +
51490 +/* Make Linus happy.
51491 +   Local variables:
51492 +   c-indentation-style: "K&R"
51493 +   mode-name: "LC"
51494 +   c-basic-offset: 8
51495 +   tab-width: 8
51496 +   fill-column: 120
51497 +   End:
51498 +*/
51499 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/static_stat.h linux-2.6.27/fs/reiser4/plugin/item/static_stat.h
51500 --- linux-2.6.27.orig/fs/reiser4/plugin/item/static_stat.h      1970-01-01 03:00:00.000000000 +0300
51501 +++ linux-2.6.27/fs/reiser4/plugin/item/static_stat.h   2008-10-12 18:20:01.000000000 +0400
51502 @@ -0,0 +1,224 @@
51503 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51504 +
51505 +/* This describes the static_stat item, used to hold all information needed by the stat() syscall.
51506 +
51507 +In the case where each file has not less than the fields needed by the
51508 +stat() syscall, it is more compact to store those fields in this
51509 +struct.
51510 +
51511 +If this item does not exist, then all stats are dynamically resolved.
51512 +At the moment, we either resolve all stats dynamically or all of them
51513 +statically.  If you think this is not fully optimal, and the rest of
51514 +reiser4 is working, then fix it...:-)
51515 +
51516 +*/
51517 +
51518 +#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ )
51519 +#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__
51520 +
51521 +#include "../../forward.h"
51522 +#include "../../dformat.h"
51523 +
51524 +#include <linux/fs.h>          /* for struct inode */
51525 +
51526 +/* Stat data layout: goals and implementation.
51527 +
51528 +   We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to
51529 +   them, including not having semantic metadata attached to them.
51530 +
51531 +   There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you
51532 +   want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically
51533 +   sized structure because the statically sized structure knows without recording it what the names and lengths of the
51534 +   attributes are.
51535 +
51536 +   This leads to a natural compromise, which is to special case those files which have simply the standard unix file
51537 +   attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix
51538 +   file in their use of file attributes.
51539 +
51540 +   Yet this compromise deserves to be compromised a little.
51541 +
51542 +   We accommodate the case where you have no more than the standard unix file attributes by using an "extension
51543 +   bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum).
51544 +
51545 +   If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited
51546 +   from parent directory (as uid, gid) or initialised to some sane values.
51547 +
51548 +   To capitalize on existing code infrastructure, extensions are
51549 +   implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE.
51550 +   Each stat-data extension plugin implements four methods:
51551 +
51552 +    ->present() called by sd_load() when this extension is found in stat-data
51553 +    ->absent() called by sd_load() when this extension is not found in stat-data
51554 +    ->save_len() called by sd_len() to calculate total length of stat-data
51555 +    ->save() called by sd_save() to store extension data into stat-data
51556 +
51557 +    Implementation is in fs/reiser4/plugin/item/static_stat.c
51558 +*/
51559 +
51560 +/* stat-data extension. Please order this by presumed frequency of use */
51561 +typedef enum {
51562 +       /* support for light-weight files */
51563 +       LIGHT_WEIGHT_STAT,
51564 +       /* data required to implement unix stat(2) call. Layout is in
51565 +          reiser4_unix_stat. If this is not present, file is light-weight */
51566 +       UNIX_STAT,
51567 +       /* this contains additional set of 32bit [anc]time fields to implement
51568 +          nanosecond resolution. Layout is in reiser4_large_times_stat. Usage
51569 +          if this extension is governed by 32bittimes mount option. */
51570 +       LARGE_TIMES_STAT,
51571 +       /* stat data has link name included */
51572 +       SYMLINK_STAT,
51573 +       /* on-disk slots of non-standard plugins for main plugin table
51574 +          (@reiser4_inode->pset), that is, plugins that cannot be deduced
51575 +          from file mode bits), for example, aggregation, interpolation etc. */
51576 +       PLUGIN_STAT,
51577 +       /* this extension contains persistent inode flags. These flags are
51578 +          single bits: immutable, append, only, etc. Layout is in
51579 +          reiser4_flags_stat. */
51580 +       FLAGS_STAT,
51581 +       /* this extension contains capabilities sets, associated with this
51582 +          file. Layout is in reiser4_capabilities_stat */
51583 +       CAPABILITIES_STAT,
51584 +       /* this extension contains size and public id of the secret key.
51585 +          Layout is in reiser4_crypto_stat */
51586 +       CRYPTO_STAT,
51587 +       /* on-disk slots of non-default plugins for inheritance, which
51588 +          are extracted to special plugin table (@reiser4_inode->hset).
51589 +          By default, children of the object will inherit plugins from
51590 +          its main plugin table (pset). */
51591 +       HEIR_STAT,
51592 +       LAST_SD_EXTENSION,
51593 +       /*
51594 +        * init_inode_static_sd() iterates over extension mask until all
51595 +        * non-zero bits are processed. This means, that neither ->present(),
51596 +        * nor ->absent() methods will be called for stat-data extensions that
51597 +        * go after last present extension. But some basic extensions, we want
51598 +        * either ->absent() or ->present() method to be called, because these
51599 +        * extensions set up something in inode even when they are not
51600 +        * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all
51601 +        * extensions before and including LAST_IMPORTANT_SD_EXTENSION either
51602 +        * ->present(), or ->absent() method will be called, independently of
51603 +        * what other extensions are present.
51604 +        */
51605 +       LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT
51606 +} sd_ext_bits;
51607 +
51608 +/* minimal stat-data. This allows to support light-weight files. */
51609 +typedef struct reiser4_stat_data_base {
51610 +       /*  0 */ __le16 extmask;
51611 +       /*  2 */
51612 +} PACKED reiser4_stat_data_base;
51613 +
51614 +typedef struct reiser4_light_weight_stat {
51615 +       /*  0 */ __le16 mode;
51616 +       /*  2 */ __le32 nlink;
51617 +       /*  6 */ __le64 size;
51618 +       /* size in bytes */
51619 +       /* 14 */
51620 +} PACKED reiser4_light_weight_stat;
51621 +
51622 +typedef struct reiser4_unix_stat {
51623 +       /* owner id */
51624 +       /*  0 */ __le32 uid;
51625 +       /* group id */
51626 +       /*  4 */ __le32 gid;
51627 +       /* access time */
51628 +       /*  8 */ __le32 atime;
51629 +       /* modification time */
51630 +       /* 12 */ __le32 mtime;
51631 +       /* change time */
51632 +       /* 16 */ __le32 ctime;
51633 +       union {
51634 +               /* minor:major for device files */
51635 +               /* 20 */ __le64 rdev;
51636 +               /* bytes used by file */
51637 +               /* 20 */ __le64 bytes;
51638 +       } u;
51639 +       /* 28 */
51640 +} PACKED reiser4_unix_stat;
51641 +
51642 +/* symlink stored as part of inode */
51643 +typedef struct reiser4_symlink_stat {
51644 +       char body[0];
51645 +} PACKED reiser4_symlink_stat;
51646 +
51647 +typedef struct reiser4_plugin_slot {
51648 +       /*  0 */ __le16 pset_memb;
51649 +       /*  2 */ __le16 id;
51650 +       /*  4 *//* here plugin stores its persistent state */
51651 +} PACKED reiser4_plugin_slot;
51652 +
51653 +/* stat-data extension for files with non-standard plugin. */
51654 +typedef struct reiser4_plugin_stat {
51655 +       /* number of additional plugins, associated with this object */
51656 +       /*  0 */ __le16 plugins_no;
51657 +       /*  2 */ reiser4_plugin_slot slot[0];
51658 +       /*  2 */
51659 +} PACKED reiser4_plugin_stat;
51660 +
51661 +/* stat-data extension for inode flags. Currently it is just fixed-width 32
51662 + * bit mask. If need arise, this can be replaced with variable width
51663 + * bitmask. */
51664 +typedef struct reiser4_flags_stat {
51665 +       /*  0 */ __le32 flags;
51666 +       /*  4 */
51667 +} PACKED reiser4_flags_stat;
51668 +
51669 +typedef struct reiser4_capabilities_stat {
51670 +       /*  0 */ __le32 effective;
51671 +       /*  8 */ __le32 permitted;
51672 +       /* 16 */
51673 +} PACKED reiser4_capabilities_stat;
51674 +
51675 +typedef struct reiser4_cluster_stat {
51676 +/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */
51677 +       /* 0 */ d8 cluster_shift;
51678 +       /* 1 */
51679 +} PACKED reiser4_cluster_stat;
51680 +
51681 +typedef struct reiser4_crypto_stat {
51682 +       /* secret key size, bits */
51683 +       /*  0 */ d16 keysize;
51684 +       /* secret key id */
51685 +       /*  2 */ d8 keyid[0];
51686 +       /* 2 */
51687 +} PACKED reiser4_crypto_stat;
51688 +
51689 +typedef struct reiser4_large_times_stat {
51690 +       /* access time */
51691 +       /* 0 */ d32 atime;
51692 +       /* modification time */
51693 +       /* 4 */ d32 mtime;
51694 +       /* change time */
51695 +       /* 8 */ d32 ctime;
51696 +       /* 12 */
51697 +} PACKED reiser4_large_times_stat;
51698 +
51699 +/* this structure is filled by sd_item_stat */
51700 +typedef struct sd_stat {
51701 +       int dirs;
51702 +       int files;
51703 +       int others;
51704 +} sd_stat;
51705 +
51706 +/* plugin->item.common.* */
51707 +extern void print_sd(const char *prefix, coord_t * coord);
51708 +extern void item_stat_static_sd(const coord_t * coord, void *vp);
51709 +
51710 +/* plugin->item.s.sd.* */
51711 +extern int init_inode_static_sd(struct inode *inode, char *sd, int len);
51712 +extern int save_len_static_sd(struct inode *inode);
51713 +extern int save_static_sd(struct inode *inode, char **area);
51714 +
51715 +/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */
51716 +#endif
51717 +
51718 +/* Make Linus happy.
51719 +   Local variables:
51720 +   c-indentation-style: "K&R"
51721 +   mode-name: "LC"
51722 +   c-basic-offset: 8
51723 +   tab-width: 8
51724 +   fill-column: 120
51725 +   End:
51726 +*/
51727 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/tail.c linux-2.6.27/fs/reiser4/plugin/item/tail.c
51728 --- linux-2.6.27.orig/fs/reiser4/plugin/item/tail.c     1970-01-01 03:00:00.000000000 +0300
51729 +++ linux-2.6.27/fs/reiser4/plugin/item/tail.c  2008-10-12 18:20:01.000000000 +0400
51730 @@ -0,0 +1,807 @@
51731 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51732 +
51733 +#include "item.h"
51734 +#include "../../inode.h"
51735 +#include "../../page_cache.h"
51736 +#include "../../carry.h"
51737 +#include "../../vfs_ops.h"
51738 +
51739 +#include <linux/quotaops.h>
51740 +#include <asm/uaccess.h>
51741 +#include <linux/swap.h>
51742 +#include <linux/writeback.h>
51743 +
51744 +/* plugin->u.item.b.max_key_inside */
51745 +reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key)
51746 +{
51747 +       item_key_by_coord(coord, key);
51748 +       set_key_offset(key, get_key_offset(reiser4_max_key()));
51749 +       return key;
51750 +}
51751 +
51752 +/* plugin->u.item.b.can_contain_key */
51753 +int can_contain_key_tail(const coord_t *coord, const reiser4_key *key,
51754 +                        const reiser4_item_data *data)
51755 +{
51756 +       reiser4_key item_key;
51757 +
51758 +       if (item_plugin_by_coord(coord) != data->iplug)
51759 +               return 0;
51760 +
51761 +       item_key_by_coord(coord, &item_key);
51762 +       if (get_key_locality(key) != get_key_locality(&item_key) ||
51763 +           get_key_objectid(key) != get_key_objectid(&item_key))
51764 +               return 0;
51765 +
51766 +       return 1;
51767 +}
51768 +
51769 +/* plugin->u.item.b.mergeable
51770 +   first item is of tail type */
51771 +/* Audited by: green(2002.06.14) */
51772 +int mergeable_tail(const coord_t *p1, const coord_t *p2)
51773 +{
51774 +       reiser4_key key1, key2;
51775 +
51776 +       assert("vs-535", plugin_of_group(item_plugin_by_coord(p1),
51777 +                                        UNIX_FILE_METADATA_ITEM_TYPE));
51778 +       assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID);
51779 +
51780 +       if (item_id_by_coord(p2) != FORMATTING_ID) {
51781 +               /* second item is of another type */
51782 +               return 0;
51783 +       }
51784 +
51785 +       item_key_by_coord(p1, &key1);
51786 +       item_key_by_coord(p2, &key2);
51787 +       if (get_key_locality(&key1) != get_key_locality(&key2) ||
51788 +           get_key_objectid(&key1) != get_key_objectid(&key2)
51789 +           || get_key_type(&key1) != get_key_type(&key2)) {
51790 +               /* items of different objects */
51791 +               return 0;
51792 +       }
51793 +       if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) {
51794 +               /* not adjacent items */
51795 +               return 0;
51796 +       }
51797 +       return 1;
51798 +}
51799 +
51800 +/* plugin->u.item.b.print
51801 +   plugin->u.item.b.check */
51802 +
51803 +/* plugin->u.item.b.nr_units */
51804 +pos_in_node_t nr_units_tail(const coord_t * coord)
51805 +{
51806 +       return item_length_by_coord(coord);
51807 +}
51808 +
51809 +/* plugin->u.item.b.lookup */
51810 +lookup_result
51811 +lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord)
51812 +{
51813 +       reiser4_key item_key;
51814 +       __u64 lookuped, offset;
51815 +       unsigned nr_units;
51816 +
51817 +       item_key_by_coord(coord, &item_key);
51818 +       offset = get_key_offset(item_key_by_coord(coord, &item_key));
51819 +       nr_units = nr_units_tail(coord);
51820 +
51821 +       /* key we are looking for must be greater than key of item @coord */
51822 +       assert("vs-416", keygt(key, &item_key));
51823 +
51824 +       /* offset we are looking for */
51825 +       lookuped = get_key_offset(key);
51826 +
51827 +       if (lookuped >= offset && lookuped < offset + nr_units) {
51828 +               /* byte we are looking for is in this item */
51829 +               coord->unit_pos = lookuped - offset;
51830 +               coord->between = AT_UNIT;
51831 +               return CBK_COORD_FOUND;
51832 +       }
51833 +
51834 +       /* set coord after last unit */
51835 +       coord->unit_pos = nr_units - 1;
51836 +       coord->between = AFTER_UNIT;
51837 +       return bias ==
51838 +           FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
51839 +}
51840 +
51841 +/* plugin->u.item.b.paste */
51842 +int
51843 +paste_tail(coord_t *coord, reiser4_item_data *data,
51844 +          carry_plugin_info *info UNUSED_ARG)
51845 +{
51846 +       unsigned old_item_length;
51847 +       char *item;
51848 +
51849 +       /* length the item had before resizing has been performed */
51850 +       old_item_length = item_length_by_coord(coord) - data->length;
51851 +
51852 +       /* tail items never get pasted in the middle */
51853 +       assert("vs-363",
51854 +              (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) ||
51855 +              (coord->unit_pos == old_item_length - 1 &&
51856 +               coord->between == AFTER_UNIT) ||
51857 +              (coord->unit_pos == 0 && old_item_length == 0
51858 +               && coord->between == AT_UNIT));
51859 +
51860 +       item = item_body_by_coord(coord);
51861 +       if (coord->unit_pos == 0)
51862 +               /* make space for pasted data when pasting at the beginning of
51863 +                  the item */
51864 +               memmove(item + data->length, item, old_item_length);
51865 +
51866 +       if (coord->between == AFTER_UNIT)
51867 +               coord->unit_pos++;
51868 +
51869 +       if (data->data) {
51870 +               assert("vs-554", data->user == 0 || data->user == 1);
51871 +               if (data->user) {
51872 +                       assert("nikita-3035", reiser4_schedulable());
51873 +                       /* copy from user space */
51874 +                       if (__copy_from_user(item + coord->unit_pos,
51875 +                                            (const char __user *)data->data,
51876 +                                            (unsigned)data->length))
51877 +                               return RETERR(-EFAULT);
51878 +               } else
51879 +                       /* copy from kernel space */
51880 +                       memcpy(item + coord->unit_pos, data->data,
51881 +                              (unsigned)data->length);
51882 +       } else {
51883 +               memset(item + coord->unit_pos, 0, (unsigned)data->length);
51884 +       }
51885 +       return 0;
51886 +}
51887 +
51888 +/* plugin->u.item.b.fast_paste */
51889 +
51890 +/* plugin->u.item.b.can_shift
51891 +   number of units is returned via return value, number of bytes via @size. For
51892 +   tail items they coincide */
51893 +int
51894 +can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG,
51895 +              znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG,
51896 +              unsigned *size, unsigned want)
51897 +{
51898 +       /* make sure that that we do not want to shift more than we have */
51899 +       assert("vs-364", want > 0
51900 +              && want <= (unsigned)item_length_by_coord(source));
51901 +
51902 +       *size = min(want, free_space);
51903 +       return *size;
51904 +}
51905 +
51906 +/* plugin->u.item.b.copy_units */
51907 +void
51908 +copy_units_tail(coord_t * target, coord_t * source,
51909 +               unsigned from, unsigned count,
51910 +               shift_direction where_is_free_space,
51911 +               unsigned free_space UNUSED_ARG)
51912 +{
51913 +       /* make sure that item @target is expanded already */
51914 +       assert("vs-366", (unsigned)item_length_by_coord(target) >= count);
51915 +       assert("vs-370", free_space >= count);
51916 +
51917 +       if (where_is_free_space == SHIFT_LEFT) {
51918 +               /* append item @target with @count first bytes of @source */
51919 +               assert("vs-365", from == 0);
51920 +
51921 +               memcpy((char *)item_body_by_coord(target) +
51922 +                      item_length_by_coord(target) - count,
51923 +                      (char *)item_body_by_coord(source), count);
51924 +       } else {
51925 +               /* target item is moved to right already */
51926 +               reiser4_key key;
51927 +
51928 +               assert("vs-367",
51929 +                      (unsigned)item_length_by_coord(source) == from + count);
51930 +
51931 +               memcpy((char *)item_body_by_coord(target),
51932 +                      (char *)item_body_by_coord(source) + from, count);
51933 +
51934 +               /* new units are inserted before first unit in an item,
51935 +                  therefore, we have to update item key */
51936 +               item_key_by_coord(source, &key);
51937 +               set_key_offset(&key, get_key_offset(&key) + from);
51938 +
51939 +               node_plugin_by_node(target->node)->update_item_key(target, &key,
51940 +                                                                  NULL /*info */);
51941 +       }
51942 +}
51943 +
51944 +/* plugin->u.item.b.create_hook */
51945 +
51946 +/* item_plugin->b.kill_hook
51947 +   this is called when @count units starting from @from-th one are going to be removed
51948 +   */
51949 +int
51950 +kill_hook_tail(const coord_t * coord, pos_in_node_t from,
51951 +              pos_in_node_t count, struct carry_kill_data *kdata)
51952 +{
51953 +       reiser4_key key;
51954 +       loff_t start, end;
51955 +
51956 +       assert("vs-1577", kdata);
51957 +       assert("vs-1579", kdata->inode);
51958 +
51959 +       item_key_by_coord(coord, &key);
51960 +       start = get_key_offset(&key) + from;
51961 +       end = start + count;
51962 +       fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate);
51963 +       return 0;
51964 +}
51965 +
51966 +/* plugin->u.item.b.shift_hook */
51967 +
51968 +/* helper for kill_units_tail and cut_units_tail */
51969 +static int
51970 +do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
51971 +              reiser4_key * smallest_removed, reiser4_key * new_first)
51972 +{
51973 +       pos_in_node_t count;
51974 +
51975 +       /* this method is only called to remove part of item */
51976 +       assert("vs-374", (to - from + 1) < item_length_by_coord(coord));
51977 +       /* tails items are never cut from the middle of an item */
51978 +       assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord)));
51979 +       assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord)));
51980 +
51981 +       count = to - from + 1;
51982 +
51983 +       if (smallest_removed) {
51984 +               /* store smallest key removed */
51985 +               item_key_by_coord(coord, smallest_removed);
51986 +               set_key_offset(smallest_removed,
51987 +                              get_key_offset(smallest_removed) + from);
51988 +       }
51989 +       if (new_first) {
51990 +               /* head of item is cut */
51991 +               assert("vs-1529", from == 0);
51992 +
51993 +               item_key_by_coord(coord, new_first);
51994 +               set_key_offset(new_first,
51995 +                              get_key_offset(new_first) + from + count);
51996 +       }
51997 +
51998 +       if (REISER4_DEBUG)
51999 +               memset((char *)item_body_by_coord(coord) + from, 0, count);
52000 +       return count;
52001 +}
52002 +
52003 +/* plugin->u.item.b.cut_units */
52004 +int
52005 +cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
52006 +              struct carry_cut_data *cdata UNUSED_ARG,
52007 +              reiser4_key * smallest_removed, reiser4_key * new_first)
52008 +{
52009 +       return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
52010 +}
52011 +
52012 +/* plugin->u.item.b.kill_units */
52013 +int
52014 +kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
52015 +               struct carry_kill_data *kdata, reiser4_key * smallest_removed,
52016 +               reiser4_key * new_first)
52017 +{
52018 +       kill_hook_tail(coord, from, to - from + 1, kdata);
52019 +       return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
52020 +}
52021 +
52022 +/* plugin->u.item.b.unit_key */
52023 +reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key)
52024 +{
52025 +       assert("vs-375", coord_is_existing_unit(coord));
52026 +
52027 +       item_key_by_coord(coord, key);
52028 +       set_key_offset(key, (get_key_offset(key) + coord->unit_pos));
52029 +
52030 +       return key;
52031 +}
52032 +
52033 +/* plugin->u.item.b.estimate
52034 +   plugin->u.item.b.item_data_by_flow */
52035 +
52036 +/* tail redpage function. It is called from readpage_tail(). */
52037 +static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page)
52038 +{
52039 +       tap_t tap;
52040 +       int result;
52041 +       coord_t coord;
52042 +       lock_handle lh;
52043 +       int count, mapped;
52044 +       struct inode *inode;
52045 +       char *pagedata;
52046 +
52047 +       /* saving passed coord in order to do not move it by tap. */
52048 +       init_lh(&lh);
52049 +       copy_lh(&lh, uf_coord->lh);
52050 +       inode = page->mapping->host;
52051 +       coord_dup(&coord, &uf_coord->coord);
52052 +
52053 +       reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
52054 +
52055 +       if ((result = reiser4_tap_load(&tap)))
52056 +               goto out_tap_done;
52057 +
52058 +       /* lookup until page is filled up. */
52059 +       for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) {
52060 +               /* number of bytes to be copied to page */
52061 +               count = item_length_by_coord(&coord) - coord.unit_pos;
52062 +               if (count > PAGE_CACHE_SIZE - mapped)
52063 +                       count = PAGE_CACHE_SIZE - mapped;
52064 +
52065 +               /* attach @page to address space and get data address */
52066 +               pagedata = kmap_atomic(page, KM_USER0);
52067 +
52068 +               /* copy tail item to page */
52069 +               memcpy(pagedata + mapped,
52070 +                      ((char *)item_body_by_coord(&coord) + coord.unit_pos),
52071 +                      count);
52072 +               mapped += count;
52073 +
52074 +               flush_dcache_page(page);
52075 +
52076 +               /* dettach page from address space */
52077 +               kunmap_atomic(pagedata, KM_USER0);
52078 +
52079 +               /* Getting next tail item. */
52080 +               if (mapped < PAGE_CACHE_SIZE) {
52081 +                       /*
52082 +                        * unlock page in order to avoid keep it locked
52083 +                        * during tree lookup, which takes long term locks
52084 +                        */
52085 +                       unlock_page(page);
52086 +
52087 +                       /* getting right neighbour. */
52088 +                       result = go_dir_el(&tap, RIGHT_SIDE, 0);
52089 +
52090 +                       /* lock page back */
52091 +                       lock_page(page);
52092 +                       if (PageUptodate(page)) {
52093 +                               /*
52094 +                                * another thread read the page, we have
52095 +                                * nothing to do
52096 +                                */
52097 +                               result = 0;
52098 +                               goto out_unlock_page;
52099 +                       }
52100 +
52101 +                       if (result) {
52102 +                               if (result == -E_NO_NEIGHBOR) {
52103 +                                       /*
52104 +                                        * rigth neighbor is not a formatted
52105 +                                        * node
52106 +                                        */
52107 +                                       result = 0;
52108 +                                       goto done;
52109 +                               } else {
52110 +                                       goto out_tap_relse;
52111 +                               }
52112 +                       } else {
52113 +                               if (!inode_file_plugin(inode)->
52114 +                                   owns_item(inode, &coord)) {
52115 +                                       /* item of another file is found */
52116 +                                       result = 0;
52117 +                                       goto done;
52118 +                               }
52119 +                       }
52120 +               }
52121 +       }
52122 +
52123 + done:
52124 +       if (mapped != PAGE_CACHE_SIZE)
52125 +               zero_user_segment(page, mapped, PAGE_CACHE_SIZE);
52126 +       SetPageUptodate(page);
52127 + out_unlock_page:
52128 +       unlock_page(page);
52129 + out_tap_relse:
52130 +       reiser4_tap_relse(&tap);
52131 + out_tap_done:
52132 +       reiser4_tap_done(&tap);
52133 +       return result;
52134 +}
52135 +
52136 +/*
52137 +   plugin->s.file.readpage
52138 +   reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail
52139 +   or
52140 +   filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_tail
52141 +
52142 +   At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail
52143 +   item. */
52144 +int readpage_tail(void *vp, struct page *page)
52145 +{
52146 +       uf_coord_t *uf_coord = vp;
52147 +       ON_DEBUG(coord_t * coord = &uf_coord->coord);
52148 +       ON_DEBUG(reiser4_key key);
52149 +
52150 +       assert("umka-2515", PageLocked(page));
52151 +       assert("umka-2516", !PageUptodate(page));
52152 +       assert("umka-2517", !jprivate(page) && !PagePrivate(page));
52153 +       assert("umka-2518", page->mapping && page->mapping->host);
52154 +
52155 +       assert("umka-2519", znode_is_loaded(coord->node));
52156 +       assert("umka-2520", item_is_tail(coord));
52157 +       assert("umka-2521", coord_is_existing_unit(coord));
52158 +       assert("umka-2522", znode_is_rlocked(coord->node));
52159 +       assert("umka-2523",
52160 +              page->mapping->host->i_ino ==
52161 +              get_key_objectid(item_key_by_coord(coord, &key)));
52162 +
52163 +       return do_readpage_tail(uf_coord, page);
52164 +}
52165 +
52166 +/**
52167 + * overwrite_tail
52168 + * @flow:
52169 + * @coord:
52170 + *
52171 + * Overwrites tail item or its part by user data. Returns number of bytes
52172 + * written or error code.
52173 + */
52174 +static int overwrite_tail(flow_t *flow, coord_t *coord)
52175 +{
52176 +       unsigned count;
52177 +
52178 +       assert("vs-570", flow->user == 1);
52179 +       assert("vs-946", flow->data);
52180 +       assert("vs-947", coord_is_existing_unit(coord));
52181 +       assert("vs-948", znode_is_write_locked(coord->node));
52182 +       assert("nikita-3036", reiser4_schedulable());
52183 +
52184 +       count = item_length_by_coord(coord) - coord->unit_pos;
52185 +       if (count > flow->length)
52186 +               count = flow->length;
52187 +
52188 +       if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos,
52189 +                            (const char __user *)flow->data, count))
52190 +               return RETERR(-EFAULT);
52191 +
52192 +       znode_make_dirty(coord->node);
52193 +       return count;
52194 +}
52195 +
52196 +/**
52197 + * insert_first_tail
52198 + * @inode:
52199 + * @flow:
52200 + * @coord:
52201 + * @lh:
52202 + *
52203 + * Returns number of bytes written or error code.
52204 + */
52205 +static ssize_t insert_first_tail(struct inode *inode, flow_t *flow,
52206 +                                coord_t *coord, lock_handle *lh)
52207 +{
52208 +       int result;
52209 +       loff_t to_write;
52210 +       struct unix_file_info *uf_info;
52211 +
52212 +       if (get_key_offset(&flow->key) != 0) {
52213 +               /*
52214 +                * file is empty and we have to write not to the beginning of
52215 +                * file. Create a hole at the beginning of file. On success
52216 +                * insert_flow returns 0 as number of written bytes which is
52217 +                * what we have to return on padding a file with holes
52218 +                */
52219 +               flow->data = NULL;
52220 +               flow->length = get_key_offset(&flow->key);
52221 +               set_key_offset(&flow->key, 0);
52222 +               /*
52223 +                * holes in files built of tails are stored just like if there
52224 +                * were real data which are all zeros. Therefore we have to
52225 +                * allocate quota here as well
52226 +                */
52227 +               if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
52228 +                       return RETERR(-EDQUOT);
52229 +               result = reiser4_insert_flow(coord, lh, flow);
52230 +               if (flow->length)
52231 +                       DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
52232 +
52233 +               uf_info = unix_file_inode_data(inode);
52234 +
52235 +               /*
52236 +                * first item insertion is only possible when writing to empty
52237 +                * file or performing tail conversion
52238 +                */
52239 +               assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
52240 +                           (reiser4_inode_get_flag(inode,
52241 +                                                   REISER4_PART_MIXED) &&
52242 +                            reiser4_inode_get_flag(inode,
52243 +                                                   REISER4_PART_IN_CONV))));
52244 +               /* if file was empty - update its state */
52245 +               if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
52246 +                       uf_info->container = UF_CONTAINER_TAILS;
52247 +               return result;
52248 +       }
52249 +
52250 +       /* check quota before appending data */
52251 +       if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
52252 +               return RETERR(-EDQUOT);
52253 +
52254 +       to_write = flow->length;
52255 +       result = reiser4_insert_flow(coord, lh, flow);
52256 +       if (flow->length)
52257 +               DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
52258 +       return (to_write - flow->length) ? (to_write - flow->length) : result;
52259 +}
52260 +
52261 +/**
52262 + * append_tail
52263 + * @inode:
52264 + * @flow:
52265 + * @coord:
52266 + * @lh:
52267 + *
52268 + * Returns number of bytes written or error code.
52269 + */
52270 +static ssize_t append_tail(struct inode *inode,
52271 +                          flow_t *flow, coord_t *coord, lock_handle *lh)
52272 +{
52273 +       int result;
52274 +       reiser4_key append_key;
52275 +       loff_t to_write;
52276 +
52277 +       if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) {
52278 +               flow->data = NULL;
52279 +               flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key);
52280 +               set_key_offset(&flow->key, get_key_offset(&append_key));
52281 +               /*
52282 +                * holes in files built of tails are stored just like if there
52283 +                * were real data which are all zeros. Therefore we have to
52284 +                * allocate quota here as well
52285 +                */
52286 +               if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
52287 +                       return RETERR(-EDQUOT);
52288 +               result = reiser4_insert_flow(coord, lh, flow);
52289 +               if (flow->length)
52290 +                       DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
52291 +               return result;
52292 +       }
52293 +
52294 +       /* check quota before appending data */
52295 +       if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
52296 +               return RETERR(-EDQUOT);
52297 +
52298 +       to_write = flow->length;
52299 +       result = reiser4_insert_flow(coord, lh, flow);
52300 +       if (flow->length)
52301 +               DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
52302 +       return (to_write - flow->length) ? (to_write - flow->length) : result;
52303 +}
52304 +
52305 +/**
52306 + * write_tail_reserve_space - reserve space for tail write operation
52307 + * @inode:
52308 + *
52309 + * Estimates and reserves space which may be required for writing one flow to a
52310 + * file
52311 + */
52312 +static int write_extent_reserve_space(struct inode *inode)
52313 +{
52314 +       __u64 count;
52315 +       reiser4_tree *tree;
52316 +
52317 +       /*
52318 +        * to write one flow to a file by tails we have to reserve disk space for:
52319 +
52320 +        * 1. find_file_item may have to insert empty node to the tree (empty
52321 +        * leaf node between two extent items). This requires 1 block and
52322 +        * number of blocks which are necessary to perform insertion of an
52323 +        * internal item into twig level.
52324 +        *
52325 +        * 2. flow insertion
52326 +        *
52327 +        * 3. stat data update
52328 +        */
52329 +       tree = reiser4_tree_by_inode(inode);
52330 +       count = estimate_one_insert_item(tree) +
52331 +               estimate_insert_flow(tree->height) +
52332 +               estimate_one_insert_item(tree);
52333 +       grab_space_enable();
52334 +       return reiser4_grab_space(count, 0 /* flags */);
52335 +}
52336 +
52337 +#define PAGE_PER_FLOW 4
52338 +
52339 +static loff_t faultin_user_pages(const char __user *buf, size_t count)
52340 +{
52341 +       loff_t faulted;
52342 +       int to_fault;
52343 +
52344 +       if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE)
52345 +               count = PAGE_PER_FLOW * PAGE_CACHE_SIZE;
52346 +       faulted = 0;
52347 +       while (count > 0) {
52348 +               to_fault = PAGE_CACHE_SIZE;
52349 +               if (count < to_fault)
52350 +                       to_fault = count;
52351 +               fault_in_pages_readable(buf + faulted, to_fault);
52352 +               count -= to_fault;
52353 +               faulted += to_fault;
52354 +       }
52355 +       return faulted;
52356 +}
52357 +
52358 +/**
52359 + * reiser4_write_tail - write method of tail item plugin
52360 + * @file: file to write to
52361 + * @buf: address of user-space buffer
52362 + * @count: number of bytes to write
52363 + * @pos: position in file to write to
52364 + *
52365 + * Returns number of written bytes or error code.
52366 + */
52367 +ssize_t reiser4_write_tail(struct file *file, struct inode * inode,
52368 +                          const char __user *buf, size_t count, loff_t *pos)
52369 +{
52370 +       struct hint hint;
52371 +       int result;
52372 +       flow_t flow;
52373 +       coord_t *coord;
52374 +       lock_handle *lh;
52375 +       znode *loaded;
52376 +
52377 +       assert("edward-1548", inode != NULL);
52378 +
52379 +       if (write_extent_reserve_space(inode))
52380 +               return RETERR(-ENOSPC);
52381 +
52382 +       result = load_file_hint(file, &hint);
52383 +       BUG_ON(result != 0);
52384 +
52385 +       flow.length = faultin_user_pages(buf, count);
52386 +       flow.user = 1;
52387 +       memcpy(&flow.data, &buf, sizeof(buf));
52388 +       flow.op = WRITE_OP;
52389 +       key_by_inode_and_offset_common(inode, *pos, &flow.key);
52390 +
52391 +       result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode);
52392 +       if (IS_CBKERR(result))
52393 +               return result;
52394 +
52395 +       coord = &hint.ext_coord.coord;
52396 +       lh = hint.ext_coord.lh;
52397 +
52398 +       result = zload(coord->node);
52399 +       BUG_ON(result != 0);
52400 +       loaded = coord->node;
52401 +
52402 +       if (coord->between == AFTER_UNIT) {
52403 +               /* append with data or hole */
52404 +               result = append_tail(inode, &flow, coord, lh);
52405 +       } else if (coord->between == AT_UNIT) {
52406 +               /* overwrite */
52407 +               result = overwrite_tail(&flow, coord);
52408 +       } else {
52409 +               /* no items of this file yet. insert data or hole */
52410 +               result = insert_first_tail(inode, &flow, coord, lh);
52411 +       }
52412 +       zrelse(loaded);
52413 +       if (result < 0) {
52414 +               done_lh(lh);
52415 +               return result;
52416 +       }
52417 +
52418 +       /* seal and unlock znode */
52419 +       hint.ext_coord.valid = 0;
52420 +       if (hint.ext_coord.valid)
52421 +               reiser4_set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK);
52422 +       else
52423 +               reiser4_unset_hint(&hint);
52424 +
52425 +       save_file_hint(file, &hint);
52426 +       return result;
52427 +}
52428 +
52429 +#if REISER4_DEBUG
52430 +
52431 +static int
52432 +coord_matches_key_tail(const coord_t * coord, const reiser4_key * key)
52433 +{
52434 +       reiser4_key item_key;
52435 +
52436 +       assert("vs-1356", coord_is_existing_unit(coord));
52437 +       assert("vs-1354", keylt(key, append_key_tail(coord, &item_key)));
52438 +       assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key)));
52439 +       return get_key_offset(key) ==
52440 +           get_key_offset(&item_key) + coord->unit_pos;
52441 +
52442 +}
52443 +
52444 +#endif
52445 +
52446 +/* plugin->u.item.s.file.read */
52447 +int reiser4_read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint)
52448 +{
52449 +       unsigned count;
52450 +       int item_length;
52451 +       coord_t *coord;
52452 +       uf_coord_t *uf_coord;
52453 +
52454 +       uf_coord = &hint->ext_coord;
52455 +       coord = &uf_coord->coord;
52456 +
52457 +       assert("vs-571", f->user == 1);
52458 +       assert("vs-571", f->data);
52459 +       assert("vs-967", coord && coord->node);
52460 +       assert("vs-1117", znode_is_rlocked(coord->node));
52461 +       assert("vs-1118", znode_is_loaded(coord->node));
52462 +
52463 +       assert("nikita-3037", reiser4_schedulable());
52464 +       assert("vs-1357", coord_matches_key_tail(coord, &f->key));
52465 +
52466 +       /* calculate number of bytes to read off the item */
52467 +       item_length = item_length_by_coord(coord);
52468 +       count = item_length_by_coord(coord) - coord->unit_pos;
52469 +       if (count > f->length)
52470 +               count = f->length;
52471 +
52472 +       /* user page has to be brought in so that major page fault does not
52473 +        * occur here when longtem lock is held */
52474 +       if (__copy_to_user((char __user *)f->data,
52475 +                          ((char *)item_body_by_coord(coord) + coord->unit_pos),
52476 +                          count))
52477 +               return RETERR(-EFAULT);
52478 +
52479 +       /* probably mark_page_accessed() should only be called if
52480 +        * coord->unit_pos is zero. */
52481 +       mark_page_accessed(znode_page(coord->node));
52482 +       move_flow_forward(f, count);
52483 +
52484 +       coord->unit_pos += count;
52485 +       if (item_length == coord->unit_pos) {
52486 +               coord->unit_pos--;
52487 +               coord->between = AFTER_UNIT;
52488 +       }
52489 +       reiser4_set_hint(hint, &f->key, ZNODE_READ_LOCK);
52490 +       return 0;
52491 +}
52492 +
52493 +/*
52494 +   plugin->u.item.s.file.append_key
52495 +   key of first byte which is the next to last byte by addressed by this item
52496 +*/
52497 +reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key)
52498 +{
52499 +       item_key_by_coord(coord, key);
52500 +       set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord));
52501 +       return key;
52502 +}
52503 +
52504 +/* plugin->u.item.s.file.init_coord_extension */
52505 +void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped)
52506 +{
52507 +       uf_coord->valid = 1;
52508 +}
52509 +
52510 +/*
52511 +  plugin->u.item.s.file.get_block
52512 +*/
52513 +int
52514 +get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block)
52515 +{
52516 +       assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL);
52517 +
52518 +       if (reiser4_blocknr_is_fake(znode_get_block(coord->node)))
52519 +               /* if node has'nt obtainet its block number yet, return 0.
52520 +                * Lets avoid upsetting users with some cosmic numbers beyond
52521 +                * the device capacity.*/
52522 +               *block = 0;
52523 +       else
52524 +               *block = *znode_get_block(coord->node);
52525 +       return 0;
52526 +}
52527 +
52528 +/*
52529 + * Local variables:
52530 + * c-indentation-style: "K&R"
52531 + * mode-name: "LC"
52532 + * c-basic-offset: 8
52533 + * tab-width: 8
52534 + * fill-column: 79
52535 + * scroll-step: 1
52536 + * End:
52537 + */
52538 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/item/tail.h linux-2.6.27/fs/reiser4/plugin/item/tail.h
52539 --- linux-2.6.27.orig/fs/reiser4/plugin/item/tail.h     1970-01-01 03:00:00.000000000 +0300
52540 +++ linux-2.6.27/fs/reiser4/plugin/item/tail.h  2008-10-12 18:20:01.000000000 +0400
52541 @@ -0,0 +1,58 @@
52542 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52543 +
52544 +#if !defined( __REISER4_TAIL_H__ )
52545 +#define __REISER4_TAIL_H__
52546 +
52547 +struct tail_coord_extension {
52548 +       int not_used;
52549 +};
52550 +
52551 +struct cut_list;
52552 +
52553 +/* plugin->u.item.b.* */
52554 +reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *);
52555 +int can_contain_key_tail(const coord_t * coord, const reiser4_key * key,
52556 +                        const reiser4_item_data *);
52557 +int mergeable_tail(const coord_t * p1, const coord_t * p2);
52558 +pos_in_node_t nr_units_tail(const coord_t *);
52559 +lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *);
52560 +int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *);
52561 +int can_shift_tail(unsigned free_space, coord_t * source,
52562 +                  znode * target, shift_direction, unsigned *size,
52563 +                  unsigned want);
52564 +void copy_units_tail(coord_t * target, coord_t * source, unsigned from,
52565 +                    unsigned count, shift_direction, unsigned free_space);
52566 +int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count,
52567 +                  struct carry_kill_data *);
52568 +int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
52569 +                  struct carry_cut_data *, reiser4_key * smallest_removed,
52570 +                  reiser4_key * new_first);
52571 +int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
52572 +                   struct carry_kill_data *, reiser4_key * smallest_removed,
52573 +                   reiser4_key * new_first);
52574 +reiser4_key *unit_key_tail(const coord_t *, reiser4_key *);
52575 +
52576 +/* plugin->u.item.s.* */
52577 +ssize_t reiser4_write_tail(struct file *file, struct inode * inode,
52578 +                          const char __user *buf, size_t count, loff_t *pos);
52579 +int reiser4_read_tail(struct file *, flow_t *, hint_t *);
52580 +int readpage_tail(void *vp, struct page *page);
52581 +reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
52582 +void init_coord_extension_tail(uf_coord_t *, loff_t offset);
52583 +int get_block_address_tail(const coord_t *, sector_t, sector_t *);
52584 +int item_balance_dirty_pages(struct address_space *, const flow_t *,
52585 +                            hint_t *, int back_to_dirty, int set_hint);
52586 +
52587 +/* __REISER4_TAIL_H__ */
52588 +#endif
52589 +
52590 +/* Make Linus happy.
52591 +   Local variables:
52592 +   c-indentation-style: "K&R"
52593 +   mode-name: "LC"
52594 +   c-basic-offset: 8
52595 +   tab-width: 8
52596 +   fill-column: 120
52597 +   scroll-step: 1
52598 +   End:
52599 +*/
52600 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/Makefile linux-2.6.27/fs/reiser4/plugin/Makefile
52601 --- linux-2.6.27.orig/fs/reiser4/plugin/Makefile        1970-01-01 03:00:00.000000000 +0300
52602 +++ linux-2.6.27/fs/reiser4/plugin/Makefile     2008-10-12 18:20:01.000000000 +0400
52603 @@ -0,0 +1,26 @@
52604 +obj-$(CONFIG_REISER4_FS) += plugins.o
52605 +
52606 +plugins-objs :=                        \
52607 +       plugin.o                \
52608 +       plugin_set.o            \
52609 +       object.o                \
52610 +       inode_ops.o             \
52611 +       inode_ops_rename.o      \
52612 +       file_ops.o              \
52613 +       file_ops_readdir.o      \
52614 +       file_plugin_common.o    \
52615 +       dir_plugin_common.o     \
52616 +       digest.o                \
52617 +       hash.o                  \
52618 +       fibration.o             \
52619 +       tail_policy.o           \
52620 +       regular.o
52621 +
52622 +obj-$(CONFIG_REISER4_FS) += item/
52623 +obj-$(CONFIG_REISER4_FS) += file/
52624 +obj-$(CONFIG_REISER4_FS) += dir/
52625 +obj-$(CONFIG_REISER4_FS) += node/
52626 +obj-$(CONFIG_REISER4_FS) += compress/
52627 +obj-$(CONFIG_REISER4_FS) += space/
52628 +obj-$(CONFIG_REISER4_FS) += disk_format/
52629 +obj-$(CONFIG_REISER4_FS) += security/
52630 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/node/Makefile linux-2.6.27/fs/reiser4/plugin/node/Makefile
52631 --- linux-2.6.27.orig/fs/reiser4/plugin/node/Makefile   1970-01-01 03:00:00.000000000 +0300
52632 +++ linux-2.6.27/fs/reiser4/plugin/node/Makefile        2008-10-12 18:20:01.000000000 +0400
52633 @@ -0,0 +1,5 @@
52634 +obj-$(CONFIG_REISER4_FS) += node_plugins.o
52635 +
52636 +node_plugins-objs :=   \
52637 +       node.o          \
52638 +       node40.o
52639 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/node/node40.c linux-2.6.27/fs/reiser4/plugin/node/node40.c
52640 --- linux-2.6.27.orig/fs/reiser4/plugin/node/node40.c   1970-01-01 03:00:00.000000000 +0300
52641 +++ linux-2.6.27/fs/reiser4/plugin/node/node40.c        2008-10-12 18:20:01.000000000 +0400
52642 @@ -0,0 +1,2924 @@
52643 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52644 +
52645 +#include "../../debug.h"
52646 +#include "../../key.h"
52647 +#include "../../coord.h"
52648 +#include "../plugin_header.h"
52649 +#include "../item/item.h"
52650 +#include "node.h"
52651 +#include "node40.h"
52652 +#include "../plugin.h"
52653 +#include "../../jnode.h"
52654 +#include "../../znode.h"
52655 +#include "../../pool.h"
52656 +#include "../../carry.h"
52657 +#include "../../tap.h"
52658 +#include "../../tree.h"
52659 +#include "../../super.h"
52660 +#include "../../reiser4.h"
52661 +
52662 +#include <asm/uaccess.h>
52663 +#include <linux/types.h>
52664 +#include <linux/prefetch.h>
52665 +
52666 +/* leaf 40 format:
52667 +
52668 +  [node header | item 0, item 1, .., item N-1 |  free space | item_head N-1, .. item_head 1, item head 0 ]
52669 +   plugin_id (16)                                                key
52670 +   free_space (16)                                               pluginid (16)
52671 +   free_space_start (16)                                         offset (16)
52672 +   level (8)
52673 +   num_items (16)
52674 +   magic (32)
52675 +   flush_time (32)
52676 +*/
52677 +/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs.  Change to "ReIs". */
52678 +/* magic number that is stored in ->magic field of node header */
52679 +static const __u32 REISER4_NODE_MAGIC = 0x52344653;    /* (*(__u32 *)"R4FS"); */
52680 +
52681 +static int prepare_for_update(znode * left, znode * right,
52682 +                             carry_plugin_info * info);
52683 +
52684 +/* header of node of reiser40 format is at the beginning of node */
52685 +static inline node40_header *node40_node_header(const znode * node     /* node to
52686 +                                                                        * query */ )
52687 +{
52688 +       assert("nikita-567", node != NULL);
52689 +       assert("nikita-568", znode_page(node) != NULL);
52690 +       assert("nikita-569", zdata(node) != NULL);
52691 +       return (node40_header *) zdata(node);
52692 +}
52693 +
52694 +/* functions to get/set fields of node40_header */
52695 +#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic))
52696 +#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space))
52697 +#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start))
52698 +#define nh40_get_level(nh) get_unaligned(&(nh)->level)
52699 +#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items))
52700 +#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id))
52701 +
52702 +#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic)
52703 +#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space)
52704 +#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start)
52705 +#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level)
52706 +#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items)
52707 +#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id)
52708 +
52709 +/* plugin field of node header should be read/set by
52710 +   plugin_by_disk_id/save_disk_plugin */
52711 +
52712 +/* array of item headers is at the end of node */
52713 +static inline item_header40 *node40_ih_at(const znode * node, unsigned pos)
52714 +{
52715 +       return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1;
52716 +}
52717 +
52718 +/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1
52719 + */
52720 +static inline item_header40 *node40_ih_at_coord(const coord_t * coord)
52721 +{
52722 +       return (item_header40 *) (zdata(coord->node) +
52723 +                                 znode_size(coord->node)) - (coord->item_pos) -
52724 +           1;
52725 +}
52726 +
52727 +/* functions to get/set fields of item_header40 */
52728 +#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset))
52729 +
52730 +#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset)
52731 +
52732 +/* plugin field of item header should be read/set by
52733 +   plugin_by_disk_id/save_disk_plugin */
52734 +
52735 +/* plugin methods */
52736 +
52737 +/* plugin->u.node.item_overhead
52738 +   look for description of this method in plugin/node/node.h */
52739 +size_t
52740 +item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG)
52741 +{
52742 +       return sizeof(item_header40);
52743 +}
52744 +
52745 +/* plugin->u.node.free_space
52746 +   look for description of this method in plugin/node/node.h */
52747 +size_t free_space_node40(znode * node)
52748 +{
52749 +       assert("nikita-577", node != NULL);
52750 +       assert("nikita-578", znode_is_loaded(node));
52751 +       assert("nikita-579", zdata(node) != NULL);
52752 +
52753 +       return nh40_get_free_space(node40_node_header(node));
52754 +}
52755 +
52756 +/* private inline version of node40_num_of_items() for use in this file. This
52757 +   is necessary, because address of node40_num_of_items() is taken and it is
52758 +   never inlined as a result. */
52759 +static inline short node40_num_of_items_internal(const znode * node)
52760 +{
52761 +       return nh40_get_num_items(node40_node_header(node));
52762 +}
52763 +
52764 +#if REISER4_DEBUG
52765 +static inline void check_num_items(const znode * node)
52766 +{
52767 +       assert("nikita-2749",
52768 +              node40_num_of_items_internal(node) == node->nr_items);
52769 +       assert("nikita-2746", znode_is_write_locked(node));
52770 +}
52771 +#else
52772 +#define check_num_items(node) noop
52773 +#endif
52774 +
52775 +/* plugin->u.node.num_of_items
52776 +   look for description of this method in plugin/node/node.h */
52777 +int num_of_items_node40(const znode * node)
52778 +{
52779 +       return node40_num_of_items_internal(node);
52780 +}
52781 +
52782 +static void
52783 +node40_set_num_items(znode * node, node40_header * nh, unsigned value)
52784 +{
52785 +       assert("nikita-2751", node != NULL);
52786 +       assert("nikita-2750", nh == node40_node_header(node));
52787 +
52788 +       check_num_items(node);
52789 +       nh40_set_num_items(nh, value);
52790 +       node->nr_items = value;
52791 +       check_num_items(node);
52792 +}
52793 +
52794 +/* plugin->u.node.item_by_coord
52795 +   look for description of this method in plugin/node/node.h */
52796 +char *item_by_coord_node40(const coord_t * coord)
52797 +{
52798 +       item_header40 *ih;
52799 +       char *p;
52800 +
52801 +       /* @coord is set to existing item */
52802 +       assert("nikita-596", coord != NULL);
52803 +       assert("vs-255", coord_is_existing_item(coord));
52804 +
52805 +       ih = node40_ih_at_coord(coord);
52806 +       p = zdata(coord->node) + ih40_get_offset(ih);
52807 +       return p;
52808 +}
52809 +
52810 +/* plugin->u.node.length_by_coord
52811 +   look for description of this method in plugin/node/node.h */
52812 +int length_by_coord_node40(const coord_t * coord)
52813 +{
52814 +       item_header40 *ih;
52815 +       int result;
52816 +
52817 +       /* @coord is set to existing item */
52818 +       assert("vs-256", coord != NULL);
52819 +       assert("vs-257", coord_is_existing_item(coord));
52820 +
52821 +       ih = node40_ih_at_coord(coord);
52822 +       if ((int)coord->item_pos ==
52823 +           node40_num_of_items_internal(coord->node) - 1)
52824 +               result =
52825 +                   nh40_get_free_space_start(node40_node_header(coord->node)) -
52826 +                   ih40_get_offset(ih);
52827 +       else
52828 +               result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
52829 +
52830 +       return result;
52831 +}
52832 +
52833 +static pos_in_node_t
52834 +node40_item_length(const znode * node, pos_in_node_t item_pos)
52835 +{
52836 +       item_header40 *ih;
52837 +       pos_in_node_t result;
52838 +
52839 +       /* @coord is set to existing item */
52840 +       assert("vs-256", node != NULL);
52841 +       assert("vs-257", node40_num_of_items_internal(node) > item_pos);
52842 +
52843 +       ih = node40_ih_at(node, item_pos);
52844 +       if (item_pos == node40_num_of_items_internal(node) - 1)
52845 +               result =
52846 +                   nh40_get_free_space_start(node40_node_header(node)) -
52847 +                   ih40_get_offset(ih);
52848 +       else
52849 +               result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
52850 +
52851 +       return result;
52852 +}
52853 +
52854 +/* plugin->u.node.plugin_by_coord
52855 +   look for description of this method in plugin/node/node.h */
52856 +item_plugin *plugin_by_coord_node40(const coord_t * coord)
52857 +{
52858 +       item_header40 *ih;
52859 +       item_plugin *result;
52860 +
52861 +       /* @coord is set to existing item */
52862 +       assert("vs-258", coord != NULL);
52863 +       assert("vs-259", coord_is_existing_item(coord));
52864 +
52865 +       ih = node40_ih_at_coord(coord);
52866 +       /* pass NULL in stead of current tree. This is time critical call. */
52867 +       result = item_plugin_by_disk_id(NULL, &ih->plugin_id);
52868 +       return result;
52869 +}
52870 +
52871 +/* plugin->u.node.key_at
52872 +   look for description of this method in plugin/node/node.h */
52873 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key)
52874 +{
52875 +       item_header40 *ih;
52876 +
52877 +       assert("nikita-1765", coord_is_existing_item(coord));
52878 +
52879 +       /* @coord is set to existing item */
52880 +       ih = node40_ih_at_coord(coord);
52881 +       memcpy(key, &ih->key, sizeof(reiser4_key));
52882 +       return key;
52883 +}
52884 +
52885 +/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */
52886 +
52887 +#define NODE_INCSTAT(n, counter)                                               \
52888 +       reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter)
52889 +
52890 +#define NODE_ADDSTAT(n, counter, val)                                          \
52891 +       reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val)
52892 +
52893 +/* plugin->u.node.lookup
52894 +   look for description of this method in plugin/node/node.h */
52895 +node_search_result lookup_node40(znode * node /* node to query */ ,
52896 +                                const reiser4_key * key /* key to look for */ ,
52897 +                                lookup_bias bias /* search bias */ ,
52898 +                                coord_t * coord /* resulting coord */ )
52899 +{
52900 +       int left;
52901 +       int right;
52902 +       int found;
52903 +       int items;
52904 +
52905 +       item_header40 *lefth;
52906 +       item_header40 *righth;
52907 +
52908 +       item_plugin *iplug;
52909 +       item_header40 *bstop;
52910 +       item_header40 *ih;
52911 +       cmp_t order;
52912 +
52913 +       assert("nikita-583", node != NULL);
52914 +       assert("nikita-584", key != NULL);
52915 +       assert("nikita-585", coord != NULL);
52916 +       assert("nikita-2693", znode_is_any_locked(node));
52917 +       cassert(REISER4_SEQ_SEARCH_BREAK > 2);
52918 +
52919 +       items = node_num_items(node);
52920 +
52921 +       if (unlikely(items == 0)) {
52922 +               coord_init_first_unit(coord, node);
52923 +               return NS_NOT_FOUND;
52924 +       }
52925 +
52926 +       /* binary search for item that can contain given key */
52927 +       left = 0;
52928 +       right = items - 1;
52929 +       coord->node = node;
52930 +       coord_clear_iplug(coord);
52931 +       found = 0;
52932 +
52933 +       lefth = node40_ih_at(node, left);
52934 +       righth = node40_ih_at(node, right);
52935 +
52936 +       /* It is known that for small arrays sequential search is on average
52937 +          more efficient than binary. This is because sequential search is
52938 +          coded as tight loop that can be better optimized by compilers and
52939 +          for small array size gain from this optimization makes sequential
52940 +          search the winner. Another, maybe more important, reason for this,
52941 +          is that sequential array is more CPU cache friendly, whereas binary
52942 +          search effectively destroys CPU caching.
52943 +
52944 +          Critical here is the notion of "smallness". Reasonable value of
52945 +          REISER4_SEQ_SEARCH_BREAK can be found by playing with code in
52946 +          fs/reiser4/ulevel/ulevel.c:test_search().
52947 +
52948 +          Don't try to further optimize sequential search by scanning from
52949 +          right to left in attempt to use more efficient loop termination
52950 +          condition (comparison with 0). This doesn't work.
52951 +
52952 +        */
52953 +
52954 +       while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
52955 +               int median;
52956 +               item_header40 *medianh;
52957 +
52958 +               median = (left + right) / 2;
52959 +               medianh = node40_ih_at(node, median);
52960 +
52961 +               assert("nikita-1084", median >= 0);
52962 +               assert("nikita-1085", median < items);
52963 +               switch (keycmp(key, &medianh->key)) {
52964 +               case LESS_THAN:
52965 +                       right = median;
52966 +                       righth = medianh;
52967 +                       break;
52968 +               default:
52969 +                       wrong_return_value("nikita-586", "keycmp");
52970 +               case GREATER_THAN:
52971 +                       left = median;
52972 +                       lefth = medianh;
52973 +                       break;
52974 +               case EQUAL_TO:
52975 +                       do {
52976 +                               --median;
52977 +                               /* headers are ordered from right to left */
52978 +                               ++medianh;
52979 +                       } while (median >= 0 && keyeq(key, &medianh->key));
52980 +                       right = left = median + 1;
52981 +                       ih = lefth = righth = medianh - 1;
52982 +                       found = 1;
52983 +                       break;
52984 +               }
52985 +       }
52986 +       /* sequential scan. Item headers, and, therefore, keys are stored at
52987 +          the rightmost part of a node from right to left. We are trying to
52988 +          access memory from left to right, and hence, scan in _descending_
52989 +          order of item numbers.
52990 +        */
52991 +       if (!found) {
52992 +               for (left = right, ih = righth; left >= 0; ++ih, --left) {
52993 +                       cmp_t comparison;
52994 +
52995 +                       prefetchkey(&(ih + 1)->key);
52996 +                       comparison = keycmp(&ih->key, key);
52997 +                       if (comparison == GREATER_THAN)
52998 +                               continue;
52999 +                       if (comparison == EQUAL_TO) {
53000 +                               found = 1;
53001 +                               do {
53002 +                                       --left;
53003 +                                       ++ih;
53004 +                               } while (left >= 0 && keyeq(&ih->key, key));
53005 +                               ++left;
53006 +                               --ih;
53007 +                       } else {
53008 +                               assert("nikita-1256", comparison == LESS_THAN);
53009 +                       }
53010 +                       break;
53011 +               }
53012 +               if (unlikely(left < 0))
53013 +                       left = 0;
53014 +       }
53015 +
53016 +       assert("nikita-3212", right >= left);
53017 +       assert("nikita-3214",
53018 +              equi(found, keyeq(&node40_ih_at(node, left)->key, key)));
53019 +
53020 +       coord_set_item_pos(coord, left);
53021 +       coord->unit_pos = 0;
53022 +       coord->between = AT_UNIT;
53023 +
53024 +       /* key < leftmost key in a mode or node is corrupted and keys
53025 +          are not sorted  */
53026 +       bstop = node40_ih_at(node, (unsigned)left);
53027 +       order = keycmp(&bstop->key, key);
53028 +       if (unlikely(order == GREATER_THAN)) {
53029 +               if (unlikely(left != 0)) {
53030 +                       /* screw up */
53031 +                       warning("nikita-587", "Key less than %i key in a node",
53032 +                               left);
53033 +                       reiser4_print_key("key", key);
53034 +                       reiser4_print_key("min", &bstop->key);
53035 +                       print_coord_content("coord", coord);
53036 +                       return RETERR(-EIO);
53037 +               } else {
53038 +                       coord->between = BEFORE_UNIT;
53039 +                       return NS_NOT_FOUND;
53040 +               }
53041 +       }
53042 +       /* left <= key, ok */
53043 +       iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id);
53044 +
53045 +       if (unlikely(iplug == NULL)) {
53046 +               warning("nikita-588", "Unknown plugin %i",
53047 +                       le16_to_cpu(get_unaligned(&bstop->plugin_id)));
53048 +               reiser4_print_key("key", key);
53049 +               print_coord_content("coord", coord);
53050 +               return RETERR(-EIO);
53051 +       }
53052 +
53053 +       coord_set_iplug(coord, iplug);
53054 +
53055 +       /* if exact key from item header was found by binary search, no
53056 +          further checks are necessary. */
53057 +       if (found) {
53058 +               assert("nikita-1259", order == EQUAL_TO);
53059 +               return NS_FOUND;
53060 +       }
53061 +       if (iplug->b.max_key_inside != NULL) {
53062 +               reiser4_key max_item_key;
53063 +
53064 +               /* key > max_item_key --- outside of an item */
53065 +               if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) {
53066 +                       coord->unit_pos = 0;
53067 +                       coord->between = AFTER_ITEM;
53068 +                       /* FIXME-VS: key we are looking for does not fit into
53069 +                          found item. Return NS_NOT_FOUND then. Without that
53070 +                          the following case does not work: there is extent of
53071 +                          file 10000, 10001. File 10000, 10002 has been just
53072 +                          created. When writing to position 0 in that file -
53073 +                          traverse_tree will stop here on twig level. When we
53074 +                          want it to go down to leaf level
53075 +                        */
53076 +                       return NS_NOT_FOUND;
53077 +               }
53078 +       }
53079 +
53080 +       if (iplug->b.lookup != NULL) {
53081 +               return iplug->b.lookup(key, bias, coord);
53082 +       } else {
53083 +               assert("nikita-1260", order == LESS_THAN);
53084 +               coord->between = AFTER_UNIT;
53085 +               return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND;
53086 +       }
53087 +}
53088 +
53089 +#undef NODE_ADDSTAT
53090 +#undef NODE_INCSTAT
53091 +
53092 +/* plugin->u.node.estimate
53093 +   look for description of this method in plugin/node/node.h */
53094 +size_t estimate_node40(znode * node)
53095 +{
53096 +       size_t result;
53097 +
53098 +       assert("nikita-597", node != NULL);
53099 +
53100 +       result = free_space_node40(node) - sizeof(item_header40);
53101 +
53102 +       return (result > 0) ? result : 0;
53103 +}
53104 +
53105 +/* plugin->u.node.check
53106 +   look for description of this method in plugin/node/node.h */
53107 +int check_node40(const znode * node /* node to check */ ,
53108 +                __u32 flags /* check flags */ ,
53109 +                const char **error /* where to store error message */ )
53110 +{
53111 +       int nr_items;
53112 +       int i;
53113 +       reiser4_key prev;
53114 +       unsigned old_offset;
53115 +       tree_level level;
53116 +       coord_t coord;
53117 +       int result;
53118 +
53119 +       assert("nikita-580", node != NULL);
53120 +       assert("nikita-581", error != NULL);
53121 +       assert("nikita-2948", znode_is_loaded(node));
53122 +
53123 +       if (ZF_ISSET(node, JNODE_HEARD_BANSHEE))
53124 +               return 0;
53125 +
53126 +       assert("nikita-582", zdata(node) != NULL);
53127 +
53128 +       nr_items = node40_num_of_items_internal(node);
53129 +       if (nr_items < 0) {
53130 +               *error = "Negative number of items";
53131 +               return -1;
53132 +       }
53133 +
53134 +       if (flags & REISER4_NODE_DKEYS)
53135 +               prev = *znode_get_ld_key((znode *) node);
53136 +       else
53137 +               prev = *reiser4_min_key();
53138 +
53139 +       old_offset = 0;
53140 +       coord_init_zero(&coord);
53141 +       coord.node = (znode *) node;
53142 +       coord.unit_pos = 0;
53143 +       coord.between = AT_UNIT;
53144 +       level = znode_get_level(node);
53145 +       for (i = 0; i < nr_items; i++) {
53146 +               item_header40 *ih;
53147 +               reiser4_key unit_key;
53148 +               unsigned j;
53149 +
53150 +               ih = node40_ih_at(node, (unsigned)i);
53151 +               coord_set_item_pos(&coord, i);
53152 +               if ((ih40_get_offset(ih) >=
53153 +                    znode_size(node) - nr_items * sizeof(item_header40)) ||
53154 +                   (ih40_get_offset(ih) < sizeof(node40_header))) {
53155 +                       *error = "Offset is out of bounds";
53156 +                       return -1;
53157 +               }
53158 +               if (ih40_get_offset(ih) <= old_offset) {
53159 +                       *error = "Offsets are in wrong order";
53160 +                       return -1;
53161 +               }
53162 +               if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) {
53163 +                       *error = "Wrong offset of first item";
53164 +                       return -1;
53165 +               }
53166 +               old_offset = ih40_get_offset(ih);
53167 +
53168 +               if (keygt(&prev, &ih->key)) {
53169 +                       *error = "Keys are in wrong order";
53170 +                       return -1;
53171 +               }
53172 +               if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) {
53173 +                       *error = "Wrong key of first unit";
53174 +                       return -1;
53175 +               }
53176 +               prev = ih->key;
53177 +               for (j = 0; j < coord_num_units(&coord); ++j) {
53178 +                       coord.unit_pos = j;
53179 +                       unit_key_by_coord(&coord, &unit_key);
53180 +                       if (keygt(&prev, &unit_key)) {
53181 +                               *error = "Unit keys are in wrong order";
53182 +                               return -1;
53183 +                       }
53184 +                       prev = unit_key;
53185 +               }
53186 +               coord.unit_pos = 0;
53187 +               if (level != TWIG_LEVEL && item_is_extent(&coord)) {
53188 +                       *error = "extent on the wrong level";
53189 +                       return -1;
53190 +               }
53191 +               if (level == LEAF_LEVEL && item_is_internal(&coord)) {
53192 +                       *error = "internal item on the wrong level";
53193 +                       return -1;
53194 +               }
53195 +               if (level != LEAF_LEVEL &&
53196 +                   !item_is_internal(&coord) && !item_is_extent(&coord)) {
53197 +                       *error = "wrong item on the internal level";
53198 +                       return -1;
53199 +               }
53200 +               if (level > TWIG_LEVEL && !item_is_internal(&coord)) {
53201 +                       *error = "non-internal item on the internal level";
53202 +                       return -1;
53203 +               }
53204 +#if REISER4_DEBUG
53205 +               if (item_plugin_by_coord(&coord)->b.check
53206 +                   && item_plugin_by_coord(&coord)->b.check(&coord, error))
53207 +                       return -1;
53208 +#endif
53209 +               if (i) {
53210 +                       coord_t prev_coord;
53211 +                       /* two neighboring items can not be mergeable */
53212 +                       coord_dup(&prev_coord, &coord);
53213 +                       coord_prev_item(&prev_coord);
53214 +                       if (are_items_mergeable(&prev_coord, &coord)) {
53215 +                               *error = "mergeable items in one node";
53216 +                               return -1;
53217 +                       }
53218 +
53219 +               }
53220 +       }
53221 +
53222 +       if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) {
53223 +               coord_t coord;
53224 +               item_plugin *iplug;
53225 +
53226 +               coord_init_last_unit(&coord, node);
53227 +               iplug = item_plugin_by_coord(&coord);
53228 +               if ((item_is_extent(&coord) || item_is_tail(&coord)) &&
53229 +                   iplug->s.file.append_key != NULL) {
53230 +                       reiser4_key mkey;
53231 +
53232 +                       iplug->s.file.append_key(&coord, &mkey);
53233 +                       set_key_offset(&mkey, get_key_offset(&mkey) - 1);
53234 +                       read_lock_dk(current_tree);
53235 +                       result = keygt(&mkey, znode_get_rd_key((znode *) node));
53236 +                       read_unlock_dk(current_tree);
53237 +                       if (result) {
53238 +                               *error = "key of rightmost item is too large";
53239 +                               return -1;
53240 +                       }
53241 +               }
53242 +       }
53243 +       if (flags & REISER4_NODE_DKEYS) {
53244 +               read_lock_tree(current_tree);
53245 +               read_lock_dk(current_tree);
53246 +
53247 +               flags |= REISER4_NODE_TREE_STABLE;
53248 +
53249 +               if (keygt(&prev, znode_get_rd_key((znode *) node))) {
53250 +                       if (flags & REISER4_NODE_TREE_STABLE) {
53251 +                               *error = "Last key is greater than rdkey";
53252 +                               read_unlock_dk(current_tree);
53253 +                               read_unlock_tree(current_tree);
53254 +                               return -1;
53255 +                       }
53256 +               }
53257 +               if (keygt
53258 +                   (znode_get_ld_key((znode *) node),
53259 +                    znode_get_rd_key((znode *) node))) {
53260 +                       *error = "ldkey is greater than rdkey";
53261 +                       read_unlock_dk(current_tree);
53262 +                       read_unlock_tree(current_tree);
53263 +                       return -1;
53264 +               }
53265 +               if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
53266 +                   (node->left != NULL) &&
53267 +                   !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) &&
53268 +                   ergo(flags & REISER4_NODE_TREE_STABLE,
53269 +                        !keyeq(znode_get_rd_key(node->left),
53270 +                               znode_get_ld_key((znode *) node)))
53271 +                   && ergo(!(flags & REISER4_NODE_TREE_STABLE),
53272 +                           keygt(znode_get_rd_key(node->left),
53273 +                                 znode_get_ld_key((znode *) node)))) {
53274 +                       *error = "left rdkey or ldkey is wrong";
53275 +                       read_unlock_dk(current_tree);
53276 +                       read_unlock_tree(current_tree);
53277 +                       return -1;
53278 +               }
53279 +               if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
53280 +                   (node->right != NULL) &&
53281 +                   !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) &&
53282 +                   ergo(flags & REISER4_NODE_TREE_STABLE,
53283 +                        !keyeq(znode_get_rd_key((znode *) node),
53284 +                               znode_get_ld_key(node->right)))
53285 +                   && ergo(!(flags & REISER4_NODE_TREE_STABLE),
53286 +                           keygt(znode_get_rd_key((znode *) node),
53287 +                                 znode_get_ld_key(node->right)))) {
53288 +                       *error = "rdkey or right ldkey is wrong";
53289 +                       read_unlock_dk(current_tree);
53290 +                       read_unlock_tree(current_tree);
53291 +                       return -1;
53292 +               }
53293 +
53294 +               read_unlock_dk(current_tree);
53295 +               read_unlock_tree(current_tree);
53296 +       }
53297 +
53298 +       return 0;
53299 +}
53300 +
53301 +/* plugin->u.node.parse
53302 +   look for description of this method in plugin/node/node.h */
53303 +int parse_node40(znode * node /* node to parse */ )
53304 +{
53305 +       node40_header *header;
53306 +       int result;
53307 +       d8 level;
53308 +
53309 +       header = node40_node_header((znode *) node);
53310 +       result = -EIO;
53311 +       level = nh40_get_level(header);
53312 +       if (unlikely(((__u8) znode_get_level(node)) != level))
53313 +               warning("nikita-494", "Wrong level found in node: %i != %i",
53314 +                       znode_get_level(node), level);
53315 +       else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC))
53316 +               warning("nikita-495",
53317 +                       "Wrong magic in tree node: want %x, got %x",
53318 +                       REISER4_NODE_MAGIC, nh40_get_magic(header));
53319 +       else {
53320 +               node->nr_items = node40_num_of_items_internal(node);
53321 +               result = 0;
53322 +       }
53323 +       return RETERR(result);
53324 +}
53325 +
53326 +/* plugin->u.node.init
53327 +   look for description of this method in plugin/node/node.h */
53328 +int init_node40(znode * node /* node to initialise */ )
53329 +{
53330 +       node40_header *header;
53331 +
53332 +       assert("nikita-570", node != NULL);
53333 +       assert("nikita-572", zdata(node) != NULL);
53334 +
53335 +       header = node40_node_header(node);
53336 +       memset(header, 0, sizeof(node40_header));
53337 +       nh40_set_free_space(header, znode_size(node) - sizeof(node40_header));
53338 +       nh40_set_free_space_start(header, sizeof(node40_header));
53339 +       /* sane hypothesis: 0 in CPU format is 0 in disk format */
53340 +       /* items: 0 */
53341 +       save_plugin_id(node_plugin_to_plugin(node->nplug),
53342 +                      &header->common_header.plugin_id);
53343 +       nh40_set_level(header, znode_get_level(node));
53344 +       nh40_set_magic(header, REISER4_NODE_MAGIC);
53345 +       node->nr_items = 0;
53346 +       nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb()));
53347 +
53348 +       /* flags: 0 */
53349 +       return 0;
53350 +}
53351 +
53352 +#ifdef GUESS_EXISTS
53353 +int guess_node40(const znode * node /* node to guess plugin of */ )
53354 +{
53355 +       node40_header *nethack;
53356 +
53357 +       assert("nikita-1058", node != NULL);
53358 +       nethack = node40_node_header(node);
53359 +       return
53360 +           (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) &&
53361 +           (plugin_by_disk_id(znode_get_tree(node),
53362 +                              REISER4_NODE_PLUGIN_TYPE,
53363 +                              &nethack->common_header.plugin_id)->h.id ==
53364 +            NODE40_ID);
53365 +}
53366 +#endif
53367 +
53368 +/* plugin->u.node.chage_item_size
53369 +   look for description of this method in plugin/node/node.h */
53370 +void change_item_size_node40(coord_t * coord, int by)
53371 +{
53372 +       node40_header *nh;
53373 +       item_header40 *ih;
53374 +       char *item_data;
53375 +       int item_length;
53376 +       unsigned i;
53377 +
53378 +       /* make sure that @item is coord of existing item */
53379 +       assert("vs-210", coord_is_existing_item(coord));
53380 +
53381 +       nh = node40_node_header(coord->node);
53382 +
53383 +       item_data = item_by_coord_node40(coord);
53384 +       item_length = length_by_coord_node40(coord);
53385 +
53386 +       /* move item bodies */
53387 +       ih = node40_ih_at_coord(coord);
53388 +       memmove(item_data + item_length + by, item_data + item_length,
53389 +               nh40_get_free_space_start(node40_node_header(coord->node)) -
53390 +               (ih40_get_offset(ih) + item_length));
53391 +
53392 +       /* update offsets of moved items */
53393 +       for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) {
53394 +               ih = node40_ih_at(coord->node, i);
53395 +               ih40_set_offset(ih, ih40_get_offset(ih) + by);
53396 +       }
53397 +
53398 +       /* update node header */
53399 +       nh40_set_free_space(nh, nh40_get_free_space(nh) - by);
53400 +       nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by);
53401 +}
53402 +
53403 +static int should_notify_parent(const znode * node)
53404 +{
53405 +       /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */
53406 +       return !disk_addr_eq(znode_get_block(node),
53407 +                            &znode_get_tree(node)->root_block);
53408 +}
53409 +
53410 +/* plugin->u.node.create_item
53411 +   look for description of this method in plugin/node/node.h */
53412 +int
53413 +create_item_node40(coord_t *target, const reiser4_key *key,
53414 +                  reiser4_item_data *data, carry_plugin_info *info)
53415 +{
53416 +       node40_header *nh;
53417 +       item_header40 *ih;
53418 +       unsigned offset;
53419 +       unsigned i;
53420 +
53421 +       nh = node40_node_header(target->node);
53422 +
53423 +       assert("vs-212", coord_is_between_items(target));
53424 +       /* node must have enough free space */
53425 +       assert("vs-254",
53426 +              free_space_node40(target->node) >=
53427 +              data->length + sizeof(item_header40));
53428 +       assert("vs-1410", data->length >= 0);
53429 +
53430 +       if (coord_set_to_right(target))
53431 +               /* there are not items to the right of @target, so, new item
53432 +                  will be inserted after last one */
53433 +               coord_set_item_pos(target, nh40_get_num_items(nh));
53434 +
53435 +       if (target->item_pos < nh40_get_num_items(nh)) {
53436 +               /* there are items to be moved to prepare space for new
53437 +                  item */
53438 +               ih = node40_ih_at_coord(target);
53439 +               /* new item will start at this offset */
53440 +               offset = ih40_get_offset(ih);
53441 +
53442 +               memmove(zdata(target->node) + offset + data->length,
53443 +                       zdata(target->node) + offset,
53444 +                       nh40_get_free_space_start(nh) - offset);
53445 +               /* update headers of moved items */
53446 +               for (i = target->item_pos; i < nh40_get_num_items(nh); i++) {
53447 +                       ih = node40_ih_at(target->node, i);
53448 +                       ih40_set_offset(ih, ih40_get_offset(ih) + data->length);
53449 +               }
53450 +
53451 +               /* @ih is set to item header of the last item, move item headers */
53452 +               memmove(ih - 1, ih,
53453 +                       sizeof(item_header40) * (nh40_get_num_items(nh) -
53454 +                                                target->item_pos));
53455 +       } else {
53456 +               /* new item will start at this offset */
53457 +               offset = nh40_get_free_space_start(nh);
53458 +       }
53459 +
53460 +       /* make item header for the new item */
53461 +       ih = node40_ih_at_coord(target);
53462 +       memcpy(&ih->key, key, sizeof(reiser4_key));
53463 +       ih40_set_offset(ih, offset);
53464 +       save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id);
53465 +
53466 +       /* update node header */
53467 +       nh40_set_free_space(nh,
53468 +                           nh40_get_free_space(nh) - data->length -
53469 +                           sizeof(item_header40));
53470 +       nh40_set_free_space_start(nh,
53471 +                                 nh40_get_free_space_start(nh) + data->length);
53472 +       node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1);
53473 +
53474 +       /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */
53475 +       target->unit_pos = 0;
53476 +       target->between = AT_UNIT;
53477 +       coord_clear_iplug(target);
53478 +
53479 +       /* initialize item */
53480 +       if (data->iplug->b.init != NULL) {
53481 +               data->iplug->b.init(target, NULL, data);
53482 +       }
53483 +       /* copy item body */
53484 +       if (data->iplug->b.paste != NULL) {
53485 +               data->iplug->b.paste(target, data, info);
53486 +       } else if (data->data != NULL) {
53487 +               if (data->user) {
53488 +                       /* AUDIT: Are we really should not check that pointer
53489 +                          from userspace was valid and data bytes were
53490 +                          available? How will we return -EFAULT of some kind
53491 +                          without this check? */
53492 +                       assert("nikita-3038", reiser4_schedulable());
53493 +                       /* copy data from user space */
53494 +                       __copy_from_user(zdata(target->node) + offset,
53495 +                                        (const char __user *)data->data,
53496 +                                        (unsigned)data->length);
53497 +               } else
53498 +                       /* copy from kernel space */
53499 +                       memcpy(zdata(target->node) + offset, data->data,
53500 +                              (unsigned)data->length);
53501 +       }
53502 +
53503 +       if (target->item_pos == 0) {
53504 +               /* left delimiting key has to be updated */
53505 +               prepare_for_update(NULL, target->node, info);
53506 +       }
53507 +
53508 +       if (item_plugin_by_coord(target)->b.create_hook != NULL) {
53509 +               item_plugin_by_coord(target)->b.create_hook(target, data->arg);
53510 +       }
53511 +
53512 +       return 0;
53513 +}
53514 +
53515 +/* plugin->u.node.update_item_key
53516 +   look for description of this method in plugin/node/node.h */
53517 +void
53518 +update_item_key_node40(coord_t * target, const reiser4_key * key,
53519 +                      carry_plugin_info * info)
53520 +{
53521 +       item_header40 *ih;
53522 +
53523 +       ih = node40_ih_at_coord(target);
53524 +       memcpy(&ih->key, key, sizeof(reiser4_key));
53525 +
53526 +       if (target->item_pos == 0) {
53527 +               prepare_for_update(NULL, target->node, info);
53528 +       }
53529 +}
53530 +
53531 +/* this bits encode cut mode */
53532 +#define CMODE_TAIL 1
53533 +#define CMODE_WHOLE 2
53534 +#define CMODE_HEAD 4
53535 +
53536 +struct cut40_info {
53537 +       int mode;
53538 +       pos_in_node_t tail_removed;     /* position of item which gets tail removed */
53539 +       pos_in_node_t first_removed;    /* position of first the leftmost item among items removed completely */
53540 +       pos_in_node_t removed_count;    /* number of items removed completely */
53541 +       pos_in_node_t head_removed;     /* position of item which gets head removed */
53542 +
53543 +       pos_in_node_t freed_space_start;
53544 +       pos_in_node_t freed_space_end;
53545 +       pos_in_node_t first_moved;
53546 +       pos_in_node_t head_removed_location;
53547 +};
53548 +
53549 +static void init_cinfo(struct cut40_info *cinfo)
53550 +{
53551 +       cinfo->mode = 0;
53552 +       cinfo->tail_removed = MAX_POS_IN_NODE;
53553 +       cinfo->first_removed = MAX_POS_IN_NODE;
53554 +       cinfo->removed_count = MAX_POS_IN_NODE;
53555 +       cinfo->head_removed = MAX_POS_IN_NODE;
53556 +       cinfo->freed_space_start = MAX_POS_IN_NODE;
53557 +       cinfo->freed_space_end = MAX_POS_IN_NODE;
53558 +       cinfo->first_moved = MAX_POS_IN_NODE;
53559 +       cinfo->head_removed_location = MAX_POS_IN_NODE;
53560 +}
53561 +
53562 +/* complete cut_node40/kill_node40 content by removing the gap created by */
53563 +static void compact(znode * node, struct cut40_info *cinfo)
53564 +{
53565 +       node40_header *nh;
53566 +       item_header40 *ih;
53567 +       pos_in_node_t freed;
53568 +       pos_in_node_t pos, nr_items;
53569 +
53570 +       assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE &&
53571 +                          cinfo->freed_space_end != MAX_POS_IN_NODE &&
53572 +                          cinfo->first_moved != MAX_POS_IN_NODE));
53573 +       assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start);
53574 +
53575 +       nh = node40_node_header(node);
53576 +       nr_items = nh40_get_num_items(nh);
53577 +
53578 +       /* remove gap made up by removal */
53579 +       memmove(zdata(node) + cinfo->freed_space_start,
53580 +               zdata(node) + cinfo->freed_space_end,
53581 +               nh40_get_free_space_start(nh) - cinfo->freed_space_end);
53582 +
53583 +       /* update item headers of moved items - change their locations */
53584 +       pos = cinfo->first_moved;
53585 +       ih = node40_ih_at(node, pos);
53586 +       if (cinfo->head_removed_location != MAX_POS_IN_NODE) {
53587 +               assert("vs-1580", pos == cinfo->head_removed);
53588 +               ih40_set_offset(ih, cinfo->head_removed_location);
53589 +               pos++;
53590 +               ih--;
53591 +       }
53592 +
53593 +       freed = cinfo->freed_space_end - cinfo->freed_space_start;
53594 +       for (; pos < nr_items; pos++, ih--) {
53595 +               assert("vs-1581", ih == node40_ih_at(node, pos));
53596 +               ih40_set_offset(ih, ih40_get_offset(ih) - freed);
53597 +       }
53598 +
53599 +       /* free space start moved to right */
53600 +       nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
53601 +
53602 +       if (cinfo->removed_count != MAX_POS_IN_NODE) {
53603 +               /* number of items changed. Remove item headers of those items */
53604 +               ih = node40_ih_at(node, nr_items - 1);
53605 +               memmove(ih + cinfo->removed_count, ih,
53606 +                       sizeof(item_header40) * (nr_items -
53607 +                                                cinfo->removed_count -
53608 +                                                cinfo->first_removed));
53609 +               freed += sizeof(item_header40) * cinfo->removed_count;
53610 +               node40_set_num_items(node, nh, nr_items - cinfo->removed_count);
53611 +       }
53612 +
53613 +       /* total amount of free space increased */
53614 +       nh40_set_free_space(nh, nh40_get_free_space(nh) + freed);
53615 +}
53616 +
53617 +int shrink_item_node40(coord_t * coord, int delta)
53618 +{
53619 +       node40_header *nh;
53620 +       item_header40 *ih;
53621 +       pos_in_node_t pos;
53622 +       pos_in_node_t nr_items;
53623 +       char *end;
53624 +       znode *node;
53625 +       int off;
53626 +
53627 +       assert("nikita-3487", coord != NULL);
53628 +       assert("nikita-3488", delta >= 0);
53629 +
53630 +       node = coord->node;
53631 +       nh = node40_node_header(node);
53632 +       nr_items = nh40_get_num_items(nh);
53633 +
53634 +       ih = node40_ih_at_coord(coord);
53635 +       assert("nikita-3489", delta <= length_by_coord_node40(coord));
53636 +       off = ih40_get_offset(ih) + length_by_coord_node40(coord);
53637 +       end = zdata(node) + off;
53638 +
53639 +       /* remove gap made up by removal */
53640 +       memmove(end - delta, end, nh40_get_free_space_start(nh) - off);
53641 +
53642 +       /* update item headers of moved items - change their locations */
53643 +       pos = coord->item_pos + 1;
53644 +       ih = node40_ih_at(node, pos);
53645 +       for (; pos < nr_items; pos++, ih--) {
53646 +               assert("nikita-3490", ih == node40_ih_at(node, pos));
53647 +               ih40_set_offset(ih, ih40_get_offset(ih) - delta);
53648 +       }
53649 +
53650 +       /* free space start moved to left */
53651 +       nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta);
53652 +       /* total amount of free space increased */
53653 +       nh40_set_free_space(nh, nh40_get_free_space(nh) + delta);
53654 +       /*
53655 +        * This method does _not_ changes number of items. Hence, it cannot
53656 +        * make node empty. Also it doesn't remove items at all, which means
53657 +        * that no keys have to be updated either.
53658 +        */
53659 +       return 0;
53660 +}
53661 +
53662 +/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types
53663 +   of cut. First is when a unit is removed from the middle of an item.  In this case this function returns 1. All the
53664 +   rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item
53665 +   getting head cut. Function returns 0 in this case */
53666 +static int
53667 +parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params)
53668 +{
53669 +       reiser4_key left_key, right_key;
53670 +       reiser4_key min_from_key, max_to_key;
53671 +       const reiser4_key *from_key, *to_key;
53672 +
53673 +       init_cinfo(cinfo);
53674 +
53675 +       /* calculate minimal key stored in first item of items to be cut (params->from) */
53676 +       item_key_by_coord(params->from, &min_from_key);
53677 +       /* and max key stored in last item of items to be cut (params->to) */
53678 +       max_item_key_by_coord(params->to, &max_to_key);
53679 +
53680 +       /* if cut key range is not defined in input parameters - define it using cut coord range */
53681 +       if (params->from_key == NULL) {
53682 +               assert("vs-1513", params->to_key == NULL);
53683 +               unit_key_by_coord(params->from, &left_key);
53684 +               from_key = &left_key;
53685 +               max_unit_key_by_coord(params->to, &right_key);
53686 +               to_key = &right_key;
53687 +       } else {
53688 +               from_key = params->from_key;
53689 +               to_key = params->to_key;
53690 +       }
53691 +
53692 +       if (params->from->item_pos == params->to->item_pos) {
53693 +               if (keylt(&min_from_key, from_key)
53694 +                   && keylt(to_key, &max_to_key))
53695 +                       return 1;
53696 +
53697 +               if (keygt(from_key, &min_from_key)) {
53698 +                       /* tail of item is to be cut cut */
53699 +                       cinfo->tail_removed = params->from->item_pos;
53700 +                       cinfo->mode |= CMODE_TAIL;
53701 +               } else if (keylt(to_key, &max_to_key)) {
53702 +                       /* head of item is to be cut */
53703 +                       cinfo->head_removed = params->from->item_pos;
53704 +                       cinfo->mode |= CMODE_HEAD;
53705 +               } else {
53706 +                       /* item is removed completely */
53707 +                       cinfo->first_removed = params->from->item_pos;
53708 +                       cinfo->removed_count = 1;
53709 +                       cinfo->mode |= CMODE_WHOLE;
53710 +               }
53711 +       } else {
53712 +               cinfo->first_removed = params->from->item_pos + 1;
53713 +               cinfo->removed_count =
53714 +                   params->to->item_pos - params->from->item_pos - 1;
53715 +
53716 +               if (keygt(from_key, &min_from_key)) {
53717 +                       /* first item is not cut completely */
53718 +                       cinfo->tail_removed = params->from->item_pos;
53719 +                       cinfo->mode |= CMODE_TAIL;
53720 +               } else {
53721 +                       cinfo->first_removed--;
53722 +                       cinfo->removed_count++;
53723 +               }
53724 +               if (keylt(to_key, &max_to_key)) {
53725 +                       /* last item is not cut completely */
53726 +                       cinfo->head_removed = params->to->item_pos;
53727 +                       cinfo->mode |= CMODE_HEAD;
53728 +               } else {
53729 +                       cinfo->removed_count++;
53730 +               }
53731 +               if (cinfo->removed_count)
53732 +                       cinfo->mode |= CMODE_WHOLE;
53733 +       }
53734 +
53735 +       return 0;
53736 +}
53737 +
53738 +static void
53739 +call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count,
53740 +               carry_kill_data * kdata)
53741 +{
53742 +       coord_t coord;
53743 +       item_plugin *iplug;
53744 +       pos_in_node_t pos;
53745 +
53746 +       coord.node = node;
53747 +       coord.unit_pos = 0;
53748 +       coord.between = AT_UNIT;
53749 +       for (pos = 0; pos < count; pos++) {
53750 +               coord_set_item_pos(&coord, from + pos);
53751 +               coord.unit_pos = 0;
53752 +               coord.between = AT_UNIT;
53753 +               iplug = item_plugin_by_coord(&coord);
53754 +               if (iplug->b.kill_hook) {
53755 +                       iplug->b.kill_hook(&coord, 0, coord_num_units(&coord),
53756 +                                          kdata);
53757 +               }
53758 +       }
53759 +}
53760 +
53761 +/* this is used to kill item partially */
53762 +static pos_in_node_t
53763 +kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
53764 +          reiser4_key * smallest_removed, reiser4_key * new_first_key)
53765 +{
53766 +       struct carry_kill_data *kdata;
53767 +       item_plugin *iplug;
53768 +
53769 +       kdata = data;
53770 +       iplug = item_plugin_by_coord(coord);
53771 +
53772 +       assert("vs-1524", iplug->b.kill_units);
53773 +       return iplug->b.kill_units(coord, from, to, kdata, smallest_removed,
53774 +                                  new_first_key);
53775 +}
53776 +
53777 +/* call item plugin to cut tail of file */
53778 +static pos_in_node_t
53779 +kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
53780 +{
53781 +       struct carry_kill_data *kdata;
53782 +       pos_in_node_t to;
53783 +
53784 +       kdata = data;
53785 +       to = coord_last_unit_pos(coord);
53786 +       return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed,
53787 +                         NULL);
53788 +}
53789 +
53790 +/* call item plugin to cut head of item */
53791 +static pos_in_node_t
53792 +kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
53793 +         reiser4_key * new_first_key)
53794 +{
53795 +       return kill_units(coord, 0, coord->unit_pos, data, smallest_removed,
53796 +                         new_first_key);
53797 +}
53798 +
53799 +/* this is used to cut item partially */
53800 +static pos_in_node_t
53801 +cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
53802 +         reiser4_key * smallest_removed, reiser4_key * new_first_key)
53803 +{
53804 +       carry_cut_data *cdata;
53805 +       item_plugin *iplug;
53806 +
53807 +       cdata = data;
53808 +       iplug = item_plugin_by_coord(coord);
53809 +       assert("vs-302", iplug->b.cut_units);
53810 +       return iplug->b.cut_units(coord, from, to, cdata, smallest_removed,
53811 +                                 new_first_key);
53812 +}
53813 +
53814 +/* call item plugin to cut tail of file */
53815 +static pos_in_node_t
53816 +cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
53817 +{
53818 +       carry_cut_data *cdata;
53819 +       pos_in_node_t to;
53820 +
53821 +       cdata = data;
53822 +       to = coord_last_unit_pos(cdata->params.from);
53823 +       return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL);
53824 +}
53825 +
53826 +/* call item plugin to cut head of item */
53827 +static pos_in_node_t
53828 +cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
53829 +        reiser4_key * new_first_key)
53830 +{
53831 +       return cut_units(coord, 0, coord->unit_pos, data, smallest_removed,
53832 +                        new_first_key);
53833 +}
53834 +
53835 +/* this returns 1 of key of first item changed, 0 - if it did not */
53836 +static int
53837 +prepare_for_compact(struct cut40_info *cinfo,
53838 +                   const struct cut_kill_params *params, int is_cut,
53839 +                   void *data, carry_plugin_info * info)
53840 +{
53841 +       znode *node;
53842 +       item_header40 *ih;
53843 +       pos_in_node_t freed;
53844 +       pos_in_node_t item_pos;
53845 +       coord_t coord;
53846 +       reiser4_key new_first_key;
53847 +       pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t,
53848 +                                     void *, reiser4_key *, reiser4_key *);
53849 +       pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *);
53850 +       pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *,
53851 +                                    reiser4_key *);
53852 +       int retval;
53853 +
53854 +       retval = 0;
53855 +
53856 +       node = params->from->node;
53857 +
53858 +       assert("vs-184", node == params->to->node);
53859 +       assert("vs-312", !node_is_empty(node));
53860 +       assert("vs-297",
53861 +              coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT);
53862 +
53863 +       if (is_cut) {
53864 +               kill_units_f = cut_units;
53865 +               kill_tail_f = cut_tail;
53866 +               kill_head_f = cut_head;
53867 +       } else {
53868 +               kill_units_f = kill_units;
53869 +               kill_tail_f = kill_tail;
53870 +               kill_head_f = kill_head;
53871 +       }
53872 +
53873 +       if (parse_cut(cinfo, params) == 1) {
53874 +               /* cut from the middle of item */
53875 +               freed =
53876 +                   kill_units_f(params->from, params->from->unit_pos,
53877 +                                params->to->unit_pos, data,
53878 +                                params->smallest_removed, NULL);
53879 +
53880 +               item_pos = params->from->item_pos;
53881 +               ih = node40_ih_at(node, item_pos);
53882 +               cinfo->freed_space_start =
53883 +                   ih40_get_offset(ih) + node40_item_length(node,
53884 +                                                            item_pos) - freed;
53885 +               cinfo->freed_space_end = cinfo->freed_space_start + freed;
53886 +               cinfo->first_moved = item_pos + 1;
53887 +       } else {
53888 +               assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE ||
53889 +                                  cinfo->first_removed != MAX_POS_IN_NODE ||
53890 +                                  cinfo->head_removed != MAX_POS_IN_NODE));
53891 +
53892 +               switch (cinfo->mode) {
53893 +               case CMODE_TAIL:
53894 +                       /* one item gets cut partially from its end */
53895 +                       assert("vs-1562",
53896 +                              cinfo->tail_removed == params->from->item_pos);
53897 +
53898 +                       freed =
53899 +                           kill_tail_f(params->from, data,
53900 +                                       params->smallest_removed);
53901 +
53902 +                       item_pos = cinfo->tail_removed;
53903 +                       ih = node40_ih_at(node, item_pos);
53904 +                       cinfo->freed_space_start =
53905 +                           ih40_get_offset(ih) + node40_item_length(node,
53906 +                                                                    item_pos) -
53907 +                           freed;
53908 +                       cinfo->freed_space_end =
53909 +                           cinfo->freed_space_start + freed;
53910 +                       cinfo->first_moved = cinfo->tail_removed + 1;
53911 +                       break;
53912 +
53913 +               case CMODE_WHOLE:
53914 +                       /* one or more items get removed completely */
53915 +                       assert("vs-1563",
53916 +                              cinfo->first_removed == params->from->item_pos);
53917 +                       assert("vs-1564", cinfo->removed_count > 0
53918 +                              && cinfo->removed_count != MAX_POS_IN_NODE);
53919 +
53920 +                       /* call kill hook for all items removed completely */
53921 +                       if (is_cut == 0)
53922 +                               call_kill_hooks(node, cinfo->first_removed,
53923 +                                               cinfo->removed_count, data);
53924 +
53925 +                       item_pos = cinfo->first_removed;
53926 +                       ih = node40_ih_at(node, item_pos);
53927 +
53928 +                       if (params->smallest_removed)
53929 +                               memcpy(params->smallest_removed, &ih->key,
53930 +                                      sizeof(reiser4_key));
53931 +
53932 +                       cinfo->freed_space_start = ih40_get_offset(ih);
53933 +
53934 +                       item_pos += (cinfo->removed_count - 1);
53935 +                       ih -= (cinfo->removed_count - 1);
53936 +                       cinfo->freed_space_end =
53937 +                           ih40_get_offset(ih) + node40_item_length(node,
53938 +                                                                    item_pos);
53939 +                       cinfo->first_moved = item_pos + 1;
53940 +                       if (cinfo->first_removed == 0)
53941 +                               /* key of first item of the node changes */
53942 +                               retval = 1;
53943 +                       break;
53944 +
53945 +               case CMODE_HEAD:
53946 +                       /* one item gets cut partially from its head */
53947 +                       assert("vs-1565",
53948 +                              cinfo->head_removed == params->from->item_pos);
53949 +
53950 +                       freed =
53951 +                           kill_head_f(params->to, data,
53952 +                                       params->smallest_removed,
53953 +                                       &new_first_key);
53954 +
53955 +                       item_pos = cinfo->head_removed;
53956 +                       ih = node40_ih_at(node, item_pos);
53957 +                       cinfo->freed_space_start = ih40_get_offset(ih);
53958 +                       cinfo->freed_space_end = ih40_get_offset(ih) + freed;
53959 +                       cinfo->first_moved = cinfo->head_removed + 1;
53960 +
53961 +                       /* item head is removed, therefore, item key changed */
53962 +                       coord.node = node;
53963 +                       coord_set_item_pos(&coord, item_pos);
53964 +                       coord.unit_pos = 0;
53965 +                       coord.between = AT_UNIT;
53966 +                       update_item_key_node40(&coord, &new_first_key, NULL);
53967 +                       if (item_pos == 0)
53968 +                               /* key of first item of the node changes */
53969 +                               retval = 1;
53970 +                       break;
53971 +
53972 +               case CMODE_TAIL | CMODE_WHOLE:
53973 +                       /* one item gets cut from its end and one or more items get removed completely */
53974 +                       assert("vs-1566",
53975 +                              cinfo->tail_removed == params->from->item_pos);
53976 +                       assert("vs-1567",
53977 +                              cinfo->first_removed == cinfo->tail_removed + 1);
53978 +                       assert("vs-1564", cinfo->removed_count > 0
53979 +                              && cinfo->removed_count != MAX_POS_IN_NODE);
53980 +
53981 +                       freed =
53982 +                           kill_tail_f(params->from, data,
53983 +                                       params->smallest_removed);
53984 +
53985 +                       item_pos = cinfo->tail_removed;
53986 +                       ih = node40_ih_at(node, item_pos);
53987 +                       cinfo->freed_space_start =
53988 +                           ih40_get_offset(ih) + node40_item_length(node,
53989 +                                                                    item_pos) -
53990 +                           freed;
53991 +
53992 +                       /* call kill hook for all items removed completely */
53993 +                       if (is_cut == 0)
53994 +                               call_kill_hooks(node, cinfo->first_removed,
53995 +                                               cinfo->removed_count, data);
53996 +
53997 +                       item_pos += cinfo->removed_count;
53998 +                       ih -= cinfo->removed_count;
53999 +                       cinfo->freed_space_end =
54000 +                           ih40_get_offset(ih) + node40_item_length(node,
54001 +                                                                    item_pos);
54002 +                       cinfo->first_moved = item_pos + 1;
54003 +                       break;
54004 +
54005 +               case CMODE_WHOLE | CMODE_HEAD:
54006 +                       /* one or more items get removed completely and one item gets cut partially from its head */
54007 +                       assert("vs-1568",
54008 +                              cinfo->first_removed == params->from->item_pos);
54009 +                       assert("vs-1564", cinfo->removed_count > 0
54010 +                              && cinfo->removed_count != MAX_POS_IN_NODE);
54011 +                       assert("vs-1569",
54012 +                              cinfo->head_removed ==
54013 +                              cinfo->first_removed + cinfo->removed_count);
54014 +
54015 +                       /* call kill hook for all items removed completely */
54016 +                       if (is_cut == 0)
54017 +                               call_kill_hooks(node, cinfo->first_removed,
54018 +                                               cinfo->removed_count, data);
54019 +
54020 +                       item_pos = cinfo->first_removed;
54021 +                       ih = node40_ih_at(node, item_pos);
54022 +
54023 +                       if (params->smallest_removed)
54024 +                               memcpy(params->smallest_removed, &ih->key,
54025 +                                      sizeof(reiser4_key));
54026 +
54027 +                       freed =
54028 +                           kill_head_f(params->to, data, NULL, &new_first_key);
54029 +
54030 +                       cinfo->freed_space_start = ih40_get_offset(ih);
54031 +
54032 +                       ih = node40_ih_at(node, cinfo->head_removed);
54033 +                       /* this is the most complex case. Item which got head removed and items which are to be moved
54034 +                          intact change their location differently. */
54035 +                       cinfo->freed_space_end = ih40_get_offset(ih) + freed;
54036 +                       cinfo->first_moved = cinfo->head_removed;
54037 +                       cinfo->head_removed_location = cinfo->freed_space_start;
54038 +
54039 +                       /* item head is removed, therefore, item key changed */
54040 +                       coord.node = node;
54041 +                       coord_set_item_pos(&coord, cinfo->head_removed);
54042 +                       coord.unit_pos = 0;
54043 +                       coord.between = AT_UNIT;
54044 +                       update_item_key_node40(&coord, &new_first_key, NULL);
54045 +
54046 +                       assert("vs-1579", cinfo->first_removed == 0);
54047 +                       /* key of first item of the node changes */
54048 +                       retval = 1;
54049 +                       break;
54050 +
54051 +               case CMODE_TAIL | CMODE_HEAD:
54052 +                       /* one item get cut from its end and its neighbor gets cut from its tail */
54053 +                       impossible("vs-1576", "this can not happen currently");
54054 +                       break;
54055 +
54056 +               case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD:
54057 +                       impossible("vs-1577", "this can not happen currently");
54058 +                       break;
54059 +               default:
54060 +                       impossible("vs-1578", "unexpected cut mode");
54061 +                       break;
54062 +               }
54063 +       }
54064 +       return retval;
54065 +}
54066 +
54067 +/* plugin->u.node.kill
54068 +   return value is number of items removed completely */
54069 +int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info)
54070 +{
54071 +       znode *node;
54072 +       struct cut40_info cinfo;
54073 +       int first_key_changed;
54074 +
54075 +       node = kdata->params.from->node;
54076 +
54077 +       first_key_changed =
54078 +           prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata,
54079 +                               info);
54080 +       compact(node, &cinfo);
54081 +
54082 +       if (info) {
54083 +               /* it is not called by node40_shift, so we have to take care
54084 +                  of changes on upper levels */
54085 +               if (node_is_empty(node)
54086 +                   && !(kdata->flags & DELETE_RETAIN_EMPTY))
54087 +                       /* all contents of node is deleted */
54088 +                       prepare_removal_node40(node, info);
54089 +               else if (first_key_changed) {
54090 +                       prepare_for_update(NULL, node, info);
54091 +               }
54092 +       }
54093 +
54094 +       coord_clear_iplug(kdata->params.from);
54095 +       coord_clear_iplug(kdata->params.to);
54096 +
54097 +       znode_make_dirty(node);
54098 +       return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
54099 +}
54100 +
54101 +/* plugin->u.node.cut
54102 +   return value is number of items removed completely */
54103 +int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info)
54104 +{
54105 +       znode *node;
54106 +       struct cut40_info cinfo;
54107 +       int first_key_changed;
54108 +
54109 +       node = cdata->params.from->node;
54110 +
54111 +       first_key_changed =
54112 +           prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata,
54113 +                               info);
54114 +       compact(node, &cinfo);
54115 +
54116 +       if (info) {
54117 +               /* it is not called by node40_shift, so we have to take care
54118 +                  of changes on upper levels */
54119 +               if (node_is_empty(node))
54120 +                       /* all contents of node is deleted */
54121 +                       prepare_removal_node40(node, info);
54122 +               else if (first_key_changed) {
54123 +                       prepare_for_update(NULL, node, info);
54124 +               }
54125 +       }
54126 +
54127 +       coord_clear_iplug(cdata->params.from);
54128 +       coord_clear_iplug(cdata->params.to);
54129 +
54130 +       znode_make_dirty(node);
54131 +       return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
54132 +}
54133 +
54134 +/* this structure is used by shift method of node40 plugin */
54135 +struct shift_params {
54136 +       shift_direction pend;   /* when @pend == append - we are shifting to
54137 +                                  left, when @pend == prepend - to right */
54138 +       coord_t wish_stop;      /* when shifting to left this is last unit we
54139 +                                  want shifted, when shifting to right - this
54140 +                                  is set to unit we want to start shifting
54141 +                                  from */
54142 +       znode *target;
54143 +       int everything;         /* it is set to 1 if everything we have to shift is
54144 +                                  shifted, 0 - otherwise */
54145 +
54146 +       /* FIXME-VS: get rid of read_stop */
54147 +
54148 +       /* these are set by estimate_shift */
54149 +       coord_t real_stop;      /* this will be set to last unit which will be
54150 +                                  really shifted */
54151 +
54152 +       /* coordinate in source node before operation of unit which becomes
54153 +          first after shift to left of last after shift to right */
54154 +       union {
54155 +               coord_t future_first;
54156 +               coord_t future_last;
54157 +       } u;
54158 +
54159 +       unsigned merging_units; /* number of units of first item which have to
54160 +                                  be merged with last item of target node */
54161 +       unsigned merging_bytes; /* number of bytes in those units */
54162 +
54163 +       unsigned entire;        /* items shifted in their entirety */
54164 +       unsigned entire_bytes;  /* number of bytes in those items */
54165 +
54166 +       unsigned part_units;    /* number of units of partially copied item */
54167 +       unsigned part_bytes;    /* number of bytes in those units */
54168 +
54169 +       unsigned shift_bytes;   /* total number of bytes in items shifted (item
54170 +                                  headers not included) */
54171 +
54172 +};
54173 +
54174 +static int item_creation_overhead(coord_t *item)
54175 +{
54176 +       return node_plugin_by_coord(item)->item_overhead(item->node, NULL);
54177 +}
54178 +
54179 +/* how many units are there in @source starting from source->unit_pos
54180 +   but not further than @stop_coord */
54181 +static int
54182 +wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend)
54183 +{
54184 +       if (pend == SHIFT_LEFT) {
54185 +               assert("vs-181", source->unit_pos == 0);
54186 +       } else {
54187 +               assert("vs-182",
54188 +                      source->unit_pos == coord_last_unit_pos(source));
54189 +       }
54190 +
54191 +       if (source->item_pos != stop_coord->item_pos) {
54192 +               /* @source and @stop_coord are different items */
54193 +               return coord_last_unit_pos(source) + 1;
54194 +       }
54195 +
54196 +       if (pend == SHIFT_LEFT) {
54197 +               return stop_coord->unit_pos + 1;
54198 +       } else {
54199 +               return source->unit_pos - stop_coord->unit_pos + 1;
54200 +       }
54201 +}
54202 +
54203 +/* this calculates what can be copied from @shift->wish_stop.node to
54204 +   @shift->target */
54205 +static void
54206 +estimate_shift(struct shift_params *shift, const reiser4_context * ctx)
54207 +{
54208 +       unsigned target_free_space, size;
54209 +       pos_in_node_t stop_item;        /* item which estimating should not consider */
54210 +       unsigned want;          /* number of units of item we want shifted */
54211 +       coord_t source;         /* item being estimated */
54212 +       item_plugin *iplug;
54213 +
54214 +       /* shifting to left/right starts from first/last units of
54215 +          @shift->wish_stop.node */
54216 +       if (shift->pend == SHIFT_LEFT) {
54217 +               coord_init_first_unit(&source, shift->wish_stop.node);
54218 +       } else {
54219 +               coord_init_last_unit(&source, shift->wish_stop.node);
54220 +       }
54221 +       shift->real_stop = source;
54222 +
54223 +       /* free space in target node and number of items in source */
54224 +       target_free_space = znode_free_space(shift->target);
54225 +
54226 +       shift->everything = 0;
54227 +       if (!node_is_empty(shift->target)) {
54228 +               /* target node is not empty, check for boundary items
54229 +                  mergeability */
54230 +               coord_t to;
54231 +
54232 +               /* item we try to merge @source with */
54233 +               if (shift->pend == SHIFT_LEFT) {
54234 +                       coord_init_last_unit(&to, shift->target);
54235 +               } else {
54236 +                       coord_init_first_unit(&to, shift->target);
54237 +               }
54238 +
54239 +               if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to,
54240 +                                                                     &source) :
54241 +                   are_items_mergeable(&source, &to)) {
54242 +                       /* how many units of @source do we want to merge to
54243 +                          item @to */
54244 +                       want =
54245 +                           wanted_units(&source, &shift->wish_stop,
54246 +                                        shift->pend);
54247 +
54248 +                       /* how many units of @source we can merge to item
54249 +                          @to */
54250 +                       iplug = item_plugin_by_coord(&source);
54251 +                       if (iplug->b.can_shift != NULL)
54252 +                               shift->merging_units =
54253 +                                   iplug->b.can_shift(target_free_space,
54254 +                                                      &source, shift->target,
54255 +                                                      shift->pend, &size,
54256 +                                                      want);
54257 +                       else {
54258 +                               shift->merging_units = 0;
54259 +                               size = 0;
54260 +                       }
54261 +                       shift->merging_bytes = size;
54262 +                       shift->shift_bytes += size;
54263 +                       /* update stop coord to be set to last unit of @source
54264 +                          we can merge to @target */
54265 +                       if (shift->merging_units)
54266 +                               /* at least one unit can be shifted */
54267 +                               shift->real_stop.unit_pos =
54268 +                                   (shift->merging_units - source.unit_pos -
54269 +                                    1) * shift->pend;
54270 +                       else {
54271 +                               /* nothing can be shifted */
54272 +                               if (shift->pend == SHIFT_LEFT)
54273 +                                       coord_init_before_first_item(&shift->
54274 +                                                                    real_stop,
54275 +                                                                    source.
54276 +                                                                    node);
54277 +                               else
54278 +                                       coord_init_after_last_item(&shift->
54279 +                                                                  real_stop,
54280 +                                                                  source.node);
54281 +                       }
54282 +                       assert("nikita-2081", shift->real_stop.unit_pos + 1);
54283 +
54284 +                       if (shift->merging_units != want) {
54285 +                               /* we could not copy as many as we want, so,
54286 +                                  there is no reason for estimating any
54287 +                                  longer */
54288 +                               return;
54289 +                       }
54290 +
54291 +                       target_free_space -= size;
54292 +                       coord_add_item_pos(&source, shift->pend);
54293 +               }
54294 +       }
54295 +
54296 +       /* number of item nothing of which we want to shift */
54297 +       stop_item = shift->wish_stop.item_pos + shift->pend;
54298 +
54299 +       /* calculate how many items can be copied into given free
54300 +          space as whole */
54301 +       for (; source.item_pos != stop_item;
54302 +            coord_add_item_pos(&source, shift->pend)) {
54303 +               if (shift->pend == SHIFT_RIGHT)
54304 +                       source.unit_pos = coord_last_unit_pos(&source);
54305 +
54306 +               /* how many units of @source do we want to copy */
54307 +               want = wanted_units(&source, &shift->wish_stop, shift->pend);
54308 +
54309 +               if (want == coord_last_unit_pos(&source) + 1) {
54310 +                       /* we want this item to be copied entirely */
54311 +                       size =
54312 +                           item_length_by_coord(&source) +
54313 +                           item_creation_overhead(&source);
54314 +                       if (size <= target_free_space) {
54315 +                               /* item fits into target node as whole */
54316 +                               target_free_space -= size;
54317 +                               shift->shift_bytes +=
54318 +                                   size - item_creation_overhead(&source);
54319 +                               shift->entire_bytes +=
54320 +                                   size - item_creation_overhead(&source);
54321 +                               shift->entire++;
54322 +
54323 +                               /* update shift->real_stop coord to be set to
54324 +                                  last unit of @source we can merge to
54325 +                                  @target */
54326 +                               shift->real_stop = source;
54327 +                               if (shift->pend == SHIFT_LEFT)
54328 +                                       shift->real_stop.unit_pos =
54329 +                                           coord_last_unit_pos(&shift->
54330 +                                                               real_stop);
54331 +                               else
54332 +                                       shift->real_stop.unit_pos = 0;
54333 +                               continue;
54334 +                       }
54335 +               }
54336 +
54337 +               /* we reach here only for an item which does not fit into
54338 +                  target node in its entirety. This item may be either
54339 +                  partially shifted, or not shifted at all. We will have to
54340 +                  create new item in target node, so decrease amout of free
54341 +                  space by an item creation overhead. We can reach here also
54342 +                  if stop coord is in this item */
54343 +               if (target_free_space >=
54344 +                   (unsigned)item_creation_overhead(&source)) {
54345 +                       target_free_space -= item_creation_overhead(&source);
54346 +                       iplug = item_plugin_by_coord(&source);
54347 +                       if (iplug->b.can_shift) {
54348 +                               shift->part_units = iplug->b.can_shift(target_free_space,
54349 +                                                                      &source,
54350 +                                                                      NULL, /* target */
54351 +                                                                      shift->pend,
54352 +                                                                      &size,
54353 +                                                                      want);
54354 +                       } else {
54355 +                               target_free_space = 0;
54356 +                               shift->part_units = 0;
54357 +                               size = 0;
54358 +                       }
54359 +               } else {
54360 +                       target_free_space = 0;
54361 +                       shift->part_units = 0;
54362 +                       size = 0;
54363 +               }
54364 +               shift->part_bytes = size;
54365 +               shift->shift_bytes += size;
54366 +
54367 +               /* set @shift->real_stop to last unit of @source we can merge
54368 +                  to @shift->target */
54369 +               if (shift->part_units) {
54370 +                       shift->real_stop = source;
54371 +                       shift->real_stop.unit_pos =
54372 +                           (shift->part_units - source.unit_pos -
54373 +                            1) * shift->pend;
54374 +                       assert("nikita-2082", shift->real_stop.unit_pos + 1);
54375 +               }
54376 +
54377 +               if (want != shift->part_units)
54378 +                       /* not everything wanted were shifted */
54379 +                       return;
54380 +               break;
54381 +       }
54382 +
54383 +       shift->everything = 1;
54384 +}
54385 +
54386 +static void
54387 +copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count,
54388 +          shift_direction dir, unsigned free_space)
54389 +{
54390 +       item_plugin *iplug;
54391 +
54392 +       assert("nikita-1463", target != NULL);
54393 +       assert("nikita-1464", source != NULL);
54394 +       assert("nikita-1465", from + count <= coord_num_units(source));
54395 +
54396 +       iplug = item_plugin_by_coord(source);
54397 +       assert("nikita-1468", iplug == item_plugin_by_coord(target));
54398 +       iplug->b.copy_units(target, source, from, count, dir, free_space);
54399 +
54400 +       if (dir == SHIFT_RIGHT) {
54401 +               /* FIXME-VS: this looks not necessary. update_item_key was
54402 +                  called already by copy_units method */
54403 +               reiser4_key split_key;
54404 +
54405 +               assert("nikita-1469", target->unit_pos == 0);
54406 +
54407 +               unit_key_by_coord(target, &split_key);
54408 +               node_plugin_by_coord(target)->update_item_key(target,
54409 +                                                             &split_key, NULL);
54410 +       }
54411 +}
54412 +
54413 +/* copy part of @shift->real_stop.node starting either from its beginning or
54414 +   from its end and ending at @shift->real_stop to either the end or the
54415 +   beginning of @shift->target */
54416 +static void copy(struct shift_params *shift)
54417 +{
54418 +       node40_header *nh;
54419 +       coord_t from;
54420 +       coord_t to;
54421 +       item_header40 *from_ih, *to_ih;
54422 +       int free_space_start;
54423 +       int new_items;
54424 +       unsigned old_items;
54425 +       int old_offset;
54426 +       unsigned i;
54427 +
54428 +       nh = node40_node_header(shift->target);
54429 +       free_space_start = nh40_get_free_space_start(nh);
54430 +       old_items = nh40_get_num_items(nh);
54431 +       new_items = shift->entire + (shift->part_units ? 1 : 0);
54432 +       assert("vs-185",
54433 +              shift->shift_bytes ==
54434 +              shift->merging_bytes + shift->entire_bytes + shift->part_bytes);
54435 +
54436 +       from = shift->wish_stop;
54437 +
54438 +       coord_init_first_unit(&to, shift->target);
54439 +
54440 +       /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty,
54441 +          hence to.between is set to EMPTY_NODE above. Looks like we want it
54442 +          to be AT_UNIT.
54443 +
54444 +          Oh, wonders of ->betweeness...
54445 +
54446 +        */
54447 +       to.between = AT_UNIT;
54448 +
54449 +       if (shift->pend == SHIFT_LEFT) {
54450 +               /* copying to left */
54451 +
54452 +               coord_set_item_pos(&from, 0);
54453 +               from_ih = node40_ih_at(from.node, 0);
54454 +
54455 +               coord_set_item_pos(&to,
54456 +                                  node40_num_of_items_internal(to.node) - 1);
54457 +               if (shift->merging_units) {
54458 +                       /* expand last item, so that plugin methods will see
54459 +                          correct data */
54460 +                       free_space_start += shift->merging_bytes;
54461 +                       nh40_set_free_space_start(nh,
54462 +                                                 (unsigned)free_space_start);
54463 +                       nh40_set_free_space(nh,
54464 +                                           nh40_get_free_space(nh) -
54465 +                                           shift->merging_bytes);
54466 +
54467 +                       /* appending last item of @target */
54468 +                       copy_units(&to, &from, 0,       /* starting from 0-th unit */
54469 +                                  shift->merging_units, SHIFT_LEFT,
54470 +                                  shift->merging_bytes);
54471 +                       coord_inc_item_pos(&from);
54472 +                       from_ih--;
54473 +                       coord_inc_item_pos(&to);
54474 +               }
54475 +
54476 +               to_ih = node40_ih_at(shift->target, old_items);
54477 +               if (shift->entire) {
54478 +                       /* copy @entire items entirely */
54479 +
54480 +                       /* copy item headers */
54481 +                       memcpy(to_ih - shift->entire + 1,
54482 +                              from_ih - shift->entire + 1,
54483 +                              shift->entire * sizeof(item_header40));
54484 +                       /* update item header offset */
54485 +                       old_offset = ih40_get_offset(from_ih);
54486 +                       /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */
54487 +                       for (i = 0; i < shift->entire; i++, to_ih--, from_ih--)
54488 +                               ih40_set_offset(to_ih,
54489 +                                               ih40_get_offset(from_ih) -
54490 +                                               old_offset + free_space_start);
54491 +
54492 +                       /* copy item bodies */
54493 +                       memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset,  /*ih40_get_offset (from_ih), */
54494 +                              shift->entire_bytes);
54495 +
54496 +                       coord_add_item_pos(&from, (int)shift->entire);
54497 +                       coord_add_item_pos(&to, (int)shift->entire);
54498 +               }
54499 +
54500 +               nh40_set_free_space_start(nh,
54501 +                                         free_space_start +
54502 +                                         shift->shift_bytes -
54503 +                                         shift->merging_bytes);
54504 +               nh40_set_free_space(nh,
54505 +                                   nh40_get_free_space(nh) -
54506 +                                   (shift->shift_bytes - shift->merging_bytes +
54507 +                                    sizeof(item_header40) * new_items));
54508 +
54509 +               /* update node header */
54510 +               node40_set_num_items(shift->target, nh, old_items + new_items);
54511 +               assert("vs-170",
54512 +                      nh40_get_free_space(nh) < znode_size(shift->target));
54513 +
54514 +               if (shift->part_units) {
54515 +                       /* copy heading part (@part units) of @source item as
54516 +                          a new item into @target->node */
54517 +
54518 +                       /* copy item header of partially copied item */
54519 +                       coord_set_item_pos(&to,
54520 +                                          node40_num_of_items_internal(to.node)
54521 +                                          - 1);
54522 +                       memcpy(to_ih, from_ih, sizeof(item_header40));
54523 +                       ih40_set_offset(to_ih,
54524 +                                       nh40_get_free_space_start(nh) -
54525 +                                       shift->part_bytes);
54526 +                       if (item_plugin_by_coord(&to)->b.init)
54527 +                               item_plugin_by_coord(&to)->b.init(&to, &from,
54528 +                                                                 NULL);
54529 +                       copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT,
54530 +                                  shift->part_bytes);
54531 +               }
54532 +
54533 +       } else {
54534 +               /* copying to right */
54535 +
54536 +               coord_set_item_pos(&from,
54537 +                                  node40_num_of_items_internal(from.node) - 1);
54538 +               from_ih = node40_ih_at_coord(&from);
54539 +
54540 +               coord_set_item_pos(&to, 0);
54541 +
54542 +               /* prepare space for new items */
54543 +               memmove(zdata(to.node) + sizeof(node40_header) +
54544 +                       shift->shift_bytes,
54545 +                       zdata(to.node) + sizeof(node40_header),
54546 +                       free_space_start - sizeof(node40_header));
54547 +               /* update item headers of moved items */
54548 +               to_ih = node40_ih_at(to.node, 0);
54549 +               /* first item gets @merging_bytes longer. free space appears
54550 +                  at its beginning */
54551 +               if (!node_is_empty(to.node))
54552 +                       ih40_set_offset(to_ih,
54553 +                                       ih40_get_offset(to_ih) +
54554 +                                       shift->shift_bytes -
54555 +                                       shift->merging_bytes);
54556 +
54557 +               for (i = 1; i < old_items; i++)
54558 +                       ih40_set_offset(to_ih - i,
54559 +                                       ih40_get_offset(to_ih - i) +
54560 +                                       shift->shift_bytes);
54561 +
54562 +               /* move item headers to make space for new items */
54563 +               memmove(to_ih - old_items + 1 - new_items,
54564 +                       to_ih - old_items + 1,
54565 +                       sizeof(item_header40) * old_items);
54566 +               to_ih -= (new_items - 1);
54567 +
54568 +               nh40_set_free_space_start(nh,
54569 +                                         free_space_start +
54570 +                                         shift->shift_bytes);
54571 +               nh40_set_free_space(nh,
54572 +                                   nh40_get_free_space(nh) -
54573 +                                   (shift->shift_bytes +
54574 +                                    sizeof(item_header40) * new_items));
54575 +
54576 +               /* update node header */
54577 +               node40_set_num_items(shift->target, nh, old_items + new_items);
54578 +               assert("vs-170",
54579 +                      nh40_get_free_space(nh) < znode_size(shift->target));
54580 +
54581 +               if (shift->merging_units) {
54582 +                       coord_add_item_pos(&to, new_items);
54583 +                       to.unit_pos = 0;
54584 +                       to.between = AT_UNIT;
54585 +                       /* prepend first item of @to */
54586 +                       copy_units(&to, &from,
54587 +                                  coord_last_unit_pos(&from) -
54588 +                                  shift->merging_units + 1,
54589 +                                  shift->merging_units, SHIFT_RIGHT,
54590 +                                  shift->merging_bytes);
54591 +                       coord_dec_item_pos(&from);
54592 +                       from_ih++;
54593 +               }
54594 +
54595 +               if (shift->entire) {
54596 +                       /* copy @entire items entirely */
54597 +
54598 +                       /* copy item headers */
54599 +                       memcpy(to_ih, from_ih,
54600 +                              shift->entire * sizeof(item_header40));
54601 +
54602 +                       /* update item header offset */
54603 +                       old_offset =
54604 +                           ih40_get_offset(from_ih + shift->entire - 1);
54605 +                       /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */
54606 +                       for (i = 0; i < shift->entire; i++, to_ih++, from_ih++)
54607 +                               ih40_set_offset(to_ih,
54608 +                                               ih40_get_offset(from_ih) -
54609 +                                               old_offset +
54610 +                                               sizeof(node40_header) +
54611 +                                               shift->part_bytes);
54612 +                       /* copy item bodies */
54613 +                       coord_add_item_pos(&from, -(int)(shift->entire - 1));
54614 +                       memcpy(zdata(to.node) + sizeof(node40_header) +
54615 +                              shift->part_bytes, item_by_coord_node40(&from),
54616 +                              shift->entire_bytes);
54617 +                       coord_dec_item_pos(&from);
54618 +               }
54619 +
54620 +               if (shift->part_units) {
54621 +                       coord_set_item_pos(&to, 0);
54622 +                       to.unit_pos = 0;
54623 +                       to.between = AT_UNIT;
54624 +                       /* copy heading part (@part units) of @source item as
54625 +                          a new item into @target->node */
54626 +
54627 +                       /* copy item header of partially copied item */
54628 +                       memcpy(to_ih, from_ih, sizeof(item_header40));
54629 +                       ih40_set_offset(to_ih, sizeof(node40_header));
54630 +                       if (item_plugin_by_coord(&to)->b.init)
54631 +                               item_plugin_by_coord(&to)->b.init(&to, &from,
54632 +                                                                 NULL);
54633 +                       copy_units(&to, &from,
54634 +                                  coord_last_unit_pos(&from) -
54635 +                                  shift->part_units + 1, shift->part_units,
54636 +                                  SHIFT_RIGHT, shift->part_bytes);
54637 +               }
54638 +       }
54639 +}
54640 +
54641 +/* remove everything either before or after @fact_stop. Number of items
54642 +   removed completely is returned */
54643 +static int delete_copied(struct shift_params *shift)
54644 +{
54645 +       coord_t from;
54646 +       coord_t to;
54647 +       struct carry_cut_data cdata;
54648 +
54649 +       if (shift->pend == SHIFT_LEFT) {
54650 +               /* we were shifting to left, remove everything from the
54651 +                  beginning of @shift->wish_stop->node upto
54652 +                  @shift->wish_stop */
54653 +               coord_init_first_unit(&from, shift->real_stop.node);
54654 +               to = shift->real_stop;
54655 +
54656 +               /* store old coordinate of unit which will be first after
54657 +                  shift to left */
54658 +               shift->u.future_first = to;
54659 +               coord_next_unit(&shift->u.future_first);
54660 +       } else {
54661 +               /* we were shifting to right, remove everything from
54662 +                  @shift->stop_coord upto to end of
54663 +                  @shift->stop_coord->node */
54664 +               from = shift->real_stop;
54665 +               coord_init_last_unit(&to, from.node);
54666 +
54667 +               /* store old coordinate of unit which will be last after
54668 +                  shift to right */
54669 +               shift->u.future_last = from;
54670 +               coord_prev_unit(&shift->u.future_last);
54671 +       }
54672 +
54673 +       cdata.params.from = &from;
54674 +       cdata.params.to = &to;
54675 +       cdata.params.from_key = NULL;
54676 +       cdata.params.to_key = NULL;
54677 +       cdata.params.smallest_removed = NULL;
54678 +       return cut_node40(&cdata, NULL);
54679 +}
54680 +
54681 +/* something was moved between @left and @right. Add carry operation to @info
54682 +   list to have carry to update delimiting key between them */
54683 +static int
54684 +prepare_for_update(znode * left, znode * right, carry_plugin_info * info)
54685 +{
54686 +       carry_op *op;
54687 +       carry_node *cn;
54688 +
54689 +       if (info == NULL)
54690 +               /* nowhere to send operation to. */
54691 +               return 0;
54692 +
54693 +       if (!should_notify_parent(right))
54694 +               return 0;
54695 +
54696 +       op = node_post_carry(info, COP_UPDATE, right, 1);
54697 +       if (IS_ERR(op) || op == NULL)
54698 +               return op ? PTR_ERR(op) : -EIO;
54699 +
54700 +       if (left != NULL) {
54701 +               carry_node *reference;
54702 +
54703 +               if (info->doing)
54704 +                       reference = insert_carry_node(info->doing,
54705 +                                                     info->todo, left);
54706 +               else
54707 +                       reference = op->node;
54708 +               assert("nikita-2992", reference != NULL);
54709 +               cn = reiser4_add_carry(info->todo, POOLO_BEFORE, reference);
54710 +               if (IS_ERR(cn))
54711 +                       return PTR_ERR(cn);
54712 +               cn->parent = 1;
54713 +               cn->node = left;
54714 +               if (ZF_ISSET(left, JNODE_ORPHAN))
54715 +                       cn->left_before = 1;
54716 +               op->u.update.left = cn;
54717 +       } else
54718 +               op->u.update.left = NULL;
54719 +       return 0;
54720 +}
54721 +
54722 +/* plugin->u.node.prepare_removal
54723 +   to delete a pointer to @empty from the tree add corresponding carry
54724 +   operation (delete) to @info list */
54725 +int prepare_removal_node40(znode * empty, carry_plugin_info * info)
54726 +{
54727 +       carry_op *op;
54728 +       reiser4_tree *tree;
54729 +
54730 +       if (!should_notify_parent(empty))
54731 +               return 0;
54732 +       /* already on a road to Styx */
54733 +       if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE))
54734 +               return 0;
54735 +       op = node_post_carry(info, COP_DELETE, empty, 1);
54736 +       if (IS_ERR(op) || op == NULL)
54737 +               return RETERR(op ? PTR_ERR(op) : -EIO);
54738 +
54739 +       op->u.delete.child = NULL;
54740 +       op->u.delete.flags = 0;
54741 +
54742 +       /* fare thee well */
54743 +       tree = znode_get_tree(empty);
54744 +       read_lock_tree(tree);
54745 +       write_lock_dk(tree);
54746 +       znode_set_ld_key(empty, znode_get_rd_key(empty));
54747 +       if (znode_is_left_connected(empty) && empty->left)
54748 +               znode_set_rd_key(empty->left, znode_get_rd_key(empty));
54749 +       write_unlock_dk(tree);
54750 +       read_unlock_tree(tree);
54751 +
54752 +       ZF_SET(empty, JNODE_HEARD_BANSHEE);
54753 +       return 0;
54754 +}
54755 +
54756 +/* something were shifted from @insert_coord->node to @shift->target, update
54757 +   @insert_coord correspondingly */
54758 +static void
54759 +adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed,
54760 +            int including_insert_coord)
54761 +{
54762 +       /* item plugin was invalidated by shifting */
54763 +       coord_clear_iplug(insert_coord);
54764 +
54765 +       if (node_is_empty(shift->wish_stop.node)) {
54766 +               assert("vs-242", shift->everything);
54767 +               if (including_insert_coord) {
54768 +                       if (shift->pend == SHIFT_RIGHT) {
54769 +                               /* set @insert_coord before first unit of
54770 +                                  @shift->target node */
54771 +                               coord_init_before_first_item(insert_coord,
54772 +                                                            shift->target);
54773 +                       } else {
54774 +                               /* set @insert_coord after last in target node */
54775 +                               coord_init_after_last_item(insert_coord,
54776 +                                                          shift->target);
54777 +                       }
54778 +               } else {
54779 +                       /* set @insert_coord inside of empty node. There is
54780 +                          only one possible coord within an empty
54781 +                          node. init_first_unit will set that coord */
54782 +                       coord_init_first_unit(insert_coord,
54783 +                                             shift->wish_stop.node);
54784 +               }
54785 +               return;
54786 +       }
54787 +
54788 +       if (shift->pend == SHIFT_RIGHT) {
54789 +               /* there was shifting to right */
54790 +               if (shift->everything) {
54791 +                       /* everything wanted was shifted */
54792 +                       if (including_insert_coord) {
54793 +                               /* @insert_coord is set before first unit of
54794 +                                  @to node */
54795 +                               coord_init_before_first_item(insert_coord,
54796 +                                                            shift->target);
54797 +                               insert_coord->between = BEFORE_UNIT;
54798 +                       } else {
54799 +                               /* @insert_coord is set after last unit of
54800 +                                  @insert->node */
54801 +                               coord_init_last_unit(insert_coord,
54802 +                                                    shift->wish_stop.node);
54803 +                               insert_coord->between = AFTER_UNIT;
54804 +                       }
54805 +               }
54806 +               return;
54807 +       }
54808 +
54809 +       /* there was shifting to left */
54810 +       if (shift->everything) {
54811 +               /* everything wanted was shifted */
54812 +               if (including_insert_coord) {
54813 +                       /* @insert_coord is set after last unit in @to node */
54814 +                       coord_init_after_last_item(insert_coord, shift->target);
54815 +               } else {
54816 +                       /* @insert_coord is set before first unit in the same
54817 +                          node */
54818 +                       coord_init_before_first_item(insert_coord,
54819 +                                                    shift->wish_stop.node);
54820 +               }
54821 +               return;
54822 +       }
54823 +
54824 +       /* FIXME-VS: the code below is complicated because with between ==
54825 +          AFTER_ITEM unit_pos is set to 0 */
54826 +
54827 +       if (!removed) {
54828 +               /* no items were shifted entirely */
54829 +               assert("vs-195", shift->merging_units == 0
54830 +                      || shift->part_units == 0);
54831 +
54832 +               if (shift->real_stop.item_pos == insert_coord->item_pos) {
54833 +                       if (shift->merging_units) {
54834 +                               if (insert_coord->between == AFTER_UNIT) {
54835 +                                       assert("nikita-1441",
54836 +                                              insert_coord->unit_pos >=
54837 +                                              shift->merging_units);
54838 +                                       insert_coord->unit_pos -=
54839 +                                           shift->merging_units;
54840 +                               } else if (insert_coord->between == BEFORE_UNIT) {
54841 +                                       assert("nikita-2090",
54842 +                                              insert_coord->unit_pos >
54843 +                                              shift->merging_units);
54844 +                                       insert_coord->unit_pos -=
54845 +                                           shift->merging_units;
54846 +                               }
54847 +
54848 +                               assert("nikita-2083",
54849 +                                      insert_coord->unit_pos + 1);
54850 +                       } else {
54851 +                               if (insert_coord->between == AFTER_UNIT) {
54852 +                                       assert("nikita-1442",
54853 +                                              insert_coord->unit_pos >=
54854 +                                              shift->part_units);
54855 +                                       insert_coord->unit_pos -=
54856 +                                           shift->part_units;
54857 +                               } else if (insert_coord->between == BEFORE_UNIT) {
54858 +                                       assert("nikita-2089",
54859 +                                              insert_coord->unit_pos >
54860 +                                              shift->part_units);
54861 +                                       insert_coord->unit_pos -=
54862 +                                           shift->part_units;
54863 +                               }
54864 +
54865 +                               assert("nikita-2084",
54866 +                                      insert_coord->unit_pos + 1);
54867 +                       }
54868 +               }
54869 +               return;
54870 +       }
54871 +
54872 +       /* we shifted to left and there was no enough space for everything */
54873 +       switch (insert_coord->between) {
54874 +       case AFTER_UNIT:
54875 +       case BEFORE_UNIT:
54876 +               if (shift->real_stop.item_pos == insert_coord->item_pos)
54877 +                       insert_coord->unit_pos -= shift->part_units;
54878 +       case AFTER_ITEM:
54879 +               coord_add_item_pos(insert_coord, -removed);
54880 +               break;
54881 +       default:
54882 +               impossible("nikita-2087", "not ready");
54883 +       }
54884 +       assert("nikita-2085", insert_coord->unit_pos + 1);
54885 +}
54886 +
54887 +static int call_shift_hooks(struct shift_params *shift)
54888 +{
54889 +       unsigned i, shifted;
54890 +       coord_t coord;
54891 +       item_plugin *iplug;
54892 +
54893 +       assert("vs-275", !node_is_empty(shift->target));
54894 +
54895 +       /* number of items shift touches */
54896 +       shifted =
54897 +           shift->entire + (shift->merging_units ? 1 : 0) +
54898 +           (shift->part_units ? 1 : 0);
54899 +
54900 +       if (shift->pend == SHIFT_LEFT) {
54901 +               /* moved items are at the end */
54902 +               coord_init_last_unit(&coord, shift->target);
54903 +               coord.unit_pos = 0;
54904 +
54905 +               assert("vs-279", shift->pend == 1);
54906 +               for (i = 0; i < shifted; i++) {
54907 +                       unsigned from, count;
54908 +
54909 +                       iplug = item_plugin_by_coord(&coord);
54910 +                       if (i == 0 && shift->part_units) {
54911 +                               assert("vs-277",
54912 +                                      coord_num_units(&coord) ==
54913 +                                      shift->part_units);
54914 +                               count = shift->part_units;
54915 +                               from = 0;
54916 +                       } else if (i == shifted - 1 && shift->merging_units) {
54917 +                               count = shift->merging_units;
54918 +                               from = coord_num_units(&coord) - count;
54919 +                       } else {
54920 +                               count = coord_num_units(&coord);
54921 +                               from = 0;
54922 +                       }
54923 +
54924 +                       if (iplug->b.shift_hook) {
54925 +                               iplug->b.shift_hook(&coord, from, count,
54926 +                                                   shift->wish_stop.node);
54927 +                       }
54928 +                       coord_add_item_pos(&coord, -shift->pend);
54929 +               }
54930 +       } else {
54931 +               /* moved items are at the beginning */
54932 +               coord_init_first_unit(&coord, shift->target);
54933 +
54934 +               assert("vs-278", shift->pend == -1);
54935 +               for (i = 0; i < shifted; i++) {
54936 +                       unsigned from, count;
54937 +
54938 +                       iplug = item_plugin_by_coord(&coord);
54939 +                       if (i == 0 && shift->part_units) {
54940 +                               assert("vs-277",
54941 +                                      coord_num_units(&coord) ==
54942 +                                      shift->part_units);
54943 +                               count = coord_num_units(&coord);
54944 +                               from = 0;
54945 +                       } else if (i == shifted - 1 && shift->merging_units) {
54946 +                               count = shift->merging_units;
54947 +                               from = 0;
54948 +                       } else {
54949 +                               count = coord_num_units(&coord);
54950 +                               from = 0;
54951 +                       }
54952 +
54953 +                       if (iplug->b.shift_hook) {
54954 +                               iplug->b.shift_hook(&coord, from, count,
54955 +                                                   shift->wish_stop.node);
54956 +                       }
54957 +                       coord_add_item_pos(&coord, -shift->pend);
54958 +               }
54959 +       }
54960 +
54961 +       return 0;
54962 +}
54963 +
54964 +/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */
54965 +static int
54966 +unit_moved_left(const struct shift_params *shift, const coord_t * old)
54967 +{
54968 +       assert("vs-944", shift->real_stop.node == old->node);
54969 +
54970 +       if (shift->real_stop.item_pos < old->item_pos)
54971 +               return 0;
54972 +       if (shift->real_stop.item_pos == old->item_pos) {
54973 +               if (shift->real_stop.unit_pos < old->unit_pos)
54974 +                       return 0;
54975 +       }
54976 +       return 1;
54977 +}
54978 +
54979 +/* shift to right is completed. Return 1 if unit @old was moved to right
54980 +   neighbor */
54981 +static int
54982 +unit_moved_right(const struct shift_params *shift, const coord_t * old)
54983 +{
54984 +       assert("vs-944", shift->real_stop.node == old->node);
54985 +
54986 +       if (shift->real_stop.item_pos > old->item_pos)
54987 +               return 0;
54988 +       if (shift->real_stop.item_pos == old->item_pos) {
54989 +               if (shift->real_stop.unit_pos > old->unit_pos)
54990 +                       return 0;
54991 +       }
54992 +       return 1;
54993 +}
54994 +
54995 +/* coord @old was set in node from which shift was performed. What was shifted
54996 +   is stored in @shift. Update @old correspondingly to performed shift */
54997 +static coord_t *adjust_coord2(const struct shift_params *shift,
54998 +                             const coord_t * old, coord_t * new)
54999 +{
55000 +       coord_clear_iplug(new);
55001 +       new->between = old->between;
55002 +
55003 +       coord_clear_iplug(new);
55004 +       if (old->node == shift->target) {
55005 +               if (shift->pend == SHIFT_LEFT) {
55006 +                       /* coord which is set inside of left neighbor does not
55007 +                          change during shift to left */
55008 +                       coord_dup(new, old);
55009 +                       return new;
55010 +               }
55011 +               new->node = old->node;
55012 +               coord_set_item_pos(new,
55013 +                                  old->item_pos + shift->entire +
55014 +                                  (shift->part_units ? 1 : 0));
55015 +               new->unit_pos = old->unit_pos;
55016 +               if (old->item_pos == 0 && shift->merging_units)
55017 +                       new->unit_pos += shift->merging_units;
55018 +               return new;
55019 +       }
55020 +
55021 +       assert("vs-977", old->node == shift->wish_stop.node);
55022 +       if (shift->pend == SHIFT_LEFT) {
55023 +               if (unit_moved_left(shift, old)) {
55024 +                       /* unit @old moved to left neighbor. Calculate its
55025 +                          coordinate there */
55026 +                       new->node = shift->target;
55027 +                       coord_set_item_pos(new,
55028 +                                          node_num_items(shift->target) -
55029 +                                          shift->entire -
55030 +                                          (shift->part_units ? 1 : 0) +
55031 +                                          old->item_pos);
55032 +
55033 +                       new->unit_pos = old->unit_pos;
55034 +                       if (shift->merging_units) {
55035 +                               coord_dec_item_pos(new);
55036 +                               if (old->item_pos == 0) {
55037 +                                       /* unit_pos only changes if item got
55038 +                                          merged */
55039 +                                       new->unit_pos =
55040 +                                           coord_num_units(new) -
55041 +                                           (shift->merging_units -
55042 +                                            old->unit_pos);
55043 +                               }
55044 +                       }
55045 +               } else {
55046 +                       /* unit @old did not move to left neighbor.
55047 +
55048 +                          Use _nocheck, because @old is outside of its node.
55049 +                        */
55050 +                       coord_dup_nocheck(new, old);
55051 +                       coord_add_item_pos(new,
55052 +                                          -shift->u.future_first.item_pos);
55053 +                       if (new->item_pos == 0)
55054 +                               new->unit_pos -= shift->u.future_first.unit_pos;
55055 +               }
55056 +       } else {
55057 +               if (unit_moved_right(shift, old)) {
55058 +                       /* unit @old moved to right neighbor */
55059 +                       new->node = shift->target;
55060 +                       coord_set_item_pos(new,
55061 +                                          old->item_pos -
55062 +                                          shift->real_stop.item_pos);
55063 +                       if (new->item_pos == 0) {
55064 +                               /* unit @old might change unit pos */
55065 +                               coord_set_item_pos(new,
55066 +                                                  old->unit_pos -
55067 +                                                  shift->real_stop.unit_pos);
55068 +                       }
55069 +               } else {
55070 +                       /* unit @old did not move to right neighbor, therefore
55071 +                          it did not change */
55072 +                       coord_dup(new, old);
55073 +               }
55074 +       }
55075 +       coord_set_iplug(new, item_plugin_by_coord(new));
55076 +       return new;
55077 +}
55078 +
55079 +/* this is called when shift is completed (something of source node is copied
55080 +   to target and deleted in source) to update all taps set in current
55081 +   context */
55082 +static void update_taps(const struct shift_params *shift)
55083 +{
55084 +       tap_t *tap;
55085 +       coord_t new;
55086 +
55087 +       for_all_taps(tap) {
55088 +               /* update only taps set to nodes participating in shift */
55089 +               if (tap->coord->node == shift->wish_stop.node
55090 +                   || tap->coord->node == shift->target)
55091 +                       tap_to_coord(tap,
55092 +                                    adjust_coord2(shift, tap->coord, &new));
55093 +       }
55094 +}
55095 +
55096 +#if REISER4_DEBUG
55097 +
55098 +struct shift_check {
55099 +       reiser4_key key;
55100 +       __u16 plugin_id;
55101 +       union {
55102 +               __u64 bytes;
55103 +               __u64 entries;
55104 +               void *unused;
55105 +       } u;
55106 +};
55107 +
55108 +void *shift_check_prepare(const znode * left, const znode * right)
55109 +{
55110 +       pos_in_node_t i, nr_items;
55111 +       int mergeable;
55112 +       struct shift_check *data;
55113 +       item_header40 *ih;
55114 +
55115 +       if (node_is_empty(left) || node_is_empty(right))
55116 +               mergeable = 0;
55117 +       else {
55118 +               coord_t l, r;
55119 +
55120 +               coord_init_last_unit(&l, left);
55121 +               coord_init_first_unit(&r, right);
55122 +               mergeable = are_items_mergeable(&l, &r);
55123 +       }
55124 +       nr_items =
55125 +           node40_num_of_items_internal(left) +
55126 +           node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
55127 +       data =
55128 +               kmalloc(sizeof(struct shift_check) * nr_items,
55129 +                       reiser4_ctx_gfp_mask_get());
55130 +       if (data != NULL) {
55131 +               coord_t coord;
55132 +               pos_in_node_t item_pos;
55133 +
55134 +               coord_init_first_unit(&coord, left);
55135 +               i = 0;
55136 +
55137 +               for (item_pos = 0;
55138 +                    item_pos < node40_num_of_items_internal(left);
55139 +                    item_pos++) {
55140 +
55141 +                       coord_set_item_pos(&coord, item_pos);
55142 +                       ih = node40_ih_at_coord(&coord);
55143 +
55144 +                       data[i].key = ih->key;
55145 +                       data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
55146 +                       switch (data[i].plugin_id) {
55147 +                       case CTAIL_ID:
55148 +                       case FORMATTING_ID:
55149 +                               data[i].u.bytes = coord_num_units(&coord);
55150 +                               break;
55151 +                       case EXTENT_POINTER_ID:
55152 +                               data[i].u.bytes =
55153 +                                       reiser4_extent_size(&coord,
55154 +                                                      coord_num_units(&coord));
55155 +                               break;
55156 +                       case COMPOUND_DIR_ID:
55157 +                               data[i].u.entries = coord_num_units(&coord);
55158 +                               break;
55159 +                       default:
55160 +                               data[i].u.unused = NULL;
55161 +                               break;
55162 +                       }
55163 +                       i++;
55164 +               }
55165 +
55166 +               coord_init_first_unit(&coord, right);
55167 +
55168 +               if (mergeable) {
55169 +                       assert("vs-1609", i != 0);
55170 +
55171 +                       ih = node40_ih_at_coord(&coord);
55172 +
55173 +                       assert("vs-1589",
55174 +                              data[i - 1].plugin_id ==
55175 +                              le16_to_cpu(get_unaligned(&ih->plugin_id)));
55176 +                       switch (data[i - 1].plugin_id) {
55177 +                       case CTAIL_ID:
55178 +                       case FORMATTING_ID:
55179 +                               data[i - 1].u.bytes += coord_num_units(&coord);
55180 +                               break;
55181 +                       case EXTENT_POINTER_ID:
55182 +                               data[i - 1].u.bytes +=
55183 +                                   reiser4_extent_size(&coord,
55184 +                                               coord_num_units(&coord));
55185 +                               break;
55186 +                       case COMPOUND_DIR_ID:
55187 +                               data[i - 1].u.entries +=
55188 +                                   coord_num_units(&coord);
55189 +                               break;
55190 +                       default:
55191 +                               impossible("vs-1605", "wrong mergeable item");
55192 +                               break;
55193 +                       }
55194 +                       item_pos = 1;
55195 +               } else
55196 +                       item_pos = 0;
55197 +               for (; item_pos < node40_num_of_items_internal(right);
55198 +                    item_pos++) {
55199 +
55200 +                       assert("vs-1604", i < nr_items);
55201 +                       coord_set_item_pos(&coord, item_pos);
55202 +                       ih = node40_ih_at_coord(&coord);
55203 +
55204 +                       data[i].key = ih->key;
55205 +                       data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
55206 +                       switch (data[i].plugin_id) {
55207 +                       case CTAIL_ID:
55208 +                       case FORMATTING_ID:
55209 +                               data[i].u.bytes = coord_num_units(&coord);
55210 +                               break;
55211 +                       case EXTENT_POINTER_ID:
55212 +                               data[i].u.bytes =
55213 +                                   reiser4_extent_size(&coord,
55214 +                                               coord_num_units(&coord));
55215 +                               break;
55216 +                       case COMPOUND_DIR_ID:
55217 +                               data[i].u.entries = coord_num_units(&coord);
55218 +                               break;
55219 +                       default:
55220 +                               data[i].u.unused = NULL;
55221 +                               break;
55222 +                       }
55223 +                       i++;
55224 +               }
55225 +               assert("vs-1606", i == nr_items);
55226 +       }
55227 +       return data;
55228 +}
55229 +
55230 +void shift_check(void *vp, const znode * left, const znode * right)
55231 +{
55232 +       pos_in_node_t i, nr_items;
55233 +       coord_t coord;
55234 +       __u64 last_bytes;
55235 +       int mergeable;
55236 +       item_header40 *ih;
55237 +       pos_in_node_t item_pos;
55238 +       struct shift_check *data;
55239 +
55240 +       data = (struct shift_check *)vp;
55241 +
55242 +       if (data == NULL)
55243 +               return;
55244 +
55245 +       if (node_is_empty(left) || node_is_empty(right))
55246 +               mergeable = 0;
55247 +       else {
55248 +               coord_t l, r;
55249 +
55250 +               coord_init_last_unit(&l, left);
55251 +               coord_init_first_unit(&r, right);
55252 +               mergeable = are_items_mergeable(&l, &r);
55253 +       }
55254 +
55255 +       nr_items =
55256 +           node40_num_of_items_internal(left) +
55257 +           node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
55258 +
55259 +       i = 0;
55260 +       last_bytes = 0;
55261 +
55262 +       coord_init_first_unit(&coord, left);
55263 +
55264 +       for (item_pos = 0; item_pos < node40_num_of_items_internal(left);
55265 +            item_pos++) {
55266 +
55267 +               coord_set_item_pos(&coord, item_pos);
55268 +               ih = node40_ih_at_coord(&coord);
55269 +
55270 +               assert("vs-1611", i == item_pos);
55271 +               assert("vs-1590", keyeq(&ih->key, &data[i].key));
55272 +               assert("vs-1591",
55273 +                      le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
55274 +               if ((i < (node40_num_of_items_internal(left) - 1))
55275 +                   || !mergeable) {
55276 +                       switch (data[i].plugin_id) {
55277 +                       case CTAIL_ID:
55278 +                       case FORMATTING_ID:
55279 +                               assert("vs-1592",
55280 +                                      data[i].u.bytes ==
55281 +                                      coord_num_units(&coord));
55282 +                               break;
55283 +                       case EXTENT_POINTER_ID:
55284 +                               assert("vs-1593",
55285 +                                      data[i].u.bytes ==
55286 +                                      reiser4_extent_size(&coord,
55287 +                                                          coord_num_units
55288 +                                                          (&coord)));
55289 +                               break;
55290 +                       case COMPOUND_DIR_ID:
55291 +                               assert("vs-1594",
55292 +                                      data[i].u.entries ==
55293 +                                      coord_num_units(&coord));
55294 +                               break;
55295 +                       default:
55296 +                               break;
55297 +                       }
55298 +               }
55299 +               if (item_pos == (node40_num_of_items_internal(left) - 1)
55300 +                   && mergeable) {
55301 +                       switch (data[i].plugin_id) {
55302 +                       case CTAIL_ID:
55303 +                       case FORMATTING_ID:
55304 +                               last_bytes = coord_num_units(&coord);
55305 +                               break;
55306 +                       case EXTENT_POINTER_ID:
55307 +                               last_bytes =
55308 +                                   reiser4_extent_size(&coord,
55309 +                                               coord_num_units(&coord));
55310 +                               break;
55311 +                       case COMPOUND_DIR_ID:
55312 +                               last_bytes = coord_num_units(&coord);
55313 +                               break;
55314 +                       default:
55315 +                               impossible("vs-1595", "wrong mergeable item");
55316 +                               break;
55317 +                       }
55318 +               }
55319 +               i++;
55320 +       }
55321 +
55322 +       coord_init_first_unit(&coord, right);
55323 +       if (mergeable) {
55324 +               ih = node40_ih_at_coord(&coord);
55325 +
55326 +               assert("vs-1589",
55327 +                      data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id)));
55328 +               assert("vs-1608", last_bytes != 0);
55329 +               switch (data[i - 1].plugin_id) {
55330 +               case CTAIL_ID:
55331 +               case FORMATTING_ID:
55332 +                       assert("vs-1596",
55333 +                              data[i - 1].u.bytes ==
55334 +                              last_bytes + coord_num_units(&coord));
55335 +                       break;
55336 +
55337 +               case EXTENT_POINTER_ID:
55338 +                       assert("vs-1597",
55339 +                              data[i - 1].u.bytes ==
55340 +                              last_bytes + reiser4_extent_size(&coord,
55341 +                                                               coord_num_units
55342 +                                                               (&coord)));
55343 +                       break;
55344 +
55345 +               case COMPOUND_DIR_ID:
55346 +                       assert("vs-1598",
55347 +                              data[i - 1].u.bytes ==
55348 +                              last_bytes + coord_num_units(&coord));
55349 +                       break;
55350 +               default:
55351 +                       impossible("vs-1599", "wrong mergeable item");
55352 +                       break;
55353 +               }
55354 +               item_pos = 1;
55355 +       } else
55356 +               item_pos = 0;
55357 +
55358 +       for (; item_pos < node40_num_of_items_internal(right); item_pos++) {
55359 +
55360 +               coord_set_item_pos(&coord, item_pos);
55361 +               ih = node40_ih_at_coord(&coord);
55362 +
55363 +               assert("vs-1612", keyeq(&ih->key, &data[i].key));
55364 +               assert("vs-1613",
55365 +                      le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
55366 +               switch (data[i].plugin_id) {
55367 +               case CTAIL_ID:
55368 +               case FORMATTING_ID:
55369 +                       assert("vs-1600",
55370 +                              data[i].u.bytes == coord_num_units(&coord));
55371 +                       break;
55372 +               case EXTENT_POINTER_ID:
55373 +                       assert("vs-1601",
55374 +                              data[i].u.bytes ==
55375 +                              reiser4_extent_size(&coord,
55376 +                                                  coord_num_units
55377 +                                                  (&coord)));
55378 +                       break;
55379 +               case COMPOUND_DIR_ID:
55380 +                       assert("vs-1602",
55381 +                              data[i].u.entries == coord_num_units(&coord));
55382 +                       break;
55383 +               default:
55384 +                       break;
55385 +               }
55386 +               i++;
55387 +       }
55388 +
55389 +       assert("vs-1603", i == nr_items);
55390 +       kfree(data);
55391 +}
55392 +
55393 +#endif
55394 +
55395 +/* plugin->u.node.shift
55396 +   look for description of this method in plugin/node/node.h */
55397 +int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child,   /* if @from->node becomes empty - it will be
55398 +                                                                                          deleted from the tree if this is set to 1 */
55399 +                int including_stop_coord, carry_plugin_info * info)
55400 +{
55401 +       struct shift_params shift;
55402 +       int result;
55403 +       znode *left, *right;
55404 +       znode *source;
55405 +       int target_empty;
55406 +
55407 +       assert("nikita-2161", coord_check(from));
55408 +
55409 +       memset(&shift, 0, sizeof(shift));
55410 +       shift.pend = pend;
55411 +       shift.wish_stop = *from;
55412 +       shift.target = to;
55413 +
55414 +       assert("nikita-1473", znode_is_write_locked(from->node));
55415 +       assert("nikita-1474", znode_is_write_locked(to));
55416 +
55417 +       source = from->node;
55418 +
55419 +       /* set @shift.wish_stop to rightmost/leftmost unit among units we want
55420 +          shifted */
55421 +       if (pend == SHIFT_LEFT) {
55422 +               result = coord_set_to_left(&shift.wish_stop);
55423 +               left = to;
55424 +               right = from->node;
55425 +       } else {
55426 +               result = coord_set_to_right(&shift.wish_stop);
55427 +               left = from->node;
55428 +               right = to;
55429 +       }
55430 +
55431 +       if (result) {
55432 +               /* move insertion coord even if there is nothing to move */
55433 +               if (including_stop_coord) {
55434 +                       /* move insertion coord (@from) */
55435 +                       if (pend == SHIFT_LEFT) {
55436 +                               /* after last item in target node */
55437 +                               coord_init_after_last_item(from, to);
55438 +                       } else {
55439 +                               /* before first item in target node */
55440 +                               coord_init_before_first_item(from, to);
55441 +                       }
55442 +               }
55443 +
55444 +               if (delete_child && node_is_empty(shift.wish_stop.node))
55445 +                       result =
55446 +                           prepare_removal_node40(shift.wish_stop.node, info);
55447 +               else
55448 +                       result = 0;
55449 +               /* there is nothing to shift */
55450 +               assert("nikita-2078", coord_check(from));
55451 +               return result;
55452 +       }
55453 +
55454 +       target_empty = node_is_empty(to);
55455 +
55456 +       /* when first node plugin with item body compression is implemented,
55457 +          this must be changed to call node specific plugin */
55458 +
55459 +       /* shift->stop_coord is updated to last unit which really will be
55460 +          shifted */
55461 +       estimate_shift(&shift, get_current_context());
55462 +       if (!shift.shift_bytes) {
55463 +               /* we could not shift anything */
55464 +               assert("nikita-2079", coord_check(from));
55465 +               return 0;
55466 +       }
55467 +
55468 +       copy(&shift);
55469 +
55470 +       /* result value of this is important. It is used by adjust_coord below */
55471 +       result = delete_copied(&shift);
55472 +
55473 +       assert("vs-1610", result >= 0);
55474 +       assert("vs-1471",
55475 +              ((reiser4_context *) current->journal_info)->magic ==
55476 +              context_magic);
55477 +
55478 +       /* item which has been moved from one node to another might want to do
55479 +          something on that event. This can be done by item's shift_hook
55480 +          method, which will be now called for every moved items */
55481 +       call_shift_hooks(&shift);
55482 +
55483 +       assert("vs-1472",
55484 +              ((reiser4_context *) current->journal_info)->magic ==
55485 +              context_magic);
55486 +
55487 +       update_taps(&shift);
55488 +
55489 +       assert("vs-1473",
55490 +              ((reiser4_context *) current->journal_info)->magic ==
55491 +              context_magic);
55492 +
55493 +       /* adjust @from pointer in accordance with @including_stop_coord flag
55494 +          and amount of data which was really shifted */
55495 +       adjust_coord(from, &shift, result, including_stop_coord);
55496 +
55497 +       if (target_empty)
55498 +               /*
55499 +                * items were shifted into empty node. Update delimiting key.
55500 +                */
55501 +               result = prepare_for_update(NULL, left, info);
55502 +
55503 +       /* add update operation to @info, which is the list of operations to
55504 +          be performed on a higher level */
55505 +       result = prepare_for_update(left, right, info);
55506 +       if (!result && node_is_empty(source) && delete_child) {
55507 +               /* all contents of @from->node is moved to @to and @from->node
55508 +                  has to be removed from the tree, so, on higher level we
55509 +                  will be removing the pointer to node @from->node */
55510 +               result = prepare_removal_node40(source, info);
55511 +       }
55512 +       assert("nikita-2080", coord_check(from));
55513 +       return result ? result : (int)shift.shift_bytes;
55514 +}
55515 +
55516 +/* plugin->u.node.fast_insert()
55517 +   look for description of this method in plugin/node/node.h */
55518 +int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
55519 +{
55520 +       return 1;
55521 +}
55522 +
55523 +/* plugin->u.node.fast_paste()
55524 +   look for description of this method in plugin/node/node.h */
55525 +int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
55526 +{
55527 +       return 1;
55528 +}
55529 +
55530 +/* plugin->u.node.fast_cut()
55531 +   look for description of this method in plugin/node/node.h */
55532 +int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
55533 +{
55534 +       return 1;
55535 +}
55536 +
55537 +/* plugin->u.node.modify - not defined */
55538 +
55539 +/* plugin->u.node.max_item_size */
55540 +int max_item_size_node40(void)
55541 +{
55542 +       return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) -
55543 +           sizeof(item_header40);
55544 +}
55545 +
55546 +/* plugin->u.node.set_item_plugin */
55547 +int set_item_plugin_node40(coord_t *coord, item_id id)
55548 +{
55549 +       item_header40 *ih;
55550 +
55551 +       ih = node40_ih_at_coord(coord);
55552 +       put_unaligned(cpu_to_le16(id), &ih->plugin_id);
55553 +       coord->iplugid = id;
55554 +       return 0;
55555 +}
55556 +
55557 +/*
55558 +   Local variables:
55559 +   c-indentation-style: "K&R"
55560 +   mode-name: "LC"
55561 +   c-basic-offset: 8
55562 +   tab-width: 8
55563 +   fill-column: 120
55564 +   scroll-step: 1
55565 +   End:
55566 +*/
55567 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/node/node40.h linux-2.6.27/fs/reiser4/plugin/node/node40.h
55568 --- linux-2.6.27.orig/fs/reiser4/plugin/node/node40.h   1970-01-01 03:00:00.000000000 +0300
55569 +++ linux-2.6.27/fs/reiser4/plugin/node/node40.h        2008-10-12 18:20:01.000000000 +0400
55570 @@ -0,0 +1,125 @@
55571 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55572 +
55573 +#if !defined( __REISER4_NODE40_H__ )
55574 +#define __REISER4_NODE40_H__
55575 +
55576 +#include "../../forward.h"
55577 +#include "../../dformat.h"
55578 +#include "node.h"
55579 +
55580 +#include <linux/types.h>
55581 +
55582 +/* format of node header for 40 node layouts. Keep bloat out of this struct.  */
55583 +typedef struct node40_header {
55584 +       /* identifier of node plugin. Must be located at the very beginning
55585 +          of a node. */
55586 +       common_node_header common_header;       /* this is 16 bits */
55587 +       /* number of items. Should be first element in the node header,
55588 +          because we haven't yet finally decided whether it shouldn't go into
55589 +          common_header.
55590 +        */
55591 +/* NIKITA-FIXME-HANS: Create a macro such that if there is only one
55592 + * node format at compile time, and it is this one, accesses do not function dereference when
55593 + * accessing these fields (and otherwise they do).  Probably 80% of users will only have one node format at a time throughout the life of reiser4.  */
55594 +       d16 nr_items;
55595 +       /* free space in node measured in bytes */
55596 +       d16 free_space;
55597 +       /* offset to start of free space in node */
55598 +       d16 free_space_start;
55599 +       /* for reiser4_fsck.  When information about what is a free
55600 +          block is corrupted, and we try to recover everything even
55601 +          if marked as freed, then old versions of data may
55602 +          duplicate newer versions, and this field allows us to
55603 +          restore the newer version.  Also useful for when users
55604 +          who don't have the new trashcan installed on their linux distro
55605 +          delete the wrong files and send us desperate emails
55606 +          offering $25 for them back.  */
55607 +
55608 +       /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */
55609 +       d32 magic;
55610 +       /* flushstamp is made of mk_id and write_counter. mk_id is an
55611 +          id generated randomly at mkreiserfs time. So we can just
55612 +          skip all nodes with different mk_id. write_counter is d64
55613 +          incrementing counter of writes on disk. It is used for
55614 +          choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */
55615 +
55616 +       d32 mkfs_id;
55617 +       d64 flush_id;
55618 +       /* node flags to be used by fsck (reiser4ck or reiser4fsck?)
55619 +          and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */
55620 +       d16 flags;
55621 +
55622 +       /* 1 is leaf level, 2 is twig level, root is the numerically
55623 +          largest level */
55624 +       d8 level;
55625 +
55626 +       d8 pad;
55627 +} PACKED node40_header;
55628 +
55629 +/* item headers are not standard across all node layouts, pass
55630 +   pos_in_node to functions instead */
55631 +typedef struct item_header40 {
55632 +       /* key of item */
55633 +       /*  0 */ reiser4_key key;
55634 +       /* offset from start of a node measured in 8-byte chunks */
55635 +       /* 24 */ d16 offset;
55636 +       /* 26 */ d16 flags;
55637 +       /* 28 */ d16 plugin_id;
55638 +} PACKED item_header40;
55639 +
55640 +size_t item_overhead_node40(const znode * node, flow_t * aflow);
55641 +size_t free_space_node40(znode * node);
55642 +node_search_result lookup_node40(znode * node, const reiser4_key * key,
55643 +                                lookup_bias bias, coord_t * coord);
55644 +int num_of_items_node40(const znode * node);
55645 +char *item_by_coord_node40(const coord_t * coord);
55646 +int length_by_coord_node40(const coord_t * coord);
55647 +item_plugin *plugin_by_coord_node40(const coord_t * coord);
55648 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key);
55649 +size_t estimate_node40(znode * node);
55650 +int check_node40(const znode * node, __u32 flags, const char **error);
55651 +int parse_node40(znode * node);
55652 +int init_node40(znode * node);
55653 +#ifdef GUESS_EXISTS
55654 +int guess_node40(const znode * node);
55655 +#endif
55656 +void change_item_size_node40(coord_t * coord, int by);
55657 +int create_item_node40(coord_t * target, const reiser4_key * key,
55658 +                      reiser4_item_data * data, carry_plugin_info * info);
55659 +void update_item_key_node40(coord_t * target, const reiser4_key * key,
55660 +                           carry_plugin_info * info);
55661 +int kill_node40(struct carry_kill_data *, carry_plugin_info *);
55662 +int cut_node40(struct carry_cut_data *, carry_plugin_info *);
55663 +int shift_node40(coord_t * from, znode * to, shift_direction pend,
55664 +                /* if @from->node becomes
55665 +                   empty - it will be deleted from
55666 +                   the tree if this is set to 1
55667 +                 */
55668 +                int delete_child, int including_stop_coord,
55669 +                carry_plugin_info * info);
55670 +
55671 +int fast_insert_node40(const coord_t * coord);
55672 +int fast_paste_node40(const coord_t * coord);
55673 +int fast_cut_node40(const coord_t * coord);
55674 +int max_item_size_node40(void);
55675 +int prepare_removal_node40(znode * empty, carry_plugin_info * info);
55676 +int set_item_plugin_node40(coord_t * coord, item_id id);
55677 +int shrink_item_node40(coord_t * coord, int delta);
55678 +
55679 +#if REISER4_DEBUG
55680 +void *shift_check_prepare(const znode *left, const znode *right);
55681 +void shift_check(void *vp, const znode *left, const znode *right);
55682 +#endif
55683 +
55684 +/* __REISER4_NODE40_H__ */
55685 +#endif
55686 +/*
55687 +   Local variables:
55688 +   c-indentation-style: "K&R"
55689 +   mode-name: "LC"
55690 +   c-basic-offset: 8
55691 +   tab-width: 8
55692 +   fill-column: 120
55693 +   scroll-step: 1
55694 +   End:
55695 +*/
55696 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/node/node.c linux-2.6.27/fs/reiser4/plugin/node/node.c
55697 --- linux-2.6.27.orig/fs/reiser4/plugin/node/node.c     1970-01-01 03:00:00.000000000 +0300
55698 +++ linux-2.6.27/fs/reiser4/plugin/node/node.c  2008-10-12 18:20:01.000000000 +0400
55699 @@ -0,0 +1,131 @@
55700 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55701 +
55702 +/* Node plugin interface.
55703 +
55704 +   Description: The tree provides the abstraction of flows, which it
55705 +   internally fragments into items which it stores in nodes.
55706 +
55707 +   A key_atom is a piece of data bound to a single key.
55708 +
55709 +   For reasonable space efficiency to be achieved it is often
55710 +   necessary to store key_atoms in the nodes in the form of items, where
55711 +   an item is a sequence of key_atoms of the same or similar type. It is
55712 +   more space-efficient, because the item can implement (very)
55713 +   efficient compression of key_atom's bodies using internal knowledge
55714 +   about their semantics, and it can often avoid having a key for each
55715 +   key_atom. Each type of item has specific operations implemented by its
55716 +   item handler (see balance.c).
55717 +
55718 +   Rationale: the rest of the code (specifically balancing routines)
55719 +   accesses leaf level nodes through this interface. This way we can
55720 +   implement various block layouts and even combine various layouts
55721 +   within the same tree. Balancing/allocating algorithms should not
55722 +   care about peculiarities of splitting/merging specific item types,
55723 +   but rather should leave that to the item's item handler.
55724 +
55725 +   Items, including those that provide the abstraction of flows, have
55726 +   the property that if you move them in part or in whole to another
55727 +   node, the balancing code invokes their is_left_mergeable()
55728 +   item_operation to determine if they are mergeable with their new
55729 +   neighbor in the node you have moved them to.  For some items the
55730 +   is_left_mergeable() function always returns null.
55731 +
55732 +   When moving the bodies of items from one node to another:
55733 +
55734 +     if a partial item is shifted to another node the balancing code invokes
55735 +     an item handler method to handle the item splitting.
55736 +
55737 +     if the balancing code needs to merge with an item in the node it
55738 +     is shifting to, it will invoke an item handler method to handle
55739 +     the item merging.
55740 +
55741 +     if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy()
55742 +     adjusting the item headers after the move is done using the node handler.
55743 +*/
55744 +
55745 +#include "../../forward.h"
55746 +#include "../../debug.h"
55747 +#include "../../key.h"
55748 +#include "../../coord.h"
55749 +#include "../plugin_header.h"
55750 +#include "../item/item.h"
55751 +#include "node.h"
55752 +#include "../plugin.h"
55753 +#include "../../znode.h"
55754 +#include "../../tree.h"
55755 +#include "../../super.h"
55756 +#include "../../reiser4.h"
55757 +
55758 +/**
55759 + * leftmost_key_in_node - get the smallest key in node
55760 + * @node:
55761 + * @key: store result here
55762 + *
55763 + * Stores the leftmost key of @node in @key.
55764 + */
55765 +reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key)
55766 +{
55767 +       assert("nikita-1634", node != NULL);
55768 +       assert("nikita-1635", key != NULL);
55769 +
55770 +       if (!node_is_empty(node)) {
55771 +               coord_t first_item;
55772 +
55773 +               coord_init_first_unit(&first_item, (znode *) node);
55774 +               item_key_by_coord(&first_item, key);
55775 +       } else
55776 +               *key = *reiser4_max_key();
55777 +       return key;
55778 +}
55779 +
55780 +node_plugin node_plugins[LAST_NODE_ID] = {
55781 +       [NODE40_ID] = {
55782 +               .h = {
55783 +                       .type_id = REISER4_NODE_PLUGIN_TYPE,
55784 +                       .id = NODE40_ID,
55785 +                       .pops = NULL,
55786 +                       .label = "unified",
55787 +                       .desc = "unified node layout",
55788 +                       .linkage = {NULL, NULL}
55789 +               },
55790 +               .item_overhead = item_overhead_node40,
55791 +               .free_space = free_space_node40,
55792 +               .lookup = lookup_node40,
55793 +               .num_of_items = num_of_items_node40,
55794 +               .item_by_coord = item_by_coord_node40,
55795 +               .length_by_coord = length_by_coord_node40,
55796 +               .plugin_by_coord = plugin_by_coord_node40,
55797 +               .key_at = key_at_node40,
55798 +               .estimate = estimate_node40,
55799 +               .check = check_node40,
55800 +               .parse = parse_node40,
55801 +               .init = init_node40,
55802 +#ifdef GUESS_EXISTS
55803 +               .guess = guess_node40,
55804 +#endif
55805 +               .change_item_size = change_item_size_node40,
55806 +               .create_item = create_item_node40,
55807 +               .update_item_key = update_item_key_node40,
55808 +               .cut_and_kill = kill_node40,
55809 +               .cut = cut_node40,
55810 +               .shift = shift_node40,
55811 +               .shrink_item = shrink_item_node40,
55812 +               .fast_insert = fast_insert_node40,
55813 +               .fast_paste = fast_paste_node40,
55814 +               .fast_cut = fast_cut_node40,
55815 +               .max_item_size = max_item_size_node40,
55816 +               .prepare_removal = prepare_removal_node40,
55817 +               .set_item_plugin = set_item_plugin_node40
55818 +       }
55819 +};
55820 +
55821 +/*
55822 +   Local variables:
55823 +   c-indentation-style: "K&R"
55824 +   mode-name: "LC"
55825 +   c-basic-offset: 8
55826 +   tab-width: 8
55827 +   fill-column: 120
55828 +   scroll-step: 1
55829 +   End:
55830 +*/
55831 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/node/node.h linux-2.6.27/fs/reiser4/plugin/node/node.h
55832 --- linux-2.6.27.orig/fs/reiser4/plugin/node/node.h     1970-01-01 03:00:00.000000000 +0300
55833 +++ linux-2.6.27/fs/reiser4/plugin/node/node.h  2008-10-12 18:20:01.000000000 +0400
55834 @@ -0,0 +1,272 @@
55835 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55836 +
55837 +/* We need a definition of the default node layout here. */
55838 +
55839 +/* Generally speaking, it is best to have free space in the middle of the
55840 +   node so that two sets of things can grow towards it, and to have the
55841 +   item bodies on the left so that the last one of them grows into free
55842 +   space.  We optimize for the case where we append new items to the end
55843 +   of the node, or grow the last item, because it hurts nothing to so
55844 +   optimize and it is a common special case to do massive insertions in
55845 +   increasing key order (and one of cases more likely to have a real user
55846 +   notice the delay time for).
55847 +
55848 +   formatted leaf default layout: (leaf1)
55849 +
55850 +   |node header:item bodies:free space:key + pluginid + item offset|
55851 +
55852 +   We grow towards the middle, optimizing layout for the case where we
55853 +   append new items to the end of the node.  The node header is fixed
55854 +   length.  Keys, and item offsets plus pluginids for the items
55855 +   corresponding to them are in increasing key order, and are fixed
55856 +   length.  Item offsets are relative to start of node (16 bits creating
55857 +   a node size limit of 64k, 12 bits might be a better choice....).  Item
55858 +   bodies are in decreasing key order.  Item bodies have a variable size.
55859 +   There is a one to one to one mapping of keys to item offsets to item
55860 +   bodies.  Item offsets consist of pointers to the zeroth byte of the
55861 +   item body.  Item length equals the start of the next item minus the
55862 +   start of this item, except the zeroth item whose length equals the end
55863 +   of the node minus the start of that item (plus a byte).  In other
55864 +   words, the item length is not recorded anywhere, and it does not need
55865 +   to be since it is computable.
55866 +
55867 +   Leaf variable length items and keys layout : (lvar)
55868 +
55869 +   |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies|
55870 +
55871 +   We grow towards the middle, optimizing layout for the case where we
55872 +   append new items to the end of the node.  The node header is fixed
55873 +   length.  Keys and item offsets for the items corresponding to them are
55874 +   in increasing key order, and keys are variable length.  Item offsets
55875 +   are relative to start of node (16 bits).  Item bodies are in
55876 +   decreasing key order.  Item bodies have a variable size.  There is a
55877 +   one to one to one mapping of keys to item offsets to item bodies.
55878 +   Item offsets consist of pointers to the zeroth byte of the item body.
55879 +   Item length equals the start of the next item's key minus the start of
55880 +   this item, except the zeroth item whose length equals the end of the
55881 +   node minus the start of that item (plus a byte).
55882 +
55883 +   leaf compressed keys layout: (lcomp)
55884 +
55885 +   |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies|
55886 +
55887 +   We grow towards the middle, optimizing layout for the case where we
55888 +   append new items to the end of the node.  The node header is fixed
55889 +   length.  Keys and item offsets for the items corresponding to them are
55890 +   in increasing key order, and keys are variable length.  The "key
55891 +   inherit" field indicates how much of the key prefix is identical to
55892 +   the previous key (stem compression as described in "Managing
55893 +   Gigabytes" is used).  key_inherit is a one byte integer.  The
55894 +   intra-node searches performed through this layout are linear searches,
55895 +   and this is theorized to not hurt performance much due to the high
55896 +   cost of processor stalls on modern CPUs, and the small number of keys
55897 +   in a single node.  Item offsets are relative to start of node (16
55898 +   bits).  Item bodies are in decreasing key order.  Item bodies have a
55899 +   variable size.  There is a one to one to one mapping of keys to item
55900 +   offsets to item bodies.  Item offsets consist of pointers to the
55901 +   zeroth byte of the item body.  Item length equals the start of the
55902 +   next item minus the start of this item, except the zeroth item whose
55903 +   length equals the end of the node minus the start of that item (plus a
55904 +   byte).  In other words, item length and key length is not recorded
55905 +   anywhere, and it does not need to be since it is computable.
55906 +
55907 +   internal node default layout: (idef1)
55908 +
55909 +   just like ldef1 except that item bodies are either blocknrs of
55910 +   children or extents, and moving them may require updating parent
55911 +   pointers in the nodes that they point to.
55912 +*/
55913 +
55914 +/* There is an inherent 3-way tradeoff between optimizing and
55915 +   exchanging disks between different architectures and code
55916 +   complexity.  This is optimal and simple and inexchangeable.
55917 +   Someone else can do the code for exchanging disks and make it
55918 +   complex. It would not be that hard.  Using other than the PAGE_SIZE
55919 +   might be suboptimal.
55920 +*/
55921 +
55922 +#if !defined( __REISER4_NODE_H__ )
55923 +#define __REISER4_NODE_H__
55924 +
55925 +#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE
55926 +
55927 +#include "../../dformat.h"
55928 +#include "../plugin_header.h"
55929 +
55930 +#include <linux/types.h>
55931 +
55932 +typedef enum {
55933 +       NS_FOUND = 0,
55934 +       NS_NOT_FOUND = -ENOENT
55935 +} node_search_result;
55936 +
55937 +/* Maximal possible space overhead for creation of new item in a node */
55938 +#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 )
55939 +
55940 +typedef enum {
55941 +       REISER4_NODE_DKEYS = (1 << 0),
55942 +       REISER4_NODE_TREE_STABLE = (1 << 1)
55943 +} reiser4_node_check_flag;
55944 +
55945 +/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */
55946 +struct cut_list {
55947 +       coord_t *from;
55948 +       coord_t *to;
55949 +       const reiser4_key *from_key;
55950 +       const reiser4_key *to_key;
55951 +       reiser4_key *smallest_removed;
55952 +       carry_plugin_info *info;
55953 +       __u32 flags;
55954 +       struct inode *inode;    /* this is to pass list of eflushed jnodes down to extent_kill_hook */
55955 +       lock_handle *left;
55956 +       lock_handle *right;
55957 +};
55958 +
55959 +struct carry_cut_data;
55960 +struct carry_kill_data;
55961 +
55962 +/* The responsibility of the node plugin is to store and give access
55963 +   to the sequence of items within the node.  */
55964 +typedef struct node_plugin {
55965 +       /* generic plugin fields */
55966 +       plugin_header h;
55967 +
55968 +       /* calculates the amount of space that will be required to store an
55969 +          item which is in addition to the space consumed by the item body.
55970 +          (the space consumed by the item body can be gotten by calling
55971 +          item->estimate) */
55972 +        size_t(*item_overhead) (const znode * node, flow_t * f);
55973 +
55974 +       /* returns free space by looking into node (i.e., without using
55975 +          znode->free_space). */
55976 +        size_t(*free_space) (znode * node);
55977 +       /* search within the node for the one item which might
55978 +          contain the key, invoking item->search_within to search within
55979 +          that item to see if it is in there */
55980 +        node_search_result(*lookup) (znode * node, const reiser4_key * key,
55981 +                                     lookup_bias bias, coord_t * coord);
55982 +       /* number of items in node */
55983 +       int (*num_of_items) (const znode * node);
55984 +
55985 +       /* store information about item in @coord in @data */
55986 +       /* break into several node ops, don't add any more uses of this before doing so */
55987 +       /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */
55988 +       char *(*item_by_coord) (const coord_t * coord);
55989 +       int (*length_by_coord) (const coord_t * coord);
55990 +       item_plugin *(*plugin_by_coord) (const coord_t * coord);
55991 +
55992 +       /* store item key in @key */
55993 +       reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key);
55994 +       /* conservatively estimate whether unit of what size can fit
55995 +          into node. This estimation should be performed without
55996 +          actually looking into the node's content (free space is saved in
55997 +          znode). */
55998 +        size_t(*estimate) (znode * node);
55999 +
56000 +       /* performs every consistency check the node plugin author could
56001 +          imagine. Optional. */
56002 +       int (*check) (const znode * node, __u32 flags, const char **error);
56003 +
56004 +       /* Called when node is read into memory and node plugin is
56005 +          already detected. This should read some data into znode (like free
56006 +          space counter) and, optionally, check data consistency.
56007 +        */
56008 +       int (*parse) (znode * node);
56009 +       /* This method is called on a new node to initialise plugin specific
56010 +          data (header, etc.) */
56011 +       int (*init) (znode * node);
56012 +       /* Check whether @node content conforms to this plugin format.
56013 +          Probably only useful after support for old V3.x formats is added.
56014 +          Uncomment after 4.0 only.
56015 +        */
56016 +       /*      int ( *guess )( const znode *node ); */
56017 +#if REISER4_DEBUG
56018 +       void (*print) (const char *prefix, const znode * node, __u32 flags);
56019 +#endif
56020 +       /* change size of @item by @by bytes. @item->node has enough free
56021 +          space. When @by > 0 - free space is appended to end of item. When
56022 +          @by < 0 - item is truncated - it is assumed that last @by bytes if
56023 +          the item are freed already */
56024 +       void (*change_item_size) (coord_t * item, int by);
56025 +
56026 +       /* create new item @length bytes long in coord @target */
56027 +       int (*create_item) (coord_t * target, const reiser4_key * key,
56028 +                           reiser4_item_data * data, carry_plugin_info * info);
56029 +
56030 +       /* update key of item. */
56031 +       void (*update_item_key) (coord_t * target, const reiser4_key * key,
56032 +                                carry_plugin_info * info);
56033 +
56034 +       int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *);
56035 +       int (*cut) (struct carry_cut_data *, carry_plugin_info *);
56036 +
56037 +       /*
56038 +        * shrink item pointed to by @coord by @delta bytes.
56039 +        */
56040 +       int (*shrink_item) (coord_t * coord, int delta);
56041 +
56042 +       /* copy as much as possible but not more than up to @stop from
56043 +          @stop->node to @target. If (pend == append) then data from beginning of
56044 +          @stop->node are copied to the end of @target. If (pend == prepend) then
56045 +          data from the end of @stop->node are copied to the beginning of
56046 +          @target. Copied data are removed from @stop->node. Information
56047 +          about what to do on upper level is stored in @todo */
56048 +       int (*shift) (coord_t * stop, znode * target, shift_direction pend,
56049 +                     int delete_node, int including_insert_coord,
56050 +                     carry_plugin_info * info);
56051 +       /* return true if this node allows skip carry() in some situations
56052 +          (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format
56053 +          emulation doesn't.
56054 +
56055 +          This will speedup insertions that doesn't require updates to the
56056 +          parent, by bypassing initialisation of carry() structures. It's
56057 +          believed that majority of insertions will fit there.
56058 +
56059 +        */
56060 +       int (*fast_insert) (const coord_t * coord);
56061 +       int (*fast_paste) (const coord_t * coord);
56062 +       int (*fast_cut) (const coord_t * coord);
56063 +       /* this limits max size of item which can be inserted into a node and
56064 +          number of bytes item in a node may be appended with */
56065 +       int (*max_item_size) (void);
56066 +       int (*prepare_removal) (znode * empty, carry_plugin_info * info);
56067 +       /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular
56068 +        * files */
56069 +       int (*set_item_plugin) (coord_t * coord, item_id);
56070 +} node_plugin;
56071 +
56072 +typedef enum {
56073 +       /* standard unified node layout used for both leaf and internal
56074 +          nodes */
56075 +       NODE40_ID,
56076 +       LAST_NODE_ID
56077 +} reiser4_node_id;
56078 +
56079 +extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key);
56080 +#if REISER4_DEBUG
56081 +extern void print_node_content(const char *prefix, const znode * node,
56082 +                              __u32 flags);
56083 +#endif
56084 +
56085 +extern void indent_znode(const znode * node);
56086 +
56087 +typedef struct common_node_header {
56088 +       /*
56089 +        * identifier of node plugin. Must be located at the very beginning of
56090 +        * a node.
56091 +        */
56092 +       __le16 plugin_id;
56093 +} common_node_header;
56094 +
56095 +/* __REISER4_NODE_H__ */
56096 +#endif
56097 +/*
56098 + * Local variables:
56099 + * c-indentation-style: "K&R"
56100 + * mode-name: "LC"
56101 + * c-basic-offset: 8
56102 + * tab-width: 8
56103 + * fill-column: 79
56104 + * scroll-step: 1
56105 + * End:
56106 + */
56107 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/object.c linux-2.6.27/fs/reiser4/plugin/object.c
56108 --- linux-2.6.27.orig/fs/reiser4/plugin/object.c        1970-01-01 03:00:00.000000000 +0300
56109 +++ linux-2.6.27/fs/reiser4/plugin/object.c     2008-10-13 02:24:12.000000000 +0400
56110 @@ -0,0 +1,531 @@
56111 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
56112 + * reiser4/README */
56113 +
56114 +/*
56115 + * Examples of object plugins: file, directory, symlink, special file.
56116 + *
56117 + * Plugins associated with inode:
56118 + *
56119 + * Plugin of inode is plugin referenced by plugin-id field of on-disk
56120 + * stat-data. How we store this plugin in in-core inode is not
56121 + * important. Currently pointers are used, another variant is to store offsets
56122 + * and do array lookup on each access.
56123 + *
56124 + * Now, each inode has one selected plugin: object plugin that
56125 + * determines what type of file this object is: directory, regular etc.
56126 + *
56127 + * This main plugin can use other plugins that are thus subordinated to
56128 + * it. Directory instance of object plugin uses hash; regular file
56129 + * instance uses tail policy plugin.
56130 + *
56131 + * Object plugin is either taken from id in stat-data or guessed from
56132 + * i_mode bits. Once it is established we ask it to install its
56133 + * subordinate plugins, by looking again in stat-data or inheriting them
56134 + * from parent.
56135 + *
56136 + * How new inode is initialized during ->read_inode():
56137 + * 1 read stat-data and initialize inode fields: i_size, i_mode,
56138 + *   i_generation, capabilities etc.
56139 + * 2 read plugin id from stat data or try to guess plugin id
56140 + *   from inode->i_mode bits if plugin id is missing.
56141 + * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields.
56142 + *
56143 + * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3?  What
56144 + * if stat data does contain i_size, etc., due to it being an unusual plugin?
56145 + *
56146 + * 4 Call ->activate() method of object's plugin. Plugin is either read from
56147 + *    from stat-data or guessed from mode bits
56148 + * 5 Call ->inherit() method of object plugin to inherit as yet un initialized
56149 + *    plugins from parent.
56150 + *
56151 + * Easy induction proves that on last step all plugins of inode would be
56152 + * initialized.
56153 + *
56154 + * When creating new object:
56155 + * 1 obtain object plugin id (see next period)
56156 + * NIKITA-FIXME-HANS: period?
56157 + * 2 ->install() this plugin
56158 + * 3 ->inherit() the rest from the parent
56159 + *
56160 + * We need some examples of creating an object with default and non-default
56161 + * plugin ids.  Nikita, please create them.
56162 + */
56163 +
56164 +#include "../inode.h"
56165 +
56166 +static int _bugop(void)
56167 +{
56168 +       BUG_ON(1);
56169 +       return 0;
56170 +}
56171 +
56172 +#define bugop ((void *)_bugop)
56173 +
56174 +static int _dummyop(void)
56175 +{
56176 +       return 0;
56177 +}
56178 +
56179 +#define dummyop ((void *)_dummyop)
56180 +
56181 +static int change_file(struct inode *inode,
56182 +                      reiser4_plugin * plugin,
56183 +                      pset_member memb)
56184 +{
56185 +       /* cannot change object plugin of already existing object */
56186 +       if (memb == PSET_FILE)
56187 +               return RETERR(-EINVAL);
56188 +
56189 +       /* Change PSET_CREATE */
56190 +       return aset_set_unsafe(&reiser4_inode_data(inode)->pset, memb, plugin);
56191 +}
56192 +
56193 +static reiser4_plugin_ops file_plugin_ops = {
56194 +       .change = change_file
56195 +};
56196 +
56197 +static struct inode_operations         null_i_ops = {.create = NULL};
56198 +static struct file_operations          null_f_ops = {.owner = NULL};
56199 +static struct address_space_operations null_a_ops = {.writepage = NULL};
56200 +
56201 +/* VFS methods for regular files */
56202 +static struct inode_operations regular_file_i_ops = {
56203 +       .permission = reiser4_permission_common,
56204 +       .setattr = reiser4_setattr,
56205 +       .getattr = reiser4_getattr_common
56206 +};
56207 +static struct file_operations regular_file_f_ops = {
56208 +       .llseek = generic_file_llseek,
56209 +       .read = reiser4_read_careful,
56210 +       .write = reiser4_write_careful,
56211 +       .aio_read = generic_file_aio_read,
56212 +       .ioctl = reiser4_ioctl_careful,
56213 +       .mmap = reiser4_mmap_careful,
56214 +       .open = reiser4_open_careful,
56215 +       .release = reiser4_release_careful,
56216 +       .fsync = reiser4_sync_file_common,
56217 +       .splice_read = generic_file_splice_read,
56218 +       .splice_write = generic_file_splice_write
56219 +};
56220 +static struct address_space_operations regular_file_a_ops = {
56221 +       .writepage = reiser4_writepage,
56222 +       .readpage = reiser4_readpage,
56223 +       .sync_page = block_sync_page,
56224 +       .writepages = reiser4_writepages,
56225 +       .set_page_dirty = reiser4_set_page_dirty,
56226 +       .readpages = reiser4_readpages,
56227 +       .prepare_write = reiser4_prepare_write,
56228 +       .commit_write = reiser4_commit_write,
56229 +       .bmap = reiser4_bmap_careful,
56230 +       .invalidatepage = reiser4_invalidatepage,
56231 +       .releasepage = reiser4_releasepage
56232 +};
56233 +
56234 +/* VFS methods for symlink files */
56235 +static struct inode_operations symlink_file_i_ops = {
56236 +       .readlink = generic_readlink,
56237 +       .follow_link = reiser4_follow_link_common,
56238 +       .permission = reiser4_permission_common,
56239 +       .setattr = reiser4_setattr_common,
56240 +       .getattr = reiser4_getattr_common
56241 +};
56242 +
56243 +/* VFS methods for special files */
56244 +static struct inode_operations special_file_i_ops = {
56245 +       .permission = reiser4_permission_common,
56246 +       .setattr = reiser4_setattr_common,
56247 +       .getattr = reiser4_getattr_common
56248 +};
56249 +
56250 +/* VFS methods for directories */
56251 +static struct inode_operations directory_i_ops = {
56252 +       .create = reiser4_create_common,
56253 +       .lookup = reiser4_lookup_common,
56254 +       .link = reiser4_link_common,
56255 +       .unlink = reiser4_unlink_common,
56256 +       .symlink = reiser4_symlink_common,
56257 +       .mkdir = reiser4_mkdir_common,
56258 +       .rmdir = reiser4_unlink_common,
56259 +       .mknod = reiser4_mknod_common,
56260 +       .rename = reiser4_rename_common,
56261 +       .permission = reiser4_permission_common,
56262 +       .setattr = reiser4_setattr_common,
56263 +       .getattr = reiser4_getattr_common
56264 +};
56265 +static struct file_operations directory_f_ops = {
56266 +       .llseek = reiser4_llseek_dir_common,
56267 +       .read = generic_read_dir,
56268 +       .readdir = reiser4_readdir_common,
56269 +       .release = reiser4_release_dir_common,
56270 +       .fsync = reiser4_sync_common
56271 +};
56272 +static struct address_space_operations directory_a_ops = {
56273 +       .writepage = bugop,
56274 +       .sync_page = bugop,
56275 +       .writepages = dummyop,
56276 +       .set_page_dirty = bugop,
56277 +       .readpages = bugop,
56278 +       .prepare_write = bugop,
56279 +       .commit_write = bugop,
56280 +       .bmap = bugop,
56281 +       .invalidatepage = bugop,
56282 +       .releasepage = bugop
56283 +};
56284 +
56285 +/*
56286 + * Definitions of object plugins.
56287 + */
56288 +
56289 +file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = {
56290 +       [UNIX_FILE_PLUGIN_ID] = {
56291 +               .h = {
56292 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
56293 +                       .id = UNIX_FILE_PLUGIN_ID,
56294 +                       .groups = (1 << REISER4_REGULAR_FILE),
56295 +                       .pops = &file_plugin_ops,
56296 +                       .label = "reg",
56297 +                       .desc = "regular file",
56298 +                       .linkage = {NULL, NULL},
56299 +               },
56300 +               /*
56301 +                * invariant vfs ops
56302 +                */
56303 +               .inode_ops = &regular_file_i_ops,
56304 +               .file_ops = &regular_file_f_ops,
56305 +               .as_ops = &regular_file_a_ops,
56306 +               /*
56307 +                * private i_ops
56308 +                */
56309 +               .setattr = setattr_unix_file,
56310 +               .open = open_unix_file,
56311 +               .read = read_unix_file,
56312 +               .write = write_unix_file,
56313 +               .ioctl = ioctl_unix_file,
56314 +               .mmap = mmap_unix_file,
56315 +               .release = release_unix_file,
56316 +               /*
56317 +                * private f_ops
56318 +                */
56319 +               .readpage = readpage_unix_file,
56320 +               .readpages = readpages_unix_file,
56321 +               .writepages = writepages_unix_file,
56322 +               .prepare_write = prepare_write_unix_file,
56323 +               .commit_write = commit_write_unix_file,
56324 +               /*
56325 +                * private a_ops
56326 +                */
56327 +               .bmap = bmap_unix_file,
56328 +               /*
56329 +                * other private methods
56330 +                */
56331 +               .write_sd_by_inode = write_sd_by_inode_common,
56332 +               .flow_by_inode = flow_by_inode_unix_file,
56333 +               .key_by_inode = key_by_inode_and_offset_common,
56334 +               .set_plug_in_inode = set_plug_in_inode_common,
56335 +               .adjust_to_parent = adjust_to_parent_common,
56336 +               .create_object = reiser4_create_object_common,
56337 +               .delete_object = delete_object_unix_file,
56338 +               .add_link = reiser4_add_link_common,
56339 +               .rem_link = reiser4_rem_link_common,
56340 +               .owns_item = owns_item_unix_file,
56341 +               .can_add_link = can_add_link_common,
56342 +               .detach = dummyop,
56343 +               .bind = dummyop,
56344 +               .safelink = safelink_common,
56345 +               .estimate = {
56346 +                       .create = estimate_create_common,
56347 +                       .update = estimate_update_common,
56348 +                       .unlink = estimate_unlink_common
56349 +               },
56350 +               .init_inode_data = init_inode_data_unix_file,
56351 +               .cut_tree_worker = cut_tree_worker_common,
56352 +               .wire = {
56353 +                       .write = wire_write_common,
56354 +                       .read = wire_read_common,
56355 +                       .get = wire_get_common,
56356 +                       .size = wire_size_common,
56357 +                       .done = wire_done_common
56358 +               }
56359 +       },
56360 +       [DIRECTORY_FILE_PLUGIN_ID] = {
56361 +               .h = {
56362 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
56363 +                       .id = DIRECTORY_FILE_PLUGIN_ID,
56364 +                       .groups = (1 << REISER4_DIRECTORY_FILE),
56365 +                       .pops = &file_plugin_ops,
56366 +                       .label = "dir",
56367 +                       .desc = "directory",
56368 +                       .linkage = {NULL, NULL}
56369 +               },
56370 +               .inode_ops = &null_i_ops,
56371 +               .file_ops = &null_f_ops,
56372 +               .as_ops = &null_a_ops,
56373 +
56374 +               .write_sd_by_inode = write_sd_by_inode_common,
56375 +               .flow_by_inode = bugop,
56376 +               .key_by_inode = bugop,
56377 +               .set_plug_in_inode = set_plug_in_inode_common,
56378 +               .adjust_to_parent = adjust_to_parent_common_dir,
56379 +               .create_object = reiser4_create_object_common,
56380 +               .delete_object = reiser4_delete_dir_common,
56381 +               .add_link = reiser4_add_link_common,
56382 +               .rem_link = rem_link_common_dir,
56383 +               .owns_item = owns_item_common_dir,
56384 +               .can_add_link = can_add_link_common,
56385 +               .can_rem_link = can_rem_link_common_dir,
56386 +               .detach = reiser4_detach_common_dir,
56387 +               .bind = reiser4_bind_common_dir,
56388 +               .safelink = safelink_common,
56389 +               .estimate = {
56390 +                       .create = estimate_create_common_dir,
56391 +                       .update = estimate_update_common,
56392 +                       .unlink = estimate_unlink_common_dir
56393 +               },
56394 +               .wire = {
56395 +                       .write = wire_write_common,
56396 +                       .read = wire_read_common,
56397 +                       .get = wire_get_common,
56398 +                       .size = wire_size_common,
56399 +                       .done = wire_done_common
56400 +               },
56401 +               .init_inode_data = init_inode_ordering,
56402 +               .cut_tree_worker = cut_tree_worker_common,
56403 +       },
56404 +       [SYMLINK_FILE_PLUGIN_ID] = {
56405 +               .h = {
56406 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
56407 +                       .id = SYMLINK_FILE_PLUGIN_ID,
56408 +                       .groups = (1 << REISER4_SYMLINK_FILE),
56409 +                       .pops = &file_plugin_ops,
56410 +                       .label = "symlink",
56411 +                       .desc = "symbolic link",
56412 +                       .linkage = {NULL,NULL}
56413 +               },
56414 +               .inode_ops = &symlink_file_i_ops,
56415 +               /* inode->i_fop of symlink is initialized
56416 +                  by NULL in setup_inode_ops */
56417 +               .file_ops = &null_f_ops,
56418 +               .as_ops = &null_a_ops,
56419 +
56420 +               .write_sd_by_inode = write_sd_by_inode_common,
56421 +               .set_plug_in_inode = set_plug_in_inode_common,
56422 +               .adjust_to_parent = adjust_to_parent_common,
56423 +               .create_object = reiser4_create_symlink,
56424 +               .delete_object = reiser4_delete_object_common,
56425 +               .add_link = reiser4_add_link_common,
56426 +               .rem_link = reiser4_rem_link_common,
56427 +               .can_add_link = can_add_link_common,
56428 +               .detach = dummyop,
56429 +               .bind = dummyop,
56430 +               .safelink = safelink_common,
56431 +               .estimate = {
56432 +                       .create = estimate_create_common,
56433 +                       .update = estimate_update_common,
56434 +                       .unlink = estimate_unlink_common
56435 +               },
56436 +               .init_inode_data = init_inode_ordering,
56437 +               .cut_tree_worker = cut_tree_worker_common,
56438 +               .destroy_inode = destroy_inode_symlink,
56439 +               .wire = {
56440 +                       .write = wire_write_common,
56441 +                       .read = wire_read_common,
56442 +                       .get = wire_get_common,
56443 +                       .size = wire_size_common,
56444 +                       .done = wire_done_common
56445 +               }
56446 +       },
56447 +       [SPECIAL_FILE_PLUGIN_ID] = {
56448 +               .h = {
56449 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
56450 +                       .id = SPECIAL_FILE_PLUGIN_ID,
56451 +                       .groups = (1 << REISER4_SPECIAL_FILE),
56452 +                       .pops = &file_plugin_ops,
56453 +                       .label = "special",
56454 +                       .desc =
56455 +                       "special: fifo, device or socket",
56456 +                       .linkage = {NULL, NULL}
56457 +               },
56458 +               .inode_ops = &special_file_i_ops,
56459 +               /* file_ops of special files (sockets, block, char, fifo) are
56460 +                  initialized by init_special_inode. */
56461 +               .file_ops = &null_f_ops,
56462 +               .as_ops = &null_a_ops,
56463 +
56464 +               .write_sd_by_inode = write_sd_by_inode_common,
56465 +               .set_plug_in_inode = set_plug_in_inode_common,
56466 +               .adjust_to_parent = adjust_to_parent_common,
56467 +               .create_object = reiser4_create_object_common,
56468 +               .delete_object = reiser4_delete_object_common,
56469 +               .add_link = reiser4_add_link_common,
56470 +               .rem_link = reiser4_rem_link_common,
56471 +               .owns_item = owns_item_common,
56472 +               .can_add_link = can_add_link_common,
56473 +               .detach = dummyop,
56474 +               .bind = dummyop,
56475 +               .safelink = safelink_common,
56476 +               .estimate = {
56477 +                       .create = estimate_create_common,
56478 +                       .update = estimate_update_common,
56479 +                       .unlink = estimate_unlink_common
56480 +               },
56481 +               .init_inode_data = init_inode_ordering,
56482 +               .cut_tree_worker = cut_tree_worker_common,
56483 +               .wire = {
56484 +                       .write = wire_write_common,
56485 +                       .read = wire_read_common,
56486 +                       .get = wire_get_common,
56487 +                       .size = wire_size_common,
56488 +                       .done = wire_done_common
56489 +               }
56490 +       },
56491 +       [CRYPTCOMPRESS_FILE_PLUGIN_ID] = {
56492 +               .h = {
56493 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
56494 +                       .id = CRYPTCOMPRESS_FILE_PLUGIN_ID,
56495 +                       .groups = (1 << REISER4_REGULAR_FILE),
56496 +                       .pops = &file_plugin_ops,
56497 +                       .label = "cryptcompress",
56498 +                       .desc = "cryptcompress file",
56499 +                       .linkage = {NULL, NULL}
56500 +               },
56501 +               .inode_ops = &regular_file_i_ops,
56502 +               .file_ops = &regular_file_f_ops,
56503 +               .as_ops = &regular_file_a_ops,
56504 +
56505 +               .setattr = setattr_cryptcompress,
56506 +               .open = open_cryptcompress,
56507 +               .read = read_cryptcompress,
56508 +               .write = write_cryptcompress,
56509 +               .ioctl = ioctl_cryptcompress,
56510 +               .mmap = mmap_cryptcompress,
56511 +               .release = release_cryptcompress,
56512 +
56513 +               .readpage = readpage_cryptcompress,
56514 +               .readpages = readpages_cryptcompress,
56515 +               .writepages = writepages_cryptcompress,
56516 +               .prepare_write = prepare_write_cryptcompress,
56517 +               .commit_write = commit_write_cryptcompress,
56518 +
56519 +               .bmap = bmap_cryptcompress,
56520 +
56521 +               .write_sd_by_inode = write_sd_by_inode_common,
56522 +               .flow_by_inode = flow_by_inode_cryptcompress,
56523 +               .key_by_inode = key_by_inode_cryptcompress,
56524 +               .set_plug_in_inode = set_plug_in_inode_common,
56525 +               .adjust_to_parent = adjust_to_parent_cryptcompress,
56526 +               .create_object = create_object_cryptcompress,
56527 +               .delete_object = delete_object_cryptcompress,
56528 +               .add_link = reiser4_add_link_common,
56529 +               .rem_link = reiser4_rem_link_common,
56530 +               .owns_item = owns_item_common,
56531 +               .can_add_link = can_add_link_common,
56532 +               .detach = dummyop,
56533 +               .bind = dummyop,
56534 +               .safelink = safelink_common,
56535 +               .estimate = {
56536 +                       .create = estimate_create_common,
56537 +                       .update = estimate_update_common,
56538 +                       .unlink = estimate_unlink_common
56539 +               },
56540 +               .init_inode_data = init_inode_data_cryptcompress,
56541 +               .cut_tree_worker = cut_tree_worker_cryptcompress,
56542 +               .destroy_inode = destroy_inode_cryptcompress,
56543 +               .wire = {
56544 +                       .write = wire_write_common,
56545 +                       .read = wire_read_common,
56546 +                       .get = wire_get_common,
56547 +                       .size = wire_size_common,
56548 +                       .done = wire_done_common
56549 +               }
56550 +       }
56551 +};
56552 +
56553 +static int change_dir(struct inode *inode,
56554 +                     reiser4_plugin * plugin,
56555 +                     pset_member memb)
56556 +{
56557 +       /* cannot change dir plugin of already existing object */
56558 +       return RETERR(-EINVAL);
56559 +}
56560 +
56561 +static reiser4_plugin_ops dir_plugin_ops = {
56562 +       .change = change_dir
56563 +};
56564 +
56565 +/*
56566 + * definition of directory plugins
56567 + */
56568 +
56569 +dir_plugin dir_plugins[LAST_DIR_ID] = {
56570 +       /* standard hashed directory plugin */
56571 +       [HASHED_DIR_PLUGIN_ID] = {
56572 +               .h = {
56573 +                       .type_id = REISER4_DIR_PLUGIN_TYPE,
56574 +                       .id = HASHED_DIR_PLUGIN_ID,
56575 +                       .pops = &dir_plugin_ops,
56576 +                       .label = "dir",
56577 +                       .desc = "hashed directory",
56578 +                       .linkage = {NULL, NULL}
56579 +               },
56580 +               .inode_ops = &directory_i_ops,
56581 +               .file_ops = &directory_f_ops,
56582 +               .as_ops = &directory_a_ops,
56583 +
56584 +               .get_parent = get_parent_common,
56585 +               .is_name_acceptable = is_name_acceptable_common,
56586 +               .build_entry_key = build_entry_key_hashed,
56587 +               .build_readdir_key = build_readdir_key_common,
56588 +               .add_entry = reiser4_add_entry_common,
56589 +               .rem_entry = reiser4_rem_entry_common,
56590 +               .init = reiser4_dir_init_common,
56591 +               .done = reiser4_dir_done_common,
56592 +               .attach = reiser4_attach_common,
56593 +               .detach = reiser4_detach_common,
56594 +               .estimate = {
56595 +                       .add_entry = estimate_add_entry_common,
56596 +                       .rem_entry = estimate_rem_entry_common,
56597 +                       .unlink = dir_estimate_unlink_common
56598 +               }
56599 +       },
56600 +       /* hashed directory for which seekdir/telldir are guaranteed to
56601 +        * work. Brain-damage. */
56602 +       [SEEKABLE_HASHED_DIR_PLUGIN_ID] = {
56603 +               .h = {
56604 +                       .type_id = REISER4_DIR_PLUGIN_TYPE,
56605 +                       .id = SEEKABLE_HASHED_DIR_PLUGIN_ID,
56606 +                       .pops = &dir_plugin_ops,
56607 +                       .label = "dir32",
56608 +                       .desc = "directory hashed with 31 bit hash",
56609 +                       .linkage = {NULL, NULL}
56610 +               },
56611 +               .inode_ops = &directory_i_ops,
56612 +               .file_ops = &directory_f_ops,
56613 +               .as_ops = &directory_a_ops,
56614 +
56615 +               .get_parent = get_parent_common,
56616 +               .is_name_acceptable = is_name_acceptable_common,
56617 +               .build_entry_key = build_entry_key_seekable,
56618 +               .build_readdir_key = build_readdir_key_common,
56619 +               .add_entry = reiser4_add_entry_common,
56620 +               .rem_entry = reiser4_rem_entry_common,
56621 +               .init = reiser4_dir_init_common,
56622 +               .done = reiser4_dir_done_common,
56623 +               .attach = reiser4_attach_common,
56624 +               .detach = reiser4_detach_common,
56625 +               .estimate = {
56626 +                       .add_entry = estimate_add_entry_common,
56627 +                       .rem_entry = estimate_rem_entry_common,
56628 +                       .unlink = dir_estimate_unlink_common
56629 +               }
56630 +       }
56631 +};
56632 +
56633 +/* Make Linus happy.
56634 +   Local variables:
56635 +   c-indentation-style: "K&R"
56636 +   mode-name: "LC"
56637 +   c-basic-offset: 8
56638 +   tab-width: 8
56639 +   fill-column: 120
56640 +   End:
56641 +*/
56642 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/object.h linux-2.6.27/fs/reiser4/plugin/object.h
56643 --- linux-2.6.27.orig/fs/reiser4/plugin/object.h        1970-01-01 03:00:00.000000000 +0300
56644 +++ linux-2.6.27/fs/reiser4/plugin/object.h     2008-10-12 18:20:01.000000000 +0400
56645 @@ -0,0 +1,120 @@
56646 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by
56647 + * reiser4/README */
56648 +
56649 +/* Declaration of object plugin functions. */
56650 +
56651 +#if !defined( __FS_REISER4_PLUGIN_OBJECT_H__ )
56652 +#define __FS_REISER4_PLUGIN_OBJECT_H__
56653 +
56654 +#include "../type_safe_hash.h"
56655 +
56656 +/* common implementations of inode operations */
56657 +int reiser4_create_common(struct inode *parent, struct dentry *dentry,
56658 +                         int mode, struct nameidata *);
56659 +struct dentry * reiser4_lookup_common(struct inode *parent,
56660 +                                     struct dentry *dentry,
56661 +                                     struct nameidata *nameidata);
56662 +int reiser4_link_common(struct dentry *existing, struct inode *parent,
56663 +                       struct dentry *newname);
56664 +int reiser4_unlink_common(struct inode *parent, struct dentry *victim);
56665 +int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode);
56666 +int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
56667 +                  const char *linkname);
56668 +int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
56669 +                int mode, dev_t rdev);
56670 +int reiser4_rename_common(struct inode *old_dir, struct dentry *old_name,
56671 +                         struct inode *new_dir, struct dentry *new_name);
56672 +void *reiser4_follow_link_common(struct dentry *, struct nameidata *data);
56673 +int reiser4_permission_common(struct inode *, int mask);
56674 +int reiser4_setattr_common(struct dentry *, struct iattr *);
56675 +int reiser4_getattr_common(struct vfsmount *mnt, struct dentry *,
56676 +                          struct kstat *);
56677 +
56678 +/* common implementations of file operations */
56679 +loff_t reiser4_llseek_dir_common(struct file *, loff_t off, int origin);
56680 +int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
56681 +int reiser4_release_dir_common(struct inode *, struct file *);
56682 +int reiser4_sync_common(struct file *, struct dentry *, int datasync);
56683 +
56684 +/* common implementations of address space operations */
56685 +int prepare_write_common(struct file *, struct page *, unsigned from,
56686 +                        unsigned to);
56687 +
56688 +/* file plugin operations: common implementations */
56689 +int write_sd_by_inode_common(struct inode *);
56690 +int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *);
56691 +int set_plug_in_inode_common(struct inode *object, struct inode *parent,
56692 +                            reiser4_object_create_data *);
56693 +int adjust_to_parent_common(struct inode *object, struct inode *parent,
56694 +                           struct inode *root);
56695 +int adjust_to_parent_common_dir(struct inode *object, struct inode *parent,
56696 +                               struct inode *root);
56697 +int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent,
56698 +                                  struct inode *root);
56699 +int reiser4_create_object_common(struct inode *object, struct inode *parent,
56700 +                                reiser4_object_create_data *);
56701 +int reiser4_delete_object_common(struct inode *);
56702 +int reiser4_delete_dir_common(struct inode *);
56703 +int reiser4_add_link_common(struct inode *object, struct inode *parent);
56704 +int reiser4_rem_link_common(struct inode *object, struct inode *parent);
56705 +int rem_link_common_dir(struct inode *object, struct inode *parent);
56706 +int owns_item_common(const struct inode *, const coord_t *);
56707 +int owns_item_common_dir(const struct inode *, const coord_t *);
56708 +int can_add_link_common(const struct inode *);
56709 +int can_rem_link_common_dir(const struct inode *);
56710 +int reiser4_detach_common_dir(struct inode *child, struct inode *parent);
56711 +int reiser4_bind_common_dir(struct inode *child, struct inode *parent);
56712 +int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value);
56713 +reiser4_block_nr estimate_create_common(const struct inode *);
56714 +reiser4_block_nr estimate_create_common_dir(const struct inode *);
56715 +reiser4_block_nr estimate_update_common(const struct inode *);
56716 +reiser4_block_nr estimate_unlink_common(const struct inode *,
56717 +                                       const struct inode *);
56718 +reiser4_block_nr estimate_unlink_common_dir(const struct inode *,
56719 +                                           const struct inode *);
56720 +char *wire_write_common(struct inode *, char *start);
56721 +char *wire_read_common(char *addr, reiser4_object_on_wire *);
56722 +struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *);
56723 +int wire_size_common(struct inode *);
56724 +void wire_done_common(reiser4_object_on_wire *);
56725 +
56726 +/* dir plugin operations: common implementations */
56727 +struct dentry *get_parent_common(struct inode *child);
56728 +int is_name_acceptable_common(const struct inode *, const char *name, int len);
56729 +void build_entry_key_common(const struct inode *,
56730 +                           const struct qstr *qname, reiser4_key *);
56731 +int build_readdir_key_common(struct file *dir, reiser4_key *);
56732 +int reiser4_add_entry_common(struct inode *object, struct dentry *where,
56733 +                    reiser4_object_create_data *, reiser4_dir_entry_desc *);
56734 +int reiser4_rem_entry_common(struct inode *object, struct dentry *where,
56735 +                    reiser4_dir_entry_desc *);
56736 +int reiser4_dir_init_common(struct inode *object, struct inode *parent,
56737 +                           reiser4_object_create_data *);
56738 +int reiser4_dir_done_common(struct inode *);
56739 +int reiser4_attach_common(struct inode *child, struct inode *parent);
56740 +int reiser4_detach_common(struct inode *object, struct inode *parent);
56741 +reiser4_block_nr estimate_add_entry_common(const struct inode *);
56742 +reiser4_block_nr estimate_rem_entry_common(const struct inode *);
56743 +reiser4_block_nr dir_estimate_unlink_common(const struct inode *,
56744 +                                           const struct inode *);
56745 +
56746 +/* these are essential parts of common implementations, they are to make
56747 +   customized implementations easier */
56748 +int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
56749 +
56750 +/* merely useful functions */
56751 +int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle *,
56752 +             const reiser4_key *, int silent);
56753 +
56754 +/* __FS_REISER4_PLUGIN_OBJECT_H__ */
56755 +#endif
56756 +
56757 +/* Make Linus happy.
56758 +   Local variables:
56759 +   c-indentation-style: "K&R"
56760 +   mode-name: "LC"
56761 +   c-basic-offset: 8
56762 +   tab-width: 8
56763 +   fill-column: 120
56764 +   End:
56765 +*/
56766 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/plugin.c linux-2.6.27/fs/reiser4/plugin/plugin.c
56767 --- linux-2.6.27.orig/fs/reiser4/plugin/plugin.c        1970-01-01 03:00:00.000000000 +0300
56768 +++ linux-2.6.27/fs/reiser4/plugin/plugin.c     2008-10-12 18:20:01.000000000 +0400
56769 @@ -0,0 +1,559 @@
56770 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
56771 + * reiser4/README */
56772 +
56773 +/* Basic plugin infrastructure, lookup etc. */
56774 +
56775 +/* PLUGINS:
56776 +
56777 +   Plugins are internal Reiser4 "modules" or "objects" used to increase
56778 +   extensibility and allow external users to easily adapt reiser4 to
56779 +   their needs.
56780 +
56781 +   Plugins are classified into several disjoint "types". Plugins
56782 +   belonging to the particular plugin type are termed "instances" of
56783 +   this type. Existing types are listed by enum reiser4_plugin_type
56784 +   (see plugin/plugin_header.h)
56785 +
56786 +NIKITA-FIXME-HANS: update this list, and review this entire comment for currency
56787 +
56788 +   Object (file) plugin determines how given file-system object serves
56789 +   standard VFS requests for read, write, seek, mmap etc. Instances of
56790 +   file plugins are: regular file, directory, symlink. Another example
56791 +   of file plugin is audit plugin, that optionally records accesses to
56792 +   underlying object and forwards requests to it.
56793 +
56794 +   Hash plugins compute hashes used by reiser4 to store and locate
56795 +   files within directories. Instances of hash plugin type are: r5,
56796 +   tea, rupasov.
56797 +
56798 +   Tail plugins (or, more precisely, tail policy plugins) determine
56799 +   when last part of the file should be stored in a formatted item.
56800 +
56801 +   Scope and lookup:
56802 +
56803 +   label such that pair ( type_label, plugin_label ) is unique.  This
56804 +   pair is a globally persistent and user-visible plugin
56805 +   identifier. Internally kernel maintains plugins and plugin types in
56806 +   arrays using an index into those arrays as plugin and plugin type
56807 +   identifiers. File-system in turn, also maintains persistent
56808 +   "dictionary" which is mapping from plugin label to numerical
56809 +   identifier which is stored in file-system objects.  That is, we
56810 +   store the offset into the plugin array for that plugin type as the
56811 +   plugin id in the stat data of the filesystem object.
56812 +
56813 +   Internal kernel plugin type identifier (index in plugins[] array) is
56814 +   of type reiser4_plugin_type. Set of available plugin types is
56815 +   currently static, but dynamic loading doesn't seem to pose
56816 +   insurmountable problems.
56817 +
56818 +   Within each type plugins are addressed by the identifiers of type
56819 +   reiser4_plugin_id (indices in reiser4_plugin_type_data.builtin[]).
56820 +   Such identifiers are only required to be unique within one type,
56821 +   not globally.
56822 +
56823 +   Thus, plugin in memory is uniquely identified by the pair (type_id,
56824 +   id).
56825 +
56826 +   Usage:
56827 +
56828 +   There exists only one instance of each plugin instance, but this
56829 +   single instance can be associated with many entities (file-system
56830 +   objects, items, nodes, transactions, file-descriptors etc.). Entity
56831 +   to which plugin of given type is termed (due to the lack of
56832 +   imagination) "subject" of this plugin type and, by abuse of
56833 +   terminology, subject of particular instance of this type to which
56834 +   it's attached currently. For example, inode is subject of object
56835 +   plugin type. Inode representing directory is subject of directory
56836 +   plugin, hash plugin type and some particular instance of hash plugin
56837 +   type. Inode, representing regular file is subject of "regular file"
56838 +   plugin, tail-policy plugin type etc.
56839 +
56840 +   With each subject the plugin possibly stores some state. For example,
56841 +   the state of a directory plugin (instance of object plugin type) is pointer
56842 +   to hash plugin (if directories always use hashing that is).
56843 +
56844 +   Interface:
56845 +
56846 +   In addition to a scalar identifier, each plugin type and plugin
56847 +   proper has a "label": short string and a "description"---longer
56848 +   descriptive string. Labels and descriptions of plugin types are
56849 +   hard-coded into plugins[] array, declared and defined in
56850 +   plugin.c. Label and description of plugin are stored in .label and
56851 +   .desc fields of reiser4_plugin_header respectively. It's possible to
56852 +   locate plugin by the pair of labels.
56853 +
56854 +   Features (not implemented):
56855 +
56856 +    . user-level plugin manipulations:
56857 +      + reiser4("filename/..file_plugin<='audit'");
56858 +      + write(open("filename/..file_plugin"), "audit", 8);
56859 +
56860 +    . user level utilities lsplug and chplug to manipulate plugins.
56861 +      Utilities are not of primary priority. Possibly they will be not
56862 +      working on v4.0
56863 +
56864 +   NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount
56865 +   option, do you agree?  I don't think that specifying it at mount time,
56866 +   and then changing it with each mount, is a good model for usage.
56867 +
56868 +    . mount option "plug" to set-up plugins of root-directory.
56869 +      "plug=foo:bar" will set "bar" as default plugin of type "foo".
56870 +
56871 +   Limitations:
56872 +
56873 +    . each plugin type has to provide at least one builtin
56874 +      plugin. This is technical limitation and it can be lifted in the
56875 +      future.
56876 +
56877 +   TODO:
56878 +
56879 +   New plugin types/plugings:
56880 +   Things we should be able to separately choose to inherit:
56881 +
56882 +   security plugins
56883 +
56884 +   stat data
56885 +
56886 +   file bodies
56887 +
56888 +   file plugins
56889 +
56890 +   dir plugins
56891 +
56892 +    . perm:acl
56893 +
56894 +    . audi---audit plugin intercepting and possibly logging all
56895 +      accesses to object. Requires to put stub functions in file_operations
56896 +      in stead of generic_file_*.
56897 +
56898 +NIKITA-FIXME-HANS: why make overflows a plugin?
56899 +    . over---handle hash overflows
56900 +
56901 +    . sqnt---handle different access patterns and instruments read-ahead
56902 +
56903 +NIKITA-FIXME-HANS: describe the line below in more detail.
56904 +
56905 +    . hier---handle inheritance of plugins along file-system hierarchy
56906 +
56907 +   Different kinds of inheritance: on creation vs. on access.
56908 +   Compatible/incompatible plugins.
56909 +   Inheritance for multi-linked files.
56910 +   Layered plugins.
56911 +   Notion of plugin context is abandoned.
56912 +
56913 +Each file is associated
56914 +   with one plugin and dependant plugins (hash, etc.) are stored as
56915 +   main plugin state. Now, if we have plugins used for regular files
56916 +   but not for directories, how such plugins would be inherited?
56917 +    . always store them with directories also
56918 +
56919 +NIKTIA-FIXME-HANS: Do the line above.  It is not exclusive of doing
56920 +the line below which is also useful.
56921 +
56922 +    . use inheritance hierarchy, independent of file-system namespace
56923 +*/
56924 +
56925 +#include "../debug.h"
56926 +#include "../dformat.h"
56927 +#include "plugin_header.h"
56928 +#include "item/static_stat.h"
56929 +#include "node/node.h"
56930 +#include "security/perm.h"
56931 +#include "space/space_allocator.h"
56932 +#include "disk_format/disk_format.h"
56933 +#include "plugin.h"
56934 +#include "../reiser4.h"
56935 +#include "../jnode.h"
56936 +#include "../inode.h"
56937 +
56938 +#include <linux/fs.h>          /* for struct super_block  */
56939 +
56940 +/*
56941 + * init_plugins - initialize plugin sub-system.
56942 + * Just call this once on reiser4 startup.
56943 + *
56944 + * Initializes plugin sub-system. It is part of reiser4 module
56945 + * initialization. For each plugin of each type init method is called and each
56946 + * plugin is put into list of plugins.
56947 + */
56948 +int init_plugins(void)
56949 +{
56950 +       reiser4_plugin_type type_id;
56951 +
56952 +       for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
56953 +               struct reiser4_plugin_type_data *ptype;
56954 +               int i;
56955 +
56956 +               ptype = &plugins[type_id];
56957 +               assert("nikita-3508", ptype->label != NULL);
56958 +               assert("nikita-3509", ptype->type_id == type_id);
56959 +
56960 +               INIT_LIST_HEAD(&ptype->plugins_list);
56961 +/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term builtin. */
56962 +               for (i = 0; i < ptype->builtin_num; ++i) {
56963 +                       reiser4_plugin *plugin;
56964 +
56965 +                       plugin = plugin_at(ptype, i);
56966 +
56967 +                       if (plugin->h.label == NULL)
56968 +                               /* uninitialized slot encountered */
56969 +                               continue;
56970 +                       assert("nikita-3445", plugin->h.type_id == type_id);
56971 +                       plugin->h.id = i;
56972 +                       if (plugin->h.pops != NULL &&
56973 +                           plugin->h.pops->init != NULL) {
56974 +                               int result;
56975 +
56976 +                               result = plugin->h.pops->init(plugin);
56977 +                               if (result != 0)
56978 +                                       return result;
56979 +                       }
56980 +                       INIT_LIST_HEAD(&plugin->h.linkage);
56981 +                       list_add_tail(&plugin->h.linkage, &ptype->plugins_list);
56982 +               }
56983 +       }
56984 +       return 0;
56985 +}
56986 +
56987 +/* true if plugin type id is valid */
56988 +int is_plugin_type_valid(reiser4_plugin_type type)
56989 +{
56990 +       /* "type" is unsigned, so no comparison with 0 is
56991 +          necessary */
56992 +       return (type < REISER4_PLUGIN_TYPES);
56993 +}
56994 +
56995 +/* true if plugin id is valid */
56996 +int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id)
56997 +{
56998 +       assert("nikita-1653", is_plugin_type_valid(type));
56999 +       return id < plugins[type].builtin_num;
57000 +}
57001 +
57002 +/* return plugin by its @type and @id.
57003 +
57004 +   Both arguments are checked for validness: this is supposed to be called
57005 +   from user-level.
57006 +
57007 +NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in
57008 +user space, and passed to the filesystem by use of method files? Your
57009 +comment really confused me on the first reading....
57010 +
57011 +*/
57012 +reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type /* plugin type
57013 +                                                                * unchecked */,
57014 +                                   reiser4_plugin_id id        /* plugin id,
57015 +                                                                * unchecked */)
57016 +{
57017 +       if (is_plugin_type_valid(type)) {
57018 +               if (is_plugin_id_valid(type, id))
57019 +                       return plugin_at(&plugins[type], id);
57020 +               else
57021 +                       /* id out of bounds */
57022 +                       warning("nikita-2913",
57023 +                               "Invalid plugin id: [%i:%i]", type, id);
57024 +       } else
57025 +               /* type_id out of bounds */
57026 +               warning("nikita-2914", "Invalid type_id: %i", type);
57027 +       return NULL;
57028 +}
57029 +
57030 +/**
57031 + * save_plugin_id - store plugin id in disk format
57032 + * @plugin: plugin to convert
57033 + * @area: where to store result
57034 + *
57035 + * Puts id of @plugin in little endian format to address @area.
57036 + */
57037 +int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ ,
57038 +                  d16 *area /* where to store result */ )
57039 +{
57040 +       assert("nikita-1261", plugin != NULL);
57041 +       assert("nikita-1262", area != NULL);
57042 +
57043 +       put_unaligned(cpu_to_le16(plugin->h.id), area);
57044 +       return 0;
57045 +}
57046 +
57047 +/* list of all plugins of given type */
57048 +struct list_head *get_plugin_list(reiser4_plugin_type type)
57049 +{
57050 +       assert("nikita-1056", is_plugin_type_valid(type));
57051 +       return &plugins[type].plugins_list;
57052 +}
57053 +
57054 +static void update_pset_mask(reiser4_inode * info, pset_member memb)
57055 +{
57056 +       struct dentry *rootdir;
57057 +       reiser4_inode *root;
57058 +
57059 +       assert("edward-1443", memb != PSET_FILE);
57060 +
57061 +       rootdir = inode_by_reiser4_inode(info)->i_sb->s_root;
57062 +       if (rootdir != NULL) {
57063 +               root = reiser4_inode_data(rootdir->d_inode);
57064 +               /*
57065 +                * if inode is different from the default one, or we are
57066 +                * changing plugin of root directory, update plugin_mask
57067 +                */
57068 +               if (aset_get(info->pset, memb) !=
57069 +                   aset_get(root->pset, memb) ||
57070 +                   info == root)
57071 +                       info->plugin_mask |= (1 << memb);
57072 +               else
57073 +                       info->plugin_mask &= ~(1 << memb);
57074 +       }
57075 +}
57076 +
57077 +/* Get specified plugin set member from parent,
57078 +   or from fs-defaults (if no parent is given) and
57079 +   install the result to pset of @self */
57080 +int grab_plugin_pset(struct inode *self,
57081 +                    struct inode *ancestor,
57082 +                    pset_member memb)
57083 +{
57084 +       reiser4_plugin *plug;
57085 +       reiser4_inode *info;
57086 +       int result = 0;
57087 +
57088 +       /* Do not grab if initialised already. */
57089 +       info = reiser4_inode_data(self);
57090 +       if (aset_get(info->pset, memb) != NULL)
57091 +               return 0;
57092 +       if (ancestor) {
57093 +               reiser4_inode *parent;
57094 +
57095 +               parent = reiser4_inode_data(ancestor);
57096 +               plug = aset_get(parent->hset, memb) ? :
57097 +                       aset_get(parent->pset, memb);
57098 +       }
57099 +       else
57100 +               plug = get_default_plugin(memb);
57101 +
57102 +       result = set_plugin(&info->pset, memb, plug);
57103 +       if (result == 0) {
57104 +               if (!ancestor || self->i_sb->s_root->d_inode != self)
57105 +                       update_pset_mask(info, memb);
57106 +       }
57107 +       return result;
57108 +}
57109 +
57110 +/* Take missing pset members from root inode */
57111 +int finish_pset(struct inode *inode)
57112 +{
57113 +       reiser4_plugin *plug;
57114 +       reiser4_inode *root;
57115 +       reiser4_inode *info;
57116 +       pset_member memb;
57117 +       int result = 0;
57118 +
57119 +       root = reiser4_inode_data(inode->i_sb->s_root->d_inode);
57120 +       info = reiser4_inode_data(inode);
57121 +
57122 +       assert("edward-1455", root != NULL);
57123 +       assert("edward-1456", info != NULL);
57124 +
57125 +       /* file and directory plugins are already initialized. */
57126 +       for (memb = PSET_DIR + 1; memb < PSET_LAST; ++memb) {
57127 +
57128 +               /* Do not grab if initialised already. */
57129 +               if (aset_get(info->pset, memb) != NULL)
57130 +                       continue;
57131 +
57132 +               plug = aset_get(root->pset, memb);
57133 +               result = set_plugin(&info->pset, memb, plug);
57134 +               if (result != 0)
57135 +                       break;
57136 +       }
57137 +       if (result != 0) {
57138 +               warning("nikita-3447",
57139 +                       "Cannot set up plugins for %lli",
57140 +                       (unsigned long long)
57141 +                       get_inode_oid(inode));
57142 +       }
57143 +       return result;
57144 +}
57145 +
57146 +int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin * plug)
57147 +{
57148 +       reiser4_inode *info;
57149 +       int result = 0;
57150 +
57151 +       if (!self->i_sb->s_root || self->i_sb->s_root->d_inode == self) {
57152 +               /* Changing pset in the root object. */
57153 +               return RETERR(-EINVAL);
57154 +       }
57155 +
57156 +       info = reiser4_inode_data(self);
57157 +       if (plug->h.pops != NULL && plug->h.pops->change != NULL)
57158 +               result = plug->h.pops->change(self, plug, memb);
57159 +       else
57160 +               result = aset_set_unsafe(&info->pset, memb, plug);
57161 +       if (result == 0) {
57162 +               __u16 oldmask = info->plugin_mask;
57163 +
57164 +               update_pset_mask(info, memb);
57165 +               if (oldmask != info->plugin_mask)
57166 +                       reiser4_inode_clr_flag(self, REISER4_SDLEN_KNOWN);
57167 +       }
57168 +       return result;
57169 +}
57170 +
57171 +struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = {
57172 +       /* C90 initializers */
57173 +       [REISER4_FILE_PLUGIN_TYPE] = {
57174 +               .type_id = REISER4_FILE_PLUGIN_TYPE,
57175 +               .label = "file",
57176 +               .desc = "Object plugins",
57177 +               .builtin_num = sizeof_array(file_plugins),
57178 +               .builtin = file_plugins,
57179 +               .plugins_list = {NULL, NULL},
57180 +               .size = sizeof(file_plugin)
57181 +       },
57182 +       [REISER4_DIR_PLUGIN_TYPE] = {
57183 +               .type_id = REISER4_DIR_PLUGIN_TYPE,
57184 +               .label = "dir",
57185 +               .desc = "Directory plugins",
57186 +               .builtin_num = sizeof_array(dir_plugins),
57187 +               .builtin = dir_plugins,
57188 +               .plugins_list = {NULL, NULL},
57189 +               .size = sizeof(dir_plugin)
57190 +       },
57191 +       [REISER4_HASH_PLUGIN_TYPE] = {
57192 +               .type_id = REISER4_HASH_PLUGIN_TYPE,
57193 +               .label = "hash",
57194 +               .desc = "Directory hashes",
57195 +               .builtin_num = sizeof_array(hash_plugins),
57196 +               .builtin = hash_plugins,
57197 +               .plugins_list = {NULL, NULL},
57198 +               .size = sizeof(hash_plugin)
57199 +       },
57200 +       [REISER4_FIBRATION_PLUGIN_TYPE] = {
57201 +               .type_id =
57202 +               REISER4_FIBRATION_PLUGIN_TYPE,
57203 +               .label = "fibration",
57204 +               .desc = "Directory fibrations",
57205 +               .builtin_num = sizeof_array(fibration_plugins),
57206 +               .builtin = fibration_plugins,
57207 +               .plugins_list = {NULL, NULL},
57208 +               .size = sizeof(fibration_plugin)
57209 +       },
57210 +       [REISER4_CIPHER_PLUGIN_TYPE] = {
57211 +               .type_id = REISER4_CIPHER_PLUGIN_TYPE,
57212 +               .label = "cipher",
57213 +               .desc = "Cipher plugins",
57214 +               .builtin_num = sizeof_array(cipher_plugins),
57215 +               .builtin = cipher_plugins,
57216 +               .plugins_list = {NULL, NULL},
57217 +               .size = sizeof(cipher_plugin)
57218 +       },
57219 +       [REISER4_DIGEST_PLUGIN_TYPE] = {
57220 +               .type_id = REISER4_DIGEST_PLUGIN_TYPE,
57221 +               .label = "digest",
57222 +               .desc = "Digest plugins",
57223 +               .builtin_num = sizeof_array(digest_plugins),
57224 +               .builtin = digest_plugins,
57225 +               .plugins_list = {NULL, NULL},
57226 +               .size = sizeof(digest_plugin)
57227 +       },
57228 +       [REISER4_COMPRESSION_PLUGIN_TYPE] = {
57229 +               .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
57230 +               .label = "compression",
57231 +               .desc = "Compression plugins",
57232 +               .builtin_num = sizeof_array(compression_plugins),
57233 +               .builtin = compression_plugins,
57234 +               .plugins_list = {NULL, NULL},
57235 +               .size = sizeof(compression_plugin)
57236 +       },
57237 +       [REISER4_FORMATTING_PLUGIN_TYPE] = {
57238 +               .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
57239 +               .label = "formatting",
57240 +               .desc = "Tail inlining policies",
57241 +               .builtin_num = sizeof_array(formatting_plugins),
57242 +               .builtin = formatting_plugins,
57243 +               .plugins_list = {NULL, NULL},
57244 +               .size = sizeof(formatting_plugin)
57245 +       },
57246 +       [REISER4_PERM_PLUGIN_TYPE] = {
57247 +               .type_id = REISER4_PERM_PLUGIN_TYPE,
57248 +               .label = "perm",
57249 +               .desc = "Permission checks",
57250 +               .builtin_num = sizeof_array(perm_plugins),
57251 +               .builtin = perm_plugins,
57252 +               .plugins_list = {NULL, NULL},
57253 +               .size = sizeof(perm_plugin)
57254 +       },
57255 +       [REISER4_ITEM_PLUGIN_TYPE] = {
57256 +               .type_id = REISER4_ITEM_PLUGIN_TYPE,
57257 +               .label = "item",
57258 +               .desc = "Item handlers",
57259 +               .builtin_num = sizeof_array(item_plugins),
57260 +               .builtin = item_plugins,
57261 +               .plugins_list = {NULL, NULL},
57262 +               .size = sizeof(item_plugin)
57263 +       },
57264 +       [REISER4_NODE_PLUGIN_TYPE] = {
57265 +               .type_id = REISER4_NODE_PLUGIN_TYPE,
57266 +               .label = "node",
57267 +               .desc = "node layout handlers",
57268 +               .builtin_num = sizeof_array(node_plugins),
57269 +               .builtin = node_plugins,
57270 +               .plugins_list = {NULL, NULL},
57271 +               .size = sizeof(node_plugin)
57272 +       },
57273 +       [REISER4_SD_EXT_PLUGIN_TYPE] = {
57274 +               .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
57275 +               .label = "sd_ext",
57276 +               .desc = "Parts of stat-data",
57277 +               .builtin_num = sizeof_array(sd_ext_plugins),
57278 +               .builtin = sd_ext_plugins,
57279 +               .plugins_list = {NULL, NULL},
57280 +               .size = sizeof(sd_ext_plugin)
57281 +       },
57282 +       [REISER4_FORMAT_PLUGIN_TYPE] = {
57283 +               .type_id = REISER4_FORMAT_PLUGIN_TYPE,
57284 +               .label = "disk_layout",
57285 +               .desc = "defines filesystem on disk layout",
57286 +               .builtin_num = sizeof_array(format_plugins),
57287 +               .builtin = format_plugins,
57288 +               .plugins_list = {NULL, NULL},
57289 +               .size = sizeof(disk_format_plugin)
57290 +       },
57291 +       [REISER4_JNODE_PLUGIN_TYPE] = {
57292 +               .type_id = REISER4_JNODE_PLUGIN_TYPE,
57293 +               .label = "jnode",
57294 +               .desc = "defines kind of jnode",
57295 +               .builtin_num = sizeof_array(jnode_plugins),
57296 +               .builtin = jnode_plugins,
57297 +               .plugins_list = {NULL, NULL},
57298 +               .size = sizeof(jnode_plugin)
57299 +       },
57300 +       [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = {
57301 +               .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
57302 +               .label = "compression_mode",
57303 +               .desc = "Defines compression mode",
57304 +               .builtin_num = sizeof_array(compression_mode_plugins),
57305 +               .builtin = compression_mode_plugins,
57306 +               .plugins_list = {NULL, NULL},
57307 +               .size = sizeof(compression_mode_plugin)
57308 +       },
57309 +       [REISER4_CLUSTER_PLUGIN_TYPE] = {
57310 +               .type_id = REISER4_CLUSTER_PLUGIN_TYPE,
57311 +               .label = "cluster",
57312 +               .desc = "Defines cluster size",
57313 +               .builtin_num = sizeof_array(cluster_plugins),
57314 +               .builtin = cluster_plugins,
57315 +               .plugins_list = {NULL, NULL},
57316 +               .size = sizeof(cluster_plugin)
57317 +       }
57318 +};
57319 +
57320 +/*
57321 + * Local variables:
57322 + * c-indentation-style: "K&R"
57323 + * mode-name: "LC"
57324 + * c-basic-offset: 8
57325 + * tab-width: 8
57326 + * fill-column: 120
57327 + * End:
57328 + */
57329 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/plugin.h linux-2.6.27/fs/reiser4/plugin/plugin.h
57330 --- linux-2.6.27.orig/fs/reiser4/plugin/plugin.h        1970-01-01 03:00:00.000000000 +0300
57331 +++ linux-2.6.27/fs/reiser4/plugin/plugin.h     2008-10-12 18:20:01.000000000 +0400
57332 @@ -0,0 +1,937 @@
57333 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
57334 +
57335 +/* Basic plugin data-types.
57336 +   see fs/reiser4/plugin/plugin.c for details */
57337 +
57338 +#if !defined( __FS_REISER4_PLUGIN_TYPES_H__ )
57339 +#define __FS_REISER4_PLUGIN_TYPES_H__
57340 +
57341 +#include "../forward.h"
57342 +#include "../debug.h"
57343 +#include "../dformat.h"
57344 +#include "../key.h"
57345 +#include "compress/compress.h"
57346 +#include "crypto/cipher.h"
57347 +#include "plugin_header.h"
57348 +#include "item/static_stat.h"
57349 +#include "item/internal.h"
57350 +#include "item/sde.h"
57351 +#include "item/cde.h"
57352 +#include "item/item.h"
57353 +#include "node/node.h"
57354 +#include "node/node40.h"
57355 +#include "security/perm.h"
57356 +#include "fibration.h"
57357 +
57358 +#include "space/bitmap.h"
57359 +#include "space/space_allocator.h"
57360 +
57361 +#include "disk_format/disk_format40.h"
57362 +#include "disk_format/disk_format.h"
57363 +
57364 +#include <linux/fs.h>          /* for struct super_block, address_space  */
57365 +#include <linux/mm.h>          /* for struct page */
57366 +#include <linux/buffer_head.h> /* for struct buffer_head */
57367 +#include <linux/dcache.h>      /* for struct dentry */
57368 +#include <linux/types.h>
57369 +#include <linux/crypto.h>
57370 +
57371 +typedef struct reiser4_object_on_wire reiser4_object_on_wire;
57372 +
57373 +/*
57374 + * File plugin.  Defines the set of methods that file plugins implement, some
57375 + * of which are optional.
57376 + *
57377 + * A file plugin offers to the caller an interface for IO ( writing to and/or
57378 + * reading from) to what the caller sees as one sequence of bytes.  An IO to it
57379 + * may affect more than one physical sequence of bytes, or no physical sequence
57380 + * of bytes, it may affect sequences of bytes offered by other file plugins to
57381 + * the semantic layer, and the file plugin may invoke other plugins and
57382 + * delegate work to them, but its interface is structured for offering the
57383 + * caller the ability to read and/or write what the caller sees as being a
57384 + * single sequence of bytes.
57385 + *
57386 + * The file plugin must present a sequence of bytes to the caller, but it does
57387 + * not necessarily have to store a sequence of bytes, it does not necessarily
57388 + * have to support efficient tree traversal to any offset in the sequence of
57389 + * bytes (tail and extent items, whose keys contain offsets, do however provide
57390 + * efficient non-sequential lookup of any offset in the sequence of bytes).
57391 + *
57392 + * Directory plugins provide methods for selecting file plugins by resolving a
57393 + * name for them.
57394 + *
57395 + * The functionality other filesystems call an attribute, and rigidly tie
57396 + * together, we decompose into orthogonal selectable features of files.  Using
57397 + * the terminology we will define next, an attribute is a perhaps constrained,
57398 + * perhaps static length, file whose parent has a uni-count-intra-link to it,
57399 + * which might be grandparent-major-packed, and whose parent has a deletion
57400 + * method that deletes it.
57401 + *
57402 + * File plugins can implement constraints.
57403 + *
57404 + * Files can be of variable length (e.g. regular unix files), or of static
57405 + * length (e.g. static sized attributes).
57406 + *
57407 + * An object may have many sequences of bytes, and many file plugins, but, it
57408 + * has exactly one objectid.  It is usually desirable that an object has a
57409 + * deletion method which deletes every item with that objectid.  Items cannot
57410 + * in general be found by just their objectids.  This means that an object must
57411 + * have either a method built into its deletion plugin method for knowing what
57412 + * items need to be deleted, or links stored with the object that provide the
57413 + * plugin with a method for finding those items.  Deleting a file within an
57414 + * object may or may not have the effect of deleting the entire object,
57415 + * depending on the file plugin's deletion method.
57416 + *
57417 + * LINK TAXONOMY:
57418 + *
57419 + * Many objects have a reference count, and when the reference count reaches 0
57420 + * the object's deletion method is invoked.  Some links embody a reference
57421 + * count increase ("countlinks"), and others do not ("nocountlinks").
57422 + *
57423 + * Some links are bi-directional links ("bilinks"), and some are
57424 + * uni-directional("unilinks").
57425 + *
57426 + * Some links are between parts of the same object ("intralinks"), and some are
57427 + * between different objects ("interlinks").
57428 + *
57429 + * PACKING TAXONOMY:
57430 + *
57431 + * Some items of an object are stored with a major packing locality based on
57432 + * their object's objectid (e.g. unix directory items in plan A), and these are
57433 + * called "self-major-packed".
57434 + *
57435 + * Some items of an object are stored with a major packing locality based on
57436 + * their semantic parent object's objectid (e.g. unix file bodies in plan A),
57437 + * and these are called "parent-major-packed".
57438 + *
57439 + * Some items of an object are stored with a major packing locality based on
57440 + * their semantic grandparent, and these are called "grandparent-major-packed".
57441 + * Now carefully notice that we run into trouble with key length if we have to
57442 + * store a 8 byte major+minor grandparent based packing locality, an 8 byte
57443 + * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in
57444 + * a 24 byte key.  One of these fields must be sacrificed if an item is to be
57445 + * grandparent-major-packed, and which to sacrifice is left to the item author
57446 + * choosing to make the item grandparent-major-packed.  You cannot make tail
57447 + * items and extent items grandparent-major-packed, though you could make them
57448 + * self-major-packed (usually they are parent-major-packed).
57449 + *
57450 + * In the case of ACLs (which are composed of fixed length ACEs which consist
57451 + * of {subject-type, subject, and permission bitmask} triples), it makes sense
57452 + * to not have an offset field in the ACE item key, and to allow duplicate keys
57453 + * for ACEs.  Thus, the set of ACES for a given file is found by looking for a
57454 + * key consisting of the objectid of the grandparent (thus grouping all ACLs in
57455 + * a directory together), the minor packing locality of ACE, the objectid of
57456 + * the file, and 0.
57457 + *
57458 + * IO involves moving data from one location to another, which means that two
57459 + * locations must be specified, source and destination.
57460 + *
57461 + * This source and destination can be in the filesystem, or they can be a
57462 + * pointer in the user process address space plus a byte count.
57463 + *
57464 + * If both source and destination are in the filesystem, then at least one of
57465 + * them must be representable as a pure stream of bytes (which we call a flow,
57466 + * and define as a struct containing a key, a data pointer, and a length).
57467 + * This may mean converting one of them into a flow.  We provide a generic
57468 + * cast_into_flow() method, which will work for any plugin supporting
57469 + * read_flow(), though it is inefficiently implemented in that it temporarily
57470 + * stores the flow in a buffer (Question: what to do with huge flows that
57471 + * cannot fit into memory?  Answer: we must not convert them all at once. )
57472 + *
57473 + * Performing a write requires resolving the write request into a flow defining
57474 + * the source, and a method that performs the write, and a key that defines
57475 + * where in the tree the write is to go.
57476 + *
57477 + * Performing a read requires resolving the read request into a flow defining
57478 + * the target, and a method that performs the read, and a key that defines
57479 + * where in the tree the read is to come from.
57480 + *
57481 + * There will exist file plugins which have no pluginid stored on the disk for
57482 + * them, and which are only invoked by other plugins.
57483 + */
57484 +
57485 +/* This should be incremented with each new contributed
57486 +   pair (plugin type, plugin id).
57487 +   NOTE: Make sure there is a release of reiser4progs
57488 +   with the corresponding version number */
57489 +#define PLUGIN_LIBRARY_VERSION 0
57490 +
57491 + /* enumeration of fields within plugin_set */
57492 +typedef enum {
57493 +       PSET_FILE,
57494 +       PSET_DIR,               /* PSET_FILE and PSET_DIR should be first elements:
57495 +                                * inode.c:read_inode() depends on this. */
57496 +       PSET_PERM,
57497 +       PSET_FORMATTING,
57498 +       PSET_HASH,
57499 +       PSET_FIBRATION,
57500 +       PSET_SD,
57501 +       PSET_DIR_ITEM,
57502 +       PSET_CIPHER,
57503 +       PSET_DIGEST,
57504 +       PSET_COMPRESSION,
57505 +       PSET_COMPRESSION_MODE,
57506 +       PSET_CLUSTER,
57507 +       PSET_CREATE,
57508 +       PSET_LAST
57509 +} pset_member;
57510 +
57511 +/* builtin file-plugins */
57512 +typedef enum {
57513 +       /* regular file */
57514 +       UNIX_FILE_PLUGIN_ID,
57515 +       /* directory */
57516 +       DIRECTORY_FILE_PLUGIN_ID,
57517 +       /* symlink */
57518 +       SYMLINK_FILE_PLUGIN_ID,
57519 +       /* for objects completely handled by the VFS: fifos, devices,
57520 +          sockets  */
57521 +       SPECIAL_FILE_PLUGIN_ID,
57522 +       /* regular cryptcompress file */
57523 +       CRYPTCOMPRESS_FILE_PLUGIN_ID,
57524 +       /* number of file plugins. Used as size of arrays to hold
57525 +          file plugins. */
57526 +       LAST_FILE_PLUGIN_ID
57527 +} reiser4_file_id;
57528 +
57529 +typedef struct file_plugin {
57530 +
57531 +       /* generic fields */
57532 +       plugin_header h;
57533 +
57534 +       /* VFS methods.
57535 +        * Must be invariant with respect to plugin conversion.
57536 +        * It can be achieved by using "common" methods, which
57537 +        * are the same for all plugins that take participation in
57538 +        * conversion, or by using "generic" or "careful" methods,
57539 +        * which provide automatic redirection to proper private
57540 +        * plugin methods ("careful" are the same as "generic",
57541 +        * but with protection of pset and other disk structures
57542 +        * from being rebuilt during conversion.
57543 +        */
57544 +       struct inode_operations * inode_ops;
57545 +       struct file_operations * file_ops;
57546 +       struct address_space_operations * as_ops;
57547 +       /**
57548 +        * Private methods. These are optional. If used they will allow you
57549 +        * to minimize the amount of code needed to implement a deviation
57550 +        * from some other method that also uses them.
57551 +        */
57552 +       /*
57553 +        * private inode_ops
57554 +        */
57555 +       int (*setattr)(struct dentry *, struct iattr *);
57556 +       /*
57557 +        * private file_ops
57558 +        */
57559 +       /* do whatever is necessary to do when object is opened */
57560 +       int (*open) (struct inode * inode, struct file * file);
57561 +       ssize_t (*read) (struct file *, char __user *buf, size_t read_amount,
57562 +                       loff_t *off);
57563 +       /* write as much as possible bytes from nominated @write_amount
57564 +        * before plugin scheduling is occurred. Save scheduling state
57565 +        * in @cont */
57566 +       ssize_t (*write) (struct file *, const char __user *buf,
57567 +                         size_t write_amount, loff_t * off,
57568 +                         struct psched_context * cont);
57569 +       int (*ioctl) (struct inode *inode, struct file *filp,
57570 +                     unsigned int cmd, unsigned long arg);
57571 +       int (*mmap) (struct file *, struct vm_area_struct *);
57572 +       int (*release) (struct inode *, struct file *);
57573 +       /*
57574 +        * private a_ops
57575 +        */
57576 +       int (*readpage) (struct file *file, struct page *page);
57577 +       int (*readpages)(struct file *file, struct address_space *mapping,
57578 +                         struct list_head *pages, unsigned nr_pages);
57579 +       int (*writepages)(struct address_space *mapping,
57580 +                         struct writeback_control *wbc);
57581 +       int (*prepare_write)(struct file *file, struct page *page,
57582 +                            unsigned from, unsigned to);
57583 +       int (*commit_write)(struct file *file, struct page *page,
57584 +                           unsigned from, unsigned to);
57585 +       sector_t (*bmap) (struct address_space * mapping, sector_t lblock);
57586 +       /* other private methods */
57587 +       /* save inode cached stat-data onto disk. It was called
57588 +          reiserfs_update_sd() in 3.x */
57589 +       int (*write_sd_by_inode) (struct inode *);
57590 +       /*
57591 +        * Construct flow into @flow according to user-supplied data.
57592 +        *
57593 +        * This is used by read/write methods to construct a flow to
57594 +        * write/read. ->flow_by_inode() is plugin method, rather than single
57595 +        * global implementation, because key in a flow used by plugin may
57596 +        * depend on data in a @buf.
57597 +        *
57598 +        * NIKITA-FIXME-HANS: please create statistics on what functions are
57599 +        * dereferenced how often for the mongo benchmark.  You can supervise
57600 +        * Elena doing this for you if that helps.  Email me the list of the
57601 +        * top 10, with their counts, and an estimate of the total number of
57602 +        * CPU cycles spent dereferencing as a percentage of CPU cycles spent
57603 +        * processing (non-idle processing).  If the total percent is, say,
57604 +        * less than 1%, it will make our coding discussions much easier, and
57605 +        * keep me from questioning whether functions like the below are too
57606 +        * frequently called to be dereferenced.  If the total percent is more
57607 +        * than 1%, perhaps private methods should be listed in a "required"
57608 +        * comment at the top of each plugin (with stern language about how if
57609 +        * the comment is missing it will not be accepted by the maintainer),
57610 +        * and implemented using macros not dereferenced functions.  How about
57611 +        * replacing this whole private methods part of the struct with a
57612 +        * thorough documentation of what the standard helper functions are for
57613 +        * use in constructing plugins?  I think users have been asking for
57614 +        * that, though not in so many words.
57615 +        */
57616 +       int (*flow_by_inode) (struct inode *, const char __user *buf,
57617 +                             int user, loff_t size,
57618 +                             loff_t off, rw_op op, flow_t *);
57619 +       /*
57620 +        * Return the key used to retrieve an offset of a file. It is used by
57621 +        * default implementation of ->flow_by_inode() method
57622 +        * (common_build_flow()) and, among other things, to get to the extent
57623 +        * from jnode of unformatted node.
57624 +        */
57625 +       int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *);
57626 +
57627 +       /* NIKITA-FIXME-HANS: this comment is not as clear to others as you think.... */
57628 +       /*
57629 +        * set the plugin for a file.  Called during file creation in creat()
57630 +        * but not reiser4() unless an inode already exists for the file.
57631 +        */
57632 +       int (*set_plug_in_inode) (struct inode *inode, struct inode *parent,
57633 +                                 reiser4_object_create_data *);
57634 +
57635 +       /* NIKITA-FIXME-HANS: comment and name seem to say different things,
57636 +        * are you setting up the object itself also or just adjusting the
57637 +        * parent?.... */
57638 +       /* set up plugins for new @object created in @parent. @root is root
57639 +          directory. */
57640 +       int (*adjust_to_parent) (struct inode *object, struct inode *parent,
57641 +                                struct inode *root);
57642 +       /*
57643 +        * this does whatever is necessary to do when object is created. For
57644 +        * instance, for unix files stat data is inserted. It is supposed to be
57645 +        * called by create of struct inode_operations.
57646 +        */
57647 +       int (*create_object) (struct inode *object, struct inode *parent,
57648 +                             reiser4_object_create_data *);
57649 +       /*
57650 +        * this method should check REISER4_NO_SD and set REISER4_NO_SD on
57651 +        * success. Deletion of an object usually includes removal of items
57652 +        * building file body (for directories this is removal of "." and "..")
57653 +        * and removal of stat-data item.
57654 +        */
57655 +       int (*delete_object) (struct inode *);
57656 +
57657 +       /* add link from @parent to @object */
57658 +       int (*add_link) (struct inode *object, struct inode *parent);
57659 +
57660 +       /* remove link from @parent to @object */
57661 +       int (*rem_link) (struct inode *object, struct inode *parent);
57662 +
57663 +       /*
57664 +        * return true if item addressed by @coord belongs to @inode.  This is
57665 +        * used by read/write to properly slice flow into items in presence of
57666 +        * multiple key assignment policies, because items of a file are not
57667 +        * necessarily contiguous in a key space, for example, in a plan-b.
57668 +        */
57669 +       int (*owns_item) (const struct inode *, const coord_t *);
57670 +
57671 +       /* checks whether yet another hard links to this object can be
57672 +          added  */
57673 +       int (*can_add_link) (const struct inode *);
57674 +
57675 +       /* checks whether hard links to this object can be removed */
57676 +       int (*can_rem_link) (const struct inode *);
57677 +
57678 +       /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls
57679 +          detach of directory plugin to remove ".." */
57680 +       int (*detach) (struct inode * child, struct inode * parent);
57681 +
57682 +       /* called when @child was just looked up in the @parent. It is not
57683 +          empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of
57684 +          directory plugin */
57685 +       int (*bind) (struct inode * child, struct inode * parent);
57686 +
57687 +       /* process safe-link during mount */
57688 +       int (*safelink) (struct inode * object, reiser4_safe_link_t link,
57689 +                        __u64 value);
57690 +
57691 +       /* The couple of estimate methods for all file operations */
57692 +       struct {
57693 +               reiser4_block_nr(*create) (const struct inode *);
57694 +               reiser4_block_nr(*update) (const struct inode *);
57695 +               reiser4_block_nr(*unlink) (const struct inode *,
57696 +                                          const struct inode *);
57697 +       } estimate;
57698 +
57699 +       /*
57700 +        * reiser4 specific part of inode has a union of structures which are
57701 +        * specific to a plugin. This method is called when inode is read
57702 +        * (read_inode) and when file is created (common_create_child) so that
57703 +        * file plugin could initialize its inode data
57704 +        */
57705 +       void (*init_inode_data) (struct inode *, reiser4_object_create_data *,
57706 +                                int);
57707 +
57708 +       /*
57709 +        * This method performs progressive deletion of items and whole nodes
57710 +        * from right to left.
57711 +        *
57712 +        * @tap: the point deletion process begins from,
57713 +        * @from_key: the beginning of the deleted key range,
57714 +        * @to_key: the end of the deleted key range,
57715 +        * @smallest_removed: the smallest removed key,
57716 +        *
57717 +        * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
57718 +        * operation was interrupted for allowing atom commit .
57719 +        */
57720 +       int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key,
57721 +                               const reiser4_key * to_key,
57722 +                               reiser4_key * smallest_removed, struct inode *,
57723 +                               int, int *);
57724 +
57725 +       /* called from ->destroy_inode() */
57726 +       void (*destroy_inode) (struct inode *);
57727 +
57728 +       /*
57729 +        * methods to serialize object identify. This is used, for example, by
57730 +        * reiser4_{en,de}code_fh().
57731 +        */
57732 +       struct {
57733 +               /* store object's identity at @area */
57734 +               char *(*write) (struct inode * inode, char *area);
57735 +               /* parse object from wire to the @obj */
57736 +               char *(*read) (char *area, reiser4_object_on_wire * obj);
57737 +               /* given object identity in @obj, find or create its dentry */
57738 +               struct dentry *(*get) (struct super_block * s,
57739 +                                      reiser4_object_on_wire * obj);
57740 +               /* how many bytes ->wire.write() consumes */
57741 +               int (*size) (struct inode * inode);
57742 +               /* finish with object identify */
57743 +               void (*done) (reiser4_object_on_wire * obj);
57744 +       } wire;
57745 +} file_plugin;
57746 +
57747 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
57748 +
57749 +struct reiser4_object_on_wire {
57750 +       file_plugin *plugin;
57751 +       union {
57752 +               struct {
57753 +                       obj_key_id key_id;
57754 +               } std;
57755 +               void *generic;
57756 +       } u;
57757 +};
57758 +
57759 +/* builtin dir-plugins */
57760 +typedef enum {
57761 +       HASHED_DIR_PLUGIN_ID,
57762 +       SEEKABLE_HASHED_DIR_PLUGIN_ID,
57763 +       LAST_DIR_ID
57764 +} reiser4_dir_id;
57765 +
57766 +typedef struct dir_plugin {
57767 +       /* generic fields */
57768 +       plugin_header h;
57769 +
57770 +       struct inode_operations * inode_ops;
57771 +       struct file_operations * file_ops;
57772 +       struct address_space_operations * as_ops;
57773 +
57774 +       /*
57775 +        * private methods: These are optional.  If used they will allow you to
57776 +        * minimize the amount of code needed to implement a deviation from
57777 +        * some other method that uses them.  You could logically argue that
57778 +        * they should be a separate type of plugin.
57779 +        */
57780 +
57781 +       struct dentry *(*get_parent) (struct inode * childdir);
57782 +
57783 +       /*
57784 +        * check whether "name" is acceptable name to be inserted into this
57785 +        * object. Optionally implemented by directory-like objects.  Can check
57786 +        * for maximal length, reserved symbols etc
57787 +        */
57788 +       int (*is_name_acceptable) (const struct inode * inode, const char *name,
57789 +                                  int len);
57790 +
57791 +       void (*build_entry_key) (const struct inode * dir       /* directory where
57792 +                                                                * entry is (or will
57793 +                                                                * be) in.*/ ,
57794 +                                const struct qstr * name       /* name of file
57795 +                                                                * referenced by this
57796 +                                                                * entry */ ,
57797 +                                reiser4_key * result   /* resulting key of
57798 +                                                        * directory entry */ );
57799 +       int (*build_readdir_key) (struct file * dir, reiser4_key * result);
57800 +       int (*add_entry) (struct inode * object, struct dentry * where,
57801 +                         reiser4_object_create_data * data,
57802 +                         reiser4_dir_entry_desc * entry);
57803 +       int (*rem_entry) (struct inode * object, struct dentry * where,
57804 +                         reiser4_dir_entry_desc * entry);
57805 +
57806 +       /*
57807 +        * initialize directory structure for newly created object. For normal
57808 +        * unix directories, insert dot and dotdot.
57809 +        */
57810 +       int (*init) (struct inode * object, struct inode * parent,
57811 +                    reiser4_object_create_data * data);
57812 +
57813 +       /* destroy directory */
57814 +       int (*done) (struct inode * child);
57815 +
57816 +       /* called when @subdir was just looked up in the @dir */
57817 +       int (*attach) (struct inode * subdir, struct inode * dir);
57818 +       int (*detach) (struct inode * subdir, struct inode * dir);
57819 +
57820 +       struct {
57821 +               reiser4_block_nr(*add_entry) (const struct inode *);
57822 +               reiser4_block_nr(*rem_entry) (const struct inode *);
57823 +               reiser4_block_nr(*unlink) (const struct inode *,
57824 +                                          const struct inode *);
57825 +       } estimate;
57826 +} dir_plugin;
57827 +
57828 +extern dir_plugin dir_plugins[LAST_DIR_ID];
57829 +
57830 +typedef struct formatting_plugin {
57831 +       /* generic fields */
57832 +       plugin_header h;
57833 +       /* returns non-zero iff file's tail has to be stored
57834 +          in a direct item. */
57835 +       int (*have_tail) (const struct inode * inode, loff_t size);
57836 +} formatting_plugin;
57837 +
57838 +typedef struct hash_plugin {
57839 +       /* generic fields */
57840 +       plugin_header h;
57841 +       /* computes hash of the given name */
57842 +        __u64(*hash) (const unsigned char *name, int len);
57843 +} hash_plugin;
57844 +
57845 +typedef struct cipher_plugin {
57846 +       /* generic fields */
57847 +       plugin_header h;
57848 +       struct crypto_blkcipher * (*alloc) (void);
57849 +       void (*free) (struct crypto_blkcipher * tfm);
57850 +       /* Offset translator. For each offset this returns (k * offset), where
57851 +          k (k >= 1) is an expansion factor of the cipher algorithm.
57852 +          For all symmetric algorithms k == 1. For asymmetric algorithms (which
57853 +          inflate data) offset translation guarantees that all disk cluster's
57854 +          units will have keys smaller then next cluster's one.
57855 +        */
57856 +        loff_t(*scale) (struct inode * inode, size_t blocksize, loff_t src);
57857 +       /* Cipher algorithms can accept data only by chunks of cipher block
57858 +          size. This method is to align any flow up to cipher block size when
57859 +          we pass it to cipher algorithm. To align means to append padding of
57860 +          special format specific to the cipher algorithm */
57861 +       int (*align_stream) (__u8 * tail, int clust_size, int blocksize);
57862 +       /* low-level key manager (check, install, etc..) */
57863 +       int (*setkey) (struct crypto_tfm * tfm, const __u8 * key,
57864 +                      unsigned int keylen);
57865 +       /* main text processing procedures */
57866 +       void (*encrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
57867 +       void (*decrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
57868 +} cipher_plugin;
57869 +
57870 +typedef struct digest_plugin {
57871 +       /* generic fields */
57872 +       plugin_header h;
57873 +       /* fingerprint size in bytes */
57874 +       int fipsize;
57875 +       struct crypto_hash * (*alloc) (void);
57876 +       void (*free) (struct crypto_hash * tfm);
57877 +} digest_plugin;
57878 +
57879 +typedef struct compression_plugin {
57880 +       /* generic fields */
57881 +       plugin_header h;
57882 +       int (*init) (void);
57883 +       /* the maximum number of bytes the size of the "compressed" data can
57884 +        * exceed the uncompressed data. */
57885 +       int (*overrun) (unsigned src_len);
57886 +        coa_t(*alloc) (tfm_action act);
57887 +       void (*free) (coa_t coa, tfm_action act);
57888 +       /* minimal size of the flow we still try to compress */
57889 +       int (*min_size_deflate) (void);
57890 +        __u32(*checksum) (char *data, __u32 length);
57891 +       /* main transform procedures */
57892 +       void (*compress) (coa_t coa, __u8 * src_first, unsigned src_len,
57893 +                         __u8 * dst_first, unsigned *dst_len);
57894 +       void (*decompress) (coa_t coa, __u8 * src_first, unsigned src_len,
57895 +                           __u8 * dst_first, unsigned *dst_len);
57896 +} compression_plugin;
57897 +
57898 +typedef struct compression_mode_plugin {
57899 +       /* generic fields */
57900 +       plugin_header h;
57901 +       /* this is called when estimating compressibility
57902 +          of a logical cluster by its content */
57903 +       int (*should_deflate) (struct inode * inode, cloff_t index);
57904 +       /* this is called when results of compression should be saved */
57905 +       int (*accept_hook) (struct inode * inode, cloff_t index);
57906 +       /* this is called when results of compression should be discarded */
57907 +       int (*discard_hook) (struct inode * inode, cloff_t index);
57908 +} compression_mode_plugin;
57909 +
57910 +typedef struct cluster_plugin {
57911 +       /* generic fields */
57912 +       plugin_header h;
57913 +       int shift;
57914 +} cluster_plugin;
57915 +
57916 +typedef struct sd_ext_plugin {
57917 +       /* generic fields */
57918 +       plugin_header h;
57919 +       int (*present) (struct inode * inode, char **area, int *len);
57920 +       int (*absent) (struct inode * inode);
57921 +       int (*save_len) (struct inode * inode);
57922 +       int (*save) (struct inode * inode, char **area);
57923 +       /* alignment requirement for this stat-data part */
57924 +       int alignment;
57925 +} sd_ext_plugin;
57926 +
57927 +/* this plugin contains methods to allocate objectid for newly created files,
57928 +   to deallocate objectid when file gets removed, to report number of used and
57929 +   free objectids */
57930 +typedef struct oid_allocator_plugin {
57931 +       /* generic fields */
57932 +       plugin_header h;
57933 +       int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files,
57934 +                                  __u64 oids);
57935 +       /* used to report statfs->f_files */
57936 +        __u64(*oids_used) (reiser4_oid_allocator * map);
57937 +       /* get next oid to use */
57938 +        __u64(*next_oid) (reiser4_oid_allocator * map);
57939 +       /* used to report statfs->f_ffree */
57940 +        __u64(*oids_free) (reiser4_oid_allocator * map);
57941 +       /* allocate new objectid */
57942 +       int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *);
57943 +       /* release objectid */
57944 +       int (*release_oid) (reiser4_oid_allocator * map, oid_t);
57945 +       /* how many pages to reserve in transaction for allocation of new
57946 +          objectid */
57947 +       int (*oid_reserve_allocate) (reiser4_oid_allocator * map);
57948 +       /* how many pages to reserve in transaction for freeing of an
57949 +          objectid */
57950 +       int (*oid_reserve_release) (reiser4_oid_allocator * map);
57951 +       void (*print_info) (const char *, reiser4_oid_allocator *);
57952 +} oid_allocator_plugin;
57953 +
57954 +/* disk layout plugin: this specifies super block, journal, bitmap (if there
57955 +   are any) locations, etc */
57956 +typedef struct disk_format_plugin {
57957 +       /* generic fields */
57958 +       plugin_header h;
57959 +       /* replay journal, initialize super_info_data, etc */
57960 +       int (*init_format) (struct super_block *, void *data);
57961 +
57962 +       /* key of root directory stat data */
57963 +       const reiser4_key *(*root_dir_key) (const struct super_block *);
57964 +
57965 +       int (*release) (struct super_block *);
57966 +       jnode *(*log_super) (struct super_block *);
57967 +       int (*check_open) (const struct inode * object);
57968 +       int (*version_update) (struct super_block *);
57969 +} disk_format_plugin;
57970 +
57971 +struct jnode_plugin {
57972 +       /* generic fields */
57973 +       plugin_header h;
57974 +       int (*init) (jnode * node);
57975 +       int (*parse) (jnode * node);
57976 +       struct address_space *(*mapping) (const jnode * node);
57977 +       unsigned long (*index) (const jnode * node);
57978 +       jnode *(*clone) (jnode * node);
57979 +};
57980 +
57981 +/* plugin instance.                                                         */
57982 +/*                                                                          */
57983 +/* This is "wrapper" union for all types of plugins. Most of the code uses  */
57984 +/* plugins of particular type (file_plugin, dir_plugin, etc.)  rather than  */
57985 +/* operates with pointers to reiser4_plugin. This union is only used in     */
57986 +/* some generic code in plugin/plugin.c that operates on all                */
57987 +/* plugins. Technically speaking purpose of this union is to add type       */
57988 +/* safety to said generic code: each plugin type (file_plugin, for          */
57989 +/* example), contains plugin_header as its first memeber. This first member */
57990 +/* is located at the same place in memory as .h member of                   */
57991 +/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and      */
57992 +/* looks in the .h which is header of plugin type located in union. This    */
57993 +/* allows to avoid type-casts.                                              */
57994 +union reiser4_plugin {
57995 +       /* generic fields */
57996 +       plugin_header h;
57997 +       /* file plugin */
57998 +       file_plugin file;
57999 +       /* directory plugin */
58000 +       dir_plugin dir;
58001 +       /* hash plugin, used by directory plugin */
58002 +       hash_plugin hash;
58003 +       /* fibration plugin used by directory plugin */
58004 +       fibration_plugin fibration;
58005 +       /* cipher transform plugin, used by file plugin */
58006 +       cipher_plugin cipher;
58007 +       /* digest transform plugin, used by file plugin */
58008 +       digest_plugin digest;
58009 +       /* compression transform plugin, used by file plugin */
58010 +       compression_plugin compression;
58011 +       /* tail plugin, used by file plugin */
58012 +       formatting_plugin formatting;
58013 +       /* permission plugin */
58014 +       perm_plugin perm;
58015 +       /* node plugin */
58016 +       node_plugin node;
58017 +       /* item plugin */
58018 +       item_plugin item;
58019 +       /* stat-data extension plugin */
58020 +       sd_ext_plugin sd_ext;
58021 +       /* disk layout plugin */
58022 +       disk_format_plugin format;
58023 +       /* object id allocator plugin */
58024 +       oid_allocator_plugin oid_allocator;
58025 +       /* plugin for different jnode types */
58026 +       jnode_plugin jnode;
58027 +       /* compression mode plugin, used by object plugin */
58028 +       compression_mode_plugin compression_mode;
58029 +       /* cluster plugin, used by object plugin */
58030 +       cluster_plugin clust;
58031 +       /* place-holder for new plugin types that can be registered
58032 +          dynamically, and used by other dynamically loaded plugins.  */
58033 +       void *generic;
58034 +};
58035 +
58036 +struct reiser4_plugin_ops {
58037 +       /* called when plugin is initialized */
58038 +       int (*init) (reiser4_plugin * plugin);
58039 +       /* called when plugin is unloaded */
58040 +       int (*done) (reiser4_plugin * plugin);
58041 +       /* load given plugin from disk */
58042 +       int (*load) (struct inode * inode,
58043 +                    reiser4_plugin * plugin, char **area, int *len);
58044 +       /* how many space is required to store this plugin's state
58045 +          in stat-data */
58046 +       int (*save_len) (struct inode * inode, reiser4_plugin * plugin);
58047 +       /* save persistent plugin-data to disk */
58048 +       int (*save) (struct inode * inode, reiser4_plugin * plugin,
58049 +                    char **area);
58050 +       /* alignment requirement for on-disk state of this plugin
58051 +          in number of bytes */
58052 +       int alignment;
58053 +       /* install itself into given inode. This can return error
58054 +          (e.g., you cannot change hash of non-empty directory). */
58055 +       int (*change) (struct inode * inode, reiser4_plugin * plugin,
58056 +                      pset_member memb);
58057 +       /* install itself into given inode. This can return error
58058 +          (e.g., you cannot change hash of non-empty directory). */
58059 +       int (*inherit) (struct inode * inode, struct inode * parent,
58060 +                       reiser4_plugin * plugin);
58061 +};
58062 +
58063 +/* functions implemented in fs/reiser4/plugin/plugin.c */
58064 +
58065 +/* stores plugin reference in reiser4-specific part of inode */
58066 +extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id);
58067 +extern int init_plugins(void);
58068 +
58069 +/* builtin plugins */
58070 +
58071 +/* builtin hash-plugins */
58072 +
58073 +typedef enum {
58074 +       RUPASOV_HASH_ID,
58075 +       R5_HASH_ID,
58076 +       TEA_HASH_ID,
58077 +       FNV1_HASH_ID,
58078 +       DEGENERATE_HASH_ID,
58079 +       LAST_HASH_ID
58080 +} reiser4_hash_id;
58081 +
58082 +/* builtin cipher plugins */
58083 +
58084 +typedef enum {
58085 +       NONE_CIPHER_ID,
58086 +       LAST_CIPHER_ID
58087 +} reiser4_cipher_id;
58088 +
58089 +/* builtin digest plugins */
58090 +
58091 +typedef enum {
58092 +       SHA256_32_DIGEST_ID,
58093 +       LAST_DIGEST_ID
58094 +} reiser4_digest_id;
58095 +
58096 +/* builtin compression mode plugins */
58097 +typedef enum {
58098 +       NONE_COMPRESSION_MODE_ID,
58099 +       LATTD_COMPRESSION_MODE_ID,
58100 +       ULTIM_COMPRESSION_MODE_ID,
58101 +       FORCE_COMPRESSION_MODE_ID,
58102 +       CONVX_COMPRESSION_MODE_ID,
58103 +       LAST_COMPRESSION_MODE_ID
58104 +} reiser4_compression_mode_id;
58105 +
58106 +/* builtin cluster plugins */
58107 +typedef enum {
58108 +       CLUSTER_64K_ID,
58109 +       CLUSTER_32K_ID,
58110 +       CLUSTER_16K_ID,
58111 +       CLUSTER_8K_ID,
58112 +       CLUSTER_4K_ID,
58113 +       LAST_CLUSTER_ID
58114 +} reiser4_cluster_id;
58115 +
58116 +/* builtin tail-plugins */
58117 +
58118 +typedef enum {
58119 +       NEVER_TAILS_FORMATTING_ID,
58120 +       ALWAYS_TAILS_FORMATTING_ID,
58121 +       SMALL_FILE_FORMATTING_ID,
58122 +       LAST_TAIL_FORMATTING_ID
58123 +} reiser4_formatting_id;
58124 +
58125 +/* data type used to pack parameters that we pass to vfs object creation
58126 +   function create_object() */
58127 +struct reiser4_object_create_data {
58128 +       /* plugin to control created object */
58129 +       reiser4_file_id id;
58130 +       /* mode of regular file, directory or special file */
58131 +/* what happens if some other sort of perm plugin is in use? */
58132 +       int mode;
58133 +       /* rdev of special file */
58134 +       dev_t rdev;
58135 +       /* symlink target */
58136 +       const char *name;
58137 +       /* add here something for non-standard objects you invent, like
58138 +          query for interpolation file etc. */
58139 +
58140 +       struct reiser4_crypto_info * crypto;
58141 +
58142 +       struct inode *parent;
58143 +       struct dentry *dentry;
58144 +};
58145 +
58146 +/* description of directory entry being created/destroyed/sought for
58147 +
58148 +   It is passed down to the directory plugin and farther to the
58149 +   directory item plugin methods. Creation of new directory is done in
58150 +   several stages: first we search for an entry with the same name, then
58151 +   create new one. reiser4_dir_entry_desc is used to store some information
58152 +   collected at some stage of this process and required later: key of
58153 +   item that we want to insert/delete and pointer to an object that will
58154 +   be bound by the new directory entry. Probably some more fields will
58155 +   be added there.
58156 +
58157 +*/
58158 +struct reiser4_dir_entry_desc {
58159 +       /* key of directory entry */
58160 +       reiser4_key key;
58161 +       /* object bound by this entry. */
58162 +       struct inode *obj;
58163 +};
58164 +
58165 +#define MAX_PLUGIN_TYPE_LABEL_LEN  32
58166 +#define MAX_PLUGIN_PLUG_LABEL_LEN  32
58167 +
58168 +#define PLUGIN_BY_ID(TYPE,ID,FIELD)                                    \
58169 +static inline TYPE *TYPE ## _by_id( reiser4_plugin_id id )             \
58170 +{                                                                      \
58171 +       reiser4_plugin *plugin = plugin_by_id ( ID, id );               \
58172 +       return plugin ? & plugin -> FIELD : NULL;                       \
58173 +}                                                                      \
58174 +static inline TYPE *TYPE ## _by_disk_id( reiser4_tree *tree, d16 *id ) \
58175 +{                                                                      \
58176 +       reiser4_plugin *plugin = plugin_by_disk_id ( tree, ID, id );    \
58177 +       return plugin ? & plugin -> FIELD : NULL;                       \
58178 +}                                                                      \
58179 +static inline TYPE *TYPE ## _by_unsafe_id( reiser4_plugin_id id )      \
58180 +{                                                                      \
58181 +       reiser4_plugin *plugin = plugin_by_unsafe_id ( ID, id );        \
58182 +       return plugin ? & plugin -> FIELD : NULL;                       \
58183 +}                                                                      \
58184 +static inline reiser4_plugin* TYPE ## _to_plugin( TYPE* plugin )       \
58185 +{                                                                      \
58186 +       return ( reiser4_plugin * ) plugin;                             \
58187 +}                                                                      \
58188 +static inline reiser4_plugin_id TYPE ## _id( TYPE* plugin )            \
58189 +{                                                                      \
58190 +       return TYPE ## _to_plugin (plugin) -> h.id;                     \
58191 +}                                                                      \
58192 +typedef struct { int foo; } TYPE ## _plugin_dummy
58193 +
58194 +PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item);
58195 +PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file);
58196 +PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir);
58197 +PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node);
58198 +PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext);
58199 +PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm);
58200 +PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash);
58201 +PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration);
58202 +PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher);
58203 +PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest);
58204 +PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression);
58205 +PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting);
58206 +PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format);
58207 +PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode);
58208 +PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
58209 +            compression_mode);
58210 +PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust);
58211 +
58212 +extern int save_plugin_id(reiser4_plugin * plugin, d16 * area);
58213 +
58214 +extern struct list_head *get_plugin_list(reiser4_plugin_type type_id);
58215 +
58216 +#define for_all_plugins(ptype, plugin)                                                 \
58217 +for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage);     \
58218 +     get_plugin_list(ptype) != &plugin->h.linkage;                                     \
58219 +     plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage))
58220 +
58221 +
58222 +extern int grab_plugin_pset(struct inode *self, struct inode *ancestor, pset_member memb);
58223 +extern int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin *plug);
58224 +extern int finish_pset(struct inode *inode);
58225 +
58226 +/* defined in fs/reiser4/plugin/object.c */
58227 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
58228 +/* defined in fs/reiser4/plugin/object.c */
58229 +extern dir_plugin dir_plugins[LAST_DIR_ID];
58230 +/* defined in fs/reiser4/plugin/item/static_stat.c */
58231 +extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION];
58232 +/* defined in fs/reiser4/plugin/hash.c */
58233 +extern hash_plugin hash_plugins[LAST_HASH_ID];
58234 +/* defined in fs/reiser4/plugin/fibration.c */
58235 +extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID];
58236 +/* defined in fs/reiser4/plugin/crypt.c */
58237 +extern cipher_plugin cipher_plugins[LAST_CIPHER_ID];
58238 +/* defined in fs/reiser4/plugin/digest.c */
58239 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
58240 +/* defined in fs/reiser4/plugin/compress/compress.c */
58241 +extern compression_plugin compression_plugins[LAST_COMPRESSION_ID];
58242 +/* defined in fs/reiser4/plugin/compress/compression_mode.c */
58243 +extern compression_mode_plugin
58244 +compression_mode_plugins[LAST_COMPRESSION_MODE_ID];
58245 +/* defined in fs/reiser4/plugin/cluster.c */
58246 +extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID];
58247 +/* defined in fs/reiser4/plugin/tail.c */
58248 +extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID];
58249 +/* defined in fs/reiser4/plugin/security/security.c */
58250 +extern perm_plugin perm_plugins[LAST_PERM_ID];
58251 +/* defined in fs/reiser4/plugin/item/item.c */
58252 +extern item_plugin item_plugins[LAST_ITEM_ID];
58253 +/* defined in fs/reiser4/plugin/node/node.c */
58254 +extern node_plugin node_plugins[LAST_NODE_ID];
58255 +/* defined in fs/reiser4/plugin/disk_format/disk_format.c */
58256 +extern disk_format_plugin format_plugins[LAST_FORMAT_ID];
58257 +
58258 +/* __FS_REISER4_PLUGIN_TYPES_H__ */
58259 +#endif
58260 +
58261 +/* Make Linus happy.
58262 +   Local variables:
58263 +   c-indentation-style: "K&R"
58264 +   mode-name: "LC"
58265 +   c-basic-offset: 8
58266 +   tab-width: 8
58267 +   fill-column: 120
58268 +   End:
58269 +*/
58270 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/plugin_header.h linux-2.6.27/fs/reiser4/plugin/plugin_header.h
58271 --- linux-2.6.27.orig/fs/reiser4/plugin/plugin_header.h 1970-01-01 03:00:00.000000000 +0300
58272 +++ linux-2.6.27/fs/reiser4/plugin/plugin_header.h      2008-10-12 18:20:01.000000000 +0400
58273 @@ -0,0 +1,155 @@
58274 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58275 +
58276 +/* plugin header. Data structures required by all plugin types. */
58277 +
58278 +#if !defined( __PLUGIN_HEADER_H__ )
58279 +#define __PLUGIN_HEADER_H__
58280 +
58281 +/* plugin data-types and constants */
58282 +
58283 +#include "../debug.h"
58284 +#include "../dformat.h"
58285 +
58286 +/* Every plugin type can be considered as a class of virtual objects
58287 +   {(type, i) | i = 0, 1, ...}, which has one the following categories
58288 +   of virtualization:
58289 +   A - no virtualization;
58290 +   F - per-file virtualization;
58291 +   S - per-superblock virtualization;
58292 +   FIXME-EDWARD: Define every such category */
58293 +
58294 +/* Supported plugin types: (id, (virtualization category), short description) */
58295 +typedef enum {
58296 +       REISER4_FILE_PLUGIN_TYPE,             /* (F) service VFS enry-points */
58297 +       REISER4_DIR_PLUGIN_TYPE,              /* (F) service VFS enry-points */
58298 +       REISER4_ITEM_PLUGIN_TYPE,             /* (F) manage items */
58299 +       REISER4_NODE_PLUGIN_TYPE,             /* (S) manage formatted nodes */
58300 +       REISER4_HASH_PLUGIN_TYPE,             /* (F) compute hash */
58301 +       REISER4_FIBRATION_PLUGIN_TYPE,        /* (F) directory fibrations */
58302 +       REISER4_FORMATTING_PLUGIN_TYPE,       /* (F) tail-packing policy */
58303 +       REISER4_PERM_PLUGIN_TYPE,             /*       stub (vacancy)     */
58304 +       REISER4_SD_EXT_PLUGIN_TYPE,           /* (A) stat-data extensions */
58305 +       REISER4_FORMAT_PLUGIN_TYPE,           /* (S) specify disk format */
58306 +       REISER4_JNODE_PLUGIN_TYPE,            /* (A) in-memory node headers */
58307 +       REISER4_CIPHER_PLUGIN_TYPE,           /* (F) cipher transform algs */
58308 +       REISER4_DIGEST_PLUGIN_TYPE,           /* (F) digest transform algs */
58309 +       REISER4_COMPRESSION_PLUGIN_TYPE,      /* (F) compression tfm algs */
58310 +       REISER4_COMPRESSION_MODE_PLUGIN_TYPE, /* (F) compression heuristic */
58311 +       REISER4_CLUSTER_PLUGIN_TYPE,          /* (F) size of logical cluster */
58312 +       REISER4_PLUGIN_TYPES
58313 +} reiser4_plugin_type;
58314 +
58315 +/* Supported plugin groups */
58316 +typedef enum {
58317 +       REISER4_DIRECTORY_FILE,
58318 +       REISER4_REGULAR_FILE,
58319 +       REISER4_SYMLINK_FILE,
58320 +       REISER4_SPECIAL_FILE,
58321 +} file_plugin_group;
58322 +
58323 +struct reiser4_plugin_ops;
58324 +/* generic plugin operations, supported by each
58325 +    plugin type. */
58326 +typedef struct reiser4_plugin_ops reiser4_plugin_ops;
58327 +
58328 +/* the common part of all plugin instances. */
58329 +typedef struct plugin_header {
58330 +       /* plugin type */
58331 +       reiser4_plugin_type type_id;
58332 +       /* id of this plugin */
58333 +       reiser4_plugin_id id;
58334 +       /* bitmask of groups the plugin belongs to. */
58335 +       reiser4_plugin_groups groups;
58336 +       /* plugin operations */
58337 +       reiser4_plugin_ops *pops;
58338 +/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and defined. */
58339 +       /* short label of this plugin */
58340 +       const char *label;
58341 +       /* descriptive string.. */
58342 +       const char *desc;
58343 +       /* list linkage */
58344 +       struct list_head linkage;
58345 +} plugin_header;
58346 +
58347 +#define plugin_of_group(plug, group) (plug->h.groups & (1 << group))
58348 +
58349 +/* PRIVATE INTERFACES */
58350 +/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in plugin_header? */
58351 +/* plugin type representation. */
58352 +struct reiser4_plugin_type_data {
58353 +       /* internal plugin type identifier. Should coincide with
58354 +          index of this item in plugins[] array. */
58355 +       reiser4_plugin_type type_id;
58356 +       /* short symbolic label of this plugin type. Should be no longer
58357 +          than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */
58358 +       const char *label;
58359 +       /* plugin type description longer than .label */
58360 +       const char *desc;
58361 +
58362 +/* NIKITA-FIXME-HANS: define built-in */
58363 +       /* number of built-in plugin instances of this type */
58364 +       int builtin_num;
58365 +       /* array of built-in plugins */
58366 +       void *builtin;
58367 +       struct list_head plugins_list;
58368 +       size_t size;
58369 +};
58370 +
58371 +extern struct reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES];
58372 +
58373 +int is_plugin_type_valid(reiser4_plugin_type type);
58374 +int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id);
58375 +
58376 +static inline reiser4_plugin *plugin_at(struct reiser4_plugin_type_data * ptype,
58377 +                                       int i)
58378 +{
58379 +       char *builtin;
58380 +
58381 +       builtin = ptype->builtin;
58382 +       return (reiser4_plugin *) (builtin + i * ptype->size);
58383 +}
58384 +
58385 +/* return plugin by its @type_id and @id */
58386 +static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type,
58387 +                                          reiser4_plugin_id id)
58388 +{
58389 +       assert("nikita-1651", is_plugin_type_valid(type));
58390 +       assert("nikita-1652", is_plugin_id_valid(type, id));
58391 +       return plugin_at(&plugins[type], id);
58392 +}
58393 +
58394 +extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id,
58395 +                                          reiser4_plugin_id id);
58396 +
58397 +/**
58398 + * plugin_by_disk_id - get reiser4_plugin
58399 + * @type_id: plugin type id
58400 + * @did: plugin id in disk format
58401 + *
58402 + * Returns reiser4_plugin by plugin type id an dplugin_id.
58403 + */
58404 +static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG,
58405 +                                               reiser4_plugin_type type_id,
58406 +                                               __le16 *plugin_id)
58407 +{
58408 +       /*
58409 +        * what we should do properly is to maintain within each file-system a
58410 +        * dictionary that maps on-disk plugin ids to "universal" ids. This
58411 +        * dictionary will be resolved on mount time, so that this function
58412 +        * will perform just one additional array lookup.
58413 +        */
58414 +       return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id));
58415 +}
58416 +
58417 +/* __PLUGIN_HEADER_H__ */
58418 +#endif
58419 +
58420 +/*
58421 + * Local variables:
58422 + * c-indentation-style: "K&R"
58423 + * mode-name: "LC"
58424 + * c-basic-offset: 8
58425 + * tab-width: 8
58426 + * fill-column: 79
58427 + * End:
58428 + */
58429 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/plugin_set.c linux-2.6.27/fs/reiser4/plugin/plugin_set.c
58430 --- linux-2.6.27.orig/fs/reiser4/plugin/plugin_set.c    1970-01-01 03:00:00.000000000 +0300
58431 +++ linux-2.6.27/fs/reiser4/plugin/plugin_set.c 2008-10-12 18:20:01.000000000 +0400
58432 @@ -0,0 +1,379 @@
58433 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
58434 + * reiser4/README */
58435 +/* This file contains Reiser4 plugin set operations */
58436 +
58437 +/* plugin sets
58438 + *
58439 + * Each file in reiser4 is controlled by a whole set of plugins (file plugin,
58440 + * directory plugin, hash plugin, tail policy plugin, security plugin, etc.)
58441 + * assigned (inherited, deduced from mode bits, etc.) at creation time. This
58442 + * set of plugins (so called pset) is described by structure plugin_set (see
58443 + * plugin/plugin_set.h), which contains pointers to all required plugins.
58444 + *
58445 + * Children can inherit some pset members from their parent, however sometimes
58446 + * it is useful to specify members different from parent ones. Since object's
58447 + * pset can not be easily changed without fatal consequences, we use for this
58448 + * purpose another special plugin table (so called hset, or heir set) described
58449 + * by the same structure.
58450 + *
58451 + * Inode only stores a pointers to pset and hset. Different inodes with the
58452 + * same set of pset (hset) members point to the same pset (hset). This is
58453 + * archived by storing psets and hsets in global hash table. Races are avoided
58454 + * by simple (and efficient so far) solution of never recycling psets, even
58455 + * when last inode pointing to it is destroyed.
58456 + */
58457 +
58458 +#include "../debug.h"
58459 +#include "../super.h"
58460 +#include "plugin_set.h"
58461 +
58462 +#include <linux/slab.h>
58463 +#include <linux/stddef.h>
58464 +
58465 +/* slab for plugin sets */
58466 +static struct kmem_cache *plugin_set_slab;
58467 +
58468 +static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = {
58469 +       [0 ... 7] = SPIN_LOCK_UNLOCKED
58470 +};
58471 +
58472 +/* hash table support */
58473 +
58474 +#define PS_TABLE_SIZE (32)
58475 +
58476 +static inline plugin_set *cast_to(const unsigned long *a)
58477 +{
58478 +       return container_of(a, plugin_set, hashval);
58479 +}
58480 +
58481 +static inline int pseq(const unsigned long *a1, const unsigned long *a2)
58482 +{
58483 +       plugin_set *set1;
58484 +       plugin_set *set2;
58485 +
58486 +       /* make sure fields are not missed in the code below */
58487 +       cassert(sizeof *set1 ==
58488 +               sizeof set1->hashval +
58489 +               sizeof set1->link +
58490 +               sizeof set1->file +
58491 +               sizeof set1->dir +
58492 +               sizeof set1->perm +
58493 +               sizeof set1->formatting +
58494 +               sizeof set1->hash +
58495 +               sizeof set1->fibration +
58496 +               sizeof set1->sd +
58497 +               sizeof set1->dir_item +
58498 +               sizeof set1->cipher +
58499 +               sizeof set1->digest +
58500 +               sizeof set1->compression +
58501 +               sizeof set1->compression_mode +
58502 +               sizeof set1->cluster +
58503 +               sizeof set1->create);
58504 +
58505 +       set1 = cast_to(a1);
58506 +       set2 = cast_to(a2);
58507 +       return
58508 +           set1->hashval == set2->hashval &&
58509 +           set1->file == set2->file &&
58510 +           set1->dir == set2->dir &&
58511 +           set1->perm == set2->perm &&
58512 +           set1->formatting == set2->formatting &&
58513 +           set1->hash == set2->hash &&
58514 +           set1->fibration == set2->fibration &&
58515 +           set1->sd == set2->sd &&
58516 +           set1->dir_item == set2->dir_item &&
58517 +           set1->cipher == set2->cipher &&
58518 +           set1->digest == set2->digest &&
58519 +           set1->compression == set2->compression &&
58520 +           set1->compression_mode == set2->compression_mode &&
58521 +           set1->cluster == set2->cluster &&
58522 +           set1->create == set2->create;
58523 +}
58524 +
58525 +#define HASH_FIELD(hash, set, field)           \
58526 +({                                             \
58527 +        (hash) += (unsigned long)(set)->field >> 2;    \
58528 +})
58529 +
58530 +static inline unsigned long calculate_hash(const plugin_set * set)
58531 +{
58532 +       unsigned long result;
58533 +
58534 +       result = 0;
58535 +       HASH_FIELD(result, set, file);
58536 +       HASH_FIELD(result, set, dir);
58537 +       HASH_FIELD(result, set, perm);
58538 +       HASH_FIELD(result, set, formatting);
58539 +       HASH_FIELD(result, set, hash);
58540 +       HASH_FIELD(result, set, fibration);
58541 +       HASH_FIELD(result, set, sd);
58542 +       HASH_FIELD(result, set, dir_item);
58543 +       HASH_FIELD(result, set, cipher);
58544 +       HASH_FIELD(result, set, digest);
58545 +       HASH_FIELD(result, set, compression);
58546 +       HASH_FIELD(result, set, compression_mode);
58547 +       HASH_FIELD(result, set, cluster);
58548 +       HASH_FIELD(result, set, create);
58549 +       return result & (PS_TABLE_SIZE - 1);
58550 +}
58551 +
58552 +static inline unsigned long
58553 +pshash(ps_hash_table * table, const unsigned long *a)
58554 +{
58555 +       return *a;
58556 +}
58557 +
58558 +/* The hash table definition */
58559 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
58560 +#define KFREE(ptr, size) kfree(ptr)
58561 +TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash,
58562 +                     pseq);
58563 +#undef KFREE
58564 +#undef KMALLOC
58565 +
58566 +static ps_hash_table ps_table;
58567 +static plugin_set empty_set = {
58568 +       .hashval = 0,
58569 +       .file = NULL,
58570 +       .dir = NULL,
58571 +       .perm = NULL,
58572 +       .formatting = NULL,
58573 +       .hash = NULL,
58574 +       .fibration = NULL,
58575 +       .sd = NULL,
58576 +       .dir_item = NULL,
58577 +       .cipher = NULL,
58578 +       .digest = NULL,
58579 +       .compression = NULL,
58580 +       .compression_mode = NULL,
58581 +       .cluster = NULL,
58582 +       .create = NULL,
58583 +       .link = {NULL}
58584 +};
58585 +
58586 +plugin_set *plugin_set_get_empty(void)
58587 +{
58588 +       return &empty_set;
58589 +}
58590 +
58591 +void plugin_set_put(plugin_set * set)
58592 +{
58593 +}
58594 +
58595 +static inline unsigned long *pset_field(plugin_set * set, int offset)
58596 +{
58597 +       return (unsigned long *)(((char *)set) + offset);
58598 +}
58599 +
58600 +static int plugin_set_field(plugin_set ** set, const unsigned long val,
58601 +                           const int offset)
58602 +{
58603 +       unsigned long *spot;
58604 +       spinlock_t *lock;
58605 +       plugin_set replica;
58606 +       plugin_set *twin;
58607 +       plugin_set *psal;
58608 +       plugin_set *orig;
58609 +
58610 +       assert("nikita-2902", set != NULL);
58611 +       assert("nikita-2904", *set != NULL);
58612 +
58613 +       spot = pset_field(*set, offset);
58614 +       if (unlikely(*spot == val))
58615 +               return 0;
58616 +
58617 +       replica = *(orig = *set);
58618 +       *pset_field(&replica, offset) = val;
58619 +       replica.hashval = calculate_hash(&replica);
58620 +       rcu_read_lock();
58621 +       twin = ps_hash_find(&ps_table, &replica.hashval);
58622 +       if (unlikely(twin == NULL)) {
58623 +               rcu_read_unlock();
58624 +               psal = kmem_cache_alloc(plugin_set_slab,
58625 +                                       reiser4_ctx_gfp_mask_get());
58626 +               if (psal == NULL)
58627 +                       return RETERR(-ENOMEM);
58628 +               *psal = replica;
58629 +               lock = &plugin_set_lock[replica.hashval & 7];
58630 +               spin_lock(lock);
58631 +               twin = ps_hash_find(&ps_table, &replica.hashval);
58632 +               if (likely(twin == NULL)) {
58633 +                       *set = psal;
58634 +                       ps_hash_insert_rcu(&ps_table, psal);
58635 +               } else {
58636 +                       *set = twin;
58637 +                       kmem_cache_free(plugin_set_slab, psal);
58638 +               }
58639 +               spin_unlock(lock);
58640 +       } else {
58641 +               rcu_read_unlock();
58642 +               *set = twin;
58643 +       }
58644 +       return 0;
58645 +}
58646 +
58647 +static struct {
58648 +       int offset;
58649 +       reiser4_plugin_groups groups;
58650 +       reiser4_plugin_type type;
58651 +} pset_descr[PSET_LAST] = {
58652 +       [PSET_FILE] = {
58653 +               .offset = offsetof(plugin_set, file),
58654 +               .type = REISER4_FILE_PLUGIN_TYPE,
58655 +               .groups = 0
58656 +       },
58657 +       [PSET_DIR] = {
58658 +               .offset = offsetof(plugin_set, dir),
58659 +               .type = REISER4_DIR_PLUGIN_TYPE,
58660 +               .groups = 0
58661 +       },
58662 +       [PSET_PERM] = {
58663 +               .offset = offsetof(plugin_set, perm),
58664 +               .type = REISER4_PERM_PLUGIN_TYPE,
58665 +               .groups = 0
58666 +       },
58667 +       [PSET_FORMATTING] = {
58668 +               .offset = offsetof(plugin_set, formatting),
58669 +               .type = REISER4_FORMATTING_PLUGIN_TYPE,
58670 +               .groups = 0
58671 +       },
58672 +       [PSET_HASH] = {
58673 +               .offset = offsetof(plugin_set, hash),
58674 +               .type = REISER4_HASH_PLUGIN_TYPE,
58675 +               .groups = 0
58676 +       },
58677 +       [PSET_FIBRATION] = {
58678 +               .offset = offsetof(plugin_set, fibration),
58679 +               .type = REISER4_FIBRATION_PLUGIN_TYPE,
58680 +               .groups = 0
58681 +       },
58682 +       [PSET_SD] = {
58683 +               .offset = offsetof(plugin_set, sd),
58684 +               .type = REISER4_ITEM_PLUGIN_TYPE,
58685 +               .groups = (1 << STAT_DATA_ITEM_TYPE)
58686 +       },
58687 +       [PSET_DIR_ITEM] = {
58688 +               .offset = offsetof(plugin_set, dir_item),
58689 +               .type = REISER4_ITEM_PLUGIN_TYPE,
58690 +               .groups = (1 << DIR_ENTRY_ITEM_TYPE)
58691 +       },
58692 +       [PSET_CIPHER] = {
58693 +               .offset = offsetof(plugin_set, cipher),
58694 +               .type = REISER4_CIPHER_PLUGIN_TYPE,
58695 +               .groups = 0
58696 +       },
58697 +       [PSET_DIGEST] = {
58698 +               .offset = offsetof(plugin_set, digest),
58699 +               .type = REISER4_DIGEST_PLUGIN_TYPE,
58700 +               .groups = 0
58701 +       },
58702 +       [PSET_COMPRESSION] = {
58703 +               .offset = offsetof(plugin_set, compression),
58704 +               .type = REISER4_COMPRESSION_PLUGIN_TYPE,
58705 +               .groups = 0
58706 +       },
58707 +       [PSET_COMPRESSION_MODE] = {
58708 +               .offset = offsetof(plugin_set, compression_mode),
58709 +               .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
58710 +               .groups = 0
58711 +       },
58712 +       [PSET_CLUSTER] = {
58713 +               .offset = offsetof(plugin_set, cluster),
58714 +               .type = REISER4_CLUSTER_PLUGIN_TYPE,
58715 +               .groups = 0
58716 +       },
58717 +       [PSET_CREATE] = {
58718 +               .offset = offsetof(plugin_set, create),
58719 +               .type = REISER4_FILE_PLUGIN_TYPE,
58720 +               .groups = (1 << REISER4_REGULAR_FILE)
58721 +       }
58722 +};
58723 +
58724 +#define DEFINE_PSET_OPS(PREFIX)                                                       \
58725 +       reiser4_plugin_type PREFIX##_member_to_type_unsafe(pset_member memb)   \
58726 +{                                                                             \
58727 +       if (memb > PSET_LAST)                                                  \
58728 +               return REISER4_PLUGIN_TYPES;                                   \
58729 +       return pset_descr[memb].type;                                          \
58730 +}                                                                             \
58731 +                                                                              \
58732 +int PREFIX##_set_unsafe(plugin_set ** set, pset_member memb,                  \
58733 +                    reiser4_plugin * plugin)                                  \
58734 +{                                                                             \
58735 +       assert("nikita-3492", set != NULL);                                    \
58736 +       assert("nikita-3493", *set != NULL);                                   \
58737 +       assert("nikita-3494", plugin != NULL);                                 \
58738 +       assert("nikita-3495", 0 <= memb && memb < PSET_LAST);                  \
58739 +       assert("nikita-3496", plugin->h.type_id == pset_descr[memb].type);     \
58740 +                                                                              \
58741 +       if (pset_descr[memb].groups)                                           \
58742 +               if (!(pset_descr[memb].groups & plugin->h.groups))             \
58743 +                       return -EINVAL;                                        \
58744 +                                                                              \
58745 +       return plugin_set_field(set,                                           \
58746 +                       (unsigned long)plugin, pset_descr[memb].offset);       \
58747 +}                                                                             \
58748 +                                                                              \
58749 +reiser4_plugin *PREFIX##_get(plugin_set * set, pset_member memb)              \
58750 +{                                                                             \
58751 +       assert("nikita-3497", set != NULL);                                    \
58752 +       assert("nikita-3498", 0 <= memb && memb < PSET_LAST);                  \
58753 +                                                                              \
58754 +       return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset); \
58755 +}
58756 +
58757 +DEFINE_PSET_OPS(aset);
58758 +
58759 +int set_plugin(plugin_set ** set, pset_member memb, reiser4_plugin * plugin) {
58760 +       return plugin_set_field(set,
58761 +               (unsigned long)plugin, pset_descr[memb].offset);
58762 +}
58763 +
58764 +/**
58765 + * init_plugin_set - create plugin set cache and hash table
58766 + *
58767 + * Initializes slab cache of plugin_set-s and their hash table. It is part of
58768 + * reiser4 module initialization.
58769 + */
58770 +int init_plugin_set(void)
58771 +{
58772 +       int result;
58773 +
58774 +       result = ps_hash_init(&ps_table, PS_TABLE_SIZE);
58775 +       if (result == 0) {
58776 +               plugin_set_slab = kmem_cache_create("plugin_set",
58777 +                                                   sizeof(plugin_set), 0,
58778 +                                                   SLAB_HWCACHE_ALIGN,
58779 +                                                   NULL);
58780 +               if (plugin_set_slab == NULL)
58781 +                       result = RETERR(-ENOMEM);
58782 +       }
58783 +       return result;
58784 +}
58785 +
58786 +/**
58787 + * done_plugin_set - delete plugin_set cache and plugin_set hash table
58788 + *
58789 + * This is called on reiser4 module unloading or system shutdown.
58790 + */
58791 +void done_plugin_set(void)
58792 +{
58793 +       plugin_set *cur, *next;
58794 +
58795 +       for_all_in_htable(&ps_table, ps, cur, next) {
58796 +               ps_hash_remove(&ps_table, cur);
58797 +               kmem_cache_free(plugin_set_slab, cur);
58798 +       }
58799 +       destroy_reiser4_cache(&plugin_set_slab);
58800 +       ps_hash_done(&ps_table);
58801 +}
58802 +
58803 +/*
58804 + * Local variables:
58805 + * c-indentation-style: "K&R"
58806 + * mode-name: "LC"
58807 + * c-basic-offset: 8
58808 + * tab-width: 8
58809 + * fill-column: 120
58810 + * End:
58811 + */
58812 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/plugin_set.h linux-2.6.27/fs/reiser4/plugin/plugin_set.h
58813 --- linux-2.6.27.orig/fs/reiser4/plugin/plugin_set.h    1970-01-01 03:00:00.000000000 +0300
58814 +++ linux-2.6.27/fs/reiser4/plugin/plugin_set.h 2008-10-12 18:20:01.000000000 +0400
58815 @@ -0,0 +1,77 @@
58816 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58817 +
58818 +/* Reiser4 plugin set definition.
58819 +   See fs/reiser4/plugin/plugin_set.c for details */
58820 +
58821 +#if !defined( __PLUGIN_SET_H__ )
58822 +#define __PLUGIN_SET_H__
58823 +
58824 +#include "../type_safe_hash.h"
58825 +#include "plugin.h"
58826 +
58827 +#include <linux/rcupdate.h>
58828 +
58829 +struct plugin_set;
58830 +typedef struct plugin_set plugin_set;
58831 +
58832 +TYPE_SAFE_HASH_DECLARE(ps, plugin_set);
58833 +
58834 +struct plugin_set {
58835 +       unsigned long hashval;
58836 +       /* plugin of file */
58837 +       file_plugin *file;
58838 +       /* plugin of dir */
58839 +       dir_plugin *dir;
58840 +       /* perm plugin for this file */
58841 +       perm_plugin *perm;
58842 +       /* tail policy plugin. Only meaningful for regular files */
58843 +       formatting_plugin *formatting;
58844 +       /* hash plugin. Only meaningful for directories. */
58845 +       hash_plugin *hash;
58846 +       /* fibration plugin. Only meaningful for directories. */
58847 +       fibration_plugin *fibration;
58848 +       /* plugin of stat-data */
58849 +       item_plugin *sd;
58850 +       /* plugin of items a directory is built of */
58851 +       item_plugin *dir_item;
58852 +       /* cipher plugin */
58853 +       cipher_plugin *cipher;
58854 +       /* digest plugin */
58855 +       digest_plugin *digest;
58856 +       /* compression plugin */
58857 +       compression_plugin *compression;
58858 +       /* compression mode plugin */
58859 +       compression_mode_plugin *compression_mode;
58860 +       /* cluster plugin */
58861 +       cluster_plugin *cluster;
58862 +       /* this specifies file plugin of regular children.
58863 +          only meaningful for directories */
58864 +       file_plugin *create;
58865 +       ps_hash_link link;
58866 +};
58867 +
58868 +extern plugin_set *plugin_set_get_empty(void);
58869 +extern void plugin_set_put(plugin_set * set);
58870 +
58871 +extern int init_plugin_set(void);
58872 +extern void done_plugin_set(void);
58873 +
58874 +extern reiser4_plugin *aset_get(plugin_set * set, pset_member memb);
58875 +extern int set_plugin(plugin_set ** set, pset_member memb,
58876 +                     reiser4_plugin * plugin);
58877 +extern int aset_set_unsafe(plugin_set ** set, pset_member memb,
58878 +                          reiser4_plugin * plugin);
58879 +extern reiser4_plugin_type aset_member_to_type_unsafe(pset_member memb);
58880 +
58881 +/* __PLUGIN_SET_H__ */
58882 +#endif
58883 +
58884 +/* Make Linus happy.
58885 +   Local variables:
58886 +   c-indentation-style: "K&R"
58887 +   mode-name: "LC"
58888 +   c-basic-offset: 8
58889 +   tab-width: 8
58890 +   fill-column: 120
58891 +   End:
58892 +*/
58893 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/security/Makefile linux-2.6.27/fs/reiser4/plugin/security/Makefile
58894 --- linux-2.6.27.orig/fs/reiser4/plugin/security/Makefile       1970-01-01 03:00:00.000000000 +0300
58895 +++ linux-2.6.27/fs/reiser4/plugin/security/Makefile    2008-10-12 18:20:01.000000000 +0400
58896 @@ -0,0 +1,4 @@
58897 +obj-$(CONFIG_REISER4_FS) += security_plugins.o
58898 +
58899 +security_plugins-objs :=       \
58900 +       perm.o
58901 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/security/perm.c linux-2.6.27/fs/reiser4/plugin/security/perm.c
58902 --- linux-2.6.27.orig/fs/reiser4/plugin/security/perm.c 1970-01-01 03:00:00.000000000 +0300
58903 +++ linux-2.6.27/fs/reiser4/plugin/security/perm.c      2008-10-12 18:20:01.000000000 +0400
58904 @@ -0,0 +1,33 @@
58905 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58906 +
58907 +/*
58908 + * This file contains implementation of permission plugins.
58909 + * See the comments in perm.h
58910 + */
58911 +
58912 +#include "../plugin.h"
58913 +#include "../plugin_header.h"
58914 +#include "../../debug.h"
58915 +
58916 +perm_plugin perm_plugins[LAST_PERM_ID] = {
58917 +       [NULL_PERM_ID] = {
58918 +               .h = {
58919 +                       .type_id = REISER4_PERM_PLUGIN_TYPE,
58920 +                       .id = NULL_PERM_ID,
58921 +                       .pops = NULL,
58922 +                       .label = "null",
58923 +                       .desc = "stub permission plugin",
58924 +                       .linkage = {NULL, NULL}
58925 +               }
58926 +       }
58927 +};
58928 +
58929 +/*
58930 + * Local variables:
58931 + * c-indentation-style: "K&R"
58932 + * mode-name: "LC"
58933 + * c-basic-offset: 8
58934 + * tab-width: 8
58935 + * fill-column: 79
58936 + * End:
58937 + */
58938 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/security/perm.h linux-2.6.27/fs/reiser4/plugin/security/perm.h
58939 --- linux-2.6.27.orig/fs/reiser4/plugin/security/perm.h 1970-01-01 03:00:00.000000000 +0300
58940 +++ linux-2.6.27/fs/reiser4/plugin/security/perm.h      2008-10-12 18:20:01.000000000 +0400
58941 @@ -0,0 +1,38 @@
58942 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58943 +
58944 +/* Perm (short for "permissions") plugins common stuff. */
58945 +
58946 +#if !defined( __REISER4_PERM_H__ )
58947 +#define __REISER4_PERM_H__
58948 +
58949 +#include "../../forward.h"
58950 +#include "../plugin_header.h"
58951 +
58952 +#include <linux/types.h>
58953 +
58954 +/* Definition of permission plugin */
58955 +/* NIKITA-FIXME-HANS: define what this is targeted for.
58956 +   It does not seem to be intended for use with sys_reiser4.  Explain. */
58957 +
58958 +/* NOTE-EDWARD: This seems to be intended for deprecated sys_reiser4.
58959 +   Consider it like a temporary "seam" and reserved pset member.
58960 +   If you have something usefull to add, then rename this plugin and add here */
58961 +typedef struct perm_plugin {
58962 +       /* generic plugin fields */
58963 +       plugin_header h;
58964 +} perm_plugin;
58965 +
58966 +typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id;
58967 +
58968 +/* __REISER4_PERM_H__ */
58969 +#endif
58970 +
58971 +/* Make Linus happy.
58972 +   Local variables:
58973 +   c-indentation-style: "K&R"
58974 +   mode-name: "LC"
58975 +   c-basic-offset: 8
58976 +   tab-width: 8
58977 +   fill-column: 120
58978 +   End:
58979 +*/
58980 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/space/bitmap.c linux-2.6.27/fs/reiser4/plugin/space/bitmap.c
58981 --- linux-2.6.27.orig/fs/reiser4/plugin/space/bitmap.c  1970-01-01 03:00:00.000000000 +0300
58982 +++ linux-2.6.27/fs/reiser4/plugin/space/bitmap.c       2008-10-12 18:20:01.000000000 +0400
58983 @@ -0,0 +1,1585 @@
58984 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58985 +
58986 +#include "../../debug.h"
58987 +#include "../../dformat.h"
58988 +#include "../../txnmgr.h"
58989 +#include "../../jnode.h"
58990 +#include "../../block_alloc.h"
58991 +#include "../../tree.h"
58992 +#include "../../super.h"
58993 +#include "../plugin.h"
58994 +#include "space_allocator.h"
58995 +#include "bitmap.h"
58996 +
58997 +#include <linux/types.h>
58998 +#include <linux/fs.h>          /* for struct super_block  */
58999 +#include <linux/mutex.h>
59000 +#include <asm/div64.h>
59001 +
59002 +/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap
59003 + * blocks
59004 +
59005 +   A useful optimization of reiser4 bitmap handling would be dynamic bitmap
59006 +   blocks loading/unloading which is different from v3.x where all bitmap
59007 +   blocks are loaded at mount time.
59008 +
59009 +   To implement bitmap blocks unloading we need to count bitmap block usage
59010 +   and detect currently unused blocks allowing them to be unloaded. It is not
59011 +   a simple task since we allow several threads to modify one bitmap block
59012 +   simultaneously.
59013 +
59014 +   Briefly speaking, the following schema is proposed: we count in special
59015 +   variable associated with each bitmap block. That is for counting of block
59016 +   alloc/dealloc operations on that bitmap block. With a deferred block
59017 +   deallocation feature of reiser4 all those operation will be represented in
59018 +   atom dirty/deleted lists as jnodes for freshly allocated or deleted
59019 +   nodes.
59020 +
59021 +   So, we increment usage counter for each new node allocated or deleted, and
59022 +   decrement it at atom commit one time for each node from the dirty/deleted
59023 +   atom's list.  Of course, freshly allocated node deletion and node reusing
59024 +   from atom deleted (if we do so) list should decrement bitmap usage counter
59025 +   also.
59026 +
59027 +   This schema seems to be working but that reference counting is
59028 +   not easy to debug. I think we should agree with Hans and do not implement
59029 +   it in v4.0. Current code implements "on-demand" bitmap blocks loading only.
59030 +
59031 +   For simplicity all bitmap nodes (both commit and working bitmap blocks) are
59032 +   loaded into memory on fs mount time or each bitmap nodes are loaded at the
59033 +   first access to it, the "dont_load_bitmap" mount option controls whether
59034 +   bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap
59035 +   nodes currently is not supported. */
59036 +
59037 +#define CHECKSUM_SIZE    4
59038 +
59039 +#define BYTES_PER_LONG   (sizeof(long))
59040 +
59041 +#if BITS_PER_LONG == 64
59042 +#  define LONG_INT_SHIFT (6)
59043 +#else
59044 +#  define LONG_INT_SHIFT (5)
59045 +#endif
59046 +
59047 +#define LONG_INT_MASK (BITS_PER_LONG - 1UL)
59048 +
59049 +typedef unsigned long ulong_t;
59050 +
59051 +#define bmap_size(blocksize)       ((blocksize) - CHECKSUM_SIZE)
59052 +#define bmap_bit_count(blocksize)   (bmap_size(blocksize) << 3)
59053 +
59054 +/* Block allocation/deallocation are done through special bitmap objects which
59055 +   are allocated in an array at fs mount. */
59056 +struct bitmap_node {
59057 +       struct mutex mutex;     /* long term lock object */
59058 +
59059 +       jnode *wjnode;          /* j-nodes for WORKING ... */
59060 +       jnode *cjnode;          /* ... and COMMIT bitmap blocks */
59061 +
59062 +       bmap_off_t first_zero_bit;      /* for skip_busy option implementation */
59063 +
59064 +       atomic_t loaded;        /* a flag which shows that bnode is loaded
59065 +                                * already */
59066 +};
59067 +
59068 +static inline char *bnode_working_data(struct bitmap_node *bnode)
59069 +{
59070 +       char *data;
59071 +
59072 +       data = jdata(bnode->wjnode);
59073 +       assert("zam-429", data != NULL);
59074 +
59075 +       return data + CHECKSUM_SIZE;
59076 +}
59077 +
59078 +static inline char *bnode_commit_data(const struct bitmap_node *bnode)
59079 +{
59080 +       char *data;
59081 +
59082 +       data = jdata(bnode->cjnode);
59083 +       assert("zam-430", data != NULL);
59084 +
59085 +       return data + CHECKSUM_SIZE;
59086 +}
59087 +
59088 +static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode)
59089 +{
59090 +       char *data;
59091 +
59092 +       data = jdata(bnode->cjnode);
59093 +       assert("vpf-261", data != NULL);
59094 +
59095 +       return le32_to_cpu(get_unaligned((d32 *)data));
59096 +}
59097 +
59098 +static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc)
59099 +{
59100 +       char *data;
59101 +
59102 +       data = jdata(bnode->cjnode);
59103 +       assert("vpf-261", data != NULL);
59104 +
59105 +       put_unaligned(cpu_to_le32(crc), (d32 *)data);
59106 +}
59107 +
59108 +/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having
59109 + * written the code, does this added abstraction still have */
59110 +/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the
59111 + * reiser4_space_allocator structure) */
59112 +/* ZAM-FIXME-HANS: I don't understand your english in comment above. */
59113 +/* FIXME-HANS(Zam): I don't understand the questions like "might be a union
59114 + * someday?". What they about?  If there is a reason to have a union, it should
59115 + * be a union, if not, it should not be a union.  "..might be someday" means no
59116 + * reason. */
59117 +struct bitmap_allocator_data {
59118 +       /* an array for bitmap blocks direct access */
59119 +       struct bitmap_node *bitmap;
59120 +};
59121 +
59122 +#define get_barray(super) \
59123 +(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap)
59124 +
59125 +#define get_bnode(super, i) (get_barray(super) + i)
59126 +
59127 +/* allocate and initialize jnode with JNODE_BITMAP type */
59128 +static jnode *bnew(void)
59129 +{
59130 +       jnode *jal = jalloc();
59131 +
59132 +       if (jal)
59133 +               jnode_init(jal, current_tree, JNODE_BITMAP);
59134 +
59135 +       return jal;
59136 +}
59137 +
59138 +/* this file contains:
59139 +   - bitmap based implementation of space allocation plugin
59140 +   - all the helper functions like set bit, find_first_zero_bit, etc */
59141 +
59142 +/* Audited by: green(2002.06.12) */
59143 +static int find_next_zero_bit_in_word(ulong_t word, int start_bit)
59144 +{
59145 +       ulong_t mask = 1UL << start_bit;
59146 +       int i = start_bit;
59147 +
59148 +       while ((word & mask) != 0) {
59149 +               mask <<= 1;
59150 +               if (++i >= BITS_PER_LONG)
59151 +                       break;
59152 +       }
59153 +
59154 +       return i;
59155 +}
59156 +
59157 +#include <linux/bitops.h>
59158 +
59159 +#if BITS_PER_LONG == 64
59160 +
59161 +#define OFF(addr)  (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3)
59162 +#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1)))
59163 +
59164 +static inline void reiser4_set_bit(int nr, void *addr)
59165 +{
59166 +       ext2_set_bit(nr + OFF(addr), BASE(addr));
59167 +}
59168 +
59169 +static inline void reiser4_clear_bit(int nr, void *addr)
59170 +{
59171 +       ext2_clear_bit(nr + OFF(addr), BASE(addr));
59172 +}
59173 +
59174 +static inline int reiser4_test_bit(int nr, void *addr)
59175 +{
59176 +       return ext2_test_bit(nr + OFF(addr), BASE(addr));
59177 +}
59178 +static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset,
59179 +                                            int offset)
59180 +{
59181 +       int off = OFF(addr);
59182 +
59183 +       return ext2_find_next_zero_bit(BASE(addr), maxoffset + off,
59184 +                                      offset + off) - off;
59185 +}
59186 +
59187 +#else
59188 +
59189 +#define reiser4_set_bit(nr, addr)    ext2_set_bit(nr, addr)
59190 +#define reiser4_clear_bit(nr, addr)  ext2_clear_bit(nr, addr)
59191 +#define reiser4_test_bit(nr, addr)  ext2_test_bit(nr, addr)
59192 +
59193 +#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \
59194 +ext2_find_next_zero_bit(addr, maxoffset, offset)
59195 +#endif
59196 +
59197 +/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets
59198 + * are counted from @addr, return the offset of the first bit if it is found,
59199 + * @maxoffset otherwise. */
59200 +static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
59201 +                                             bmap_off_t start_offset)
59202 +{
59203 +       ulong_t *base = addr;
59204 +       /* start_offset is in bits, convert it to byte offset within bitmap. */
59205 +       int word_nr = start_offset >> LONG_INT_SHIFT;
59206 +       /* bit number within the byte. */
59207 +       int bit_nr = start_offset & LONG_INT_MASK;
59208 +       int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT;
59209 +
59210 +       assert("zam-387", max_offset != 0);
59211 +
59212 +       /* Unaligned @start_offset case.  */
59213 +       if (bit_nr != 0) {
59214 +               bmap_nr_t nr;
59215 +
59216 +               nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr);
59217 +
59218 +               if (nr < BITS_PER_LONG)
59219 +                       return (word_nr << LONG_INT_SHIFT) + nr;
59220 +
59221 +               ++word_nr;
59222 +       }
59223 +
59224 +       /* Fast scan trough aligned words. */
59225 +       while (word_nr <= max_word_nr) {
59226 +               if (base[word_nr] != 0) {
59227 +                       return (word_nr << LONG_INT_SHIFT)
59228 +                           + find_next_zero_bit_in_word(~(base[word_nr]), 0);
59229 +               }
59230 +
59231 +               ++word_nr;
59232 +       }
59233 +
59234 +       return max_offset;
59235 +}
59236 +
59237 +#if BITS_PER_LONG == 64
59238 +
59239 +static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
59240 +                                           bmap_off_t start_offset)
59241 +{
59242 +       bmap_off_t off = OFF(addr);
59243 +
59244 +       return __reiser4_find_next_set_bit(BASE(addr), max_offset + off,
59245 +                                          start_offset + off) - off;
59246 +}
59247 +
59248 +#else
59249 +#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \
59250 +  __reiser4_find_next_set_bit(addr, max_offset, start_offset)
59251 +#endif
59252 +
59253 +/* search for the first set bit in single word. */
59254 +static int find_last_set_bit_in_word(ulong_t word, int start_bit)
59255 +{
59256 +       ulong_t bit_mask;
59257 +       int nr = start_bit;
59258 +
59259 +       assert("zam-965", start_bit < BITS_PER_LONG);
59260 +       assert("zam-966", start_bit >= 0);
59261 +
59262 +       bit_mask = (1UL << nr);
59263 +
59264 +       while (bit_mask != 0) {
59265 +               if (bit_mask & word)
59266 +                       return nr;
59267 +               bit_mask >>= 1;
59268 +               nr--;
59269 +       }
59270 +       return BITS_PER_LONG;
59271 +}
59272 +
59273 +/* Search bitmap for a set bit in backward direction from the end to the
59274 + * beginning of given region
59275 + *
59276 + * @result: result offset of the last set bit
59277 + * @addr:   base memory address,
59278 + * @low_off:  low end of the search region, edge bit included into the region,
59279 + * @high_off: high end of the search region, edge bit included into the region,
59280 + *
59281 + * @return: 0 - set bit was found, -1 otherwise.
59282 + */
59283 +static int
59284 +reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
59285 +                         bmap_off_t high_off)
59286 +{
59287 +       ulong_t *base = addr;
59288 +       int last_word;
59289 +       int first_word;
59290 +       int last_bit;
59291 +       int nr;
59292 +
59293 +       assert("zam-962", high_off >= low_off);
59294 +
59295 +       last_word = high_off >> LONG_INT_SHIFT;
59296 +       last_bit = high_off & LONG_INT_MASK;
59297 +       first_word = low_off >> LONG_INT_SHIFT;
59298 +
59299 +       if (last_bit < BITS_PER_LONG) {
59300 +               nr = find_last_set_bit_in_word(base[last_word], last_bit);
59301 +               if (nr < BITS_PER_LONG) {
59302 +                       *result = (last_word << LONG_INT_SHIFT) + nr;
59303 +                       return 0;
59304 +               }
59305 +               --last_word;
59306 +       }
59307 +       while (last_word >= first_word) {
59308 +               if (base[last_word] != 0x0) {
59309 +                       last_bit =
59310 +                           find_last_set_bit_in_word(base[last_word],
59311 +                                                     BITS_PER_LONG - 1);
59312 +                       assert("zam-972", last_bit < BITS_PER_LONG);
59313 +                       *result = (last_word << LONG_INT_SHIFT) + last_bit;
59314 +                       return 0;
59315 +               }
59316 +               --last_word;
59317 +       }
59318 +
59319 +       return -1;              /* set bit not found */
59320 +}
59321 +
59322 +/* Search bitmap for a clear bit in backward direction from the end to the
59323 + * beginning of given region */
59324 +static int
59325 +reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
59326 +                          bmap_off_t high_off)
59327 +{
59328 +       ulong_t *base = addr;
59329 +       int last_word;
59330 +       int first_word;
59331 +       int last_bit;
59332 +       int nr;
59333 +
59334 +       last_word = high_off >> LONG_INT_SHIFT;
59335 +       last_bit = high_off & LONG_INT_MASK;
59336 +       first_word = low_off >> LONG_INT_SHIFT;
59337 +
59338 +       if (last_bit < BITS_PER_LONG) {
59339 +               nr = find_last_set_bit_in_word(~base[last_word], last_bit);
59340 +               if (nr < BITS_PER_LONG) {
59341 +                       *result = (last_word << LONG_INT_SHIFT) + nr;
59342 +                       return 0;
59343 +               }
59344 +               --last_word;
59345 +       }
59346 +       while (last_word >= first_word) {
59347 +               if (base[last_word] != (ulong_t) (-1)) {
59348 +                       *result = (last_word << LONG_INT_SHIFT) +
59349 +                           find_last_set_bit_in_word(~base[last_word],
59350 +                                                     BITS_PER_LONG - 1);
59351 +                       return 0;
59352 +               }
59353 +               --last_word;
59354 +       }
59355 +
59356 +       return -1;              /* zero bit not found */
59357 +}
59358 +
59359 +/* Audited by: green(2002.06.12) */
59360 +static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end)
59361 +{
59362 +       int first_byte;
59363 +       int last_byte;
59364 +
59365 +       unsigned char first_byte_mask = 0xFF;
59366 +       unsigned char last_byte_mask = 0xFF;
59367 +
59368 +       assert("zam-410", start < end);
59369 +
59370 +       first_byte = start >> 3;
59371 +       last_byte = (end - 1) >> 3;
59372 +
59373 +       if (last_byte > first_byte + 1)
59374 +               memset(addr + first_byte + 1, 0,
59375 +                      (size_t) (last_byte - first_byte - 1));
59376 +
59377 +       first_byte_mask >>= 8 - (start & 0x7);
59378 +       last_byte_mask <<= ((end - 1) & 0x7) + 1;
59379 +
59380 +       if (first_byte == last_byte) {
59381 +               addr[first_byte] &= (first_byte_mask | last_byte_mask);
59382 +       } else {
59383 +               addr[first_byte] &= first_byte_mask;
59384 +               addr[last_byte] &= last_byte_mask;
59385 +       }
59386 +}
59387 +
59388 +/* Audited by: green(2002.06.12) */
59389 +/* ZAM-FIXME-HANS: comment this */
59390 +static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end)
59391 +{
59392 +       int first_byte;
59393 +       int last_byte;
59394 +
59395 +       unsigned char first_byte_mask = 0xFF;
59396 +       unsigned char last_byte_mask = 0xFF;
59397 +
59398 +       assert("zam-386", start < end);
59399 +
59400 +       first_byte = start >> 3;
59401 +       last_byte = (end - 1) >> 3;
59402 +
59403 +       if (last_byte > first_byte + 1)
59404 +               memset(addr + first_byte + 1, 0xFF,
59405 +                      (size_t) (last_byte - first_byte - 1));
59406 +
59407 +       first_byte_mask <<= start & 0x7;
59408 +       last_byte_mask >>= 7 - ((end - 1) & 0x7);
59409 +
59410 +       if (first_byte == last_byte) {
59411 +               addr[first_byte] |= (first_byte_mask & last_byte_mask);
59412 +       } else {
59413 +               addr[first_byte] |= first_byte_mask;
59414 +               addr[last_byte] |= last_byte_mask;
59415 +       }
59416 +}
59417 +
59418 +#define ADLER_BASE    65521
59419 +#define ADLER_NMAX    5552
59420 +
59421 +/* Calculates the adler32 checksum for the data pointed by `data` of the
59422 +    length `len`. This function was originally taken from zlib, version 1.1.3,
59423 +    July 9th, 1998.
59424 +
59425 +    Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
59426 +
59427 +    This software is provided 'as-is', without any express or implied
59428 +    warranty.  In no event will the authors be held liable for any damages
59429 +    arising from the use of this software.
59430 +
59431 +    Permission is granted to anyone to use this software for any purpose,
59432 +    including commercial applications, and to alter it and redistribute it
59433 +    freely, subject to the following restrictions:
59434 +
59435 +    1. The origin of this software must not be misrepresented; you must not
59436 +       claim that you wrote the original software. If you use this software
59437 +       in a product, an acknowledgment in the product documentation would be
59438 +       appreciated but is not required.
59439 +    2. Altered source versions must be plainly marked as such, and must not be
59440 +       misrepresented as being the original software.
59441 +    3. This notice may not be removed or altered from any source distribution.
59442 +
59443 +    Jean-loup Gailly        Mark Adler
59444 +    jloup@gzip.org          madler@alumni.caltech.edu
59445 +
59446 +    The above comment applies only to the reiser4_adler32 function.
59447 +*/
59448 +
59449 +__u32 reiser4_adler32(char *data, __u32 len)
59450 +{
59451 +       unsigned char *t = data;
59452 +       __u32 s1 = 1;
59453 +       __u32 s2 = 0;
59454 +       int k;
59455 +
59456 +       while (len > 0) {
59457 +               k = len < ADLER_NMAX ? len : ADLER_NMAX;
59458 +               len -= k;
59459 +
59460 +               while (k--) {
59461 +                       s1 += *t++;
59462 +                       s2 += s1;
59463 +               }
59464 +
59465 +               s1 %= ADLER_BASE;
59466 +               s2 %= ADLER_BASE;
59467 +       }
59468 +       return (s2 << 16) | s1;
59469 +}
59470 +
59471 +#define sb_by_bnode(bnode) \
59472 +       ((struct super_block *)jnode_get_tree(bnode->wjnode)->super)
59473 +
59474 +static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size)
59475 +{
59476 +       return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size));
59477 +}
59478 +
59479 +static int
59480 +bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size)
59481 +{
59482 +       if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) {
59483 +               bmap_nr_t bmap;
59484 +
59485 +               bmap = bnode - get_bnode(sb_by_bnode(bnode), 0);
59486 +
59487 +               warning("vpf-263",
59488 +                       "Checksum for the bitmap block %llu is incorrect",
59489 +                       bmap);
59490 +
59491 +               return RETERR(-EIO);
59492 +       }
59493 +
59494 +       return 0;
59495 +}
59496 +
59497 +#define REISER4_CHECK_BMAP_CRC (0)
59498 +
59499 +#if REISER4_CHECK_BMAP_CRC
59500 +static int bnode_check_crc(const struct bitmap_node *bnode)
59501 +{
59502 +       return bnode_check_adler32(bnode,
59503 +                                  bmap_size(sb_by_bnode(bnode)->s_blocksize));
59504 +}
59505 +
59506 +/* REISER4_CHECK_BMAP_CRC */
59507 +#else
59508 +
59509 +#define bnode_check_crc(bnode) (0)
59510 +
59511 +/* REISER4_CHECK_BMAP_CRC */
59512 +#endif
59513 +
59514 +/* Recalculates the adler32 checksum for only 1 byte change.
59515 +    adler - previous adler checksum
59516 +    old_data, data - old, new byte values.
59517 +    tail == (chunk - offset) : length, checksum was calculated for, - offset of
59518 +    the changed byte within this chunk.
59519 +    This function can be used for checksum calculation optimisation.
59520 +*/
59521 +
59522 +static __u32
59523 +adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data,
59524 +              __u32 tail)
59525 +{
59526 +       __u32 delta = data - old_data + 2 * ADLER_BASE;
59527 +       __u32 s1 = adler & 0xffff;
59528 +       __u32 s2 = (adler >> 16) & 0xffff;
59529 +
59530 +       s1 = (delta + s1) % ADLER_BASE;
59531 +       s2 = (delta * tail + s2) % ADLER_BASE;
59532 +
59533 +       return (s2 << 16) | s1;
59534 +}
59535 +
59536 +#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val))
59537 +
59538 +/**
59539 + * get_nr_bitmap - calculate number of bitmap blocks
59540 + * @super: super block with initialized blocksize and block count
59541 + *
59542 + * Calculates number of bitmap blocks of a filesystem which uses bitmaps to
59543 + * maintain free disk space. It assumes that each bitmap addresses the same
59544 + * number of blocks which is calculated by bmap_block_count macro defined in
59545 + * above. Number of blocks in the filesystem has to be initialized in reiser4
59546 + * private data of super block already so that it can be obtained via
59547 + * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap
59548 + * is not power of 2 because 4 bytes are used for checksum. Therefore, we have
59549 + * to use special function to divide and modulo 64bits filesystem block
59550 + * counters.
59551 + *
59552 + * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap
59553 + * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address
59554 + * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2.
59555 + */
59556 +static bmap_nr_t get_nr_bmap(const struct super_block *super)
59557 +{
59558 +       u64 quotient;
59559 +
59560 +       assert("zam-393", reiser4_block_count(super) != 0);
59561 +
59562 +       quotient = reiser4_block_count(super) - 1;
59563 +       do_div(quotient, bmap_bit_count(super->s_blocksize));
59564 +       return quotient + 1;
59565 +}
59566 +
59567 +/**
59568 + * parse_blocknr - calculate bitmap number and offset in it by block number
59569 + * @block: pointer to block number to calculate location in bitmap of
59570 + * @bmap: pointer where to store bitmap block number
59571 + * @offset: pointer where to store offset within bitmap block
59572 + *
59573 + * Calculates location of bit which is responsible for allocation/freeing of
59574 + * block @*block. That location is represented by bitmap block number and offset
59575 + * within that bitmap block.
59576 + */
59577 +static void
59578 +parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap,
59579 +             bmap_off_t *offset)
59580 +{
59581 +       struct super_block *super = get_current_context()->super;
59582 +       u64 quotient = *block;
59583 +
59584 +       *offset = do_div(quotient, bmap_bit_count(super->s_blocksize));
59585 +       *bmap = quotient;
59586 +
59587 +       assert("zam-433", *bmap < get_nr_bmap(super));
59588 +       assert("", *offset < bmap_bit_count(super->s_blocksize));
59589 +}
59590 +
59591 +#if REISER4_DEBUG
59592 +/* Audited by: green(2002.06.12) */
59593 +static void
59594 +check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len)
59595 +{
59596 +       struct super_block *sb = reiser4_get_current_sb();
59597 +
59598 +       assert("zam-436", sb != NULL);
59599 +
59600 +       assert("zam-455", start != NULL);
59601 +       assert("zam-437", *start != 0);
59602 +       assert("zam-541", !reiser4_blocknr_is_fake(start));
59603 +       assert("zam-441", *start < reiser4_block_count(sb));
59604 +
59605 +       if (len != NULL) {
59606 +               assert("zam-438", *len != 0);
59607 +               assert("zam-442", *start + *len <= reiser4_block_count(sb));
59608 +       }
59609 +}
59610 +
59611 +static void check_bnode_loaded(const struct bitmap_node *bnode)
59612 +{
59613 +       assert("zam-485", bnode != NULL);
59614 +       assert("zam-483", jnode_page(bnode->wjnode) != NULL);
59615 +       assert("zam-484", jnode_page(bnode->cjnode) != NULL);
59616 +       assert("nikita-2820", jnode_is_loaded(bnode->wjnode));
59617 +       assert("nikita-2821", jnode_is_loaded(bnode->cjnode));
59618 +}
59619 +
59620 +#else
59621 +
59622 +#  define check_block_range(start, len) do { /* nothing */} while(0)
59623 +#  define check_bnode_loaded(bnode)     do { /* nothing */} while(0)
59624 +
59625 +#endif
59626 +
59627 +/* modify bnode->first_zero_bit (if we free bits before); bnode should be
59628 +   spin-locked */
59629 +static inline void
59630 +adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset)
59631 +{
59632 +       if (offset < bnode->first_zero_bit)
59633 +               bnode->first_zero_bit = offset;
59634 +}
59635 +
59636 +/* return a physical disk address for logical bitmap number @bmap */
59637 +/* FIXME-VS: this is somehow related to disk layout? */
59638 +/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference
59639 + * per block allocation so that performance is not affected.  Probably this
59640 + * whole file should be considered part of the disk layout plugin, and other
59641 + * disk layouts can use other defines and efficiency will not be significantly
59642 + * affected.  */
59643 +
59644 +#define REISER4_FIRST_BITMAP_BLOCK \
59645 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2)
59646 +
59647 +/* Audited by: green(2002.06.12) */
59648 +static void
59649 +get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap,
59650 +                  reiser4_block_nr * bnr)
59651 +{
59652 +
59653 +       assert("zam-390", bmap < get_nr_bmap(super));
59654 +
59655 +#ifdef CONFIG_REISER4_BADBLOCKS
59656 +#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff))
59657 +       /* Check if the diskmap have this already, first. */
59658 +       if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0)
59659 +               return;         /* Found it in diskmap */
59660 +#endif
59661 +       /* FIXME_ZAM: before discussing of disk layouts and disk format
59662 +          plugins I implement bitmap location scheme which is close to scheme
59663 +          used in reiser 3.6 */
59664 +       if (bmap == 0) {
59665 +               *bnr = REISER4_FIRST_BITMAP_BLOCK;
59666 +       } else {
59667 +               *bnr = bmap * bmap_bit_count(super->s_blocksize);
59668 +       }
59669 +}
59670 +
59671 +/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */
59672 +/* Audited by: green(2002.06.12) */
59673 +static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr)
59674 +{
59675 +       *bnr =
59676 +           (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) |
59677 +                               REISER4_BITMAP_BLOCKS_STATUS_VALUE);
59678 +}
59679 +
59680 +/* bnode structure initialization */
59681 +static void
59682 +init_bnode(struct bitmap_node *bnode,
59683 +          struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG)
59684 +{
59685 +       memset(bnode, 0, sizeof(struct bitmap_node));
59686 +
59687 +       mutex_init(&bnode->mutex);
59688 +       atomic_set(&bnode->loaded, 0);
59689 +}
59690 +
59691 +static void release(jnode * node)
59692 +{
59693 +       jrelse(node);
59694 +       JF_SET(node, JNODE_HEARD_BANSHEE);
59695 +       jput(node);
59696 +}
59697 +
59698 +/* This function is for internal bitmap.c use because it assumes that jnode is
59699 +   in under full control of this thread */
59700 +static void done_bnode(struct bitmap_node *bnode)
59701 +{
59702 +       if (bnode) {
59703 +               atomic_set(&bnode->loaded, 0);
59704 +               if (bnode->wjnode != NULL)
59705 +                       release(bnode->wjnode);
59706 +               if (bnode->cjnode != NULL)
59707 +                       release(bnode->cjnode);
59708 +               bnode->wjnode = bnode->cjnode = NULL;
59709 +       }
59710 +}
59711 +
59712 +/* ZAM-FIXME-HANS: comment this.  Called only by load_and_lock_bnode()*/
59713 +static int prepare_bnode(struct bitmap_node *bnode, jnode **cjnode_ret,
59714 +                        jnode **wjnode_ret)
59715 +{
59716 +       struct super_block *super;
59717 +       jnode *cjnode;
59718 +       jnode *wjnode;
59719 +       bmap_nr_t bmap;
59720 +       int ret;
59721 +
59722 +       super = reiser4_get_current_sb();
59723 +
59724 +       *wjnode_ret = wjnode = bnew();
59725 +       if (wjnode == NULL) {
59726 +               *cjnode_ret = NULL;
59727 +               return RETERR(-ENOMEM);
59728 +       }
59729 +
59730 +       *cjnode_ret = cjnode = bnew();
59731 +       if (cjnode == NULL)
59732 +               return RETERR(-ENOMEM);
59733 +
59734 +       bmap = bnode - get_bnode(super, 0);
59735 +
59736 +       get_working_bitmap_blocknr(bmap, &wjnode->blocknr);
59737 +       get_bitmap_blocknr(super, bmap, &cjnode->blocknr);
59738 +
59739 +       jref(cjnode);
59740 +       jref(wjnode);
59741 +
59742 +       /* load commit bitmap */
59743 +       ret = jload_gfp(cjnode, GFP_NOFS, 1);
59744 +
59745 +       if (ret)
59746 +               goto error;
59747 +
59748 +       /* allocate memory for working bitmap block. Note that for
59749 +        * bitmaps jinit_new() doesn't actually modifies node content,
59750 +        * so parallel calls to this are ok. */
59751 +       ret = jinit_new(wjnode, GFP_NOFS);
59752 +
59753 +       if (ret != 0) {
59754 +               jrelse(cjnode);
59755 +               goto error;
59756 +       }
59757 +
59758 +       return 0;
59759 +
59760 +      error:
59761 +       jput(cjnode);
59762 +       jput(wjnode);
59763 +       *wjnode_ret = *cjnode_ret = NULL;
59764 +       return ret;
59765 +
59766 +}
59767 +
59768 +/* Check the bnode data on read. */
59769 +static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize)
59770 +{
59771 +       void *data;
59772 +       int ret;
59773 +
59774 +       /* Check CRC */
59775 +       ret = bnode_check_adler32(bnode, blksize);
59776 +
59777 +       if (ret) {
59778 +               return ret;
59779 +       }
59780 +
59781 +       data = jdata(bnode->cjnode) + CHECKSUM_SIZE;
59782 +
59783 +       /* Check the very first bit -- it must be busy. */
59784 +       if (!reiser4_test_bit(0, data)) {
59785 +               warning("vpf-1362", "The allocator block %llu is not marked "
59786 +                       "as used.", (unsigned long long)bnode->cjnode->blocknr);
59787 +
59788 +               return -EINVAL;
59789 +       }
59790 +
59791 +       return 0;
59792 +}
59793 +
59794 +/* load bitmap blocks "on-demand" */
59795 +static int load_and_lock_bnode(struct bitmap_node *bnode)
59796 +{
59797 +       int ret;
59798 +
59799 +       jnode *cjnode;
59800 +       jnode *wjnode;
59801 +
59802 +       assert("nikita-3040", reiser4_schedulable());
59803 +
59804 +/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not
59805 + * need to be atomic, right? Just leave a comment that if bitmaps were
59806 + * unloadable, this would need to be atomic.  */
59807 +       if (atomic_read(&bnode->loaded)) {
59808 +               /* bitmap is already loaded, nothing to do */
59809 +               check_bnode_loaded(bnode);
59810 +               mutex_lock(&bnode->mutex);
59811 +               assert("nikita-2827", atomic_read(&bnode->loaded));
59812 +               return 0;
59813 +       }
59814 +
59815 +       ret = prepare_bnode(bnode, &cjnode, &wjnode);
59816 +       if (ret == 0) {
59817 +               mutex_lock(&bnode->mutex);
59818 +
59819 +               if (!atomic_read(&bnode->loaded)) {
59820 +                       assert("nikita-2822", cjnode != NULL);
59821 +                       assert("nikita-2823", wjnode != NULL);
59822 +                       assert("nikita-2824", jnode_is_loaded(cjnode));
59823 +                       assert("nikita-2825", jnode_is_loaded(wjnode));
59824 +
59825 +                       bnode->wjnode = wjnode;
59826 +                       bnode->cjnode = cjnode;
59827 +
59828 +                       ret = check_struct_bnode(bnode, current_blocksize);
59829 +                       if (!ret) {
59830 +                               cjnode = wjnode = NULL;
59831 +                               atomic_set(&bnode->loaded, 1);
59832 +                               /* working bitmap is initialized by on-disk
59833 +                                * commit bitmap. This should be performed
59834 +                                * under mutex. */
59835 +                               memcpy(bnode_working_data(bnode),
59836 +                                      bnode_commit_data(bnode),
59837 +                                      bmap_size(current_blocksize));
59838 +                       } else
59839 +                               mutex_unlock(&bnode->mutex);
59840 +               } else
59841 +                       /* race: someone already loaded bitmap while we were
59842 +                        * busy initializing data. */
59843 +                       check_bnode_loaded(bnode);
59844 +       }
59845 +
59846 +       if (wjnode != NULL) {
59847 +               release(wjnode);
59848 +               bnode->wjnode = NULL;
59849 +       }
59850 +       if (cjnode != NULL) {
59851 +               release(cjnode);
59852 +               bnode->cjnode = NULL;
59853 +       }
59854 +
59855 +       return ret;
59856 +}
59857 +
59858 +static void release_and_unlock_bnode(struct bitmap_node *bnode)
59859 +{
59860 +       check_bnode_loaded(bnode);
59861 +       mutex_unlock(&bnode->mutex);
59862 +}
59863 +
59864 +/* This function does all block allocation work but only for one bitmap
59865 +   block.*/
59866 +/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap
59867 +   block responsibility zone boundaries. This had no sense in v3.6 but may
59868 +   have it in v4.x */
59869 +/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */
59870 +static int
59871 +search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset,
59872 +                         bmap_off_t max_offset, int min_len, int max_len)
59873 +{
59874 +       struct super_block *super = get_current_context()->super;
59875 +       struct bitmap_node *bnode = get_bnode(super, bmap);
59876 +
59877 +       char *data;
59878 +
59879 +       bmap_off_t search_end;
59880 +       bmap_off_t start;
59881 +       bmap_off_t end;
59882 +
59883 +       int set_first_zero_bit = 0;
59884 +
59885 +       int ret;
59886 +
59887 +       assert("zam-364", min_len > 0);
59888 +       assert("zam-365", max_len >= min_len);
59889 +       assert("zam-366", *offset <= max_offset);
59890 +
59891 +       ret = load_and_lock_bnode(bnode);
59892 +
59893 +       if (ret)
59894 +               return ret;
59895 +
59896 +       data = bnode_working_data(bnode);
59897 +
59898 +       start = *offset;
59899 +
59900 +       if (bnode->first_zero_bit >= start) {
59901 +               start = bnode->first_zero_bit;
59902 +               set_first_zero_bit = 1;
59903 +       }
59904 +
59905 +       while (start + min_len < max_offset) {
59906 +
59907 +               start =
59908 +                   reiser4_find_next_zero_bit((long *)data, max_offset, start);
59909 +               if (set_first_zero_bit) {
59910 +                       bnode->first_zero_bit = start;
59911 +                       set_first_zero_bit = 0;
59912 +               }
59913 +               if (start >= max_offset)
59914 +                       break;
59915 +
59916 +               search_end = LIMIT(start + max_len, max_offset);
59917 +               end =
59918 +                   reiser4_find_next_set_bit((long *)data, search_end, start);
59919 +               if (end >= start + min_len) {
59920 +                       /* we can't trust find_next_set_bit result if set bit
59921 +                          was not fount, result may be bigger than
59922 +                          max_offset */
59923 +                       if (end > search_end)
59924 +                               end = search_end;
59925 +
59926 +                       ret = end - start;
59927 +                       *offset = start;
59928 +
59929 +                       reiser4_set_bits(data, start, end);
59930 +
59931 +                       /* FIXME: we may advance first_zero_bit if [start,
59932 +                          end] region overlaps the first_zero_bit point */
59933 +
59934 +                       break;
59935 +               }
59936 +
59937 +               start = end + 1;
59938 +       }
59939 +
59940 +       release_and_unlock_bnode(bnode);
59941 +
59942 +       return ret;
59943 +}
59944 +
59945 +static int
59946 +search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset,
59947 +                          bmap_off_t end_offset, int min_len, int max_len)
59948 +{
59949 +       struct super_block *super = get_current_context()->super;
59950 +       struct bitmap_node *bnode = get_bnode(super, bmap);
59951 +       char *data;
59952 +       bmap_off_t start;
59953 +       int ret;
59954 +
59955 +       assert("zam-958", min_len > 0);
59956 +       assert("zam-959", max_len >= min_len);
59957 +       assert("zam-960", *start_offset >= end_offset);
59958 +
59959 +       ret = load_and_lock_bnode(bnode);
59960 +       if (ret)
59961 +               return ret;
59962 +
59963 +       data = bnode_working_data(bnode);
59964 +       start = *start_offset;
59965 +
59966 +       while (1) {
59967 +               bmap_off_t end, search_end;
59968 +
59969 +               /* Find the beginning of the zero filled region */
59970 +               if (reiser4_find_last_zero_bit(&start, data, end_offset, start))
59971 +                       break;
59972 +               /* Is there more than `min_len' bits from `start' to
59973 +                * `end_offset'?  */
59974 +               if (start < end_offset + min_len - 1)
59975 +                       break;
59976 +
59977 +               /* Do not search to `end_offset' if we need to find less than
59978 +                * `max_len' zero bits. */
59979 +               if (end_offset + max_len - 1 < start)
59980 +                       search_end = start - max_len + 1;
59981 +               else
59982 +                       search_end = end_offset;
59983 +
59984 +               if (reiser4_find_last_set_bit(&end, data, search_end, start))
59985 +                       end = search_end;
59986 +               else
59987 +                       end++;
59988 +
59989 +               if (end + min_len <= start + 1) {
59990 +                       if (end < search_end)
59991 +                               end = search_end;
59992 +                       ret = start - end + 1;
59993 +                       *start_offset = end;    /* `end' is lowest offset */
59994 +                       assert("zam-987",
59995 +                              reiser4_find_next_set_bit(data, start + 1,
59996 +                                                        end) >= start + 1);
59997 +                       reiser4_set_bits(data, end, start + 1);
59998 +                       break;
59999 +               }
60000 +
60001 +               if (end <= end_offset)
60002 +                       /* left search boundary reached. */
60003 +                       break;
60004 +               start = end - 1;
60005 +       }
60006 +
60007 +       release_and_unlock_bnode(bnode);
60008 +       return ret;
60009 +}
60010 +
60011 +/* allocate contiguous range of blocks in bitmap */
60012 +static int bitmap_alloc_forward(reiser4_block_nr * start,
60013 +                               const reiser4_block_nr * end, int min_len,
60014 +                               int max_len)
60015 +{
60016 +       bmap_nr_t bmap, end_bmap;
60017 +       bmap_off_t offset, end_offset;
60018 +       int len;
60019 +
60020 +       reiser4_block_nr tmp;
60021 +
60022 +       struct super_block *super = get_current_context()->super;
60023 +       const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
60024 +
60025 +       parse_blocknr(start, &bmap, &offset);
60026 +
60027 +       tmp = *end - 1;
60028 +       parse_blocknr(&tmp, &end_bmap, &end_offset);
60029 +       ++end_offset;
60030 +
60031 +       assert("zam-358", end_bmap >= bmap);
60032 +       assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset));
60033 +
60034 +       for (; bmap < end_bmap; bmap++, offset = 0) {
60035 +               len =
60036 +                   search_one_bitmap_forward(bmap, &offset, max_offset,
60037 +                                             min_len, max_len);
60038 +               if (len != 0)
60039 +                       goto out;
60040 +       }
60041 +
60042 +       len =
60043 +           search_one_bitmap_forward(bmap, &offset, end_offset, min_len,
60044 +                                     max_len);
60045 +      out:
60046 +       *start = bmap * max_offset + offset;
60047 +       return len;
60048 +}
60049 +
60050 +/* allocate contiguous range of blocks in bitmap (from @start to @end in
60051 + * backward direction) */
60052 +static int bitmap_alloc_backward(reiser4_block_nr * start,
60053 +                                const reiser4_block_nr * end, int min_len,
60054 +                                int max_len)
60055 +{
60056 +       bmap_nr_t bmap, end_bmap;
60057 +       bmap_off_t offset, end_offset;
60058 +       int len;
60059 +       struct super_block *super = get_current_context()->super;
60060 +       const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
60061 +
60062 +       parse_blocknr(start, &bmap, &offset);
60063 +       parse_blocknr(end, &end_bmap, &end_offset);
60064 +
60065 +       assert("zam-961", end_bmap <= bmap);
60066 +       assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset));
60067 +
60068 +       for (; bmap > end_bmap; bmap--, offset = max_offset - 1) {
60069 +               len =
60070 +                   search_one_bitmap_backward(bmap, &offset, 0, min_len,
60071 +                                              max_len);
60072 +               if (len != 0)
60073 +                       goto out;
60074 +       }
60075 +
60076 +       len =
60077 +           search_one_bitmap_backward(bmap, &offset, end_offset, min_len,
60078 +                                      max_len);
60079 +      out:
60080 +       *start = bmap * max_offset + offset;
60081 +       return len;
60082 +}
60083 +
60084 +/* plugin->u.space_allocator.alloc_blocks() */
60085 +static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed,
60086 +                               reiser4_block_nr *start, reiser4_block_nr *len)
60087 +{
60088 +       struct super_block *super = get_current_context()->super;
60089 +       int actual_len;
60090 +
60091 +       reiser4_block_nr search_start;
60092 +       reiser4_block_nr search_end;
60093 +
60094 +       assert("zam-398", super != NULL);
60095 +       assert("zam-412", hint != NULL);
60096 +       assert("zam-397", hint->blk <= reiser4_block_count(super));
60097 +
60098 +       if (hint->max_dist == 0)
60099 +               search_end = reiser4_block_count(super);
60100 +       else
60101 +               search_end =
60102 +                   LIMIT(hint->blk + hint->max_dist,
60103 +                         reiser4_block_count(super));
60104 +
60105 +       /* We use @hint -> blk as a search start and search from it to the end
60106 +          of the disk or in given region if @hint -> max_dist is not zero */
60107 +       search_start = hint->blk;
60108 +
60109 +       actual_len =
60110 +           bitmap_alloc_forward(&search_start, &search_end, 1, needed);
60111 +
60112 +       /* There is only one bitmap search if max_dist was specified or first
60113 +          pass was from the beginning of the bitmap. We also do one pass for
60114 +          scanning bitmap in backward direction. */
60115 +       if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) {
60116 +               /* next step is a scanning from 0 to search_start */
60117 +               search_end = search_start;
60118 +               search_start = 0;
60119 +               actual_len =
60120 +                   bitmap_alloc_forward(&search_start, &search_end, 1, needed);
60121 +       }
60122 +       if (actual_len == 0)
60123 +               return RETERR(-ENOSPC);
60124 +       if (actual_len < 0)
60125 +               return RETERR(actual_len);
60126 +       *len = actual_len;
60127 +       *start = search_start;
60128 +       return 0;
60129 +}
60130 +
60131 +static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed,
60132 +                                reiser4_block_nr * start,
60133 +                                reiser4_block_nr * len)
60134 +{
60135 +       reiser4_block_nr search_start;
60136 +       reiser4_block_nr search_end;
60137 +       int actual_len;
60138 +
60139 +       ON_DEBUG(struct super_block *super = reiser4_get_current_sb());
60140 +
60141 +       assert("zam-969", super != NULL);
60142 +       assert("zam-970", hint != NULL);
60143 +       assert("zam-971", hint->blk <= reiser4_block_count(super));
60144 +
60145 +       search_start = hint->blk;
60146 +       if (hint->max_dist == 0 || search_start <= hint->max_dist)
60147 +               search_end = 0;
60148 +       else
60149 +               search_end = search_start - hint->max_dist;
60150 +
60151 +       actual_len =
60152 +           bitmap_alloc_backward(&search_start, &search_end, 1, needed);
60153 +       if (actual_len == 0)
60154 +               return RETERR(-ENOSPC);
60155 +       if (actual_len < 0)
60156 +               return RETERR(actual_len);
60157 +       *len = actual_len;
60158 +       *start = search_start;
60159 +       return 0;
60160 +}
60161 +
60162 +/* plugin->u.space_allocator.alloc_blocks() */
60163 +int reiser4_alloc_blocks_bitmap(reiser4_space_allocator * allocator,
60164 +                               reiser4_blocknr_hint * hint, int needed,
60165 +                               reiser4_block_nr * start, reiser4_block_nr * len)
60166 +{
60167 +       if (hint->backward)
60168 +               return alloc_blocks_backward(hint, needed, start, len);
60169 +       return alloc_blocks_forward(hint, needed, start, len);
60170 +}
60171 +
60172 +/* plugin->u.space_allocator.dealloc_blocks(). */
60173 +/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted
60174 +   nodes deletion is deferred until transaction commit.  However, deallocation
60175 +   of temporary objects like wandered blocks and transaction commit records
60176 +   requires immediate node deletion from WORKING BITMAP.*/
60177 +void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator * allocator,
60178 +                                  reiser4_block_nr start, reiser4_block_nr len)
60179 +{
60180 +       struct super_block *super = reiser4_get_current_sb();
60181 +
60182 +       bmap_nr_t bmap;
60183 +       bmap_off_t offset;
60184 +
60185 +       struct bitmap_node *bnode;
60186 +       int ret;
60187 +
60188 +       assert("zam-468", len != 0);
60189 +       check_block_range(&start, &len);
60190 +
60191 +       parse_blocknr(&start, &bmap, &offset);
60192 +
60193 +       assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize));
60194 +
60195 +       bnode = get_bnode(super, bmap);
60196 +
60197 +       assert("zam-470", bnode != NULL);
60198 +
60199 +       ret = load_and_lock_bnode(bnode);
60200 +       assert("zam-481", ret == 0);
60201 +
60202 +       reiser4_clear_bits(bnode_working_data(bnode), offset,
60203 +                          (bmap_off_t) (offset + len));
60204 +
60205 +       adjust_first_zero_bit(bnode, offset);
60206 +
60207 +       release_and_unlock_bnode(bnode);
60208 +}
60209 +
60210 +/* plugin->u.space_allocator.check_blocks(). */
60211 +void reiser4_check_blocks_bitmap(const reiser4_block_nr * start,
60212 +                                const reiser4_block_nr * len, int desired)
60213 +{
60214 +#if REISER4_DEBUG
60215 +       struct super_block *super = reiser4_get_current_sb();
60216 +
60217 +       bmap_nr_t bmap;
60218 +       bmap_off_t start_offset;
60219 +       bmap_off_t end_offset;
60220 +
60221 +       struct bitmap_node *bnode;
60222 +       int ret;
60223 +
60224 +       assert("zam-622", len != NULL);
60225 +       check_block_range(start, len);
60226 +       parse_blocknr(start, &bmap, &start_offset);
60227 +
60228 +       end_offset = start_offset + *len;
60229 +       assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize));
60230 +
60231 +       bnode = get_bnode(super, bmap);
60232 +
60233 +       assert("nikita-2215", bnode != NULL);
60234 +
60235 +       ret = load_and_lock_bnode(bnode);
60236 +       assert("zam-626", ret == 0);
60237 +
60238 +       assert("nikita-2216", jnode_is_loaded(bnode->wjnode));
60239 +
60240 +       if (desired) {
60241 +               assert("zam-623",
60242 +                      reiser4_find_next_zero_bit(bnode_working_data(bnode),
60243 +                                                 end_offset, start_offset)
60244 +                      >= end_offset);
60245 +       } else {
60246 +               assert("zam-624",
60247 +                      reiser4_find_next_set_bit(bnode_working_data(bnode),
60248 +                                                end_offset, start_offset)
60249 +                      >= end_offset);
60250 +       }
60251 +
60252 +       release_and_unlock_bnode(bnode);
60253 +#endif
60254 +}
60255 +
60256 +/* conditional insertion of @node into atom's overwrite set  if it was not there */
60257 +static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node)
60258 +{
60259 +       assert("zam-546", atom != NULL);
60260 +       assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT);
60261 +       assert("zam-548", node != NULL);
60262 +
60263 +       spin_lock_atom(atom);
60264 +       spin_lock_jnode(node);
60265 +
60266 +       if (node->atom == NULL) {
60267 +               JF_SET(node, JNODE_OVRWR);
60268 +               insert_into_atom_ovrwr_list(atom, node);
60269 +       } else {
60270 +               assert("zam-549", node->atom == atom);
60271 +       }
60272 +
60273 +       spin_unlock_jnode(node);
60274 +       spin_unlock_atom(atom);
60275 +}
60276 +
60277 +/* an actor which applies delete set to COMMIT bitmap pages and link modified
60278 +   pages in a single-linked list */
60279 +static int
60280 +apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start,
60281 +                         const reiser4_block_nr * len, void *data)
60282 +{
60283 +
60284 +       bmap_nr_t bmap;
60285 +       bmap_off_t offset;
60286 +       int ret;
60287 +
60288 +       long long *blocks_freed_p = data;
60289 +
60290 +       struct bitmap_node *bnode;
60291 +
60292 +       struct super_block *sb = reiser4_get_current_sb();
60293 +
60294 +       check_block_range(start, len);
60295 +
60296 +       parse_blocknr(start, &bmap, &offset);
60297 +
60298 +       /* FIXME-ZAM: we assume that all block ranges are allocated by this
60299 +          bitmap-based allocator and each block range can't go over a zone of
60300 +          responsibility of one bitmap block; same assumption is used in
60301 +          other journal hooks in bitmap code. */
60302 +       bnode = get_bnode(sb, bmap);
60303 +       assert("zam-448", bnode != NULL);
60304 +
60305 +       /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */
60306 +       assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT);
60307 +       ret = load_and_lock_bnode(bnode);
60308 +       if (ret)
60309 +               return ret;
60310 +
60311 +       /* put bnode into atom's overwrite set */
60312 +       cond_add_to_overwrite_set(atom, bnode->cjnode);
60313 +
60314 +       data = bnode_commit_data(bnode);
60315 +
60316 +       ret = bnode_check_crc(bnode);
60317 +       if (ret != 0)
60318 +               return ret;
60319 +
60320 +       if (len != NULL) {
60321 +               /* FIXME-ZAM: a check that all bits are set should be there */
60322 +               assert("zam-443",
60323 +                      offset + *len <= bmap_bit_count(sb->s_blocksize));
60324 +               reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len));
60325 +
60326 +               (*blocks_freed_p) += *len;
60327 +       } else {
60328 +               reiser4_clear_bit(offset, data);
60329 +               (*blocks_freed_p)++;
60330 +       }
60331 +
60332 +       bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize));
60333 +
60334 +       release_and_unlock_bnode(bnode);
60335 +
60336 +       return 0;
60337 +}
60338 +
60339 +/* plugin->u.space_allocator.pre_commit_hook(). */
60340 +/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the
60341 +   rest is done by transaction manager (allocate wandered locations for COMMIT
60342 +   BITMAP blocks, copy COMMIT BITMAP blocks data). */
60343 +/* Only one instance of this function can be running at one given time, because
60344 +   only one transaction can be committed a time, therefore it is safe to access
60345 +   some global variables without any locking */
60346 +
60347 +int reiser4_pre_commit_hook_bitmap(void)
60348 +{
60349 +       struct super_block *super = reiser4_get_current_sb();
60350 +       txn_atom *atom;
60351 +
60352 +       long long blocks_freed = 0;
60353 +
60354 +       atom = get_current_atom_locked();
60355 +       assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
60356 +       spin_unlock_atom(atom);
60357 +
60358 +       {                       /* scan atom's captured list and find all freshly allocated nodes,
60359 +                                * mark corresponded bits in COMMIT BITMAP as used */
60360 +               struct list_head *head = ATOM_CLEAN_LIST(atom);
60361 +               jnode *node = list_entry(head->next, jnode, capture_link);
60362 +
60363 +               while (head != &node->capture_link) {
60364 +                       /* we detect freshly allocated jnodes */
60365 +                       if (JF_ISSET(node, JNODE_RELOC)) {
60366 +                               int ret;
60367 +                               bmap_nr_t bmap;
60368 +
60369 +                               bmap_off_t offset;
60370 +                               bmap_off_t index;
60371 +                               struct bitmap_node *bn;
60372 +                               __u32 size = bmap_size(super->s_blocksize);
60373 +                               __u32 crc;
60374 +                               char byte;
60375 +
60376 +                               assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
60377 +                               assert("zam-460",
60378 +                                      !reiser4_blocknr_is_fake(&node->blocknr));
60379 +
60380 +                               parse_blocknr(&node->blocknr, &bmap, &offset);
60381 +                               bn = get_bnode(super, bmap);
60382 +
60383 +                               index = offset >> 3;
60384 +                               assert("vpf-276", index < size);
60385 +
60386 +                               ret = bnode_check_crc(bnode);
60387 +                               if (ret != 0)
60388 +                                       return ret;
60389 +
60390 +                               check_bnode_loaded(bn);
60391 +                               load_and_lock_bnode(bn);
60392 +
60393 +                               byte = *(bnode_commit_data(bn) + index);
60394 +                               reiser4_set_bit(offset, bnode_commit_data(bn));
60395 +
60396 +                               crc = adler32_recalc(bnode_commit_crc(bn), byte,
60397 +                                                    *(bnode_commit_data(bn) +
60398 +                                                      index),
60399 +                                                    size - index),
60400 +                                   bnode_set_commit_crc(bn, crc);
60401 +
60402 +                               release_and_unlock_bnode(bn);
60403 +
60404 +                               ret = bnode_check_crc(bn);
60405 +                               if (ret != 0)
60406 +                                       return ret;
60407 +
60408 +                               /* working of this depends on how it inserts
60409 +                                  new j-node into clean list, because we are
60410 +                                  scanning the same list now. It is OK, if
60411 +                                  insertion is done to the list front */
60412 +                               cond_add_to_overwrite_set(atom, bn->cjnode);
60413 +                       }
60414 +
60415 +                       node = list_entry(node->capture_link.next, jnode, capture_link);
60416 +               }
60417 +       }
60418 +
60419 +       blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap,
60420 +                            &blocks_freed, 0);
60421 +
60422 +       blocks_freed -= atom->nr_blocks_allocated;
60423 +
60424 +       {
60425 +               reiser4_super_info_data *sbinfo;
60426 +
60427 +               sbinfo = get_super_private(super);
60428 +
60429 +               spin_lock_reiser4_super(sbinfo);
60430 +               sbinfo->blocks_free_committed += blocks_freed;
60431 +               spin_unlock_reiser4_super(sbinfo);
60432 +       }
60433 +
60434 +       return 0;
60435 +}
60436 +
60437 +/* plugin->u.space_allocator.init_allocator
60438 +    constructor of reiser4_space_allocator object. It is called on fs mount */
60439 +int reiser4_init_allocator_bitmap(reiser4_space_allocator * allocator,
60440 +                                 struct super_block *super, void *arg)
60441 +{
60442 +       struct bitmap_allocator_data *data = NULL;
60443 +       bmap_nr_t bitmap_blocks_nr;
60444 +       bmap_nr_t i;
60445 +
60446 +       assert("nikita-3039", reiser4_schedulable());
60447 +
60448 +       /* getting memory for bitmap allocator private data holder */
60449 +       data =
60450 +               kmalloc(sizeof(struct bitmap_allocator_data),
60451 +                       reiser4_ctx_gfp_mask_get());
60452 +
60453 +       if (data == NULL)
60454 +               return RETERR(-ENOMEM);
60455 +
60456 +       /* allocation and initialization for the array of bnodes */
60457 +       bitmap_blocks_nr = get_nr_bmap(super);
60458 +
60459 +       /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps
60460 +          which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17,
60461 +          may I never meet someone who still uses the ia32 architecture when
60462 +          storage devices of that size enter the market, and wants to use ia32
60463 +          with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and,
60464 +          probably, another dynamic data structure should replace a static
60465 +          array of bnodes. */
60466 +       /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */
60467 +       data->bitmap = reiser4_vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr);
60468 +       if (data->bitmap == NULL) {
60469 +               kfree(data);
60470 +               return RETERR(-ENOMEM);
60471 +       }
60472 +
60473 +       for (i = 0; i < bitmap_blocks_nr; i++)
60474 +               init_bnode(data->bitmap + i, super, i);
60475 +
60476 +       allocator->u.generic = data;
60477 +
60478 +#if REISER4_DEBUG
60479 +       get_super_private(super)->min_blocks_used += bitmap_blocks_nr;
60480 +#endif
60481 +
60482 +       /* Load all bitmap blocks at mount time. */
60483 +       if (!test_bit
60484 +           (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) {
60485 +               __u64 start_time, elapsed_time;
60486 +               struct bitmap_node *bnode;
60487 +               int ret;
60488 +
60489 +               if (REISER4_DEBUG)
60490 +                       printk(KERN_INFO "loading reiser4 bitmap...");
60491 +               start_time = jiffies;
60492 +
60493 +               for (i = 0; i < bitmap_blocks_nr; i++) {
60494 +                       bnode = data->bitmap + i;
60495 +                       ret = load_and_lock_bnode(bnode);
60496 +                       if (ret) {
60497 +                               reiser4_destroy_allocator_bitmap(allocator,
60498 +                                                                super);
60499 +                               return ret;
60500 +                       }
60501 +                       release_and_unlock_bnode(bnode);
60502 +               }
60503 +
60504 +               elapsed_time = jiffies - start_time;
60505 +               if (REISER4_DEBUG)
60506 +                       printk("...done (%llu jiffies)\n",
60507 +                              (unsigned long long)elapsed_time);
60508 +       }
60509 +
60510 +       return 0;
60511 +}
60512 +
60513 +/* plugin->u.space_allocator.destroy_allocator
60514 +   destructor. It is called on fs unmount */
60515 +int reiser4_destroy_allocator_bitmap(reiser4_space_allocator * allocator,
60516 +                                    struct super_block *super)
60517 +{
60518 +       bmap_nr_t bitmap_blocks_nr;
60519 +       bmap_nr_t i;
60520 +
60521 +       struct bitmap_allocator_data *data = allocator->u.generic;
60522 +
60523 +       assert("zam-414", data != NULL);
60524 +       assert("zam-376", data->bitmap != NULL);
60525 +
60526 +       bitmap_blocks_nr = get_nr_bmap(super);
60527 +
60528 +       for (i = 0; i < bitmap_blocks_nr; i++) {
60529 +               struct bitmap_node *bnode = data->bitmap + i;
60530 +
60531 +               mutex_lock(&bnode->mutex);
60532 +
60533 +#if REISER4_DEBUG
60534 +               if (atomic_read(&bnode->loaded)) {
60535 +                       jnode *wj = bnode->wjnode;
60536 +                       jnode *cj = bnode->cjnode;
60537 +
60538 +                       assert("zam-480", jnode_page(cj) != NULL);
60539 +                       assert("zam-633", jnode_page(wj) != NULL);
60540 +
60541 +                       assert("zam-634",
60542 +                              memcmp(jdata(wj), jdata(wj),
60543 +                                     bmap_size(super->s_blocksize)) == 0);
60544 +
60545 +               }
60546 +#endif
60547 +               done_bnode(bnode);
60548 +               mutex_unlock(&bnode->mutex);
60549 +       }
60550 +
60551 +       vfree(data->bitmap);
60552 +       kfree(data);
60553 +
60554 +       allocator->u.generic = NULL;
60555 +
60556 +       return 0;
60557 +}
60558 +
60559 +/*
60560 + * Local variables:
60561 + * c-indentation-style: "K&R"
60562 + * mode-name: "LC"
60563 + * c-basic-offset: 8
60564 + * tab-width: 8
60565 + * fill-column: 79
60566 + * scroll-step: 1
60567 + * End:
60568 + */
60569 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/space/bitmap.h linux-2.6.27/fs/reiser4/plugin/space/bitmap.h
60570 --- linux-2.6.27.orig/fs/reiser4/plugin/space/bitmap.h  1970-01-01 03:00:00.000000000 +0300
60571 +++ linux-2.6.27/fs/reiser4/plugin/space/bitmap.h       2008-10-12 18:20:01.000000000 +0400
60572 @@ -0,0 +1,47 @@
60573 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60574 +
60575 +#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__)
60576 +#define __REISER4_PLUGIN_SPACE_BITMAP_H__
60577 +
60578 +#include "../../dformat.h"
60579 +#include "../../block_alloc.h"
60580 +
60581 +#include <linux/types.h>       /* for __u??  */
60582 +#include <linux/fs.h>          /* for struct super_block  */
60583 +/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */
60584 +/* declarations of functions implementing methods of space allocator plugin for
60585 +   bitmap based allocator. The functions themselves are in bitmap.c */
60586 +extern int reiser4_init_allocator_bitmap(reiser4_space_allocator *,
60587 +                                        struct super_block *, void *);
60588 +extern int reiser4_destroy_allocator_bitmap(reiser4_space_allocator *,
60589 +                                           struct super_block *);
60590 +extern int reiser4_alloc_blocks_bitmap(reiser4_space_allocator *,
60591 +                                      reiser4_blocknr_hint *, int needed,
60592 +                                      reiser4_block_nr * start,
60593 +                                      reiser4_block_nr * len);
60594 +extern void reiser4_check_blocks_bitmap(const reiser4_block_nr *,
60595 +                                       const reiser4_block_nr *, int);
60596 +extern void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator *,
60597 +                                         reiser4_block_nr,
60598 +                                         reiser4_block_nr);
60599 +extern int reiser4_pre_commit_hook_bitmap(void);
60600 +
60601 +#define reiser4_post_commit_hook_bitmap() do{}while(0)
60602 +#define reiser4_post_write_back_hook_bitmap() do{}while(0)
60603 +#define reiser4_print_info_bitmap(pref, al) do{}while(0)
60604 +
60605 +typedef __u64 bmap_nr_t;
60606 +typedef __u32 bmap_off_t;
60607 +
60608 +#endif                         /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */
60609 +
60610 +/* Make Linus happy.
60611 +   Local variables:
60612 +   c-indentation-style: "K&R"
60613 +   mode-name: "LC"
60614 +   c-basic-offset: 8
60615 +   tab-width: 8
60616 +   fill-column: 120
60617 +   scroll-step: 1
60618 +   End:
60619 +*/
60620 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/space/Makefile linux-2.6.27/fs/reiser4/plugin/space/Makefile
60621 --- linux-2.6.27.orig/fs/reiser4/plugin/space/Makefile  1970-01-01 03:00:00.000000000 +0300
60622 +++ linux-2.6.27/fs/reiser4/plugin/space/Makefile       2008-10-12 18:20:01.000000000 +0400
60623 @@ -0,0 +1,4 @@
60624 +obj-$(CONFIG_REISER4_FS) += space_plugins.o
60625 +
60626 +space_plugins-objs := \
60627 +       bitmap.o
60628 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/space/space_allocator.h linux-2.6.27/fs/reiser4/plugin/space/space_allocator.h
60629 --- linux-2.6.27.orig/fs/reiser4/plugin/space/space_allocator.h 1970-01-01 03:00:00.000000000 +0300
60630 +++ linux-2.6.27/fs/reiser4/plugin/space/space_allocator.h      2008-10-12 18:20:01.000000000 +0400
60631 @@ -0,0 +1,80 @@
60632 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60633 +
60634 +#ifndef __SPACE_ALLOCATOR_H__
60635 +#define __SPACE_ALLOCATOR_H__
60636 +
60637 +#include "../../forward.h"
60638 +#include "bitmap.h"
60639 +/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now,
60640 + * but... */
60641 +#define DEF_SPACE_ALLOCATOR(allocator)                                                                                 \
60642 +                                                                                                                       \
60643 +static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque)               \
60644 +{                                                                                                                      \
60645 +       return reiser4_init_allocator_##allocator (al, s, opaque);                                                      \
60646 +}                                                                                                                      \
60647 +                                                                                                                       \
60648 +static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s)                           \
60649 +{                                                                                                                      \
60650 +       reiser4_destroy_allocator_##allocator (al, s);                                                                  \
60651 +}                                                                                                                      \
60652 +                                                                                                                       \
60653 +static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint,                           \
60654 +                                  int needed, reiser4_block_nr * start, reiser4_block_nr * len)                        \
60655 +{                                                                                                                      \
60656 +       return reiser4_alloc_blocks_##allocator (al, hint, needed, start, len);                                         \
60657 +}                                                                                                                      \
60658 +static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len)      \
60659 +{                                                                                                                      \
60660 +       reiser4_dealloc_blocks_##allocator (al, start, len);                                                            \
60661 +}                                                                                                                      \
60662 +                                                                                                                       \
60663 +static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired)                 \
60664 +{                                                                                                                      \
60665 +       reiser4_check_blocks_##allocator (start, end, desired);                                                         \
60666 +}                                                                                                                      \
60667 +                                                                                                                       \
60668 +static inline void sa_pre_commit_hook (void)                                                                           \
60669 +{                                                                                                                      \
60670 +       reiser4_pre_commit_hook_##allocator ();                                                                         \
60671 +}                                                                                                                      \
60672 +                                                                                                                       \
60673 +static inline void sa_post_commit_hook (void)                                                                          \
60674 +{                                                                                                                      \
60675 +       reiser4_post_commit_hook_##allocator ();                                                                        \
60676 +}                                                                                                                      \
60677 +                                                                                                                       \
60678 +static inline void sa_post_write_back_hook (void)                                                                      \
60679 +{                                                                                                                      \
60680 +       reiser4_post_write_back_hook_##allocator();                                                                     \
60681 +}                                                                                                                      \
60682 +                                                                                                                       \
60683 +static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al)                                    \
60684 +{                                                                                                                      \
60685 +       reiser4_print_info_##allocator (prefix, al);                                                                    \
60686 +}
60687 +
60688 +DEF_SPACE_ALLOCATOR(bitmap)
60689 +
60690 +/* this object is part of reiser4 private in-core super block */
60691 +struct reiser4_space_allocator {
60692 +       union {
60693 +               /* space allocators might use this pointer to reference their
60694 +                * data. */
60695 +               void *generic;
60696 +       } u;
60697 +};
60698 +
60699 +/* __SPACE_ALLOCATOR_H__ */
60700 +#endif
60701 +
60702 +/* Make Linus happy.
60703 +   Local variables:
60704 +   c-indentation-style: "K&R"
60705 +   mode-name: "LC"
60706 +   c-basic-offset: 8
60707 +   tab-width: 8
60708 +   fill-column: 120
60709 +   scroll-step: 1
60710 +   End:
60711 +*/
60712 diff -urN linux-2.6.27.orig/fs/reiser4/plugin/tail_policy.c linux-2.6.27/fs/reiser4/plugin/tail_policy.c
60713 --- linux-2.6.27.orig/fs/reiser4/plugin/tail_policy.c   1970-01-01 03:00:00.000000000 +0300
60714 +++ linux-2.6.27/fs/reiser4/plugin/tail_policy.c        2008-10-12 18:20:01.000000000 +0400
60715 @@ -0,0 +1,113 @@
60716 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
60717 + * reiser4/README */
60718 +
60719 +/* Formatting policy plugins */
60720 +
60721 +/*
60722 + * Formatting policy plugin is used by object plugin (of regular file) to
60723 + * convert file between two representations.
60724 + *
60725 + * Currently following policies are implemented:
60726 + *  never store file in formatted nodes
60727 + *  always store file in formatted nodes
60728 + *  store file in formatted nodes if file is smaller than 4 blocks (default)
60729 + */
60730 +
60731 +#include "../tree.h"
60732 +#include "../inode.h"
60733 +#include "../super.h"
60734 +#include "object.h"
60735 +#include "plugin.h"
60736 +#include "node/node.h"
60737 +#include "plugin_header.h"
60738 +
60739 +#include <linux/pagemap.h>
60740 +#include <linux/fs.h>          /* For struct inode */
60741 +
60742 +/**
60743 + * have_formatting_never -
60744 + * @inode:
60745 + * @size:
60746 + *
60747 + *
60748 + */
60749 +/* Never store file's tail as direct item */
60750 +/* Audited by: green(2002.06.12) */
60751 +static int have_formatting_never(const struct inode *inode UNUSED_ARG
60752 +                     /* inode to operate on */ ,
60753 +                     loff_t size UNUSED_ARG /* new object size */ )
60754 +{
60755 +       return 0;
60756 +}
60757 +
60758 +/* Always store file's tail as direct item */
60759 +/* Audited by: green(2002.06.12) */
60760 +static int
60761 +have_formatting_always(const struct inode *inode UNUSED_ARG
60762 +                      /* inode to operate on */ ,
60763 +                      loff_t size UNUSED_ARG /* new object size */ )
60764 +{
60765 +       return 1;
60766 +}
60767 +
60768 +/* This function makes test if we should store file denoted @inode as tails only or
60769 +   as extents only. */
60770 +static int
60771 +have_formatting_default(const struct inode *inode UNUSED_ARG
60772 +                       /* inode to operate on */ ,
60773 +                       loff_t size /* new object size */ )
60774 +{
60775 +       assert("umka-1253", inode != NULL);
60776 +
60777 +       if (size > inode->i_sb->s_blocksize * 4)
60778 +               return 0;
60779 +
60780 +       return 1;
60781 +}
60782 +
60783 +/* tail plugins */
60784 +formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = {
60785 +       [NEVER_TAILS_FORMATTING_ID] = {
60786 +               .h = {
60787 +                       .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
60788 +                       .id = NEVER_TAILS_FORMATTING_ID,
60789 +                       .pops = NULL,
60790 +                       .label = "never",
60791 +                       .desc = "Never store file's tail",
60792 +                       .linkage = {NULL, NULL}
60793 +               },
60794 +               .have_tail = have_formatting_never
60795 +       },
60796 +       [ALWAYS_TAILS_FORMATTING_ID] = {
60797 +               .h = {
60798 +                       .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
60799 +                       .id = ALWAYS_TAILS_FORMATTING_ID,
60800 +                       .pops = NULL,
60801 +                       .label = "always",
60802 +                       .desc = "Always store file's tail",
60803 +                       .linkage = {NULL, NULL}
60804 +               },
60805 +               .have_tail = have_formatting_always
60806 +       },
60807 +       [SMALL_FILE_FORMATTING_ID] = {
60808 +               .h = {
60809 +                       .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
60810 +                       .id = SMALL_FILE_FORMATTING_ID,
60811 +                       .pops = NULL,
60812 +                       .label = "4blocks",
60813 +                       .desc = "store files shorter than 4 blocks in tail items",
60814 +                       .linkage = {NULL, NULL}
60815 +               },
60816 +               .have_tail = have_formatting_default
60817 +       }
60818 +};
60819 +
60820 +/*
60821 + * Local variables:
60822 + * c-indentation-style: "K&R"
60823 + * mode-name: "LC"
60824 + * c-basic-offset: 8
60825 + * tab-width: 8
60826 + * fill-column: 79
60827 + * End:
60828 + */
60829 diff -urN linux-2.6.27.orig/fs/reiser4/pool.c linux-2.6.27/fs/reiser4/pool.c
60830 --- linux-2.6.27.orig/fs/reiser4/pool.c 1970-01-01 03:00:00.000000000 +0300
60831 +++ linux-2.6.27/fs/reiser4/pool.c      2008-10-12 18:20:01.000000000 +0400
60832 @@ -0,0 +1,231 @@
60833 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
60834 + * reiser4/README */
60835 +
60836 +/* Fast pool allocation.
60837 +
60838 +   There are situations when some sub-system normally asks memory allocator
60839 +   for only few objects, but under some circumstances could require much
60840 +   more. Typical and actually motivating example is tree balancing. It needs
60841 +   to keep track of nodes that were involved into it, and it is well-known
60842 +   that in reasonable packed balanced tree most (92.938121%) percent of all
60843 +   balancings end up after working with only few nodes (3.141592 on
60844 +   average). But in rare cases balancing can involve much more nodes
60845 +   (3*tree_height+1 in extremal situation).
60846 +
60847 +   On the one hand, we don't want to resort to dynamic allocation (slab,
60848 +    malloc(), etc.) to allocate data structures required to keep track of
60849 +   nodes during balancing. On the other hand, we cannot statically allocate
60850 +   required amount of space on the stack, because first: it is useless wastage
60851 +   of precious resource, and second: this amount is unknown in advance (tree
60852 +   height can change).
60853 +
60854 +   Pools, implemented in this file are solution for this problem:
60855 +
60856 +    - some configurable amount of objects is statically preallocated on the
60857 +    stack
60858 +
60859 +    - if this preallocated pool is exhausted and more objects is requested
60860 +    they are allocated dynamically.
60861 +
60862 +   Pools encapsulate distinction between statically and dynamically allocated
60863 +   objects. Both allocation and recycling look exactly the same.
60864 +
60865 +   To keep track of dynamically allocated objects, pool adds its own linkage
60866 +   to each object.
60867 +
60868 +   NOTE-NIKITA This linkage also contains some balancing-specific data. This
60869 +   is not perfect. On the other hand, balancing is currently the only client
60870 +   of pool code.
60871 +
60872 +   NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation
60873 +   functions in the style of tslist/tshash, i.e., make them unreadable, but
60874 +   type-safe.
60875 +
60876 +*/
60877 +
60878 +#include "debug.h"
60879 +#include "pool.h"
60880 +#include "super.h"
60881 +
60882 +#include <linux/types.h>
60883 +#include <linux/err.h>
60884 +
60885 +/* initialize new pool object @h */
60886 +static void reiser4_init_pool_obj(struct reiser4_pool_header * h)
60887 +{
60888 +       INIT_LIST_HEAD(&h->usage_linkage);
60889 +       INIT_LIST_HEAD(&h->level_linkage);
60890 +       INIT_LIST_HEAD(&h->extra_linkage);
60891 +}
60892 +
60893 +/* initialize new pool */
60894 +void reiser4_init_pool(struct reiser4_pool * pool /* pool to initialize */ ,
60895 +                      size_t obj_size /* size of objects in @pool */ ,
60896 +                      int num_of_objs /* number of preallocated objects */ ,
60897 +                      char *data /* area for preallocated objects */ )
60898 +{
60899 +       struct reiser4_pool_header *h;
60900 +       int i;
60901 +
60902 +       assert("nikita-955", pool != NULL);
60903 +       assert("nikita-1044", obj_size > 0);
60904 +       assert("nikita-956", num_of_objs >= 0);
60905 +       assert("nikita-957", data != NULL);
60906 +
60907 +       memset(pool, 0, sizeof *pool);
60908 +       pool->obj_size = obj_size;
60909 +       pool->data = data;
60910 +       INIT_LIST_HEAD(&pool->free);
60911 +       INIT_LIST_HEAD(&pool->used);
60912 +       INIT_LIST_HEAD(&pool->extra);
60913 +       memset(data, 0, obj_size * num_of_objs);
60914 +       for (i = 0; i < num_of_objs; ++i) {
60915 +               h = (struct reiser4_pool_header *) (data + i * obj_size);
60916 +               reiser4_init_pool_obj(h);
60917 +               /* add pool header to the end of pool's free list */
60918 +               list_add_tail(&h->usage_linkage, &pool->free);
60919 +       }
60920 +}
60921 +
60922 +/* release pool resources
60923 +
60924 +   Release all resources acquired by this pool, specifically, dynamically
60925 +   allocated objects.
60926 +
60927 +*/
60928 +void reiser4_done_pool(struct reiser4_pool * pool UNUSED_ARG)
60929 +{
60930 +}
60931 +
60932 +/* allocate carry object from @pool
60933 +
60934 +   First, try to get preallocated object. If this fails, resort to dynamic
60935 +   allocation.
60936 +
60937 +*/
60938 +static void *reiser4_pool_alloc(struct reiser4_pool * pool)
60939 +{
60940 +       struct reiser4_pool_header *result;
60941 +
60942 +       assert("nikita-959", pool != NULL);
60943 +
60944 +       if (!list_empty(&pool->free)) {
60945 +               struct list_head *linkage;
60946 +
60947 +               linkage = pool->free.next;
60948 +               list_del(linkage);
60949 +               INIT_LIST_HEAD(linkage);
60950 +               result = list_entry(linkage, struct reiser4_pool_header,
60951 +                                   usage_linkage);
60952 +               BUG_ON(!list_empty(&result->level_linkage) ||
60953 +                      !list_empty(&result->extra_linkage));
60954 +       } else {
60955 +               /* pool is empty. Extra allocations don't deserve dedicated
60956 +                  slab to be served from, as they are expected to be rare. */
60957 +               result = kmalloc(pool->obj_size, reiser4_ctx_gfp_mask_get());
60958 +               if (result != 0) {
60959 +                       reiser4_init_pool_obj(result);
60960 +                       list_add(&result->extra_linkage, &pool->extra);
60961 +               } else
60962 +                       return ERR_PTR(RETERR(-ENOMEM));
60963 +               BUG_ON(!list_empty(&result->usage_linkage) ||
60964 +                      !list_empty(&result->level_linkage));
60965 +       }
60966 +       ++pool->objs;
60967 +       list_add(&result->usage_linkage, &pool->used);
60968 +       memset(result + 1, 0, pool->obj_size - sizeof *result);
60969 +       return result;
60970 +}
60971 +
60972 +/* return object back to the pool */
60973 +void reiser4_pool_free(struct reiser4_pool * pool,
60974 +                      struct reiser4_pool_header * h)
60975 +{
60976 +       assert("nikita-961", h != NULL);
60977 +       assert("nikita-962", pool != NULL);
60978 +
60979 +       --pool->objs;
60980 +       assert("nikita-963", pool->objs >= 0);
60981 +
60982 +       list_del_init(&h->usage_linkage);
60983 +       list_del_init(&h->level_linkage);
60984 +
60985 +       if (list_empty(&h->extra_linkage))
60986 +               /*
60987 +                * pool header is not an extra one. Push it onto free list
60988 +                * using usage_linkage
60989 +                */
60990 +               list_add(&h->usage_linkage, &pool->free);
60991 +       else {
60992 +               /* remove pool header from pool's extra list and kfree it */
60993 +               list_del(&h->extra_linkage);
60994 +               kfree(h);
60995 +       }
60996 +}
60997 +
60998 +/* add new object to the carry level list
60999 +
61000 +   Carry level is FIFO most of the time, but not always. Complications arise
61001 +   when make_space() function tries to go to the left neighbor and thus adds
61002 +   carry node before existing nodes, and also, when updating delimiting keys
61003 +   after moving data between two nodes, we want left node to be locked before
61004 +   right node.
61005 +
61006 +   Latter case is confusing at the first glance. Problem is that COP_UPDATE
61007 +   opration that updates delimiting keys is sometimes called with two nodes
61008 +   (when data are moved between two nodes) and sometimes with only one node
61009 +   (when leftmost item is deleted in a node). In any case operation is
61010 +   supplied with at least node whose left delimiting key is to be updated
61011 +   (that is "right" node).
61012 +
61013 +   @pool - from which to allocate new object;
61014 +   @list - where to add object;
61015 +   @reference - after (or before) which existing object to add
61016 +*/
61017 +struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool * pool,
61018 +                                        struct list_head *list,
61019 +                                        pool_ordering order,
61020 +                                        struct reiser4_pool_header * reference)
61021 +{
61022 +       struct reiser4_pool_header *result;
61023 +
61024 +       assert("nikita-972", pool != NULL);
61025 +
61026 +       result = reiser4_pool_alloc(pool);
61027 +       if (IS_ERR(result))
61028 +               return result;
61029 +
61030 +       assert("nikita-973", result != NULL);
61031 +
61032 +       switch (order) {
61033 +       case POOLO_BEFORE:
61034 +               __list_add(&result->level_linkage,
61035 +                          reference->level_linkage.prev,
61036 +                          &reference->level_linkage);
61037 +               break;
61038 +       case POOLO_AFTER:
61039 +               __list_add(&result->level_linkage,
61040 +                          &reference->level_linkage,
61041 +                          reference->level_linkage.next);
61042 +               break;
61043 +       case POOLO_LAST:
61044 +               list_add_tail(&result->level_linkage, list);
61045 +               break;
61046 +       case POOLO_FIRST:
61047 +               list_add(&result->level_linkage, list);
61048 +               break;
61049 +       default:
61050 +               wrong_return_value("nikita-927", "order");
61051 +       }
61052 +       return result;
61053 +}
61054 +
61055 +/* Make Linus happy.
61056 +   Local variables:
61057 +   c-indentation-style: "K&R"
61058 +   mode-name: "LC"
61059 +   c-basic-offset: 8
61060 +   tab-width: 8
61061 +   fill-column: 120
61062 +   End:
61063 +*/
61064 diff -urN linux-2.6.27.orig/fs/reiser4/pool.h linux-2.6.27/fs/reiser4/pool.h
61065 --- linux-2.6.27.orig/fs/reiser4/pool.h 1970-01-01 03:00:00.000000000 +0300
61066 +++ linux-2.6.27/fs/reiser4/pool.h      2008-10-12 18:20:01.000000000 +0400
61067 @@ -0,0 +1,56 @@
61068 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61069 +
61070 +/* Fast pool allocation */
61071 +
61072 +#ifndef __REISER4_POOL_H__
61073 +#define __REISER4_POOL_H__
61074 +
61075 +#include <linux/types.h>
61076 +
61077 +struct reiser4_pool {
61078 +       size_t obj_size;
61079 +       int objs;
61080 +       char *data;
61081 +       struct list_head free;
61082 +       struct list_head used;
61083 +       struct list_head extra;
61084 +};
61085 +
61086 +struct reiser4_pool_header {
61087 +       /* object is either on free or "used" lists */
61088 +       struct list_head usage_linkage;
61089 +       struct list_head level_linkage;
61090 +       struct list_head extra_linkage;
61091 +};
61092 +
61093 +typedef enum {
61094 +       POOLO_BEFORE,
61095 +       POOLO_AFTER,
61096 +       POOLO_LAST,
61097 +       POOLO_FIRST
61098 +} pool_ordering;
61099 +
61100 +/* pool manipulation functions */
61101 +
61102 +extern void reiser4_init_pool(struct reiser4_pool * pool, size_t obj_size,
61103 +                             int num_of_objs, char *data);
61104 +extern void reiser4_done_pool(struct reiser4_pool * pool);
61105 +extern void reiser4_pool_free(struct reiser4_pool * pool,
61106 +                             struct reiser4_pool_header * h);
61107 +struct reiser4_pool_header *reiser4_add_obj(struct reiser4_pool * pool,
61108 +                                        struct list_head * list,
61109 +                                        pool_ordering order,
61110 +                                        struct reiser4_pool_header *reference);
61111 +
61112 +/* __REISER4_POOL_H__ */
61113 +#endif
61114 +
61115 +/* Make Linus happy.
61116 +   Local variables:
61117 +   c-indentation-style: "K&R"
61118 +   mode-name: "LC"
61119 +   c-basic-offset: 8
61120 +   tab-width: 8
61121 +   fill-column: 120
61122 +   End:
61123 +*/
61124 diff -urN linux-2.6.27.orig/fs/reiser4/readahead.c linux-2.6.27/fs/reiser4/readahead.c
61125 --- linux-2.6.27.orig/fs/reiser4/readahead.c    1970-01-01 03:00:00.000000000 +0300
61126 +++ linux-2.6.27/fs/reiser4/readahead.c 2008-10-12 18:20:01.000000000 +0400
61127 @@ -0,0 +1,138 @@
61128 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61129 + * reiser4/README */
61130 +
61131 +#include "forward.h"
61132 +#include "tree.h"
61133 +#include "tree_walk.h"
61134 +#include "super.h"
61135 +#include "inode.h"
61136 +#include "key.h"
61137 +#include "znode.h"
61138 +
61139 +#include <linux/swap.h>                /* for totalram_pages */
61140 +
61141 +void reiser4_init_ra_info(ra_info_t * rai)
61142 +{
61143 +       rai->key_to_stop = *reiser4_min_key();
61144 +}
61145 +
61146 +/* global formatted node readahead parameter. It can be set by mount option -o readahead:NUM:1 */
61147 +static inline int ra_adjacent_only(int flags)
61148 +{
61149 +       return flags & RA_ADJACENT_ONLY;
61150 +}
61151 +
61152 +/* this is used by formatted_readahead to decide whether read for right neighbor of node is to be issued. It returns 1
61153 +   if right neighbor's first key is less or equal to readahead's stop key */
61154 +static int should_readahead_neighbor(znode * node, ra_info_t * info)
61155 +{
61156 +       int result;
61157 +
61158 +       read_lock_dk(znode_get_tree(node));
61159 +       result = keyle(znode_get_rd_key(node), &info->key_to_stop);
61160 +       read_unlock_dk(znode_get_tree(node));
61161 +       return result;
61162 +}
61163 +
61164 +#define LOW_MEM_PERCENTAGE (5)
61165 +
61166 +static int low_on_memory(void)
61167 +{
61168 +       unsigned int freepages;
61169 +
61170 +       freepages = nr_free_pages();
61171 +       return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100);
61172 +}
61173 +
61174 +/* start read for @node and for a few of its right neighbors */
61175 +void formatted_readahead(znode * node, ra_info_t * info)
61176 +{
61177 +       struct formatted_ra_params *ra_params;
61178 +       znode *cur;
61179 +       int i;
61180 +       int grn_flags;
61181 +       lock_handle next_lh;
61182 +
61183 +       /* do nothing if node block number has not been assigned to node (which means it is still in cache). */
61184 +       if (reiser4_blocknr_is_fake(znode_get_block(node)))
61185 +               return;
61186 +
61187 +       ra_params = get_current_super_ra_params();
61188 +
61189 +       if (znode_page(node) == NULL)
61190 +               jstartio(ZJNODE(node));
61191 +
61192 +       if (znode_get_level(node) != LEAF_LEVEL)
61193 +               return;
61194 +
61195 +       /* don't waste memory for read-ahead when low on memory */
61196 +       if (low_on_memory())
61197 +               return;
61198 +
61199 +       /* We can have locked nodes on upper tree levels, in this situation lock
61200 +          priorities do not help to resolve deadlocks, we have to use TRY_LOCK
61201 +          here. */
61202 +       grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK);
61203 +
61204 +       i = 0;
61205 +       cur = zref(node);
61206 +       init_lh(&next_lh);
61207 +       while (i < ra_params->max) {
61208 +               const reiser4_block_nr *nextblk;
61209 +
61210 +               if (!should_readahead_neighbor(cur, info))
61211 +                       break;
61212 +
61213 +               if (reiser4_get_right_neighbor
61214 +                   (&next_lh, cur, ZNODE_READ_LOCK, grn_flags))
61215 +                       break;
61216 +
61217 +               nextblk = znode_get_block(next_lh.node);
61218 +               if (reiser4_blocknr_is_fake(nextblk) ||
61219 +                   (ra_adjacent_only(ra_params->flags)
61220 +                    && *nextblk != *znode_get_block(cur) + 1)) {
61221 +                       break;
61222 +               }
61223 +
61224 +               zput(cur);
61225 +               cur = zref(next_lh.node);
61226 +               done_lh(&next_lh);
61227 +               if (znode_page(cur) == NULL)
61228 +                       jstartio(ZJNODE(cur));
61229 +               else
61230 +                       /* Do not scan read-ahead window if pages already
61231 +                        * allocated (and i/o already started). */
61232 +                       break;
61233 +
61234 +               i++;
61235 +       }
61236 +       zput(cur);
61237 +       done_lh(&next_lh);
61238 +}
61239 +
61240 +void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap)
61241 +{
61242 +       reiser4_key *stop_key;
61243 +
61244 +       assert("nikita-3542", dir != NULL);
61245 +       assert("nikita-3543", tap != NULL);
61246 +
61247 +       stop_key = &tap->ra_info.key_to_stop;
61248 +       /* initialize readdir readahead information: include into readahead
61249 +        * stat data of all files of the directory */
61250 +       set_key_locality(stop_key, get_inode_oid(dir));
61251 +       set_key_type(stop_key, KEY_SD_MINOR);
61252 +       set_key_ordering(stop_key, get_key_ordering(reiser4_max_key()));
61253 +       set_key_objectid(stop_key, get_key_objectid(reiser4_max_key()));
61254 +       set_key_offset(stop_key, get_key_offset(reiser4_max_key()));
61255 +}
61256 +
61257 +/*
61258 +   Local variables:
61259 +   c-indentation-style: "K&R"
61260 +   mode-name: "LC"
61261 +   c-basic-offset: 8
61262 +   tab-width: 8
61263 +   fill-column: 80
61264 +   End:
61265 +*/
61266 diff -urN linux-2.6.27.orig/fs/reiser4/readahead.h linux-2.6.27/fs/reiser4/readahead.h
61267 --- linux-2.6.27.orig/fs/reiser4/readahead.h    1970-01-01 03:00:00.000000000 +0300
61268 +++ linux-2.6.27/fs/reiser4/readahead.h 2008-10-12 18:20:01.000000000 +0400
61269 @@ -0,0 +1,51 @@
61270 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61271 +
61272 +#ifndef __READAHEAD_H__
61273 +#define __READAHEAD_H__
61274 +
61275 +#include "key.h"
61276 +
61277 +typedef enum {
61278 +       RA_ADJACENT_ONLY = 1,   /* only requests nodes which are adjacent.
61279 +                                  Default is NO (not only adjacent) */
61280 +} ra_global_flags;
61281 +
61282 +/* reiser4 super block has a field of this type.
61283 +   It controls readahead during tree traversals */
61284 +struct formatted_ra_params {
61285 +       unsigned long max;      /* request not more than this amount of nodes.
61286 +                                  Default is totalram_pages / 4 */
61287 +       int flags;
61288 +};
61289 +
61290 +typedef struct {
61291 +       reiser4_key key_to_stop;
61292 +} ra_info_t;
61293 +
61294 +void formatted_readahead(znode *, ra_info_t *);
61295 +void reiser4_init_ra_info(ra_info_t * rai);
61296 +
61297 +struct reiser4_file_ra_state {
61298 +       loff_t start;           /* Current window */
61299 +       loff_t size;
61300 +       loff_t next_size;       /* Next window size */
61301 +       loff_t ahead_start;     /* Ahead window */
61302 +       loff_t ahead_size;
61303 +       loff_t max_window_size; /* Maximum readahead window */
61304 +       loff_t slow_start;      /* enlarging r/a size algorithm. */
61305 +};
61306 +
61307 +extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap);
61308 +
61309 +/* __READAHEAD_H__ */
61310 +#endif
61311 +
61312 +/*
61313 +   Local variables:
61314 +   c-indentation-style: "K&R"
61315 +   mode-name: "LC"
61316 +   c-basic-offset: 8
61317 +   tab-width: 8
61318 +   fill-column: 120
61319 +   End:
61320 +*/
61321 diff -urN linux-2.6.27.orig/fs/reiser4/README linux-2.6.27/fs/reiser4/README
61322 --- linux-2.6.27.orig/fs/reiser4/README 1970-01-01 03:00:00.000000000 +0300
61323 +++ linux-2.6.27/fs/reiser4/README      2008-10-12 18:20:01.000000000 +0400
61324 @@ -0,0 +1,128 @@
61325 +[LICENSING]
61326 +
61327 +Reiser4 is hereby licensed under the GNU General
61328 +Public License version 2.
61329 +
61330 +Source code files that contain the phrase "licensing governed by
61331 +reiser4/README" are "governed files" throughout this file.  Governed
61332 +files are licensed under the GPL.  The portions of them owned by Hans
61333 +Reiser, or authorized to be licensed by him, have been in the past,
61334 +and likely will be in the future, licensed to other parties under
61335 +other licenses.  If you add your code to governed files, and don't
61336 +want it to be owned by Hans Reiser, put your copyright label on that
61337 +code so the poor blight and his customers can keep things straight.
61338 +All portions of governed files not labeled otherwise are owned by Hans
61339 +Reiser, and by adding your code to it, widely distributing it to
61340 +others or sending us a patch, and leaving the sentence in stating that
61341 +licensing is governed by the statement in this file, you accept this.
61342 +It will be a kindness if you identify whether Hans Reiser is allowed
61343 +to license code labeled as owned by you on your behalf other than
61344 +under the GPL, because he wants to know if it is okay to do so and put
61345 +a check in the mail to you (for non-trivial improvements) when he
61346 +makes his next sale.  He makes no guarantees as to the amount if any,
61347 +though he feels motivated to motivate contributors, and you can surely
61348 +discuss this with him before or after contributing.  You have the
61349 +right to decline to allow him to license your code contribution other
61350 +than under the GPL.
61351 +
61352 +Further licensing options are available for commercial and/or other
61353 +interests directly from Hans Reiser: reiser@namesys.com.  If you interpret
61354 +the GPL as not allowing those additional licensing options, you read
61355 +it wrongly, and Richard Stallman agrees with me, when carefully read
61356 +you can see that those restrictions on additional terms do not apply
61357 +to the owner of the copyright, and my interpretation of this shall
61358 +govern for this license.
61359 +
61360 +[END LICENSING]
61361 +
61362 +Reiser4 is a file system based on dancing tree algorithms, and is
61363 +described at http://www.namesys.com
61364 +
61365 +mkfs.reiser4 and other utilities are on our webpage or wherever your
61366 +Linux provider put them.  You really want to be running the latest
61367 +version off the website if you use fsck.
61368 +
61369 +Yes, if you update your reiser4 kernel module you do have to
61370 +recompile your kernel, most of the time.  The errors you get will be
61371 +quite cryptic if your forget to do so.
61372 +
61373 +Hideous Commercial Pitch: Spread your development costs across other OS
61374 +vendors.  Select from the best in the world, not the best in your
61375 +building, by buying from third party OS component suppliers.  Leverage
61376 +the software component development power of the internet.  Be the most
61377 +aggressive in taking advantage of the commercial possibilities of
61378 +decentralized internet development, and add value through your branded
61379 +integration that you sell as an operating system.  Let your competitors
61380 +be the ones to compete against the entire internet by themselves.  Be
61381 +hip, get with the new economic trend, before your competitors do.  Send
61382 +email to reiser@namesys.com
61383 +
61384 +Hans Reiser was the primary architect of Reiser4, but a whole team
61385 +chipped their ideas in.  He invested everything he had into Namesys
61386 +for 5.5 dark years of no money before Reiser3 finally started to work well
61387 +enough to bring in money.  He owns the copyright.
61388 +
61389 +DARPA was the primary sponsor of Reiser4.  DARPA does not endorse
61390 +Reiser4, it merely sponsors it.  DARPA is, in solely Hans's personal
61391 +opinion, unique in its willingness to invest into things more
61392 +theoretical than the VC community can readily understand, and more
61393 +longterm than allows them to be sure that they will be the ones to
61394 +extract the economic benefits from.  DARPA also integrated us into a
61395 +security community that transformed our security worldview.
61396 +
61397 +Vladimir Saveliev is our lead programmer, with us from the beginning,
61398 +and he worked long hours writing the cleanest code.  This is why he is
61399 +now the lead programmer after years of commitment to our work.  He
61400 +always made the effort to be the best he could be, and to make his
61401 +code the best that it could be.  What resulted was quite remarkable. I
61402 +don't think that money can ever motivate someone to work the way he
61403 +did, he is one of the most selfless men I know.
61404 +
61405 +Alexander Lyamin was our sysadmin, and helped to educate us in
61406 +security issues.  Moscow State University and IMT were very generous
61407 +in the internet access they provided us, and in lots of other little
61408 +ways that a generous institution can be.
61409 +
61410 +Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
61411 +locking code, the block allocator, and finished the flushing code.
61412 +His code is always crystal clean and well structured.
61413 +
61414 +Nikita Danilov wrote the core of the balancing code, the core of the
61415 +plugins code, and the directory code.  He worked a steady pace of long
61416 +hours that produced a whole lot of well abstracted code.  He is our
61417 +senior computer scientist.
61418 +
61419 +Vladimir Demidov wrote the parser.  Writing an in kernel parser is
61420 +something very few persons have the skills for, and it is thanks to
61421 +him that we can say that the parser is really not so big compared to
61422 +various bits of our other code, and making a parser work in the kernel
61423 +was not so complicated as everyone would imagine mainly because it was
61424 +him doing it...
61425 +
61426 +Joshua McDonald wrote the transaction manager, and the flush code.
61427 +The flush code unexpectedly turned out be extremely hairy for reasons
61428 +you can read about on our web page, and he did a great job on an
61429 +extremely difficult task.
61430 +
61431 +Nina Reiser handled our accounting, government relations, and much
61432 +more.
61433 +
61434 +Ramon Reiser developed our website.
61435 +
61436 +Beverly Palmer drew our graphics.
61437 +
61438 +Vitaly Fertman developed librepair, userspace plugins repair code, fsck
61439 +and worked with Umka on developing libreiser4 and userspace plugins.
61440 +
61441 +Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
61442 +userspace tools (reiser4progs).
61443 +
61444 +Oleg Drokin (aka Green) is the release manager who fixes everything.
61445 +It is so nice to have someone like that on the team.  He (plus Chris
61446 +and Jeff) make it possible for the entire rest of the Namesys team to
61447 +focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also.  It
61448 +is just amazing to watch his talent for spotting bugs in action.
61449 +
61450 +Edward Shishkin wrote cryptcompress file plugin (which manages files
61451 +built of encrypted and(or) compressed bodies) and other plugins related
61452 +to transparent encryption and compression support.
61453 diff -urN linux-2.6.27.orig/fs/reiser4/reiser4.h linux-2.6.27/fs/reiser4/reiser4.h
61454 --- linux-2.6.27.orig/fs/reiser4/reiser4.h      1970-01-01 03:00:00.000000000 +0300
61455 +++ linux-2.6.27/fs/reiser4/reiser4.h   2008-10-12 18:20:01.000000000 +0400
61456 @@ -0,0 +1,259 @@
61457 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61458 + * reiser4/README */
61459 +
61460 +/* definitions of common constants used by reiser4 */
61461 +
61462 +#if !defined( __REISER4_H__ )
61463 +#define __REISER4_H__
61464 +
61465 +#include <asm/param.h>         /* for HZ */
61466 +#include <linux/errno.h>
61467 +#include <linux/types.h>
61468 +#include <linux/fs.h>
61469 +#include <linux/hardirq.h>
61470 +#include <linux/sched.h>
61471 +
61472 +/*
61473 + * reiser4 compilation options.
61474 + */
61475 +
61476 +#if defined(CONFIG_REISER4_DEBUG)
61477 +/* turn on assertion checks */
61478 +#define REISER4_DEBUG (1)
61479 +#else
61480 +#define REISER4_DEBUG (0)
61481 +#endif
61482 +
61483 +#define REISER4_SHA256 (0)
61484 +
61485 +/*
61486 + * Turn on large keys mode. In his mode (which is default), reiser4 key has 4
61487 + * 8-byte components. In the old "small key" mode, it's 3 8-byte
61488 + * components. Additional component, referred to as "ordering" is used to
61489 + * order items from which given object is composed of. As such, ordering is
61490 + * placed between locality and objectid. For directory item ordering contains
61491 + * initial prefix of the file name this item is for. This sorts all directory
61492 + * items within given directory lexicographically (but see
61493 + * fibration.[ch]). For file body and stat-data, ordering contains initial
61494 + * prefix of the name file was initially created with. In the common case
61495 + * (files with single name) this allows to order file bodies and stat-datas in
61496 + * the same order as their respective directory entries, thus speeding up
61497 + * readdir.
61498 + *
61499 + * Note, that kernel can only mount file system with the same key size as one
61500 + * it is compiled for, so flipping this option may render your data
61501 + * inaccessible.
61502 + */
61503 +#define REISER4_LARGE_KEY (1)
61504 +/*#define REISER4_LARGE_KEY (0)*/
61505 +
61506 +/*#define GUESS_EXISTS 1*/
61507 +
61508 +/*
61509 + * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation
61510 + * option
61511 + */
61512 +
61513 +extern const char *REISER4_SUPER_MAGIC_STRING;
61514 +extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the
61515 +                                        * beginning of device */
61516 +
61517 +/* here go tunable parameters that are not worth special entry in kernel
61518 +   configuration */
61519 +
61520 +/* default number of slots in coord-by-key caches */
61521 +#define CBK_CACHE_SLOTS    (16)
61522 +/* how many elementary tree operation to carry on the next level */
61523 +#define CARRIES_POOL_SIZE        (5)
61524 +/* size of pool of preallocated nodes for carry process. */
61525 +#define NODES_LOCKED_POOL_SIZE   (5)
61526 +
61527 +#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
61528 +#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
61529 +#define REISER4_PASTE_FLAGS (COPI_GO_LEFT)
61530 +#define REISER4_INSERT_FLAGS (COPI_GO_LEFT)
61531 +
61532 +/* we are supporting reservation of disk space on uid basis */
61533 +#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0)
61534 +/* we are supporting reservation of disk space for groups */
61535 +#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0)
61536 +/* we are supporting reservation of disk space for root */
61537 +#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0)
61538 +/* we use rapid flush mode, see flush.c for comments.  */
61539 +#define REISER4_USE_RAPID_FLUSH (1)
61540 +
61541 +/*
61542 + * set this to 0 if you don't want to use wait-for-flush in ->writepage().
61543 + */
61544 +#define REISER4_USE_ENTD (1)
61545 +
61546 +/* key allocation is Plan-A */
61547 +#define REISER4_PLANA_KEY_ALLOCATION (1)
61548 +/* key allocation follows good old 3.x scheme */
61549 +#define REISER4_3_5_KEY_ALLOCATION (0)
61550 +
61551 +/* size of hash-table for znodes */
61552 +#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13)
61553 +
61554 +/* number of buckets in lnode hash-table */
61555 +#define LNODE_HTABLE_BUCKETS (1024)
61556 +
61557 +/* some ridiculously high maximal limit on height of znode tree. This
61558 +    is used in declaration of various per level arrays and
61559 +    to allocate stattistics gathering array for per-level stats. */
61560 +#define REISER4_MAX_ZTREE_HEIGHT     (8)
61561 +
61562 +#define REISER4_PANIC_MSG_BUFFER_SIZE (1024)
61563 +
61564 +/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then,
61565 +   sequential search is on average faster than binary. This is because
61566 +   of better optimization and because sequential search is more CPU
61567 +   cache friendly. This number (25) was found by experiments on dual AMD
61568 +   Athlon(tm), 1400MHz.
61569 +
61570 +   NOTE: testing in kernel has shown that binary search is more effective than
61571 +   implied by results of the user level benchmarking. Probably because in the
61572 +   node keys are separated by other data. So value was adjusted after few
61573 +   tests. More thorough tuning is needed.
61574 +*/
61575 +#define REISER4_SEQ_SEARCH_BREAK      (3)
61576 +
61577 +/* don't allow tree to be lower than this */
61578 +#define REISER4_MIN_TREE_HEIGHT       (TWIG_LEVEL)
61579 +
61580 +/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to
61581 + * available memory. */
61582 +/* Default value of maximal atom size. Can be ovewritten by
61583 +   tmgr.atom_max_size mount option. By default infinity. */
61584 +#define REISER4_ATOM_MAX_SIZE         ((unsigned)(~0))
61585 +
61586 +/* Default value of maximal atom age (in jiffies). After reaching this age
61587 +   atom will be forced to commit, either synchronously or asynchronously. Can
61588 +   be overwritten by tmgr.atom_max_age mount option. */
61589 +#define REISER4_ATOM_MAX_AGE          (600 * HZ)
61590 +
61591 +/* sleeping period for ktxnmrgd */
61592 +#define REISER4_TXNMGR_TIMEOUT  (5 * HZ)
61593 +
61594 +/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */
61595 +#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000)
61596 +
61597 +/* start complaining after that many restarts in coord_by_key().
61598 +
61599 +   This either means incredibly heavy contention for this part of a tree, or
61600 +   some corruption or bug.
61601 +*/
61602 +#define REISER4_CBK_ITERATIONS_LIMIT  (100)
61603 +
61604 +/* return -EIO after that many iterations in coord_by_key().
61605 +
61606 +   I have witnessed more than 800 iterations (in 30 thread test) before cbk
61607 +   finished. --nikita
61608 +*/
61609 +#define REISER4_MAX_CBK_ITERATIONS    500000
61610 +
61611 +/* put a per-inode limit on maximal number of directory entries with identical
61612 +   keys in hashed directory.
61613 +
61614 +   Disable this until inheritance interfaces stabilize: we need some way to
61615 +   set per directory limit.
61616 +*/
61617 +#define REISER4_USE_COLLISION_LIMIT    (0)
61618 +
61619 +/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level blocks it
61620 +   will force them to be relocated. */
61621 +#define FLUSH_RELOCATE_THRESHOLD 64
61622 +/* If flush finds can find a block allocation closer than at most FLUSH_RELOCATE_DISTANCE
61623 +   from the preceder it will relocate to that position. */
61624 +#define FLUSH_RELOCATE_DISTANCE  64
61625 +
61626 +/* If we have written this much or more blocks before encountering busy jnode
61627 +   in flush list - abort flushing hoping that next time we get called
61628 +   this jnode will be clean already, and we will save some seeks. */
61629 +#define FLUSH_WRITTEN_THRESHOLD 50
61630 +
61631 +/* The maximum number of nodes to scan left on a level during flush. */
61632 +#define FLUSH_SCAN_MAXNODES 10000
61633 +
61634 +/* per-atom limit of flushers */
61635 +#define ATOM_MAX_FLUSHERS (1)
61636 +
61637 +/* default tracing buffer size */
61638 +#define REISER4_TRACE_BUF_SIZE (1 << 15)
61639 +
61640 +/* what size units of IO we would like cp, etc., to use, in writing to
61641 +   reiser4. In bytes.
61642 +
61643 +   Can be overwritten by optimal_io_size mount option.
61644 +*/
61645 +#define REISER4_OPTIMAL_IO_SIZE (64 * 1024)
61646 +
61647 +/* see comments in inode.c:oid_to_uino() */
61648 +#define REISER4_UINO_SHIFT (1 << 30)
61649 +
61650 +/* Mark function argument as unused to avoid compiler warnings. */
61651 +#define UNUSED_ARG __attribute__((unused))
61652 +
61653 +#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
61654 +#define NONNULL __attribute__((nonnull))
61655 +#else
61656 +#define NONNULL
61657 +#endif
61658 +
61659 +/* master super block offset in bytes.*/
61660 +#define REISER4_MASTER_OFFSET 65536
61661 +
61662 +/* size of VFS block */
61663 +#define VFS_BLKSIZE 512
61664 +/* number of bits in size of VFS block (512==2^9) */
61665 +#define VFS_BLKSIZE_BITS 9
61666 +
61667 +#define REISER4_I reiser4_inode_data
61668 +
61669 +/* implication */
61670 +#define ergo( antecedent, consequent ) ( !( antecedent ) || ( consequent ) )
61671 +/* logical equivalence */
61672 +#define equi( p1, p2 ) ( ergo( ( p1 ), ( p2 ) ) && ergo( ( p2 ), ( p1 ) ) )
61673 +
61674 +#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0])))
61675 +
61676 +#define NOT_YET                       (0)
61677 +
61678 +/** Reiser4 specific error codes **/
61679 +
61680 +#define REISER4_ERROR_CODE_BASE 10000
61681 +
61682 +/* Neighbor is not available (side neighbor or parent) */
61683 +#define E_NO_NEIGHBOR  (REISER4_ERROR_CODE_BASE)
61684 +
61685 +/* Node was not found in cache */
61686 +#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1)
61687 +
61688 +/* node has no free space enough for completion of balancing operation */
61689 +#define E_NODE_FULL    (REISER4_ERROR_CODE_BASE + 2)
61690 +
61691 +/* repeat operation */
61692 +#define E_REPEAT       (REISER4_ERROR_CODE_BASE + 3)
61693 +
61694 +/* deadlock happens */
61695 +#define E_DEADLOCK     (REISER4_ERROR_CODE_BASE + 4)
61696 +
61697 +/* operation cannot be performed, because it would block and non-blocking mode
61698 + * was requested. */
61699 +#define E_BLOCK        (REISER4_ERROR_CODE_BASE + 5)
61700 +
61701 +/* wait some event (depends on context), then repeat */
61702 +#define E_WAIT         (REISER4_ERROR_CODE_BASE + 6)
61703 +
61704 +#endif                         /* __REISER4_H__ */
61705 +
61706 +
61707 +/* Make Linus happy.
61708 +   Local variables:
61709 +   c-indentation-style: "K&R"
61710 +   mode-name: "LC"
61711 +   c-basic-offset: 8
61712 +   tab-width: 8
61713 +   fill-column: 120
61714 +   End:
61715 +*/
61716 diff -urN linux-2.6.27.orig/fs/reiser4/safe_link.c linux-2.6.27/fs/reiser4/safe_link.c
61717 --- linux-2.6.27.orig/fs/reiser4/safe_link.c    1970-01-01 03:00:00.000000000 +0300
61718 +++ linux-2.6.27/fs/reiser4/safe_link.c 2008-10-12 18:20:01.000000000 +0400
61719 @@ -0,0 +1,352 @@
61720 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
61721 + * reiser4/README */
61722 +
61723 +/* Safe-links. */
61724 +
61725 +/*
61726 + * Safe-links are used to maintain file system consistency during operations
61727 + * that spawns multiple transactions. For example:
61728 + *
61729 + *     1. Unlink. UNIX supports "open-but-unlinked" files, that is files
61730 + *     without user-visible names in the file system, but still opened by some
61731 + *     active process. What happens here is that unlink proper (i.e., removal
61732 + *     of the last file name) and file deletion (truncate of file body to zero
61733 + *     and deletion of stat-data, that happens when last file descriptor is
61734 + *     closed), may belong to different transactions T1 and T2. If a crash
61735 + *     happens after T1 commit, but before T2 commit, on-disk file system has
61736 + *     a file without name, that is, disk space leak.
61737 + *
61738 + *     2. Truncate. Truncate of large file may spawn multiple transactions. If
61739 + *     system crashes while truncate was in-progress, file is left partially
61740 + *     truncated, which violates "atomicity guarantees" of reiser4, viz. that
61741 + *     every system is atomic.
61742 + *
61743 + * Safe-links address both above cases. Basically, safe-link is a way post
61744 + * some operation to be executed during commit of some other transaction than
61745 + * current one. (Another way to look at the safe-link is to interpret it as a
61746 + * logical logging.)
61747 + *
61748 + * Specifically, at the beginning of unlink safe-link in inserted in the
61749 + * tree. This safe-link is normally removed by file deletion code (during
61750 + * transaction T2 in the above terms). Truncate also inserts safe-link that is
61751 + * normally removed when truncate operation is finished.
61752 + *
61753 + * This means, that in the case of "clean umount" there are no safe-links in
61754 + * the tree. If safe-links are observed during mount, it means that (a) system
61755 + * was terminated abnormally, and (b) safe-link correspond to the "pending"
61756 + * (i.e., not finished) operations that were in-progress during system
61757 + * termination. Each safe-link record enough information to complete
61758 + * corresponding operation, and mount simply "replays" them (hence, the
61759 + * analogy with the logical logging).
61760 + *
61761 + * Safe-links are implemented as blackbox items (see
61762 + * plugin/item/blackbox.[ch]).
61763 + *
61764 + * For the reference: ext3 also has similar mechanism, it's called "an orphan
61765 + * list" there.
61766 + */
61767 +
61768 +#include "safe_link.h"
61769 +#include "debug.h"
61770 +#include "inode.h"
61771 +
61772 +#include "plugin/item/blackbox.h"
61773 +
61774 +#include <linux/fs.h>
61775 +
61776 +/*
61777 + * On-disk format of safe-link.
61778 + */
61779 +typedef struct safelink {
61780 +       reiser4_key sdkey;      /* key of stat-data for the file safe-link is
61781 +                                * for */
61782 +       d64 size;               /* size to which file should be truncated */
61783 +} safelink_t;
61784 +
61785 +/*
61786 + * locality where safe-link items are stored. Next to the objectid of root
61787 + * directory.
61788 + */
61789 +static oid_t safe_link_locality(reiser4_tree * tree)
61790 +{
61791 +       return get_key_objectid(get_super_private(tree->super)->df_plug->
61792 +                               root_dir_key(tree->super)) + 1;
61793 +}
61794 +
61795 +/*
61796 +  Construct a key for the safe-link. Key has the following format:
61797 +
61798 +|        60     | 4 |        64        | 4 |      60       |         64       |
61799 ++---------------+---+------------------+---+---------------+------------------+
61800 +|   locality    | 0 |        0         | 0 |   objectid    |     link type    |
61801 ++---------------+---+------------------+---+---------------+------------------+
61802 +|                   |                  |                   |                  |
61803 +|     8 bytes       |     8 bytes      |      8 bytes      |      8 bytes     |
61804 +
61805 +   This is in large keys format. In small keys format second 8 byte chunk is
61806 +   out. Locality is a constant returned by safe_link_locality(). objectid is
61807 +   an oid of a file on which operation protected by this safe-link is
61808 +   performed. link-type is used to distinguish safe-links for different
61809 +   operations.
61810 +
61811 + */
61812 +static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid,
61813 +                                  reiser4_safe_link_t link, reiser4_key * key)
61814 +{
61815 +       reiser4_key_init(key);
61816 +       set_key_locality(key, safe_link_locality(tree));
61817 +       set_key_objectid(key, oid);
61818 +       set_key_offset(key, link);
61819 +       return key;
61820 +}
61821 +
61822 +/*
61823 + * how much disk space is necessary to insert and remove (in the
61824 + * error-handling path) safe-link.
61825 + */
61826 +static __u64 safe_link_tograb(reiser4_tree * tree)
61827 +{
61828 +       return
61829 +           /* insert safe link */
61830 +           estimate_one_insert_item(tree) +
61831 +           /* remove safe link */
61832 +           estimate_one_item_removal(tree) +
61833 +           /* drill to the leaf level during insertion */
61834 +           1 + estimate_one_insert_item(tree) +
61835 +           /*
61836 +            * possible update of existing safe-link. Actually, if
61837 +            * safe-link existed already (we failed to remove it), then no
61838 +            * insertion is necessary, so this term is already "covered",
61839 +            * but for simplicity let's left it.
61840 +            */
61841 +           1;
61842 +}
61843 +
61844 +/*
61845 + * grab enough disk space to insert and remove (in the error-handling path)
61846 + * safe-link.
61847 + */
61848 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags)
61849 +{
61850 +       int result;
61851 +
61852 +       grab_space_enable();
61853 +       /* The sbinfo->delete_mutex can be taken here.
61854 +        * safe_link_release() should be called before leaving reiser4
61855 +        * context. */
61856 +       result =
61857 +           reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags);
61858 +       grab_space_enable();
61859 +       return result;
61860 +}
61861 +
61862 +/*
61863 + * release unused disk space reserved by safe_link_grab().
61864 + */
61865 +void safe_link_release(reiser4_tree * tree)
61866 +{
61867 +       reiser4_release_reserved(tree->super);
61868 +}
61869 +
61870 +/*
61871 + * insert into tree safe-link for operation @link on inode @inode.
61872 + */
61873 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link)
61874 +{
61875 +       reiser4_key key;
61876 +       safelink_t sl;
61877 +       int length;
61878 +       int result;
61879 +       reiser4_tree *tree;
61880 +
61881 +       build_sd_key(inode, &sl.sdkey);
61882 +       length = sizeof sl.sdkey;
61883 +
61884 +       if (link == SAFE_TRUNCATE) {
61885 +               /*
61886 +                * for truncate we have to store final file length also,
61887 +                * expand item.
61888 +                */
61889 +               length += sizeof(sl.size);
61890 +               put_unaligned(cpu_to_le64(inode->i_size), &sl.size);
61891 +       }
61892 +       tree = reiser4_tree_by_inode(inode);
61893 +       build_link_key(tree, get_inode_oid(inode), link, &key);
61894 +
61895 +       result = store_black_box(tree, &key, &sl, length);
61896 +       if (result == -EEXIST)
61897 +               result = update_black_box(tree, &key, &sl, length);
61898 +       return result;
61899 +}
61900 +
61901 +/*
61902 + * remove safe-link corresponding to the operation @link on inode @inode from
61903 + * the tree.
61904 + */
61905 +int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link)
61906 +{
61907 +       reiser4_key key;
61908 +
61909 +       return kill_black_box(tree, build_link_key(tree, oid, link, &key));
61910 +}
61911 +
61912 +/*
61913 + * in-memory structure to keep information extracted from safe-link. This is
61914 + * used to iterate over all safe-links.
61915 + */
61916 +struct safe_link_context {
61917 +       reiser4_tree *tree;     /* internal tree */
61918 +       reiser4_key key;        /* safe-link key */
61919 +       reiser4_key sdkey;      /* key of object stat-data */
61920 +       reiser4_safe_link_t link;       /* safe-link type */
61921 +       oid_t oid;              /* object oid */
61922 +       __u64 size;             /* final size for truncate */
61923 +};
61924 +
61925 +/*
61926 + * start iterating over all safe-links.
61927 + */
61928 +static void safe_link_iter_begin(reiser4_tree * tree,
61929 +                                struct safe_link_context * ctx)
61930 +{
61931 +       ctx->tree = tree;
61932 +       reiser4_key_init(&ctx->key);
61933 +       set_key_locality(&ctx->key, safe_link_locality(tree));
61934 +       set_key_objectid(&ctx->key, get_key_objectid(reiser4_max_key()));
61935 +       set_key_offset(&ctx->key, get_key_offset(reiser4_max_key()));
61936 +}
61937 +
61938 +/*
61939 + * return next safe-link.
61940 + */
61941 +static int safe_link_iter_next(struct safe_link_context * ctx)
61942 +{
61943 +       int result;
61944 +       safelink_t sl;
61945 +
61946 +       result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0);
61947 +       if (result == 0) {
61948 +               ctx->oid = get_key_objectid(&ctx->key);
61949 +               ctx->link = get_key_offset(&ctx->key);
61950 +               ctx->sdkey = sl.sdkey;
61951 +               if (ctx->link == SAFE_TRUNCATE)
61952 +                       ctx->size = le64_to_cpu(get_unaligned(&sl.size));
61953 +       }
61954 +       return result;
61955 +}
61956 +
61957 +/*
61958 + * check are there any more safe-links left in the tree.
61959 + */
61960 +static int safe_link_iter_finished(struct safe_link_context * ctx)
61961 +{
61962 +       return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree);
61963 +}
61964 +
61965 +/*
61966 + * finish safe-link iteration.
61967 + */
61968 +static void safe_link_iter_end(struct safe_link_context * ctx)
61969 +{
61970 +       /* nothing special */
61971 +}
61972 +
61973 +/*
61974 + * process single safe-link.
61975 + */
61976 +static int process_safelink(struct super_block *super, reiser4_safe_link_t link,
61977 +                           reiser4_key * sdkey, oid_t oid, __u64 size)
61978 +{
61979 +       struct inode *inode;
61980 +       int result;
61981 +
61982 +       /*
61983 +        * obtain object inode by reiser4_iget(), then call object plugin
61984 +        * ->safelink() method to do actual work, then delete safe-link on
61985 +        * success.
61986 +        */
61987 +       inode = reiser4_iget(super, sdkey, 1);
61988 +       if (!IS_ERR(inode)) {
61989 +               file_plugin *fplug;
61990 +
61991 +               fplug = inode_file_plugin(inode);
61992 +               assert("nikita-3428", fplug != NULL);
61993 +               assert("", oid == get_inode_oid(inode));
61994 +               if (fplug->safelink != NULL) {
61995 +                       /* reiser4_txn_restart_current is not necessary because
61996 +                        * mounting is signle thread. However, without it
61997 +                        * deadlock detection code will complain (see
61998 +                        * nikita-3361). */
61999 +                       reiser4_txn_restart_current();
62000 +                       result = fplug->safelink(inode, link, size);
62001 +               } else {
62002 +                       warning("nikita-3430",
62003 +                               "Cannot handle safelink for %lli",
62004 +                               (unsigned long long)oid);
62005 +                       reiser4_print_key("key", sdkey);
62006 +                       result = 0;
62007 +               }
62008 +               if (result != 0) {
62009 +                       warning("nikita-3431",
62010 +                               "Error processing safelink for %lli: %i",
62011 +                               (unsigned long long)oid, result);
62012 +               }
62013 +               reiser4_iget_complete(inode);
62014 +               iput(inode);
62015 +               if (result == 0) {
62016 +                       result = safe_link_grab(reiser4_get_tree(super), BA_CAN_COMMIT);
62017 +                       if (result == 0)
62018 +                               result =
62019 +                                   safe_link_del(reiser4_get_tree(super), oid, link);
62020 +                       safe_link_release(reiser4_get_tree(super));
62021 +                       /*
62022 +                        * restart transaction: if there was large number of
62023 +                        * safe-links, their processing may fail to fit into
62024 +                        * single transaction.
62025 +                        */
62026 +                       if (result == 0)
62027 +                               reiser4_txn_restart_current();
62028 +               }
62029 +       } else
62030 +               result = PTR_ERR(inode);
62031 +       return result;
62032 +}
62033 +
62034 +/*
62035 + * iterate over all safe-links in the file-system processing them one by one.
62036 + */
62037 +int process_safelinks(struct super_block *super)
62038 +{
62039 +       struct safe_link_context ctx;
62040 +       int result;
62041 +
62042 +       if (rofs_super(super))
62043 +               /* do nothing on the read-only file system */
62044 +               return 0;
62045 +       safe_link_iter_begin(&get_super_private(super)->tree, &ctx);
62046 +       result = 0;
62047 +       do {
62048 +               result = safe_link_iter_next(&ctx);
62049 +               if (safe_link_iter_finished(&ctx) || result == -ENOENT) {
62050 +                       result = 0;
62051 +                       break;
62052 +               }
62053 +               if (result == 0)
62054 +                       result = process_safelink(super, ctx.link,
62055 +                                                 &ctx.sdkey, ctx.oid,
62056 +                                                 ctx.size);
62057 +       } while (result == 0);
62058 +       safe_link_iter_end(&ctx);
62059 +       return result;
62060 +}
62061 +
62062 +/* Make Linus happy.
62063 +   Local variables:
62064 +   c-indentation-style: "K&R"
62065 +   mode-name: "LC"
62066 +   c-basic-offset: 8
62067 +   tab-width: 8
62068 +   fill-column: 120
62069 +   scroll-step: 1
62070 +   End:
62071 +*/
62072 diff -urN linux-2.6.27.orig/fs/reiser4/safe_link.h linux-2.6.27/fs/reiser4/safe_link.h
62073 --- linux-2.6.27.orig/fs/reiser4/safe_link.h    1970-01-01 03:00:00.000000000 +0300
62074 +++ linux-2.6.27/fs/reiser4/safe_link.h 2008-10-12 18:20:01.000000000 +0400
62075 @@ -0,0 +1,29 @@
62076 +/* Copyright 2003 by Hans Reiser, licensing governed by
62077 + * reiser4/README */
62078 +
62079 +/* Safe-links. See safe_link.c for details. */
62080 +
62081 +#if !defined( __FS_SAFE_LINK_H__ )
62082 +#define __FS_SAFE_LINK_H__
62083 +
62084 +#include "tree.h"
62085 +
62086 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags);
62087 +void safe_link_release(reiser4_tree * tree);
62088 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link);
62089 +int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link);
62090 +
62091 +int process_safelinks(struct super_block *super);
62092 +
62093 +/* __FS_SAFE_LINK_H__ */
62094 +#endif
62095 +
62096 +/* Make Linus happy.
62097 +   Local variables:
62098 +   c-indentation-style: "K&R"
62099 +   mode-name: "LC"
62100 +   c-basic-offset: 8
62101 +   tab-width: 8
62102 +   fill-column: 120
62103 +   End:
62104 +*/
62105 diff -urN linux-2.6.27.orig/fs/reiser4/seal.c linux-2.6.27/fs/reiser4/seal.c
62106 --- linux-2.6.27.orig/fs/reiser4/seal.c 1970-01-01 03:00:00.000000000 +0300
62107 +++ linux-2.6.27/fs/reiser4/seal.c      2008-10-12 18:20:01.000000000 +0400
62108 @@ -0,0 +1,218 @@
62109 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62110 +/* Seals implementation. */
62111 +/* Seals are "weak" tree pointers. They are analogous to tree coords in
62112 +   allowing to bypass tree traversal. But normal usage of coords implies that
62113 +   node pointed to by coord is locked, whereas seals don't keep a lock (or
62114 +   even a reference) to znode. In stead, each znode contains a version number,
62115 +   increased on each znode modification. This version number is copied into a
62116 +   seal when seal is created. Later, one can "validate" seal by calling
62117 +   reiser4_seal_validate(). If znode is in cache and its version number is
62118 +   still the same, seal is "pristine" and coord associated with it can be
62119 +   re-used immediately.
62120 +
62121 +   If, on the other hand, znode is out of cache, or it is obviously different
62122 +   one from the znode seal was initially attached to (for example, it is on
62123 +   the different level, or is being removed from the tree), seal is
62124 +   irreparably invalid ("burned") and tree traversal has to be repeated.
62125 +
62126 +   Otherwise, there is some hope, that while znode was modified (and seal was
62127 +   "broken" as a result), key attached to the seal is still in the node. This
62128 +   is checked by first comparing this key with delimiting keys of node and, if
62129 +   key is ok, doing intra-node lookup.
62130 +
62131 +   Znode version is maintained in the following way:
62132 +
62133 +   there is reiser4_tree.znode_epoch counter. Whenever new znode is created,
62134 +   znode_epoch is incremented and its new value is stored in ->version field
62135 +   of new znode. Whenever znode is dirtied (which means it was probably
62136 +   modified), znode_epoch is also incremented and its new value is stored in
62137 +   znode->version. This is done so, because just incrementing znode->version
62138 +   on each update is not enough: it may so happen, that znode get deleted, new
62139 +   znode is allocated for the same disk block and gets the same version
62140 +   counter, tricking seal code into false positive.
62141 +*/
62142 +
62143 +#include "forward.h"
62144 +#include "debug.h"
62145 +#include "key.h"
62146 +#include "coord.h"
62147 +#include "seal.h"
62148 +#include "plugin/item/item.h"
62149 +#include "plugin/node/node.h"
62150 +#include "jnode.h"
62151 +#include "znode.h"
62152 +#include "super.h"
62153 +
62154 +static znode *seal_node(const seal_t * seal);
62155 +static int seal_matches(const seal_t * seal, znode * node);
62156 +
62157 +/* initialise seal. This can be called several times on the same seal. @coord
62158 +   and @key can be NULL.  */
62159 +void reiser4_seal_init(seal_t * seal /* seal to initialise */ ,
62160 +                      const coord_t * coord /* coord @seal will be
62161 +                                             * attached to */ ,
62162 +                      const reiser4_key * key UNUSED_ARG /* key @seal will be
62163 +                                                          * attached to */ )
62164 +{
62165 +       assert("nikita-1886", seal != NULL);
62166 +       memset(seal, 0, sizeof *seal);
62167 +       if (coord != NULL) {
62168 +               znode *node;
62169 +
62170 +               node = coord->node;
62171 +               assert("nikita-1987", node != NULL);
62172 +               spin_lock_znode(node);
62173 +               seal->version = node->version;
62174 +               assert("nikita-1988", seal->version != 0);
62175 +               seal->block = *znode_get_block(node);
62176 +#if REISER4_DEBUG
62177 +               seal->coord1 = *coord;
62178 +               if (key != NULL)
62179 +                       seal->key = *key;
62180 +#endif
62181 +               spin_unlock_znode(node);
62182 +       }
62183 +}
62184 +
62185 +/* finish with seal */
62186 +void reiser4_seal_done(seal_t * seal /* seal to clear */ )
62187 +{
62188 +       assert("nikita-1887", seal != NULL);
62189 +       seal->version = 0;
62190 +}
62191 +
62192 +/* true if seal was initialised */
62193 +int reiser4_seal_is_set(const seal_t * seal /* seal to query */ )
62194 +{
62195 +       assert("nikita-1890", seal != NULL);
62196 +       return seal->version != 0;
62197 +}
62198 +
62199 +#if REISER4_DEBUG
62200 +/* helper function for reiser4_seal_validate(). It checks that item at @coord
62201 + * has expected key. This is to detect cases where node was modified but wasn't
62202 + * marked dirty. */
62203 +static inline int check_seal_match(const coord_t * coord /* coord to check */ ,
62204 +                                  const reiser4_key * k /* expected key */ )
62205 +{
62206 +       reiser4_key ukey;
62207 +
62208 +       return (coord->between != AT_UNIT) ||
62209 +           /* FIXME-VS: we only can compare keys for items whose units
62210 +              represent exactly one key */
62211 +           ((coord_is_existing_unit(coord))
62212 +            && (item_is_extent(coord)
62213 +                || keyeq(k, unit_key_by_coord(coord, &ukey))))
62214 +           || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord))
62215 +               && keyge(k, unit_key_by_coord(coord, &ukey)));
62216 +}
62217 +#endif
62218 +
62219 +/* this is used by reiser4_seal_validate. It accepts return value of
62220 + * longterm_lock_znode and returns 1 if it can be interpreted as seal
62221 + * validation failure. For instance, when longterm_lock_znode returns -EINVAL,
62222 + * reiser4_seal_validate returns -E_REPEAT and caller will call tre search.
62223 + * We cannot do this in longterm_lock_znode(), because sometimes we want to
62224 + * distinguish between -EINVAL and -E_REPEAT. */
62225 +static int should_repeat(int return_code)
62226 +{
62227 +       return return_code == -EINVAL;
62228 +}
62229 +
62230 +/* (re-)validate seal.
62231 +
62232 +   Checks whether seal is pristine, and try to revalidate it if possible.
62233 +
62234 +   If seal was burned, or broken irreparably, return -E_REPEAT.
62235 +
62236 +   NOTE-NIKITA currently reiser4_seal_validate() returns -E_REPEAT if key we are
62237 +   looking for is in range of keys covered by the sealed node, but item wasn't
62238 +   found by node ->lookup() method. Alternative is to return -ENOENT in this
62239 +   case, but this would complicate callers logic.
62240 +
62241 +*/
62242 +int reiser4_seal_validate(seal_t * seal /* seal to validate */,
62243 +                         coord_t * coord /* coord to validate against */,
62244 +                         const reiser4_key * key /* key to validate against */,
62245 +                         lock_handle * lh /* resulting lock handle */,
62246 +                         znode_lock_mode mode /* lock node */,
62247 +                         znode_lock_request request /* locking priority */)
62248 +{
62249 +       znode *node;
62250 +       int result;
62251 +
62252 +       assert("nikita-1889", seal != NULL);
62253 +       assert("nikita-1881", reiser4_seal_is_set(seal));
62254 +       assert("nikita-1882", key != NULL);
62255 +       assert("nikita-1883", coord != NULL);
62256 +       assert("nikita-1884", lh != NULL);
62257 +       assert("nikita-1885", keyeq(&seal->key, key));
62258 +       assert("nikita-1989", coords_equal(&seal->coord1, coord));
62259 +
62260 +       /* obtain znode by block number */
62261 +       node = seal_node(seal);
62262 +       if (node != NULL) {
62263 +               /* znode was in cache, lock it */
62264 +               result = longterm_lock_znode(lh, node, mode, request);
62265 +               zput(node);
62266 +               if (result == 0) {
62267 +                       if (seal_matches(seal, node)) {
62268 +                               /* if seal version and znode version
62269 +                                  coincide */
62270 +                               ON_DEBUG(coord_update_v(coord));
62271 +                               assert("nikita-1990",
62272 +                                      node == seal->coord1.node);
62273 +                               assert("nikita-1898",
62274 +                                      WITH_DATA_RET(coord->node, 1,
62275 +                                                    check_seal_match(coord,
62276 +                                                                     key)));
62277 +                       } else
62278 +                               result = RETERR(-E_REPEAT);
62279 +               }
62280 +               if (result != 0) {
62281 +                       if (should_repeat(result))
62282 +                               result = RETERR(-E_REPEAT);
62283 +                       /* unlock node on failure */
62284 +                       done_lh(lh);
62285 +               }
62286 +       } else {
62287 +               /* znode wasn't in cache */
62288 +               result = RETERR(-E_REPEAT);
62289 +       }
62290 +       return result;
62291 +}
62292 +
62293 +/* helpers functions */
62294 +
62295 +/* obtain reference to znode seal points to, if in cache */
62296 +static znode *seal_node(const seal_t * seal /* seal to query */ )
62297 +{
62298 +       assert("nikita-1891", seal != NULL);
62299 +       return zlook(current_tree, &seal->block);
62300 +}
62301 +
62302 +/* true if @seal version and @node version coincide */
62303 +static int seal_matches(const seal_t * seal /* seal to check */ ,
62304 +                       znode * node /* node to check */ )
62305 +{
62306 +       int result;
62307 +
62308 +       assert("nikita-1991", seal != NULL);
62309 +       assert("nikita-1993", node != NULL);
62310 +
62311 +       spin_lock_znode(node);
62312 +       result = (seal->version == node->version);
62313 +       spin_unlock_znode(node);
62314 +       return result;
62315 +}
62316 +
62317 +/* Make Linus happy.
62318 +   Local variables:
62319 +   c-indentation-style: "K&R"
62320 +   mode-name: "LC"
62321 +   c-basic-offset: 8
62322 +   tab-width: 8
62323 +   fill-column: 120
62324 +   scroll-step: 1
62325 +   End:
62326 +*/
62327 diff -urN linux-2.6.27.orig/fs/reiser4/seal.h linux-2.6.27/fs/reiser4/seal.h
62328 --- linux-2.6.27.orig/fs/reiser4/seal.h 1970-01-01 03:00:00.000000000 +0300
62329 +++ linux-2.6.27/fs/reiser4/seal.h      2008-10-12 18:20:01.000000000 +0400
62330 @@ -0,0 +1,49 @@
62331 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62332 +
62333 +/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */
62334 +
62335 +#ifndef __SEAL_H__
62336 +#define __SEAL_H__
62337 +
62338 +#include "forward.h"
62339 +#include "debug.h"
62340 +#include "dformat.h"
62341 +#include "key.h"
62342 +#include "coord.h"
62343 +
62344 +/* for __u?? types */
62345 +/*#include <linux/types.h>*/
62346 +
62347 +/* seal. See comment at the top of seal.c */
62348 +typedef struct seal_s {
62349 +       /* version of znode recorder at the time of seal creation */
62350 +       __u64 version;
62351 +       /* block number of znode attached to this seal */
62352 +       reiser4_block_nr block;
62353 +#if REISER4_DEBUG
62354 +       /* coord this seal is attached to. For debugging. */
62355 +       coord_t coord1;
62356 +       /* key this seal is attached to. For debugging. */
62357 +       reiser4_key key;
62358 +#endif
62359 +} seal_t;
62360 +
62361 +extern void reiser4_seal_init(seal_t *, const coord_t *, const reiser4_key *);
62362 +extern void reiser4_seal_done(seal_t *);
62363 +extern int reiser4_seal_is_set(const seal_t *);
62364 +extern int reiser4_seal_validate(seal_t *, coord_t *,
62365 +                        const reiser4_key *, lock_handle *,
62366 +                        znode_lock_mode mode, znode_lock_request request);
62367 +
62368 +/* __SEAL_H__ */
62369 +#endif
62370 +
62371 +/* Make Linus happy.
62372 +   Local variables:
62373 +   c-indentation-style: "K&R"
62374 +   mode-name: "LC"
62375 +   c-basic-offset: 8
62376 +   tab-width: 8
62377 +   fill-column: 120
62378 +   End:
62379 +*/
62380 diff -urN linux-2.6.27.orig/fs/reiser4/search.c linux-2.6.27/fs/reiser4/search.c
62381 --- linux-2.6.27.orig/fs/reiser4/search.c       1970-01-01 03:00:00.000000000 +0300
62382 +++ linux-2.6.27/fs/reiser4/search.c    2008-10-12 18:20:01.000000000 +0400
62383 @@ -0,0 +1,1611 @@
62384 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
62385 + * reiser4/README */
62386 +
62387 +#include "forward.h"
62388 +#include "debug.h"
62389 +#include "dformat.h"
62390 +#include "key.h"
62391 +#include "coord.h"
62392 +#include "seal.h"
62393 +#include "plugin/item/item.h"
62394 +#include "plugin/node/node.h"
62395 +#include "plugin/plugin.h"
62396 +#include "jnode.h"
62397 +#include "znode.h"
62398 +#include "block_alloc.h"
62399 +#include "tree_walk.h"
62400 +#include "tree.h"
62401 +#include "reiser4.h"
62402 +#include "super.h"
62403 +#include "inode.h"
62404 +
62405 +#include <linux/slab.h>
62406 +
62407 +static const char *bias_name(lookup_bias bias);
62408 +
62409 +/* tree searching algorithm, intranode searching algorithms are in
62410 +   plugin/node/ */
62411 +
62412 +/* tree lookup cache
62413 + *
62414 + * The coord by key cache consists of small list of recently accessed nodes
62415 + * maintained according to the LRU discipline. Before doing real top-to-down
62416 + * tree traversal this cache is scanned for nodes that can contain key
62417 + * requested.
62418 + *
62419 + * The efficiency of coord cache depends heavily on locality of reference for
62420 + * tree accesses. Our user level simulations show reasonably good hit ratios
62421 + * for coord cache under most loads so far.
62422 + */
62423 +
62424 +/* Initialise coord cache slot */
62425 +static void cbk_cache_init_slot(cbk_cache_slot *slot)
62426 +{
62427 +       assert("nikita-345", slot != NULL);
62428 +
62429 +       INIT_LIST_HEAD(&slot->lru);
62430 +       slot->node = NULL;
62431 +}
62432 +
62433 +/* Initialize coord cache */
62434 +int cbk_cache_init(cbk_cache *cache /* cache to init */ )
62435 +{
62436 +       int i;
62437 +
62438 +       assert("nikita-346", cache != NULL);
62439 +
62440 +       cache->slot =
62441 +               kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots,
62442 +                       reiser4_ctx_gfp_mask_get());
62443 +       if (cache->slot == NULL)
62444 +               return RETERR(-ENOMEM);
62445 +
62446 +       INIT_LIST_HEAD(&cache->lru);
62447 +       for (i = 0; i < cache->nr_slots; ++i) {
62448 +               cbk_cache_init_slot(cache->slot + i);
62449 +               list_add_tail(&((cache->slot + i)->lru), &cache->lru);
62450 +       }
62451 +       rwlock_init(&cache->guard);
62452 +       return 0;
62453 +}
62454 +
62455 +/* free cbk cache data */
62456 +void cbk_cache_done(cbk_cache * cache /* cache to release */ )
62457 +{
62458 +       assert("nikita-2493", cache != NULL);
62459 +       if (cache->slot != NULL) {
62460 +               kfree(cache->slot);
62461 +               cache->slot = NULL;
62462 +       }
62463 +}
62464 +
62465 +/* macro to iterate over all cbk cache slots */
62466 +#define for_all_slots(cache, slot)                                             \
62467 +       for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru);       \
62468 +            &(cache)->lru != &(slot)->lru;                                     \
62469 +            (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
62470 +
62471 +#if REISER4_DEBUG
62472 +/* this function assures that [cbk-cache-invariant] invariant holds */
62473 +static int cbk_cache_invariant(const cbk_cache *cache)
62474 +{
62475 +       cbk_cache_slot *slot;
62476 +       int result;
62477 +       int unused;
62478 +
62479 +       if (cache->nr_slots == 0)
62480 +               return 1;
62481 +
62482 +       assert("nikita-2469", cache != NULL);
62483 +       unused = 0;
62484 +       result = 1;
62485 +       read_lock(&((cbk_cache *)cache)->guard);
62486 +       for_all_slots(cache, slot) {
62487 +               /* in LRU first go all `used' slots followed by `unused' */
62488 +               if (unused && (slot->node != NULL))
62489 +                       result = 0;
62490 +               if (slot->node == NULL)
62491 +                       unused = 1;
62492 +               else {
62493 +                       cbk_cache_slot *scan;
62494 +
62495 +                       /* all cached nodes are different */
62496 +                       scan = slot;
62497 +                       while (result) {
62498 +                               scan = list_entry(scan->lru.next, cbk_cache_slot, lru);
62499 +                               if (&cache->lru == &scan->lru)
62500 +                                       break;
62501 +                               if (slot->node == scan->node)
62502 +                                       result = 0;
62503 +                       }
62504 +               }
62505 +               if (!result)
62506 +                       break;
62507 +       }
62508 +       read_unlock(&((cbk_cache *)cache)->guard);
62509 +       return result;
62510 +}
62511 +
62512 +#endif
62513 +
62514 +/* Remove references, if any, to @node from coord cache */
62515 +void cbk_cache_invalidate(const znode * node /* node to remove from cache */ ,
62516 +                         reiser4_tree * tree /* tree to remove node from */ )
62517 +{
62518 +       cbk_cache_slot *slot;
62519 +       cbk_cache *cache;
62520 +       int i;
62521 +
62522 +       assert("nikita-350", node != NULL);
62523 +       assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree));
62524 +
62525 +       cache = &tree->cbk_cache;
62526 +       assert("nikita-2470", cbk_cache_invariant(cache));
62527 +
62528 +       write_lock(&(cache->guard));
62529 +       for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
62530 +               if (slot->node == node) {
62531 +                       list_move_tail(&slot->lru, &cache->lru);
62532 +                       slot->node = NULL;
62533 +                       break;
62534 +               }
62535 +       }
62536 +       write_unlock(&(cache->guard));
62537 +       assert("nikita-2471", cbk_cache_invariant(cache));
62538 +}
62539 +
62540 +/* add to the cbk-cache in the "tree" information about "node". This
62541 +    can actually be update of existing slot in a cache. */
62542 +static void cbk_cache_add(const znode *node /* node to add to the cache */ )
62543 +{
62544 +       cbk_cache *cache;
62545 +       cbk_cache_slot *slot;
62546 +       int i;
62547 +
62548 +       assert("nikita-352", node != NULL);
62549 +
62550 +       cache = &znode_get_tree(node)->cbk_cache;
62551 +       assert("nikita-2472", cbk_cache_invariant(cache));
62552 +
62553 +       if (cache->nr_slots == 0)
62554 +               return;
62555 +
62556 +       write_lock(&(cache->guard));
62557 +       /* find slot to update/add */
62558 +       for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
62559 +               /* oops, this node is already in a cache */
62560 +               if (slot->node == node)
62561 +                       break;
62562 +       }
62563 +       /* if all slots are used, reuse least recently used one */
62564 +       if (i == cache->nr_slots) {
62565 +               slot = list_entry(cache->lru.prev, cbk_cache_slot, lru);
62566 +               slot->node = (znode *) node;
62567 +       }
62568 +       list_move(&slot->lru, &cache->lru);
62569 +       write_unlock(&(cache->guard));
62570 +       assert("nikita-2473", cbk_cache_invariant(cache));
62571 +}
62572 +
62573 +static int setup_delimiting_keys(cbk_handle * h);
62574 +static lookup_result coord_by_handle(cbk_handle * handle);
62575 +static lookup_result traverse_tree(cbk_handle * h);
62576 +static int cbk_cache_search(cbk_handle * h);
62577 +
62578 +static level_lookup_result cbk_level_lookup(cbk_handle * h);
62579 +static level_lookup_result cbk_node_lookup(cbk_handle * h);
62580 +
62581 +/* helper functions */
62582 +
62583 +static void update_stale_dk(reiser4_tree * tree, znode * node);
62584 +
62585 +/* release parent node during traversal */
62586 +static void put_parent(cbk_handle * h);
62587 +/* check consistency of fields */
62588 +static int sanity_check(cbk_handle * h);
62589 +/* release resources in handle */
62590 +static void hput(cbk_handle * h);
62591 +
62592 +static level_lookup_result search_to_left(cbk_handle * h);
62593 +
62594 +/* pack numerous (numberous I should say) arguments of coord_by_key() into
62595 + * cbk_handle */
62596 +static cbk_handle *cbk_pack(cbk_handle * handle,
62597 +                           reiser4_tree * tree,
62598 +                           const reiser4_key * key,
62599 +                           coord_t * coord,
62600 +                           lock_handle * active_lh,
62601 +                           lock_handle * parent_lh,
62602 +                           znode_lock_mode lock_mode,
62603 +                           lookup_bias bias,
62604 +                           tree_level lock_level,
62605 +                           tree_level stop_level,
62606 +                           __u32 flags, ra_info_t * info)
62607 +{
62608 +       memset(handle, 0, sizeof *handle);
62609 +
62610 +       handle->tree = tree;
62611 +       handle->key = key;
62612 +       handle->lock_mode = lock_mode;
62613 +       handle->bias = bias;
62614 +       handle->lock_level = lock_level;
62615 +       handle->stop_level = stop_level;
62616 +       handle->coord = coord;
62617 +       /* set flags. See comment in tree.h:cbk_flags */
62618 +       handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK;
62619 +
62620 +       handle->active_lh = active_lh;
62621 +       handle->parent_lh = parent_lh;
62622 +       handle->ra_info = info;
62623 +       return handle;
62624 +}
62625 +
62626 +/* main tree lookup procedure
62627 +
62628 +   Check coord cache. If key we are looking for is not found there, call cbk()
62629 +   to do real tree traversal.
62630 +
62631 +   As we have extents on the twig level, @lock_level and @stop_level can
62632 +   be different from LEAF_LEVEL and each other.
62633 +
62634 +   Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
62635 +   long term locks) while calling this.
62636 +*/
62637 +lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search
62638 +                                                * in. Usually this tree is
62639 +                                                * part of file-system
62640 +                                                * super-block */ ,
62641 +                          const reiser4_key * key /* key to look for */ ,
62642 +                          coord_t * coord      /* where to store found
62643 +                                                * position in a tree. Fields
62644 +                                                * in "coord" are only valid if
62645 +                                                * coord_by_key() returned
62646 +                                                * "CBK_COORD_FOUND" */ ,
62647 +                          lock_handle * lh,    /* resulting lock handle */
62648 +                          znode_lock_mode lock_mode    /* type of lookup we
62649 +                                                        * want on node. Pass
62650 +                                                        * ZNODE_READ_LOCK here
62651 +                                                        * if you only want to
62652 +                                                        * read item found and
62653 +                                                        * ZNODE_WRITE_LOCK if
62654 +                                                        * you want to modify
62655 +                                                        * it */ ,
62656 +                          lookup_bias bias     /* what to return if coord
62657 +                                                * with exactly the @key is
62658 +                                                * not in the tree */ ,
62659 +                          tree_level lock_level        /* tree level where to start
62660 +                                                        * taking @lock type of
62661 +                                                        * locks */ ,
62662 +                          tree_level stop_level        /* tree level to stop. Pass
62663 +                                                        * LEAF_LEVEL or TWIG_LEVEL
62664 +                                                        * here Item being looked
62665 +                                                        * for has to be between
62666 +                                                        * @lock_level and
62667 +                                                        * @stop_level, inclusive */ ,
62668 +                          __u32 flags /* search flags */ ,
62669 +                          ra_info_t *
62670 +                          info
62671 +                          /* information about desired tree traversal readahead */
62672 +                          )
62673 +{
62674 +       cbk_handle handle;
62675 +       lock_handle parent_lh;
62676 +       lookup_result result;
62677 +
62678 +       init_lh(lh);
62679 +       init_lh(&parent_lh);
62680 +
62681 +       assert("nikita-3023", reiser4_schedulable());
62682 +
62683 +       assert("nikita-353", tree != NULL);
62684 +       assert("nikita-354", key != NULL);
62685 +       assert("nikita-355", coord != NULL);
62686 +       assert("nikita-356", (bias == FIND_EXACT)
62687 +              || (bias == FIND_MAX_NOT_MORE_THAN));
62688 +       assert("nikita-357", stop_level >= LEAF_LEVEL);
62689 +       /* no locks can be held during tree traversal */
62690 +       assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
62691 +
62692 +       cbk_pack(&handle,
62693 +                tree,
62694 +                key,
62695 +                coord,
62696 +                lh,
62697 +                &parent_lh,
62698 +                lock_mode, bias, lock_level, stop_level, flags, info);
62699 +
62700 +       result = coord_by_handle(&handle);
62701 +       assert("nikita-3247",
62702 +              ergo(!IS_CBKERR(result), coord->node == lh->node));
62703 +       return result;
62704 +}
62705 +
62706 +/* like coord_by_key(), but starts traversal from vroot of @object rather than
62707 + * from tree root. */
62708 +lookup_result reiser4_object_lookup(struct inode * object,
62709 +                                   const reiser4_key * key,
62710 +                                   coord_t * coord,
62711 +                                   lock_handle * lh,
62712 +                                   znode_lock_mode lock_mode,
62713 +                                   lookup_bias bias,
62714 +                                   tree_level lock_level,
62715 +                                   tree_level stop_level, __u32 flags,
62716 +                                   ra_info_t * info)
62717 +{
62718 +       cbk_handle handle;
62719 +       lock_handle parent_lh;
62720 +       lookup_result result;
62721 +
62722 +       init_lh(lh);
62723 +       init_lh(&parent_lh);
62724 +
62725 +       assert("nikita-3023", reiser4_schedulable());
62726 +
62727 +       assert("nikita-354", key != NULL);
62728 +       assert("nikita-355", coord != NULL);
62729 +       assert("nikita-356", (bias == FIND_EXACT)
62730 +              || (bias == FIND_MAX_NOT_MORE_THAN));
62731 +       assert("nikita-357", stop_level >= LEAF_LEVEL);
62732 +       /* no locks can be held during tree search by key */
62733 +       assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
62734 +
62735 +       cbk_pack(&handle,
62736 +                object != NULL ? reiser4_tree_by_inode(object) : current_tree,
62737 +                key,
62738 +                coord,
62739 +                lh,
62740 +                &parent_lh,
62741 +                lock_mode, bias, lock_level, stop_level, flags, info);
62742 +       handle.object = object;
62743 +
62744 +       result = coord_by_handle(&handle);
62745 +       assert("nikita-3247",
62746 +              ergo(!IS_CBKERR(result), coord->node == lh->node));
62747 +       return result;
62748 +}
62749 +
62750 +/* lookup by cbk_handle. Common part of coord_by_key() and
62751 +   reiser4_object_lookup(). */
62752 +static lookup_result coord_by_handle(cbk_handle * handle)
62753 +{
62754 +       /*
62755 +        * first check cbk_cache (which is look-aside cache for our tree) and
62756 +        * of this fails, start traversal.
62757 +        */
62758 +       /* first check whether "key" is in cache of recent lookups. */
62759 +       if (cbk_cache_search(handle) == 0)
62760 +               return handle->result;
62761 +       else
62762 +               return traverse_tree(handle);
62763 +}
62764 +
62765 +/* Execute actor for each item (or unit, depending on @through_units_p),
62766 +   starting from @coord, right-ward, until either:
62767 +
62768 +   - end of the tree is reached
62769 +   - unformatted node is met
62770 +   - error occurred
62771 +   - @actor returns 0 or less
62772 +
62773 +   Error code, or last actor return value is returned.
62774 +
62775 +   This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through
62776 +   sequence of entries with identical keys and alikes.
62777 +*/
62778 +int reiser4_iterate_tree(reiser4_tree * tree /* tree to scan */ ,
62779 +                        coord_t * coord /* coord to start from */ ,
62780 +                        lock_handle * lh /* lock handle to start with and to
62781 +                                          * update along the way */ ,
62782 +                        tree_iterate_actor_t actor /* function to call on each
62783 +                                                    * item/unit */ ,
62784 +                        void *arg /* argument to pass to @actor */ ,
62785 +                        znode_lock_mode mode /* lock mode on scanned nodes */ ,
62786 +                        int through_units_p /* call @actor on each item or on
62787 +                                             * each unit */ )
62788 +{
62789 +       int result;
62790 +
62791 +       assert("nikita-1143", tree != NULL);
62792 +       assert("nikita-1145", coord != NULL);
62793 +       assert("nikita-1146", lh != NULL);
62794 +       assert("nikita-1147", actor != NULL);
62795 +
62796 +       result = zload(coord->node);
62797 +       coord_clear_iplug(coord);
62798 +       if (result != 0)
62799 +               return result;
62800 +       if (!coord_is_existing_unit(coord)) {
62801 +               zrelse(coord->node);
62802 +               return -ENOENT;
62803 +       }
62804 +       while ((result = actor(tree, coord, lh, arg)) > 0) {
62805 +               /* move further  */
62806 +               if ((through_units_p && coord_next_unit(coord)) ||
62807 +                   (!through_units_p && coord_next_item(coord))) {
62808 +                       do {
62809 +                               lock_handle couple;
62810 +
62811 +                               /* move to the next node  */
62812 +                               init_lh(&couple);
62813 +                               result =
62814 +                                   reiser4_get_right_neighbor(&couple,
62815 +                                                              coord->node,
62816 +                                                              (int)mode,
62817 +                                                              GN_CAN_USE_UPPER_LEVELS);
62818 +                               zrelse(coord->node);
62819 +                               if (result == 0) {
62820 +
62821 +                                       result = zload(couple.node);
62822 +                                       if (result != 0) {
62823 +                                               done_lh(&couple);
62824 +                                               return result;
62825 +                                       }
62826 +
62827 +                                       coord_init_first_unit(coord,
62828 +                                                             couple.node);
62829 +                                       done_lh(lh);
62830 +                                       move_lh(lh, &couple);
62831 +                               } else
62832 +                                       return result;
62833 +                       } while (node_is_empty(coord->node));
62834 +               }
62835 +
62836 +               assert("nikita-1149", coord_is_existing_unit(coord));
62837 +       }
62838 +       zrelse(coord->node);
62839 +       return result;
62840 +}
62841 +
62842 +/* return locked uber znode for @tree */
62843 +int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
62844 +                  znode_lock_request pri, lock_handle * lh)
62845 +{
62846 +       int result;
62847 +
62848 +       result = longterm_lock_znode(lh, tree->uber, mode, pri);
62849 +       return result;
62850 +}
62851 +
62852 +/* true if @key is strictly within @node
62853 +
62854 +   we are looking for possibly non-unique key and it is item is at the edge of
62855 +   @node. May be it is in the neighbor.
62856 +*/
62857 +static int znode_contains_key_strict(znode * node      /* node to check key
62858 +                                                        * against */ ,
62859 +                                    const reiser4_key *
62860 +                                    key /* key to check */ ,
62861 +                                    int isunique)
62862 +{
62863 +       int answer;
62864 +
62865 +       assert("nikita-1760", node != NULL);
62866 +       assert("nikita-1722", key != NULL);
62867 +
62868 +       if (keyge(key, &node->rd_key))
62869 +               return 0;
62870 +
62871 +       answer = keycmp(&node->ld_key, key);
62872 +
62873 +       if (isunique)
62874 +               return answer != GREATER_THAN;
62875 +       else
62876 +               return answer == LESS_THAN;
62877 +}
62878 +
62879 +/*
62880 + * Virtual Root (vroot) code.
62881 + *
62882 + *     For given file system object (e.g., regular file or directory) let's
62883 + *     define its "virtual root" as lowest in the tree (that is, furtherest
62884 + *     from the tree root) node such that all body items of said object are
62885 + *     located in a tree rooted at this node.
62886 + *
62887 + *     Once vroot of object is found all tree lookups for items within body of
62888 + *     this object ("object lookups") can be started from its vroot rather
62889 + *     than from real root. This has following advantages:
62890 + *
62891 + *         1. amount of nodes traversed during lookup (and, hence, amount of
62892 + *         key comparisons made) decreases, and
62893 + *
62894 + *         2. contention on tree root is decreased. This latter was actually
62895 + *         motivating reason behind vroot, because spin lock of root node,
62896 + *         which is taken when acquiring long-term lock on root node is the
62897 + *         hottest lock in the reiser4.
62898 + *
62899 + * How to find vroot.
62900 + *
62901 + *     When vroot of object F is not yet determined, all object lookups start
62902 + *     from the root of the tree. At each tree level during traversal we have
62903 + *     a node N such that a key we are looking for (which is the key inside
62904 + *     object's body) is located within N. In function handle_vroot() called
62905 + *     from cbk_level_lookup() we check whether N is possible vroot for
62906 + *     F. Check is trivial---if neither leftmost nor rightmost item of N
62907 + *     belongs to F (and we already have helpful ->owns_item() method of
62908 + *     object plugin for this), then N is possible vroot of F. This, of
62909 + *     course, relies on the assumption that each object occupies contiguous
62910 + *     range of keys in the tree.
62911 + *
62912 + *     Thus, traversing tree downward and checking each node as we go, we can
62913 + *     find lowest such node, which, by definition, is vroot.
62914 + *
62915 + * How to track vroot.
62916 + *
62917 + *     Nohow. If actual vroot changes, next object lookup will just restart
62918 + *     from the actual tree root, refreshing object's vroot along the way.
62919 + *
62920 + */
62921 +
62922 +/*
62923 + * Check whether @node is possible vroot of @object.
62924 + */
62925 +static void handle_vroot(struct inode *object, znode * node)
62926 +{
62927 +       file_plugin *fplug;
62928 +       coord_t coord;
62929 +
62930 +       fplug = inode_file_plugin(object);
62931 +       assert("nikita-3353", fplug != NULL);
62932 +       assert("nikita-3354", fplug->owns_item != NULL);
62933 +
62934 +       if (unlikely(node_is_empty(node)))
62935 +               return;
62936 +
62937 +       coord_init_first_unit(&coord, node);
62938 +       /*
62939 +        * if leftmost item of @node belongs to @object, we cannot be sure
62940 +        * that @node is vroot of @object, because, some items of @object are
62941 +        * probably in the sub-tree rooted at the left neighbor of @node.
62942 +        */
62943 +       if (fplug->owns_item(object, &coord))
62944 +               return;
62945 +       coord_init_last_unit(&coord, node);
62946 +       /* mutatis mutandis for the rightmost item */
62947 +       if (fplug->owns_item(object, &coord))
62948 +               return;
62949 +       /* otherwise, @node is possible vroot of @object */
62950 +       inode_set_vroot(object, node);
62951 +}
62952 +
62953 +/*
62954 + * helper function used by traverse tree to start tree traversal not from the
62955 + * tree root, but from @h->object's vroot, if possible.
62956 + */
62957 +static int prepare_object_lookup(cbk_handle * h)
62958 +{
62959 +       znode *vroot;
62960 +       int result;
62961 +
62962 +       vroot = inode_get_vroot(h->object);
62963 +       if (vroot == NULL) {
62964 +               /*
62965 +                * object doesn't have known vroot, start from real tree root.
62966 +                */
62967 +               return LOOKUP_CONT;
62968 +       }
62969 +
62970 +       h->level = znode_get_level(vroot);
62971 +       /* take a long-term lock on vroot */
62972 +       h->result = longterm_lock_znode(h->active_lh, vroot,
62973 +                                       cbk_lock_mode(h->level, h),
62974 +                                       ZNODE_LOCK_LOPRI);
62975 +       result = LOOKUP_REST;
62976 +       if (h->result == 0) {
62977 +               int isunique;
62978 +               int inside;
62979 +
62980 +               isunique = h->flags & CBK_UNIQUE;
62981 +               /* check that key is inside vroot */
62982 +               read_lock_dk(h->tree);
62983 +               inside = (znode_contains_key_strict(vroot, h->key, isunique) &&
62984 +                         !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE));
62985 +               read_unlock_dk(h->tree);
62986 +               if (inside) {
62987 +                       h->result = zload(vroot);
62988 +                       if (h->result == 0) {
62989 +                               /* search for key in vroot. */
62990 +                               result = cbk_node_lookup(h);
62991 +                               zrelse(vroot);  /*h->active_lh->node); */
62992 +                               if (h->active_lh->node != vroot) {
62993 +                                       result = LOOKUP_REST;
62994 +                               } else if (result == LOOKUP_CONT) {
62995 +                                       move_lh(h->parent_lh, h->active_lh);
62996 +                                       h->flags &= ~CBK_DKSET;
62997 +                               }
62998 +                       }
62999 +               }
63000 +       }
63001 +
63002 +       zput(vroot);
63003 +
63004 +       if (IS_CBKERR(h->result) || result == LOOKUP_REST)
63005 +               hput(h);
63006 +       return result;
63007 +}
63008 +
63009 +/* main function that handles common parts of tree traversal: starting
63010 +    (fake znode handling), restarts, error handling, completion */
63011 +static lookup_result traverse_tree(cbk_handle * h /* search handle */ )
63012 +{
63013 +       int done;
63014 +       int iterations;
63015 +       int vroot_used;
63016 +
63017 +       assert("nikita-365", h != NULL);
63018 +       assert("nikita-366", h->tree != NULL);
63019 +       assert("nikita-367", h->key != NULL);
63020 +       assert("nikita-368", h->coord != NULL);
63021 +       assert("nikita-369", (h->bias == FIND_EXACT)
63022 +              || (h->bias == FIND_MAX_NOT_MORE_THAN));
63023 +       assert("nikita-370", h->stop_level >= LEAF_LEVEL);
63024 +       assert("nikita-2949", !(h->flags & CBK_DKSET));
63025 +       assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
63026 +
63027 +       done = 0;
63028 +       iterations = 0;
63029 +       vroot_used = 0;
63030 +
63031 +       /* loop for restarts */
63032 +      restart:
63033 +
63034 +       assert("nikita-3024", reiser4_schedulable());
63035 +
63036 +       h->result = CBK_COORD_FOUND;
63037 +       /* connect_znode() needs it */
63038 +       h->ld_key = *reiser4_min_key();
63039 +       h->rd_key = *reiser4_max_key();
63040 +       h->flags |= CBK_DKSET;
63041 +       h->error = NULL;
63042 +
63043 +       if (!vroot_used && h->object != NULL) {
63044 +               vroot_used = 1;
63045 +               done = prepare_object_lookup(h);
63046 +               if (done == LOOKUP_REST) {
63047 +                       goto restart;
63048 +               } else if (done == LOOKUP_DONE)
63049 +                       return h->result;
63050 +       }
63051 +       if (h->parent_lh->node == NULL) {
63052 +               done =
63053 +                   get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
63054 +                                  h->parent_lh);
63055 +
63056 +               assert("nikita-1637", done != -E_DEADLOCK);
63057 +
63058 +               h->block = h->tree->root_block;
63059 +               h->level = h->tree->height;
63060 +               h->coord->node = h->parent_lh->node;
63061 +
63062 +               if (done != 0)
63063 +                       return done;
63064 +       }
63065 +
63066 +       /* loop descending a tree */
63067 +       while (!done) {
63068 +
63069 +               if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
63070 +                            IS_POW(iterations))) {
63071 +                       warning("nikita-1481", "Too many iterations: %i",
63072 +                               iterations);
63073 +                       reiser4_print_key("key", h->key);
63074 +                       ++iterations;
63075 +               } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
63076 +                       h->error =
63077 +                           "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
63078 +                       h->result = RETERR(-EIO);
63079 +                       break;
63080 +               }
63081 +               switch (cbk_level_lookup(h)) {
63082 +               case LOOKUP_CONT:
63083 +                       move_lh(h->parent_lh, h->active_lh);
63084 +                       continue;
63085 +               default:
63086 +                       wrong_return_value("nikita-372", "cbk_level");
63087 +               case LOOKUP_DONE:
63088 +                       done = 1;
63089 +                       break;
63090 +               case LOOKUP_REST:
63091 +                       hput(h);
63092 +                       /* deadlock avoidance is normal case. */
63093 +                       if (h->result != -E_DEADLOCK)
63094 +                               ++iterations;
63095 +                       reiser4_preempt_point();
63096 +                       goto restart;
63097 +               }
63098 +       }
63099 +       /* that's all. The rest is error handling */
63100 +       if (unlikely(h->error != NULL)) {
63101 +               warning("nikita-373", "%s: level: %i, "
63102 +                       "lock_level: %i, stop_level: %i "
63103 +                       "lock_mode: %s, bias: %s",
63104 +                       h->error, h->level, h->lock_level, h->stop_level,
63105 +                       lock_mode_name(h->lock_mode), bias_name(h->bias));
63106 +               reiser4_print_address("block", &h->block);
63107 +               reiser4_print_key("key", h->key);
63108 +               print_coord_content("coord", h->coord);
63109 +       }
63110 +       /* `unlikely' error case */
63111 +       if (unlikely(IS_CBKERR(h->result))) {
63112 +               /* failure. do cleanup */
63113 +               hput(h);
63114 +       } else {
63115 +               assert("nikita-1605", WITH_DATA_RET
63116 +                      (h->coord->node, 1,
63117 +                       ergo((h->result == CBK_COORD_FOUND) &&
63118 +                            (h->bias == FIND_EXACT) &&
63119 +                            (!node_is_empty(h->coord->node)),
63120 +                            coord_is_existing_item(h->coord))));
63121 +       }
63122 +       return h->result;
63123 +}
63124 +
63125 +/* find delimiting keys of child
63126 +
63127 +   Determine left and right delimiting keys for child pointed to by
63128 +   @parent_coord.
63129 +
63130 +*/
63131 +static void find_child_delimiting_keys(znode * parent  /* parent znode, passed
63132 +                                                        * locked */ ,
63133 +                                      const coord_t * parent_coord     /* coord where
63134 +                                                                        * pointer to
63135 +                                                                        * child is
63136 +                                                                        * stored */ ,
63137 +                                      reiser4_key * ld /* where to store left
63138 +                                                        * delimiting key */ ,
63139 +                                      reiser4_key * rd /* where to store right
63140 +                                                        * delimiting key */ )
63141 +{
63142 +       coord_t neighbor;
63143 +
63144 +       assert("nikita-1484", parent != NULL);
63145 +       assert_rw_locked(&(znode_get_tree(parent)->dk_lock));
63146 +
63147 +       coord_dup(&neighbor, parent_coord);
63148 +
63149 +       if (neighbor.between == AT_UNIT)
63150 +               /* imitate item ->lookup() behavior. */
63151 +               neighbor.between = AFTER_UNIT;
63152 +
63153 +       if (coord_set_to_left(&neighbor) == 0)
63154 +               unit_key_by_coord(&neighbor, ld);
63155 +       else {
63156 +               assert("nikita-14851", 0);
63157 +               *ld = *znode_get_ld_key(parent);
63158 +       }
63159 +
63160 +       coord_dup(&neighbor, parent_coord);
63161 +       if (neighbor.between == AT_UNIT)
63162 +               neighbor.between = AFTER_UNIT;
63163 +       if (coord_set_to_right(&neighbor) == 0)
63164 +               unit_key_by_coord(&neighbor, rd);
63165 +       else
63166 +               *rd = *znode_get_rd_key(parent);
63167 +}
63168 +
63169 +/*
63170 + * setup delimiting keys for a child
63171 + *
63172 + * @parent parent node
63173 + *
63174 + * @coord location in @parent where pointer to @child is
63175 + *
63176 + * @child child node
63177 + */
63178 +int
63179 +set_child_delimiting_keys(znode * parent, const coord_t * coord, znode * child)
63180 +{
63181 +       reiser4_tree *tree;
63182 +
63183 +       assert("nikita-2952",
63184 +              znode_get_level(parent) == znode_get_level(coord->node));
63185 +
63186 +       /* fast check without taking dk lock. This is safe, because
63187 +        * JNODE_DKSET is never cleared once set. */
63188 +       if (!ZF_ISSET(child, JNODE_DKSET)) {
63189 +               tree = znode_get_tree(parent);
63190 +               write_lock_dk(tree);
63191 +               if (likely(!ZF_ISSET(child, JNODE_DKSET))) {
63192 +                       find_child_delimiting_keys(parent, coord,
63193 +                                                  &child->ld_key,
63194 +                                                  &child->rd_key);
63195 +                       ON_DEBUG(child->ld_key_version =
63196 +                                atomic_inc_return(&delim_key_version);
63197 +                                child->rd_key_version =
63198 +                                atomic_inc_return(&delim_key_version););
63199 +                       ZF_SET(child, JNODE_DKSET);
63200 +               }
63201 +               write_unlock_dk(tree);
63202 +               return 1;
63203 +       }
63204 +       return 0;
63205 +}
63206 +
63207 +/* Perform tree lookup at one level. This is called from cbk_traverse()
63208 +   function that drives lookup through tree and calls cbk_node_lookup() to
63209 +   perform lookup within one node.
63210 +
63211 +   See comments in a code.
63212 +*/
63213 +static level_lookup_result cbk_level_lookup(cbk_handle * h /* search handle */ )
63214 +{
63215 +       int ret;
63216 +       int setdk;
63217 +       int ldkeyset = 0;
63218 +       reiser4_key ldkey;
63219 +       reiser4_key key;
63220 +       znode *active;
63221 +
63222 +       assert("nikita-3025", reiser4_schedulable());
63223 +
63224 +       /* acquire reference to @active node */
63225 +       active =
63226 +           zget(h->tree, &h->block, h->parent_lh->node, h->level,
63227 +                reiser4_ctx_gfp_mask_get());
63228 +
63229 +       if (IS_ERR(active)) {
63230 +               h->result = PTR_ERR(active);
63231 +               return LOOKUP_DONE;
63232 +       }
63233 +
63234 +       /* lock @active */
63235 +       h->result = longterm_lock_znode(h->active_lh,
63236 +                                       active,
63237 +                                       cbk_lock_mode(h->level, h),
63238 +                                       ZNODE_LOCK_LOPRI);
63239 +       /* longterm_lock_znode() acquires additional reference to znode (which
63240 +          will be later released by longterm_unlock_znode()). Release
63241 +          reference acquired by zget().
63242 +        */
63243 +       zput(active);
63244 +       if (unlikely(h->result != 0))
63245 +               goto fail_or_restart;
63246 +
63247 +       setdk = 0;
63248 +       /* if @active is accessed for the first time, setup delimiting keys on
63249 +          it. Delimiting keys are taken from the parent node. See
63250 +          setup_delimiting_keys() for details.
63251 +        */
63252 +       if (h->flags & CBK_DKSET) {
63253 +               setdk = setup_delimiting_keys(h);
63254 +               h->flags &= ~CBK_DKSET;
63255 +       } else {
63256 +               znode *parent;
63257 +
63258 +               parent = h->parent_lh->node;
63259 +               h->result = zload(parent);
63260 +               if (unlikely(h->result != 0))
63261 +                       goto fail_or_restart;
63262 +
63263 +               if (!ZF_ISSET(active, JNODE_DKSET))
63264 +                       setdk = set_child_delimiting_keys(parent,
63265 +                                                         h->coord, active);
63266 +               else {
63267 +                       read_lock_dk(h->tree);
63268 +                       find_child_delimiting_keys(parent, h->coord, &ldkey,
63269 +                                                  &key);
63270 +                       read_unlock_dk(h->tree);
63271 +                       ldkeyset = 1;
63272 +               }
63273 +               zrelse(parent);
63274 +       }
63275 +
63276 +       /* this is ugly kludge. Reminder: this is necessary, because
63277 +          ->lookup() method returns coord with ->between field probably set
63278 +          to something different from AT_UNIT.
63279 +        */
63280 +       h->coord->between = AT_UNIT;
63281 +
63282 +       if (znode_just_created(active) && (h->coord->node != NULL)) {
63283 +               write_lock_tree(h->tree);
63284 +               /* if we are going to load znode right now, setup
63285 +                  ->in_parent: coord where pointer to this node is stored in
63286 +                  parent.
63287 +                */
63288 +               coord_to_parent_coord(h->coord, &active->in_parent);
63289 +               write_unlock_tree(h->tree);
63290 +       }
63291 +
63292 +       /* check connectedness without holding tree lock---false negatives
63293 +        * will be re-checked by connect_znode(), and false positives are
63294 +        * impossible---@active cannot suddenly turn into unconnected
63295 +        * state. */
63296 +       if (!znode_is_connected(active)) {
63297 +               h->result = connect_znode(h->coord, active);
63298 +               if (unlikely(h->result != 0)) {
63299 +                       put_parent(h);
63300 +                       goto fail_or_restart;
63301 +               }
63302 +       }
63303 +
63304 +       jload_prefetch(ZJNODE(active));
63305 +
63306 +       if (setdk)
63307 +               update_stale_dk(h->tree, active);
63308 +
63309 +       /* put_parent() cannot be called earlier, because connect_znode()
63310 +          assumes parent node is referenced; */
63311 +       put_parent(h);
63312 +
63313 +       if ((!znode_contains_key_lock(active, h->key) &&
63314 +            (h->flags & CBK_TRUST_DK))
63315 +           || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) {
63316 +               /* 1. key was moved out of this node while this thread was
63317 +                  waiting for the lock. Restart. More elaborate solution is
63318 +                  to determine where key moved (to the left, or to the right)
63319 +                  and try to follow it through sibling pointers.
63320 +
63321 +                  2. or, node itself is going to be removed from the
63322 +                  tree. Release lock and restart.
63323 +                */
63324 +               h->result = -E_REPEAT;
63325 +       }
63326 +       if (h->result == -E_REPEAT)
63327 +               return LOOKUP_REST;
63328 +
63329 +       h->result = zload_ra(active, h->ra_info);
63330 +       if (h->result) {
63331 +               return LOOKUP_DONE;
63332 +       }
63333 +
63334 +       /* sanity checks */
63335 +       if (sanity_check(h)) {
63336 +               zrelse(active);
63337 +               return LOOKUP_DONE;
63338 +       }
63339 +
63340 +       /* check that key of leftmost item in the @active is the same as in
63341 +        * its parent */
63342 +       if (ldkeyset && !node_is_empty(active) &&
63343 +           !keyeq(leftmost_key_in_node(active, &key), &ldkey)) {
63344 +               warning("vs-3533", "Keys are inconsistent. Fsck?");
63345 +               reiser4_print_key("inparent", &ldkey);
63346 +               reiser4_print_key("inchild", &key);
63347 +               h->result = RETERR(-EIO);
63348 +               zrelse(active);
63349 +               return LOOKUP_DONE;
63350 +       }
63351 +
63352 +       if (h->object != NULL)
63353 +               handle_vroot(h->object, active);
63354 +
63355 +       ret = cbk_node_lookup(h);
63356 +
63357 +       /* h->active_lh->node might change, but active is yet to be zrelsed */
63358 +       zrelse(active);
63359 +
63360 +       return ret;
63361 +
63362 +      fail_or_restart:
63363 +       if (h->result == -E_DEADLOCK)
63364 +               return LOOKUP_REST;
63365 +       return LOOKUP_DONE;
63366 +}
63367 +
63368 +#if REISER4_DEBUG
63369 +/* check left and right delimiting keys of a znode */
63370 +void check_dkeys(znode * node)
63371 +{
63372 +       znode *left;
63373 +       znode *right;
63374 +
63375 +       read_lock_tree(current_tree);
63376 +       read_lock_dk(current_tree);
63377 +
63378 +       assert("vs-1710", znode_is_any_locked(node));
63379 +       assert("vs-1197",
63380 +              !keygt(znode_get_ld_key(node), znode_get_rd_key(node)));
63381 +
63382 +       left = node->left;
63383 +       right = node->right;
63384 +
63385 +       if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
63386 +           && left != NULL && ZF_ISSET(left, JNODE_DKSET))
63387 +               /* check left neighbor. Note that left neighbor is not locked,
63388 +                  so it might get wrong delimiting keys therefore */
63389 +               assert("vs-1198",
63390 +                      (keyeq(znode_get_rd_key(left), znode_get_ld_key(node))
63391 +                       || ZF_ISSET(left, JNODE_HEARD_BANSHEE)));
63392 +
63393 +       if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
63394 +           && right != NULL && ZF_ISSET(right, JNODE_DKSET))
63395 +               /* check right neighbor. Note that right neighbor is not
63396 +                  locked, so it might get wrong delimiting keys therefore  */
63397 +               assert("vs-1199",
63398 +                      (keyeq(znode_get_rd_key(node), znode_get_ld_key(right))
63399 +                       || ZF_ISSET(right, JNODE_HEARD_BANSHEE)));
63400 +
63401 +       read_unlock_dk(current_tree);
63402 +       read_unlock_tree(current_tree);
63403 +}
63404 +#endif
63405 +
63406 +/* true if @key is left delimiting key of @node */
63407 +static int key_is_ld(znode * node, const reiser4_key * key)
63408 +{
63409 +       int ld;
63410 +
63411 +       assert("nikita-1716", node != NULL);
63412 +       assert("nikita-1758", key != NULL);
63413 +
63414 +       read_lock_dk(znode_get_tree(node));
63415 +       assert("nikita-1759", znode_contains_key(node, key));
63416 +       ld = keyeq(znode_get_ld_key(node), key);
63417 +       read_unlock_dk(znode_get_tree(node));
63418 +       return ld;
63419 +}
63420 +
63421 +/* Process one node during tree traversal.
63422 +
63423 +   This is called by cbk_level_lookup(). */
63424 +static level_lookup_result cbk_node_lookup(cbk_handle * h /* search handle */ )
63425 +{
63426 +       /* node plugin of @active */
63427 +       node_plugin *nplug;
63428 +       /* item plugin of item that was found */
63429 +       item_plugin *iplug;
63430 +       /* search bias */
63431 +       lookup_bias node_bias;
63432 +       /* node we are operating upon */
63433 +       znode *active;
63434 +       /* tree we are searching in */
63435 +       reiser4_tree *tree;
63436 +       /* result */
63437 +       int result;
63438 +
63439 +       assert("nikita-379", h != NULL);
63440 +
63441 +       active = h->active_lh->node;
63442 +       tree = h->tree;
63443 +
63444 +       nplug = active->nplug;
63445 +       assert("nikita-380", nplug != NULL);
63446 +
63447 +       ON_DEBUG(check_dkeys(active));
63448 +
63449 +       /* return item from "active" node with maximal key not greater than
63450 +          "key"  */
63451 +       node_bias = h->bias;
63452 +       result = nplug->lookup(active, h->key, node_bias, h->coord);
63453 +       if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) {
63454 +               /* error occurred */
63455 +               h->result = result;
63456 +               return LOOKUP_DONE;
63457 +       }
63458 +       if (h->level == h->stop_level) {
63459 +               /* welcome to the stop level */
63460 +               assert("nikita-381", h->coord->node == active);
63461 +               if (result == NS_FOUND) {
63462 +                       /* success of tree lookup */
63463 +                       if (!(h->flags & CBK_UNIQUE)
63464 +                           && key_is_ld(active, h->key)) {
63465 +                               return search_to_left(h);
63466 +                       } else
63467 +                               h->result = CBK_COORD_FOUND;
63468 +               } else {
63469 +                       h->result = CBK_COORD_NOTFOUND;
63470 +               }
63471 +               if (!(h->flags & CBK_IN_CACHE))
63472 +                       cbk_cache_add(active);
63473 +               return LOOKUP_DONE;
63474 +       }
63475 +
63476 +       if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) {
63477 +               h->error = "not found on internal node";
63478 +               h->result = result;
63479 +               return LOOKUP_DONE;
63480 +       }
63481 +
63482 +       assert("vs-361", h->level > h->stop_level);
63483 +
63484 +       if (handle_eottl(h, &result)) {
63485 +               assert("vs-1674", (result == LOOKUP_DONE ||
63486 +                                  result == LOOKUP_REST));
63487 +               return result;
63488 +       }
63489 +
63490 +       /* go down to next level */
63491 +       check_me("vs-12", zload(h->coord->node) == 0);
63492 +       assert("nikita-2116", item_is_internal(h->coord));
63493 +       iplug = item_plugin_by_coord(h->coord);
63494 +       iplug->s.internal.down_link(h->coord, h->key, &h->block);
63495 +       zrelse(h->coord->node);
63496 +       --h->level;
63497 +       return LOOKUP_CONT;     /* continue */
63498 +}
63499 +
63500 +/* scan cbk_cache slots looking for a match for @h */
63501 +static int cbk_cache_scan_slots(cbk_handle * h /* cbk handle */ )
63502 +{
63503 +       level_lookup_result llr;
63504 +       znode *node;
63505 +       reiser4_tree *tree;
63506 +       cbk_cache_slot *slot;
63507 +       cbk_cache *cache;
63508 +       tree_level level;
63509 +       int isunique;
63510 +       const reiser4_key *key;
63511 +       int result;
63512 +
63513 +       assert("nikita-1317", h != NULL);
63514 +       assert("nikita-1315", h->tree != NULL);
63515 +       assert("nikita-1316", h->key != NULL);
63516 +
63517 +       tree = h->tree;
63518 +       cache = &tree->cbk_cache;
63519 +       if (cache->nr_slots == 0)
63520 +               /* size of cbk cache was set to 0 by mount time option. */
63521 +               return RETERR(-ENOENT);
63522 +
63523 +       assert("nikita-2474", cbk_cache_invariant(cache));
63524 +       node = NULL;            /* to keep gcc happy */
63525 +       level = h->level;
63526 +       key = h->key;
63527 +       isunique = h->flags & CBK_UNIQUE;
63528 +       result = RETERR(-ENOENT);
63529 +
63530 +       /*
63531 +        * this is time-critical function and dragons had, hence, been settled
63532 +        * here.
63533 +        *
63534 +        * Loop below scans cbk cache slots trying to find matching node with
63535 +        * suitable range of delimiting keys and located at the h->level.
63536 +        *
63537 +        * Scan is done under cbk cache spin lock that protects slot->node
63538 +        * pointers. If suitable node is found we want to pin it in
63539 +        * memory. But slot->node can point to the node with x_count 0
63540 +        * (unreferenced). Such node can be recycled at any moment, or can
63541 +        * already be in the process of being recycled (within jput()).
63542 +        *
63543 +        * As we found node in the cbk cache, it means that jput() hasn't yet
63544 +        * called cbk_cache_invalidate().
63545 +        *
63546 +        * We acquire reference to the node without holding tree lock, and
63547 +        * later, check node's RIP bit. This avoids races with jput().
63548 +        */
63549 +
63550 +       rcu_read_lock();
63551 +       read_lock(&((cbk_cache *)cache)->guard);
63552 +
63553 +       slot = list_entry(cache->lru.next, cbk_cache_slot, lru);
63554 +       slot = list_entry(slot->lru.prev, cbk_cache_slot, lru);
63555 +       BUG_ON(&slot->lru != &cache->lru);/*????*/
63556 +       while (1) {
63557 +
63558 +               slot = list_entry(slot->lru.next, cbk_cache_slot, lru);
63559 +
63560 +               if (&cache->lru != &slot->lru)
63561 +                       node = slot->node;
63562 +               else
63563 +                       node = NULL;
63564 +
63565 +               if (unlikely(node == NULL))
63566 +                       break;
63567 +
63568 +               /*
63569 +                * this is (hopefully) the only place in the code where we are
63570 +                * working with delimiting keys without holding dk lock. This
63571 +                * is fine here, because this is only "guess" anyway---keys
63572 +                * are rechecked under dk lock below.
63573 +                */
63574 +               if (znode_get_level(node) == level &&
63575 +                   /* reiser4_min_key < key < reiser4_max_key */
63576 +                   znode_contains_key_strict(node, key, isunique)) {
63577 +                       zref(node);
63578 +                       result = 0;
63579 +                       spin_lock_prefetch(&tree->tree_lock);
63580 +                       break;
63581 +               }
63582 +       }
63583 +       read_unlock(&((cbk_cache *)cache)->guard);
63584 +
63585 +       assert("nikita-2475", cbk_cache_invariant(cache));
63586 +
63587 +       if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
63588 +               result = -ENOENT;
63589 +
63590 +       rcu_read_unlock();
63591 +
63592 +       if (result != 0) {
63593 +               h->result = CBK_COORD_NOTFOUND;
63594 +               return RETERR(-ENOENT);
63595 +       }
63596 +
63597 +       result =
63598 +           longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h),
63599 +                               ZNODE_LOCK_LOPRI);
63600 +       zput(node);
63601 +       if (result != 0)
63602 +               return result;
63603 +       result = zload(node);
63604 +       if (result != 0)
63605 +               return result;
63606 +
63607 +       /* recheck keys */
63608 +       read_lock_dk(tree);
63609 +       result = (znode_contains_key_strict(node, key, isunique) &&
63610 +               !ZF_ISSET(node, JNODE_HEARD_BANSHEE));
63611 +       read_unlock_dk(tree);
63612 +       if (result) {
63613 +               /* do lookup inside node */
63614 +               llr = cbk_node_lookup(h);
63615 +               /* if cbk_node_lookup() wandered to another node (due to eottl
63616 +                  or non-unique keys), adjust @node */
63617 +               /*node = h->active_lh->node; */
63618 +
63619 +               if (llr != LOOKUP_DONE) {
63620 +                       /* restart or continue on the next level */
63621 +                       result = RETERR(-ENOENT);
63622 +               } else if (IS_CBKERR(h->result))
63623 +                       /* io or oom */
63624 +                       result = RETERR(-ENOENT);
63625 +               else {
63626 +                       /* good. Either item found or definitely not found. */
63627 +                       result = 0;
63628 +
63629 +                       write_lock(&(cache->guard));
63630 +                       if (slot->node == h->active_lh->node /*node */ ) {
63631 +                               /* if this node is still in cbk cache---move
63632 +                                  its slot to the head of the LRU list. */
63633 +                               list_move(&slot->lru, &cache->lru);
63634 +                       }
63635 +                       write_unlock(&(cache->guard));
63636 +               }
63637 +       } else {
63638 +               /* race. While this thread was waiting for the lock, node was
63639 +                  rebalanced and item we are looking for, shifted out of it
63640 +                  (if it ever was here).
63641 +
63642 +                  Continuing scanning is almost hopeless: node key range was
63643 +                  moved to, is almost certainly at the beginning of the LRU
63644 +                  list at this time, because it's hot, but restarting
63645 +                  scanning from the very beginning is complex. Just return,
63646 +                  so that cbk() will be performed. This is not that
63647 +                  important, because such races should be rare. Are they?
63648 +                */
63649 +               result = RETERR(-ENOENT);       /* -ERAUGHT */
63650 +       }
63651 +       zrelse(node);
63652 +       assert("nikita-2476", cbk_cache_invariant(cache));
63653 +       return result;
63654 +}
63655 +
63656 +/* look for item with given key in the coord cache
63657 +
63658 +   This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
63659 +   which is a small LRU list of znodes accessed lately. For each znode in
63660 +   znode in this list, it checks whether key we are looking for fits into key
63661 +   range covered by this node. If so, and in addition, node lies at allowed
63662 +   level (this is to handle extents on a twig level), node is locked, and
63663 +   lookup inside it is performed.
63664 +
63665 +   we need a measurement of the cost of this cache search compared to the cost
63666 +   of coord_by_key.
63667 +
63668 +*/
63669 +static int cbk_cache_search(cbk_handle * h /* cbk handle */ )
63670 +{
63671 +       int result = 0;
63672 +       tree_level level;
63673 +
63674 +       /* add CBK_IN_CACHE to the handle flags. This means that
63675 +        * cbk_node_lookup() assumes that cbk_cache is scanned and would add
63676 +        * found node to the cache. */
63677 +       h->flags |= CBK_IN_CACHE;
63678 +       for (level = h->stop_level; level <= h->lock_level; ++level) {
63679 +               h->level = level;
63680 +               result = cbk_cache_scan_slots(h);
63681 +               if (result != 0) {
63682 +                       done_lh(h->active_lh);
63683 +                       done_lh(h->parent_lh);
63684 +               } else {
63685 +                       assert("nikita-1319", !IS_CBKERR(h->result));
63686 +                       break;
63687 +               }
63688 +       }
63689 +       h->flags &= ~CBK_IN_CACHE;
63690 +       return result;
63691 +}
63692 +
63693 +/* type of lock we want to obtain during tree traversal. On stop level
63694 +    we want type of lock user asked for, on upper levels: read lock. */
63695 +znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h)
63696 +{
63697 +       assert("nikita-382", h != NULL);
63698 +
63699 +       return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK;
63700 +}
63701 +
63702 +/* update outdated delimiting keys */
63703 +static void stale_dk(reiser4_tree * tree, znode * node)
63704 +{
63705 +       znode *right;
63706 +
63707 +       read_lock_tree(tree);
63708 +       write_lock_dk(tree);
63709 +       right = node->right;
63710 +
63711 +       if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
63712 +           right && ZF_ISSET(right, JNODE_DKSET) &&
63713 +           !keyeq(znode_get_rd_key(node), znode_get_ld_key(right)))
63714 +               znode_set_rd_key(node, znode_get_ld_key(right));
63715 +
63716 +       write_unlock_dk(tree);
63717 +       read_unlock_tree(tree);
63718 +}
63719 +
63720 +/* check for possibly outdated delimiting keys, and update them if
63721 + * necessary. */
63722 +static void update_stale_dk(reiser4_tree * tree, znode * node)
63723 +{
63724 +       znode *right;
63725 +       reiser4_key rd;
63726 +
63727 +       read_lock_tree(tree);
63728 +       read_lock_dk(tree);
63729 +       rd = *znode_get_rd_key(node);
63730 +       right = node->right;
63731 +       if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
63732 +                    right && ZF_ISSET(right, JNODE_DKSET) &&
63733 +                    !keyeq(&rd, znode_get_ld_key(right)))) {
63734 +               assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET));
63735 +               read_unlock_dk(tree);
63736 +               read_unlock_tree(tree);
63737 +               stale_dk(tree, node);
63738 +               return;
63739 +       }
63740 +       read_unlock_dk(tree);
63741 +       read_unlock_tree(tree);
63742 +}
63743 +
63744 +/*
63745 + * handle searches a the non-unique key.
63746 + *
63747 + * Suppose that we are looking for an item with possibly non-unique key 100.
63748 + *
63749 + * Root node contains two pointers: one to a node with left delimiting key 0,
63750 + * and another to a node with left delimiting key 100. Item we interested in
63751 + * may well happen in the sub-tree rooted at the first pointer.
63752 + *
63753 + * To handle this search_to_left() is called when search reaches stop
63754 + * level. This function checks it is _possible_ that item we are looking for
63755 + * is in the left neighbor (this can be done by comparing delimiting keys) and
63756 + * if so, tries to lock left neighbor (this is low priority lock, so it can
63757 + * deadlock, tree traversal is just restarted if it did) and then checks
63758 + * whether left neighbor actually contains items with our key.
63759 + *
63760 + * Note that this is done on the stop level only. It is possible to try such
63761 + * left-check on each level, but as duplicate keys are supposed to be rare
63762 + * (very unlikely that more than one node is completely filled with items with
63763 + * duplicate keys), it sis cheaper to scan to the left on the stop level once.
63764 + *
63765 + */
63766 +static level_lookup_result search_to_left(cbk_handle * h /* search handle */ )
63767 +{
63768 +       level_lookup_result result;
63769 +       coord_t *coord;
63770 +       znode *node;
63771 +       znode *neighbor;
63772 +
63773 +       lock_handle lh;
63774 +
63775 +       assert("nikita-1761", h != NULL);
63776 +       assert("nikita-1762", h->level == h->stop_level);
63777 +
63778 +       init_lh(&lh);
63779 +       coord = h->coord;
63780 +       node = h->active_lh->node;
63781 +       assert("nikita-1763", coord_is_leftmost_unit(coord));
63782 +
63783 +       h->result =
63784 +           reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode,
63785 +                                     GN_CAN_USE_UPPER_LEVELS);
63786 +       neighbor = NULL;
63787 +       switch (h->result) {
63788 +       case -E_DEADLOCK:
63789 +               result = LOOKUP_REST;
63790 +               break;
63791 +       case 0:{
63792 +                       node_plugin *nplug;
63793 +                       coord_t crd;
63794 +                       lookup_bias bias;
63795 +
63796 +                       neighbor = lh.node;
63797 +                       h->result = zload(neighbor);
63798 +                       if (h->result != 0) {
63799 +                               result = LOOKUP_DONE;
63800 +                               break;
63801 +                       }
63802 +
63803 +                       nplug = neighbor->nplug;
63804 +
63805 +                       coord_init_zero(&crd);
63806 +                       bias = h->bias;
63807 +                       h->bias = FIND_EXACT;
63808 +                       h->result =
63809 +                           nplug->lookup(neighbor, h->key, h->bias, &crd);
63810 +                       h->bias = bias;
63811 +
63812 +                       if (h->result == NS_NOT_FOUND) {
63813 +       case -E_NO_NEIGHBOR:
63814 +                               h->result = CBK_COORD_FOUND;
63815 +                               if (!(h->flags & CBK_IN_CACHE))
63816 +                                       cbk_cache_add(node);
63817 +       default:                /* some other error */
63818 +                               result = LOOKUP_DONE;
63819 +                       } else if (h->result == NS_FOUND) {
63820 +                               read_lock_dk(znode_get_tree(neighbor));
63821 +                               h->rd_key = *znode_get_ld_key(node);
63822 +                               leftmost_key_in_node(neighbor, &h->ld_key);
63823 +                               read_unlock_dk(znode_get_tree(neighbor));
63824 +                               h->flags |= CBK_DKSET;
63825 +
63826 +                               h->block = *znode_get_block(neighbor);
63827 +                               /* clear coord -> node so that cbk_level_lookup()
63828 +                                  wouldn't overwrite parent hint in neighbor.
63829 +
63830 +                                  Parent hint was set up by
63831 +                                  reiser4_get_left_neighbor()
63832 +                                */
63833 +                               /* FIXME: why do we have to spinlock here? */
63834 +                               write_lock_tree(znode_get_tree(neighbor));
63835 +                               h->coord->node = NULL;
63836 +                               write_unlock_tree(znode_get_tree(neighbor));
63837 +                               result = LOOKUP_CONT;
63838 +                       } else {
63839 +                               result = LOOKUP_DONE;
63840 +                       }
63841 +                       if (neighbor != NULL)
63842 +                               zrelse(neighbor);
63843 +               }
63844 +       }
63845 +       done_lh(&lh);
63846 +       return result;
63847 +}
63848 +
63849 +/* debugging aid: return symbolic name of search bias */
63850 +static const char *bias_name(lookup_bias bias /* bias to get name of */ )
63851 +{
63852 +       if (bias == FIND_EXACT)
63853 +               return "exact";
63854 +       else if (bias == FIND_MAX_NOT_MORE_THAN)
63855 +               return "left-slant";
63856 +/*     else if( bias == RIGHT_SLANT_BIAS ) */
63857 +/*             return "right-bias"; */
63858 +       else {
63859 +               static char buf[30];
63860 +
63861 +               sprintf(buf, "unknown: %i", bias);
63862 +               return buf;
63863 +       }
63864 +}
63865 +
63866 +#if REISER4_DEBUG
63867 +/* debugging aid: print human readable information about @p */
63868 +void print_coord_content(const char *prefix /* prefix to print */ ,
63869 +                        coord_t * p /* coord to print */ )
63870 +{
63871 +       reiser4_key key;
63872 +
63873 +       if (p == NULL) {
63874 +               printk("%s: null\n", prefix);
63875 +               return;
63876 +       }
63877 +       if ((p->node != NULL) && znode_is_loaded(p->node)
63878 +           && coord_is_existing_item(p))
63879 +               printk("%s: data: %p, length: %i\n", prefix,
63880 +                      item_body_by_coord(p), item_length_by_coord(p));
63881 +       if (znode_is_loaded(p->node)) {
63882 +               item_key_by_coord(p, &key);
63883 +               reiser4_print_key(prefix, &key);
63884 +       }
63885 +}
63886 +
63887 +/* debugging aid: print human readable information about @block */
63888 +void reiser4_print_address(const char *prefix /* prefix to print */ ,
63889 +                  const reiser4_block_nr * block /* block number to print */ )
63890 +{
63891 +       printk("%s: %s\n", prefix, sprint_address(block));
63892 +}
63893 +#endif
63894 +
63895 +/* return string containing human readable representation of @block */
63896 +char *sprint_address(const reiser4_block_nr *
63897 +                    block /* block number to print */ )
63898 +{
63899 +       static char address[30];
63900 +
63901 +       if (block == NULL)
63902 +               sprintf(address, "null");
63903 +       else if (reiser4_blocknr_is_fake(block))
63904 +               sprintf(address, "%llx", (unsigned long long)(*block));
63905 +       else
63906 +               sprintf(address, "%llu", (unsigned long long)(*block));
63907 +       return address;
63908 +}
63909 +
63910 +/* release parent node during traversal */
63911 +static void put_parent(cbk_handle * h /* search handle */ )
63912 +{
63913 +       assert("nikita-383", h != NULL);
63914 +       if (h->parent_lh->node != NULL) {
63915 +               longterm_unlock_znode(h->parent_lh);
63916 +       }
63917 +}
63918 +
63919 +/* helper function used by coord_by_key(): release reference to parent znode
63920 +   stored in handle before processing its child. */
63921 +static void hput(cbk_handle * h /* search handle */ )
63922 +{
63923 +       assert("nikita-385", h != NULL);
63924 +       done_lh(h->parent_lh);
63925 +       done_lh(h->active_lh);
63926 +}
63927 +
63928 +/* Helper function used by cbk(): update delimiting keys of child node (stored
63929 +   in h->active_lh->node) using key taken from parent on the parent level. */
63930 +static int setup_delimiting_keys(cbk_handle * h /* search handle */ )
63931 +{
63932 +       znode *active;
63933 +       reiser4_tree *tree;
63934 +
63935 +       assert("nikita-1088", h != NULL);
63936 +
63937 +       active = h->active_lh->node;
63938 +
63939 +       /* fast check without taking dk lock. This is safe, because
63940 +        * JNODE_DKSET is never cleared once set. */
63941 +       if (!ZF_ISSET(active, JNODE_DKSET)) {
63942 +               tree = znode_get_tree(active);
63943 +               write_lock_dk(tree);
63944 +               if (!ZF_ISSET(active, JNODE_DKSET)) {
63945 +                       znode_set_ld_key(active, &h->ld_key);
63946 +                       znode_set_rd_key(active, &h->rd_key);
63947 +                       ZF_SET(active, JNODE_DKSET);
63948 +               }
63949 +               write_unlock_dk(tree);
63950 +               return 1;
63951 +       }
63952 +       return 0;
63953 +}
63954 +
63955 +/* true if @block makes sense for the @tree. Used to detect corrupted node
63956 + * pointers */
63957 +static int
63958 +block_nr_is_correct(reiser4_block_nr * block /* block number to check */ ,
63959 +                   reiser4_tree * tree /* tree to check against */ )
63960 +{
63961 +       assert("nikita-757", block != NULL);
63962 +       assert("nikita-758", tree != NULL);
63963 +
63964 +       /* check to see if it exceeds the size of the device. */
63965 +       return reiser4_blocknr_is_sane_for(tree->super, block);
63966 +}
63967 +
63968 +/* check consistency of fields */
63969 +static int sanity_check(cbk_handle * h /* search handle */ )
63970 +{
63971 +       assert("nikita-384", h != NULL);
63972 +
63973 +       if (h->level < h->stop_level) {
63974 +               h->error = "Buried under leaves";
63975 +               h->result = RETERR(-EIO);
63976 +               return LOOKUP_DONE;
63977 +       } else if (!block_nr_is_correct(&h->block, h->tree)) {
63978 +               h->error = "bad block number";
63979 +               h->result = RETERR(-EIO);
63980 +               return LOOKUP_DONE;
63981 +       } else
63982 +               return 0;
63983 +}
63984 +
63985 +/* Make Linus happy.
63986 +   Local variables:
63987 +   c-indentation-style: "K&R"
63988 +   mode-name: "LC"
63989 +   c-basic-offset: 8
63990 +   tab-width: 8
63991 +   fill-column: 120
63992 +   scroll-step: 1
63993 +   End:
63994 +*/
63995 diff -urN linux-2.6.27.orig/fs/reiser4/status_flags.c linux-2.6.27/fs/reiser4/status_flags.c
63996 --- linux-2.6.27.orig/fs/reiser4/status_flags.c 1970-01-01 03:00:00.000000000 +0300
63997 +++ linux-2.6.27/fs/reiser4/status_flags.c      2008-10-12 18:20:01.000000000 +0400
63998 @@ -0,0 +1,170 @@
63999 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64000 + * reiser4/README */
64001 +
64002 +/* Functions that deal with reiser4 status block, query status and update it, if needed */
64003 +
64004 +#include <linux/bio.h>
64005 +#include <linux/highmem.h>
64006 +#include <linux/fs.h>
64007 +#include <linux/blkdev.h>
64008 +#include "debug.h"
64009 +#include "dformat.h"
64010 +#include "status_flags.h"
64011 +#include "super.h"
64012 +
64013 +/* This is our end I/O handler that marks page uptodate if IO was successful. It also
64014 +   unconditionally unlocks the page, so we can see that io was done.
64015 +   We do not free bio, because we hope to reuse that. */
64016 +static void reiser4_status_endio(struct bio *bio, int err)
64017 +{
64018 +       if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
64019 +               SetPageUptodate(bio->bi_io_vec->bv_page);
64020 +       } else {
64021 +               ClearPageUptodate(bio->bi_io_vec->bv_page);
64022 +               SetPageError(bio->bi_io_vec->bv_page);
64023 +       }
64024 +       unlock_page(bio->bi_io_vec->bv_page);
64025 +}
64026 +
64027 +/* Initialise status code. This is expected to be called from the disk format
64028 +   code. block paremeter is where status block lives. */
64029 +int reiser4_status_init(reiser4_block_nr block)
64030 +{
64031 +       struct super_block *sb = reiser4_get_current_sb();
64032 +       struct reiser4_status *statuspage;
64033 +       struct bio *bio;
64034 +       struct page *page;
64035 +
64036 +       get_super_private(sb)->status_page = NULL;
64037 +       get_super_private(sb)->status_bio = NULL;
64038 +
64039 +       page = alloc_pages(reiser4_ctx_gfp_mask_get(), 0);
64040 +       if (!page)
64041 +               return -ENOMEM;
64042 +
64043 +       bio = bio_alloc(reiser4_ctx_gfp_mask_get(), 1);
64044 +       if (bio != NULL) {
64045 +               bio->bi_sector = block * (sb->s_blocksize >> 9);
64046 +               bio->bi_bdev = sb->s_bdev;
64047 +               bio->bi_io_vec[0].bv_page = page;
64048 +               bio->bi_io_vec[0].bv_len = sb->s_blocksize;
64049 +               bio->bi_io_vec[0].bv_offset = 0;
64050 +               bio->bi_vcnt = 1;
64051 +               bio->bi_size = sb->s_blocksize;
64052 +               bio->bi_end_io = reiser4_status_endio;
64053 +       } else {
64054 +               __free_pages(page, 0);
64055 +               return -ENOMEM;
64056 +       }
64057 +       lock_page(page);
64058 +       submit_bio(READ, bio);
64059 +       blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
64060 +       wait_on_page_locked(page);
64061 +       if (!PageUptodate(page)) {
64062 +               warning("green-2007",
64063 +                       "I/O error while tried to read status page\n");
64064 +               return -EIO;
64065 +       }
64066 +
64067 +       statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0);
64068 +       if (memcmp
64069 +           (statuspage->magic, REISER4_STATUS_MAGIC,
64070 +            sizeof(REISER4_STATUS_MAGIC))) {
64071 +               /* Magic does not match. */
64072 +               kunmap_atomic((char *)statuspage, KM_USER0);
64073 +               warning("green-2008", "Wrong magic in status block\n");
64074 +               __free_pages(page, 0);
64075 +               bio_put(bio);
64076 +               return -EINVAL;
64077 +       }
64078 +       kunmap_atomic((char *)statuspage, KM_USER0);
64079 +
64080 +       get_super_private(sb)->status_page = page;
64081 +       get_super_private(sb)->status_bio = bio;
64082 +       return 0;
64083 +}
64084 +
64085 +/* Query the status of fs. Returns if the FS can be safely mounted.
64086 +   Also if "status" and "extended" parameters are given, it will fill
64087 +   actual parts of status from disk there. */
64088 +int reiser4_status_query(u64 * status, u64 * extended)
64089 +{
64090 +       struct super_block *sb = reiser4_get_current_sb();
64091 +       struct reiser4_status *statuspage;
64092 +       int retval;
64093 +
64094 +       if (!get_super_private(sb)->status_page) {      // No status page?
64095 +               return REISER4_STATUS_MOUNT_UNKNOWN;
64096 +       }
64097 +       statuspage = (struct reiser4_status *)
64098 +           kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
64099 +       switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) {        // FIXME: this cast is a hack for 32 bit arches to work.
64100 +       case REISER4_STATUS_OK:
64101 +               retval = REISER4_STATUS_MOUNT_OK;
64102 +               break;
64103 +       case REISER4_STATUS_CORRUPTED:
64104 +               retval = REISER4_STATUS_MOUNT_WARN;
64105 +               break;
64106 +       case REISER4_STATUS_DAMAGED:
64107 +       case REISER4_STATUS_DESTROYED:
64108 +       case REISER4_STATUS_IOERROR:
64109 +               retval = REISER4_STATUS_MOUNT_RO;
64110 +               break;
64111 +       default:
64112 +               retval = REISER4_STATUS_MOUNT_UNKNOWN;
64113 +               break;
64114 +       }
64115 +
64116 +       if (status)
64117 +               *status = le64_to_cpu(get_unaligned(&statuspage->status));
64118 +       if (extended)
64119 +               *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status));
64120 +
64121 +       kunmap_atomic((char *)statuspage, KM_USER0);
64122 +       return retval;
64123 +}
64124 +
64125 +/* This function should be called when something bad happens (e.g. from reiser4_panic).
64126 +   It fills the status structure and tries to push it to disk. */
64127 +int reiser4_status_write(__u64 status, __u64 extended_status, char *message)
64128 +{
64129 +       struct super_block *sb = reiser4_get_current_sb();
64130 +       struct reiser4_status *statuspage;
64131 +       struct bio *bio = get_super_private(sb)->status_bio;
64132 +
64133 +       if (!get_super_private(sb)->status_page) {      // No status page?
64134 +               return -1;
64135 +       }
64136 +       statuspage = (struct reiser4_status *)
64137 +           kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
64138 +
64139 +       put_unaligned(cpu_to_le64(status), &statuspage->status);
64140 +       put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status);
64141 +       strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN);
64142 +
64143 +       kunmap_atomic((char *)statuspage, KM_USER0);
64144 +       bio->bi_bdev = sb->s_bdev;
64145 +       bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page;
64146 +       bio->bi_io_vec[0].bv_len = sb->s_blocksize;
64147 +       bio->bi_io_vec[0].bv_offset = 0;
64148 +       bio->bi_vcnt = 1;
64149 +       bio->bi_size = sb->s_blocksize;
64150 +       bio->bi_end_io = reiser4_status_endio;
64151 +       lock_page(get_super_private(sb)->status_page);  // Safe as nobody should touch our page.
64152 +       /* We can block now, but we have no other choice anyway */
64153 +       submit_bio(WRITE, bio);
64154 +       blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
64155 +       return 0;               // We do not wait for io to finish.
64156 +}
64157 +
64158 +/* Frees the page with status and bio structure. Should be called by disk format at umount time */
64159 +int reiser4_status_finish(void)
64160 +{
64161 +       struct super_block *sb = reiser4_get_current_sb();
64162 +
64163 +       __free_pages(get_super_private(sb)->status_page, 0);
64164 +       get_super_private(sb)->status_page = NULL;
64165 +       bio_put(get_super_private(sb)->status_bio);
64166 +       get_super_private(sb)->status_bio = NULL;
64167 +       return 0;
64168 +}
64169 diff -urN linux-2.6.27.orig/fs/reiser4/status_flags.h linux-2.6.27/fs/reiser4/status_flags.h
64170 --- linux-2.6.27.orig/fs/reiser4/status_flags.h 1970-01-01 03:00:00.000000000 +0300
64171 +++ linux-2.6.27/fs/reiser4/status_flags.h      2008-10-12 18:20:01.000000000 +0400
64172 @@ -0,0 +1,43 @@
64173 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64174 + * reiser4/README */
64175 +
64176 +/* Here we declare structures and flags that store reiser4 status on disk.
64177 +   The status that helps us to find out if the filesystem is valid or if it
64178 +   contains some critical, or not so critical errors */
64179 +
64180 +#if !defined( __REISER4_STATUS_FLAGS_H__ )
64181 +#define __REISER4_STATUS_FLAGS_H__
64182 +
64183 +#include "dformat.h"
64184 +/* These are major status flags */
64185 +#define REISER4_STATUS_OK 0
64186 +#define REISER4_STATUS_CORRUPTED 0x1
64187 +#define REISER4_STATUS_DAMAGED 0x2
64188 +#define REISER4_STATUS_DESTROYED 0x4
64189 +#define REISER4_STATUS_IOERROR 0x8
64190 +
64191 +/* Return values for reiser4_status_query() */
64192 +#define REISER4_STATUS_MOUNT_OK 0
64193 +#define REISER4_STATUS_MOUNT_WARN 1
64194 +#define REISER4_STATUS_MOUNT_RO 2
64195 +#define REISER4_STATUS_MOUNT_UNKNOWN -1
64196 +
64197 +#define REISER4_TEXTERROR_LEN 256
64198 +
64199 +#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl"
64200 +/* We probably need to keep its size under sector size which is 512 bytes */
64201 +struct reiser4_status {
64202 +       char magic[16];
64203 +       d64 status;             /* Current FS state */
64204 +       d64 extended_status;    /* Any additional info that might have sense in addition to "status". E.g.
64205 +                                  last sector where io error happened if status is "io error encountered" */
64206 +       d64 stacktrace[10];     /* Last ten functional calls made (addresses) */
64207 +       char texterror[REISER4_TEXTERROR_LEN];  /* Any error message if appropriate, otherwise filled with zeroes */
64208 +};
64209 +
64210 +int reiser4_status_init(reiser4_block_nr block);
64211 +int reiser4_status_query(u64 * status, u64 * extended);
64212 +int reiser4_status_write(u64 status, u64 extended_status, char *message);
64213 +int reiser4_status_finish(void);
64214 +
64215 +#endif
64216 diff -urN linux-2.6.27.orig/fs/reiser4/super.c linux-2.6.27/fs/reiser4/super.c
64217 --- linux-2.6.27.orig/fs/reiser4/super.c        1970-01-01 03:00:00.000000000 +0300
64218 +++ linux-2.6.27/fs/reiser4/super.c     2008-10-12 18:20:01.000000000 +0400
64219 @@ -0,0 +1,316 @@
64220 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
64221 + * reiser4/README */
64222 +
64223 +/* Super-block manipulations. */
64224 +
64225 +#include "debug.h"
64226 +#include "dformat.h"
64227 +#include "key.h"
64228 +#include "plugin/security/perm.h"
64229 +#include "plugin/space/space_allocator.h"
64230 +#include "plugin/plugin.h"
64231 +#include "tree.h"
64232 +#include "vfs_ops.h"
64233 +#include "super.h"
64234 +#include "reiser4.h"
64235 +
64236 +#include <linux/types.h>       /* for __u??  */
64237 +#include <linux/fs.h>          /* for struct super_block  */
64238 +
64239 +static __u64 reserved_for_gid(const struct super_block *super, gid_t gid);
64240 +static __u64 reserved_for_uid(const struct super_block *super, uid_t uid);
64241 +static __u64 reserved_for_root(const struct super_block *super);
64242 +
64243 +/* Return reiser4-specific part of super block */
64244 +reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super     /* super block
64245 +                                                                                        * queried */ )
64246 +{
64247 +       return (reiser4_super_info_data *) super->s_fs_info;
64248 +}
64249 +
64250 +/* Return reiser4 fstype: value that is returned in ->f_type field by statfs() */
64251 +long reiser4_statfs_type(const struct super_block *super UNUSED_ARG)
64252 +{
64253 +       assert("nikita-448", super != NULL);
64254 +       assert("nikita-449", is_reiser4_super(super));
64255 +       return (long)REISER4_SUPER_MAGIC;
64256 +}
64257 +
64258 +/* functions to read/modify fields of reiser4_super_info_data */
64259 +
64260 +/* get number of blocks in file system */
64261 +__u64 reiser4_block_count(const struct super_block *super      /* super block
64262 +                                                                  queried */ )
64263 +{
64264 +       assert("vs-494", super != NULL);
64265 +       assert("vs-495", is_reiser4_super(super));
64266 +       return get_super_private(super)->block_count;
64267 +}
64268 +
64269 +#if REISER4_DEBUG
64270 +/*
64271 + * number of blocks in the current file system
64272 + */
64273 +__u64 reiser4_current_block_count(void)
64274 +{
64275 +       return get_current_super_private()->block_count;
64276 +}
64277 +#endif  /*  REISER4_DEBUG  */
64278 +
64279 +/* set number of block in filesystem */
64280 +void reiser4_set_block_count(const struct super_block *super, __u64 nr)
64281 +{
64282 +       assert("vs-501", super != NULL);
64283 +       assert("vs-502", is_reiser4_super(super));
64284 +       get_super_private(super)->block_count = nr;
64285 +       /*
64286 +        * The proper calculation of the reserved space counter (%5 of device
64287 +        * block counter) we need a 64 bit division which is missing in Linux
64288 +        * on i386 platform. Because we do not need a precise calculation here
64289 +        * we can replace a div64 operation by this combination of
64290 +        * multiplication and shift: 51. / (2^10) == .0498 .
64291 +        * FIXME: this is a bug. It comes up only for very small filesystems
64292 +        * which probably are never used. Nevertheless, it is a bug. Number of
64293 +        * reserved blocks must be not less than maximal number of blocks which
64294 +        * get grabbed with BA_RESERVED.
64295 +        */
64296 +       get_super_private(super)->blocks_reserved = ((nr * 51) >> 10);
64297 +}
64298 +
64299 +/* amount of blocks used (allocated for data) in file system */
64300 +__u64 reiser4_data_blocks(const struct super_block *super      /* super block
64301 +                                                                  queried */ )
64302 +{
64303 +       assert("nikita-452", super != NULL);
64304 +       assert("nikita-453", is_reiser4_super(super));
64305 +       return get_super_private(super)->blocks_used;
64306 +}
64307 +
64308 +/* set number of block used in filesystem */
64309 +void reiser4_set_data_blocks(const struct super_block *super, __u64 nr)
64310 +{
64311 +       assert("vs-503", super != NULL);
64312 +       assert("vs-504", is_reiser4_super(super));
64313 +       get_super_private(super)->blocks_used = nr;
64314 +}
64315 +
64316 +/* amount of free blocks in file system */
64317 +__u64 reiser4_free_blocks(const struct super_block *super      /* super block
64318 +                                                                  queried */ )
64319 +{
64320 +       assert("nikita-454", super != NULL);
64321 +       assert("nikita-455", is_reiser4_super(super));
64322 +       return get_super_private(super)->blocks_free;
64323 +}
64324 +
64325 +/* set number of blocks free in filesystem */
64326 +void reiser4_set_free_blocks(const struct super_block *super, __u64 nr)
64327 +{
64328 +       assert("vs-505", super != NULL);
64329 +       assert("vs-506", is_reiser4_super(super));
64330 +       get_super_private(super)->blocks_free = nr;
64331 +}
64332 +
64333 +/* get mkfs unique identifier */
64334 +__u32 reiser4_mkfs_id(const struct super_block *super  /* super block
64335 +                                                          queried */ )
64336 +{
64337 +       assert("vpf-221", super != NULL);
64338 +       assert("vpf-222", is_reiser4_super(super));
64339 +       return get_super_private(super)->mkfs_id;
64340 +}
64341 +
64342 +/* amount of free blocks in file system */
64343 +__u64 reiser4_free_committed_blocks(const struct super_block *super)
64344 +{
64345 +       assert("vs-497", super != NULL);
64346 +       assert("vs-498", is_reiser4_super(super));
64347 +       return get_super_private(super)->blocks_free_committed;
64348 +}
64349 +
64350 +/* amount of blocks in the file system reserved for @uid and @gid */
64351 +long reiser4_reserved_blocks(const struct super_block *super   /* super block
64352 +                                                                  queried */ ,
64353 +                            uid_t uid /* user id */ ,
64354 +                            gid_t gid /* group id */ )
64355 +{
64356 +       long reserved;
64357 +
64358 +       assert("nikita-456", super != NULL);
64359 +       assert("nikita-457", is_reiser4_super(super));
64360 +
64361 +       reserved = 0;
64362 +       if (REISER4_SUPPORT_GID_SPACE_RESERVATION)
64363 +               reserved += reserved_for_gid(super, gid);
64364 +       if (REISER4_SUPPORT_UID_SPACE_RESERVATION)
64365 +               reserved += reserved_for_uid(super, uid);
64366 +       if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0))
64367 +               reserved += reserved_for_root(super);
64368 +       return reserved;
64369 +}
64370 +
64371 +/* get/set value of/to grabbed blocks counter */
64372 +__u64 reiser4_grabbed_blocks(const struct super_block * super)
64373 +{
64374 +       assert("zam-512", super != NULL);
64375 +       assert("zam-513", is_reiser4_super(super));
64376 +
64377 +       return get_super_private(super)->blocks_grabbed;
64378 +}
64379 +
64380 +__u64 reiser4_flush_reserved(const struct super_block * super)
64381 +{
64382 +       assert("vpf-285", super != NULL);
64383 +       assert("vpf-286", is_reiser4_super(super));
64384 +
64385 +       return get_super_private(super)->blocks_flush_reserved;
64386 +}
64387 +
64388 +/* get/set value of/to counter of fake allocated formatted blocks */
64389 +__u64 reiser4_fake_allocated(const struct super_block * super)
64390 +{
64391 +       assert("zam-516", super != NULL);
64392 +       assert("zam-517", is_reiser4_super(super));
64393 +
64394 +       return get_super_private(super)->blocks_fake_allocated;
64395 +}
64396 +
64397 +/* get/set value of/to counter of fake allocated unformatted blocks */
64398 +__u64 reiser4_fake_allocated_unformatted(const struct super_block * super)
64399 +{
64400 +       assert("zam-516", super != NULL);
64401 +       assert("zam-517", is_reiser4_super(super));
64402 +
64403 +       return get_super_private(super)->blocks_fake_allocated_unformatted;
64404 +}
64405 +
64406 +/* get/set value of/to counter of clustered blocks */
64407 +__u64 reiser4_clustered_blocks(const struct super_block * super)
64408 +{
64409 +       assert("edward-601", super != NULL);
64410 +       assert("edward-602", is_reiser4_super(super));
64411 +
64412 +       return get_super_private(super)->blocks_clustered;
64413 +}
64414 +
64415 +/* space allocator used by this file system */
64416 +reiser4_space_allocator * reiser4_get_space_allocator(const struct super_block
64417 +                                                     *super)
64418 +{
64419 +       assert("nikita-1965", super != NULL);
64420 +       assert("nikita-1966", is_reiser4_super(super));
64421 +       return &get_super_private(super)->space_allocator;
64422 +}
64423 +
64424 +/* return fake inode used to bind formatted nodes in the page cache */
64425 +struct inode *reiser4_get_super_fake(const struct super_block *super   /* super block
64426 +                                                                  queried */ )
64427 +{
64428 +       assert("nikita-1757", super != NULL);
64429 +       return get_super_private(super)->fake;
64430 +}
64431 +
64432 +/* return fake inode used to bind copied on capture nodes in the page cache */
64433 +struct inode *reiser4_get_cc_fake(const struct super_block *super      /* super block
64434 +                                                                  queried */ )
64435 +{
64436 +       assert("nikita-1757", super != NULL);
64437 +       return get_super_private(super)->cc;
64438 +}
64439 +
64440 +/* return fake inode used to bind bitmaps and journlal heads */
64441 +struct inode *reiser4_get_bitmap_fake(const struct super_block *super)
64442 +{
64443 +       assert("nikita-17571", super != NULL);
64444 +       return get_super_private(super)->bitmap;
64445 +}
64446 +
64447 +/* tree used by this file system */
64448 +reiser4_tree *reiser4_get_tree(const struct super_block * super        /* super block
64449 +                                                        * queried */ )
64450 +{
64451 +       assert("nikita-460", super != NULL);
64452 +       assert("nikita-461", is_reiser4_super(super));
64453 +       return &get_super_private(super)->tree;
64454 +}
64455 +
64456 +/* Check that @super is (looks like) reiser4 super block. This is mainly for
64457 +   use in assertions. */
64458 +int is_reiser4_super(const struct super_block *super   /* super block
64459 +                                                        * queried */ )
64460 +{
64461 +       return
64462 +           super != NULL &&
64463 +           get_super_private(super) != NULL &&
64464 +           super->s_op == &(get_super_private(super)->ops.super);
64465 +}
64466 +
64467 +int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f)
64468 +{
64469 +       return test_bit((int)f, &get_super_private(super)->fs_flags);
64470 +}
64471 +
64472 +/* amount of blocks reserved for given group in file system */
64473 +static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG       /* super
64474 +                                                                                * block
64475 +                                                                                * queried */ ,
64476 +                             gid_t gid UNUSED_ARG /* group id */ )
64477 +{
64478 +       return 0;
64479 +}
64480 +
64481 +/* amount of blocks reserved for given user in file system */
64482 +static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG       /* super
64483 +                                                                                  block
64484 +                                                                                  queried */ ,
64485 +                             uid_t uid UNUSED_ARG /* user id */ )
64486 +{
64487 +       return 0;
64488 +}
64489 +
64490 +/* amount of blocks reserved for super user in file system */
64491 +static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG      /* super
64492 +                                                                                  block
64493 +                                                                                  queried */ )
64494 +{
64495 +       return 0;
64496 +}
64497 +
64498 +/*
64499 + * true if block number @blk makes sense for the file system at @super.
64500 + */
64501 +int
64502 +reiser4_blocknr_is_sane_for(const struct super_block *super,
64503 +                           const reiser4_block_nr * blk)
64504 +{
64505 +       reiser4_super_info_data *sbinfo;
64506 +
64507 +       assert("nikita-2957", super != NULL);
64508 +       assert("nikita-2958", blk != NULL);
64509 +
64510 +       if (reiser4_blocknr_is_fake(blk))
64511 +               return 1;
64512 +
64513 +       sbinfo = get_super_private(super);
64514 +       return *blk < sbinfo->block_count;
64515 +}
64516 +
64517 +#if REISER4_DEBUG
64518 +/*
64519 + * true, if block number @blk makes sense for the current file system
64520 + */
64521 +int reiser4_blocknr_is_sane(const reiser4_block_nr * blk)
64522 +{
64523 +       return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk);
64524 +}
64525 +#endif  /*  REISER4_DEBUG  */
64526 +
64527 +/* Make Linus happy.
64528 +   Local variables:
64529 +   c-indentation-style: "K&R"
64530 +   mode-name: "LC"
64531 +   c-basic-offset: 8
64532 +   tab-width: 8
64533 +   fill-column: 120
64534 +   End:
64535 +*/
64536 diff -urN linux-2.6.27.orig/fs/reiser4/super.h linux-2.6.27/fs/reiser4/super.h
64537 --- linux-2.6.27.orig/fs/reiser4/super.h        1970-01-01 03:00:00.000000000 +0300
64538 +++ linux-2.6.27/fs/reiser4/super.h     2008-10-12 18:20:01.000000000 +0400
64539 @@ -0,0 +1,466 @@
64540 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
64541 + * reiser4/README */
64542 +
64543 +/* Super-block functions. See super.c for details. */
64544 +
64545 +#if !defined( __REISER4_SUPER_H__ )
64546 +#define __REISER4_SUPER_H__
64547 +
64548 +#include <linux/exportfs.h>
64549 +
64550 +#include "tree.h"
64551 +#include "entd.h"
64552 +#include "wander.h"
64553 +#include "fsdata.h"
64554 +#include "plugin/object.h"
64555 +#include "plugin/space/space_allocator.h"
64556 +
64557 +/*
64558 + * Flush algorithms parameters.
64559 + */
64560 +struct flush_params {
64561 +       unsigned relocate_threshold;
64562 +       unsigned relocate_distance;
64563 +       unsigned written_threshold;
64564 +       unsigned scan_maxnodes;
64565 +};
64566 +
64567 +typedef enum {
64568 +       /*
64569 +        * True if this file system doesn't support hard-links (multiple names)
64570 +        * for directories: this is default UNIX behavior.
64571 +        *
64572 +        * If hard-links on directoires are not allowed, file system is Acyclic
64573 +        * Directed Graph (modulo dot, and dotdot, of course).
64574 +        *
64575 +        * This is used by reiser4_link().
64576 +        */
64577 +       REISER4_ADG = 0,
64578 +       /*
64579 +        * set if all nodes in internal tree have the same node layout plugin.
64580 +        * If so, znode_guess_plugin() will return tree->node_plugin in stead
64581 +        * of guessing plugin by plugin id stored in the node.
64582 +        */
64583 +       REISER4_ONE_NODE_PLUGIN = 1,
64584 +       /* if set, bsd gid assignment is supported. */
64585 +       REISER4_BSD_GID = 2,
64586 +       /* [mac]_time are 32 bit in inode */
64587 +       REISER4_32_BIT_TIMES = 3,
64588 +       /* load all bitmap blocks at mount time */
64589 +       REISER4_DONT_LOAD_BITMAP = 5,
64590 +       /* enforce atomicity during write(2) */
64591 +       REISER4_ATOMIC_WRITE = 6,
64592 +       /* don't use write barriers in the log writer code. */
64593 +       REISER4_NO_WRITE_BARRIER = 7
64594 +} reiser4_fs_flag;
64595 +
64596 +/*
64597 + * VFS related operation vectors.
64598 + */
64599 +struct object_ops {
64600 +       struct super_operations super;
64601 +       struct dentry_operations dentry;
64602 +       struct export_operations export;
64603 +};
64604 +
64605 +/* reiser4-specific part of super block
64606 +
64607 +   Locking
64608 +
64609 +   Fields immutable after mount:
64610 +
64611 +    ->oid*
64612 +    ->space*
64613 +    ->default_[ug]id
64614 +    ->mkfs_id
64615 +    ->trace_flags
64616 +    ->debug_flags
64617 +    ->fs_flags
64618 +    ->df_plug
64619 +    ->optimal_io_size
64620 +    ->plug
64621 +    ->flush
64622 +    ->u (bad name)
64623 +    ->txnmgr
64624 +    ->ra_params
64625 +    ->fsuid
64626 +    ->journal_header
64627 +    ->journal_footer
64628 +
64629 +   Fields protected by ->lnode_guard
64630 +
64631 +    ->lnode_htable
64632 +
64633 +   Fields protected by per-super block spin lock
64634 +
64635 +    ->block_count
64636 +    ->blocks_used
64637 +    ->blocks_free
64638 +    ->blocks_free_committed
64639 +    ->blocks_grabbed
64640 +    ->blocks_fake_allocated_unformatted
64641 +    ->blocks_fake_allocated
64642 +    ->blocks_flush_reserved
64643 +    ->eflushed
64644 +    ->blocknr_hint_default
64645 +
64646 +   After journal replaying during mount,
64647 +
64648 +    ->last_committed_tx
64649 +
64650 +   is protected by ->tmgr.commit_mutex
64651 +
64652 +   Invariants involving this data-type:
64653 +
64654 +      [sb-block-counts]
64655 +      [sb-grabbed]
64656 +      [sb-fake-allocated]
64657 +*/
64658 +struct reiser4_super_info_data {
64659 +       /*
64660 +        * guard spinlock which protects reiser4 super block fields (currently
64661 +        * blocks_free, blocks_free_committed)
64662 +        */
64663 +       spinlock_t guard;
64664 +
64665 +       /* next oid that will be returned by oid_allocate() */
64666 +       oid_t next_to_use;
64667 +       /* total number of used oids */
64668 +       oid_t oids_in_use;
64669 +
64670 +       /* space manager plugin */
64671 +       reiser4_space_allocator space_allocator;
64672 +
64673 +       /* reiser4 internal tree */
64674 +       reiser4_tree tree;
64675 +
64676 +       /*
64677 +        * default user id used for light-weight files without their own
64678 +        * stat-data.
64679 +        */
64680 +       uid_t default_uid;
64681 +
64682 +       /*
64683 +        * default group id used for light-weight files without their own
64684 +        * stat-data.
64685 +        */
64686 +       gid_t default_gid;
64687 +
64688 +       /* mkfs identifier generated at mkfs time. */
64689 +       __u32 mkfs_id;
64690 +       /* amount of blocks in a file system */
64691 +       __u64 block_count;
64692 +
64693 +       /* inviolable reserve */
64694 +       __u64 blocks_reserved;
64695 +
64696 +       /* amount of blocks used by file system data and meta-data. */
64697 +       __u64 blocks_used;
64698 +
64699 +       /*
64700 +        * amount of free blocks. This is "working" free blocks counter. It is
64701 +        * like "working" bitmap, please see block_alloc.c for description.
64702 +        */
64703 +       __u64 blocks_free;
64704 +
64705 +       /*
64706 +        * free block count for fs committed state. This is "commit" version of
64707 +        * free block counter.
64708 +        */
64709 +       __u64 blocks_free_committed;
64710 +
64711 +       /*
64712 +        * number of blocks reserved for further allocation, for all
64713 +        * threads.
64714 +        */
64715 +       __u64 blocks_grabbed;
64716 +
64717 +       /* number of fake allocated unformatted blocks in tree. */
64718 +       __u64 blocks_fake_allocated_unformatted;
64719 +
64720 +       /* number of fake allocated formatted blocks in tree. */
64721 +       __u64 blocks_fake_allocated;
64722 +
64723 +       /* number of blocks reserved for flush operations. */
64724 +       __u64 blocks_flush_reserved;
64725 +
64726 +       /* number of blocks reserved for cluster operations. */
64727 +       __u64 blocks_clustered;
64728 +
64729 +       /* unique file-system identifier */
64730 +       __u32 fsuid;
64731 +
64732 +       /* On-disk format version. If does not equal to the disk_format
64733 +          plugin version, some format updates (e.g. enlarging plugin
64734 +          set, etc) may have place on mount. */
64735 +       int version;
64736 +
64737 +       /* file-system wide flags. See reiser4_fs_flag enum */
64738 +       unsigned long fs_flags;
64739 +
64740 +       /* transaction manager */
64741 +       txn_mgr tmgr;
64742 +
64743 +       /* ent thread */
64744 +       entd_context entd;
64745 +
64746 +       /* fake inode used to bind formatted nodes */
64747 +       struct inode *fake;
64748 +       /* inode used to bind bitmaps (and journal heads) */
64749 +       struct inode *bitmap;
64750 +       /* inode used to bind copied on capture nodes */
64751 +       struct inode *cc;
64752 +
64753 +       /* disk layout plugin */
64754 +       disk_format_plugin *df_plug;
64755 +
64756 +       /* disk layout specific part of reiser4 super info data */
64757 +       union {
64758 +               format40_super_info format40;
64759 +       } u;
64760 +
64761 +       /* value we return in st_blksize on stat(2) */
64762 +       unsigned long optimal_io_size;
64763 +
64764 +       /* parameters for the flush algorithm */
64765 +       struct flush_params flush;
64766 +
64767 +       /* pointers to jnodes for journal header and footer */
64768 +       jnode *journal_header;
64769 +       jnode *journal_footer;
64770 +
64771 +       journal_location jloc;
64772 +
64773 +       /* head block number of last committed transaction */
64774 +       __u64 last_committed_tx;
64775 +
64776 +       /*
64777 +        * we remember last written location for using as a hint for new block
64778 +        * allocation
64779 +        */
64780 +       __u64 blocknr_hint_default;
64781 +
64782 +       /* committed number of files (oid allocator state variable ) */
64783 +       __u64 nr_files_committed;
64784 +
64785 +       struct formatted_ra_params ra_params;
64786 +
64787 +       /*
64788 +        * A mutex for serializing cut tree operation if out-of-free-space:
64789 +        * the only one cut_tree thread is allowed to grab space from reserved
64790 +        * area (it is 5% of disk space)
64791 +        */
64792 +       struct mutex delete_mutex;
64793 +       /* task owning ->delete_mutex */
64794 +       struct task_struct *delete_mutex_owner;
64795 +
64796 +       /* Diskmap's blocknumber */
64797 +       __u64 diskmap_block;
64798 +
64799 +       /* What to do in case of error */
64800 +       int onerror;
64801 +
64802 +       /* operations for objects on this file system */
64803 +       struct object_ops ops;
64804 +
64805 +       /*
64806 +        * structure to maintain d_cursors. See plugin/file_ops_readdir.c for
64807 +        * more details
64808 +        */
64809 +       struct d_cursor_info d_info;
64810 +
64811 +#ifdef CONFIG_REISER4_BADBLOCKS
64812 +       /* Alternative master superblock offset (in bytes) */
64813 +       unsigned long altsuper;
64814 +#endif
64815 +       struct repacker *repacker;
64816 +       struct page *status_page;
64817 +       struct bio *status_bio;
64818 +
64819 +#if REISER4_DEBUG
64820 +       /*
64821 +        * minimum used blocks value (includes super blocks, bitmap blocks and
64822 +        * other fs reserved areas), depends on fs format and fs size.
64823 +        */
64824 +       __u64 min_blocks_used;
64825 +
64826 +       /*
64827 +        * when debugging is on, all jnodes (including znodes, bitmaps, etc.)
64828 +        * are kept on a list anchored at sbinfo->all_jnodes. This list is
64829 +        * protected by sbinfo->all_guard spin lock. This lock should be taken
64830 +        * with _irq modifier, because it is also modified from interrupt
64831 +        * contexts (by RCU).
64832 +        */
64833 +       spinlock_t all_guard;
64834 +       /* list of all jnodes */
64835 +       struct list_head all_jnodes;
64836 +#endif
64837 +       struct dentry *debugfs_root;
64838 +};
64839 +
64840 +extern reiser4_super_info_data *get_super_private_nocheck(const struct
64841 +                                                         super_block *super);
64842 +
64843 +/* Return reiser4-specific part of super block */
64844 +static inline reiser4_super_info_data *get_super_private(const struct
64845 +                                                        super_block *super)
64846 +{
64847 +       assert("nikita-447", super != NULL);
64848 +
64849 +       return (reiser4_super_info_data *) super->s_fs_info;
64850 +}
64851 +
64852 +/* get ent context for the @super */
64853 +static inline entd_context *get_entd_context(struct super_block *super)
64854 +{
64855 +       return &get_super_private(super)->entd;
64856 +}
64857 +
64858 +/* "Current" super-block: main super block used during current system
64859 +   call. Reference to this super block is stored in reiser4_context. */
64860 +static inline struct super_block *reiser4_get_current_sb(void)
64861 +{
64862 +       return get_current_context()->super;
64863 +}
64864 +
64865 +/* Reiser4-specific part of "current" super-block: main super block used
64866 +   during current system call. Reference to this super block is stored in
64867 +   reiser4_context. */
64868 +static inline reiser4_super_info_data *get_current_super_private(void)
64869 +{
64870 +       return get_super_private(reiser4_get_current_sb());
64871 +}
64872 +
64873 +static inline struct formatted_ra_params *get_current_super_ra_params(void)
64874 +{
64875 +       return &(get_current_super_private()->ra_params);
64876 +}
64877 +
64878 +/*
64879 + * true, if file system on @super is read-only
64880 + */
64881 +static inline int rofs_super(struct super_block *super)
64882 +{
64883 +       return super->s_flags & MS_RDONLY;
64884 +}
64885 +
64886 +/*
64887 + * true, if @tree represents read-only file system
64888 + */
64889 +static inline int rofs_tree(reiser4_tree * tree)
64890 +{
64891 +       return rofs_super(tree->super);
64892 +}
64893 +
64894 +/*
64895 + * true, if file system where @inode lives on, is read-only
64896 + */
64897 +static inline int rofs_inode(struct inode *inode)
64898 +{
64899 +       return rofs_super(inode->i_sb);
64900 +}
64901 +
64902 +/*
64903 + * true, if file system where @node lives on, is read-only
64904 + */
64905 +static inline int rofs_jnode(jnode * node)
64906 +{
64907 +       return rofs_tree(jnode_get_tree(node));
64908 +}
64909 +
64910 +extern __u64 reiser4_current_block_count(void);
64911 +
64912 +extern void build_object_ops(struct super_block *super, struct object_ops * ops);
64913 +
64914 +#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */
64915 +
64916 +static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo)
64917 +{
64918 +       spin_lock(&(sbinfo->guard));
64919 +}
64920 +
64921 +static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo)
64922 +{
64923 +       assert_spin_locked(&(sbinfo->guard));
64924 +       spin_unlock(&(sbinfo->guard));
64925 +}
64926 +
64927 +extern __u64 reiser4_flush_reserved(const struct super_block *);
64928 +extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f);
64929 +extern long reiser4_statfs_type(const struct super_block *super);
64930 +extern __u64 reiser4_block_count(const struct super_block *super);
64931 +extern void reiser4_set_block_count(const struct super_block *super, __u64 nr);
64932 +extern __u64 reiser4_data_blocks(const struct super_block *super);
64933 +extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr);
64934 +extern __u64 reiser4_free_blocks(const struct super_block *super);
64935 +extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr);
64936 +extern __u32 reiser4_mkfs_id(const struct super_block *super);
64937 +
64938 +extern __u64 reiser4_free_committed_blocks(const struct super_block *super);
64939 +
64940 +extern __u64 reiser4_grabbed_blocks(const struct super_block *);
64941 +extern __u64 reiser4_fake_allocated(const struct super_block *);
64942 +extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *);
64943 +extern __u64 reiser4_clustered_blocks(const struct super_block *);
64944 +
64945 +extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid,
64946 +                                   gid_t gid);
64947 +
64948 +extern reiser4_space_allocator *
64949 +reiser4_get_space_allocator(const struct super_block *super);
64950 +extern reiser4_oid_allocator *
64951 +reiser4_get_oid_allocator(const struct super_block *super);
64952 +extern struct inode *reiser4_get_super_fake(const struct super_block *super);
64953 +extern struct inode *reiser4_get_cc_fake(const struct super_block *super);
64954 +extern struct inode *reiser4_get_bitmap_fake(const struct super_block *super);
64955 +extern reiser4_tree *reiser4_get_tree(const struct super_block *super);
64956 +extern int is_reiser4_super(const struct super_block *super);
64957 +
64958 +extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk);
64959 +extern int reiser4_blocknr_is_sane_for(const struct super_block *super,
64960 +                                      const reiser4_block_nr * blk);
64961 +extern int reiser4_fill_super(struct super_block *s, void *data, int silent);
64962 +extern int reiser4_done_super(struct super_block *s);
64963 +
64964 +/* step of fill super */
64965 +extern int reiser4_init_fs_info(struct super_block *);
64966 +extern void reiser4_done_fs_info(struct super_block *);
64967 +extern int reiser4_init_super_data(struct super_block *, char *opt_string);
64968 +extern int reiser4_init_read_super(struct super_block *, int silent);
64969 +extern int reiser4_init_root_inode(struct super_block *);
64970 +extern reiser4_plugin *get_default_plugin(pset_member memb);
64971 +
64972 +/* Maximal possible object id. */
64973 +#define  ABSOLUTE_MAX_OID ((oid_t)~0)
64974 +
64975 +#define OIDS_RESERVED  ( 1 << 16 )
64976 +int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next);
64977 +oid_t oid_allocate(struct super_block *);
64978 +int oid_release(struct super_block *, oid_t);
64979 +oid_t oid_next(const struct super_block *);
64980 +void oid_count_allocated(void);
64981 +void oid_count_released(void);
64982 +long oids_used(const struct super_block *);
64983 +
64984 +#if REISER4_DEBUG
64985 +void print_fs_info(const char *prefix, const struct super_block *);
64986 +#endif
64987 +
64988 +extern void destroy_reiser4_cache(struct kmem_cache **);
64989 +
64990 +extern struct super_operations reiser4_super_operations;
64991 +extern struct export_operations reiser4_export_operations;
64992 +extern struct dentry_operations reiser4_dentry_operations;
64993 +
64994 +/* __REISER4_SUPER_H__ */
64995 +#endif
64996 +
64997 +/*
64998 + * Local variables:
64999 + * c-indentation-style: "K&R"
65000 + * mode-name: "LC"
65001 + * c-basic-offset: 8
65002 + * tab-width: 8
65003 + * fill-column: 120
65004 + * End:
65005 + */
65006 diff -urN linux-2.6.27.orig/fs/reiser4/super_ops.c linux-2.6.27/fs/reiser4/super_ops.c
65007 --- linux-2.6.27.orig/fs/reiser4/super_ops.c    1970-01-01 03:00:00.000000000 +0300
65008 +++ linux-2.6.27/fs/reiser4/super_ops.c 2008-10-12 18:20:01.000000000 +0400
65009 @@ -0,0 +1,724 @@
65010 +/* Copyright 2005 by Hans Reiser, licensing governed by
65011 + * reiser4/README */
65012 +
65013 +#include "inode.h"
65014 +#include "page_cache.h"
65015 +#include "ktxnmgrd.h"
65016 +#include "flush.h"
65017 +#include "safe_link.h"
65018 +
65019 +#include <linux/vfs.h>
65020 +#include <linux/writeback.h>
65021 +#include <linux/mount.h>
65022 +#include <linux/seq_file.h>
65023 +#include <linux/debugfs.h>
65024 +
65025 +/* slab cache for inodes */
65026 +static struct kmem_cache *inode_cache;
65027 +
65028 +static struct dentry *reiser4_debugfs_root = NULL;
65029 +
65030 +/**
65031 + * init_once - constructor for reiser4 inodes
65032 + * @cache: cache @obj belongs to
65033 + * @obj: inode to be initialized
65034 + *
65035 + * Initialization function to be called when new page is allocated by reiser4
65036 + * inode cache. It is set on inode cache creation.
65037 + */
65038 +static void init_once(void *obj)
65039 +{
65040 +       struct reiser4_inode_object *info;
65041 +
65042 +       info = obj;
65043 +
65044 +       /* initialize vfs inode */
65045 +       inode_init_once(&info->vfs_inode);
65046 +
65047 +       /*
65048 +        * initialize reiser4 specific part fo inode.
65049 +        * NOTE-NIKITA add here initializations for locks, list heads,
65050 +        * etc. that will be added to our private inode part.
65051 +        */
65052 +       INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode));
65053 +       init_rwsem(&info->p.conv_sem);
65054 +       /* init semaphore which is used during inode loading */
65055 +       loading_init_once(&info->p);
65056 +       INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p),
65057 +                       GFP_ATOMIC);
65058 +#if REISER4_DEBUG
65059 +       info->p.nr_jnodes = 0;
65060 +#endif
65061 +}
65062 +
65063 +/**
65064 + * init_inodes - create znode cache
65065 + *
65066 + * Initializes slab cache of inodes. It is part of reiser4 module initialization.
65067 + */
65068 +static int init_inodes(void)
65069 +{
65070 +       inode_cache = kmem_cache_create("reiser4_inode",
65071 +                                       sizeof(struct reiser4_inode_object),
65072 +                                       0,
65073 +                                       SLAB_HWCACHE_ALIGN |
65074 +                                       SLAB_RECLAIM_ACCOUNT, init_once);
65075 +       if (inode_cache == NULL)
65076 +               return RETERR(-ENOMEM);
65077 +       return 0;
65078 +}
65079 +
65080 +/**
65081 + * done_inodes - delete inode cache
65082 + *
65083 + * This is called on reiser4 module unloading or system shutdown.
65084 + */
65085 +static void done_inodes(void)
65086 +{
65087 +       destroy_reiser4_cache(&inode_cache);
65088 +}
65089 +
65090 +/**
65091 + * reiser4_alloc_inode - alloc_inode of super operations
65092 + * @super: super block new inode is allocated for
65093 + *
65094 + * Allocates new inode, initializes reiser4 specific part of it.
65095 + */
65096 +static struct inode *reiser4_alloc_inode(struct super_block *super)
65097 +{
65098 +       struct reiser4_inode_object *obj;
65099 +
65100 +       assert("nikita-1696", super != NULL);
65101 +       obj = kmem_cache_alloc(inode_cache, reiser4_ctx_gfp_mask_get());
65102 +       if (obj != NULL) {
65103 +               reiser4_inode *info;
65104 +
65105 +               info = &obj->p;
65106 +
65107 +               info->pset = plugin_set_get_empty();
65108 +               info->hset = plugin_set_get_empty();
65109 +               info->extmask = 0;
65110 +               info->locality_id = 0ull;
65111 +               info->plugin_mask = 0;
65112 +               info->heir_mask = 0;
65113 +#if !REISER4_INO_IS_OID
65114 +               info->oid_hi = 0;
65115 +#endif
65116 +               reiser4_seal_init(&info->sd_seal, NULL, NULL);
65117 +               coord_init_invalid(&info->sd_coord, NULL);
65118 +               info->flags = 0;
65119 +               spin_lock_init(&info->guard);
65120 +               /* this deals with info's loading semaphore */
65121 +               loading_alloc(info);
65122 +               info->vroot = UBER_TREE_ADDR;
65123 +               return &obj->vfs_inode;
65124 +       } else
65125 +               return NULL;
65126 +}
65127 +
65128 +/**
65129 + * reiser4_destroy_inode - destroy_inode of super operations
65130 + * @inode: inode being destroyed
65131 + *
65132 + * Puts reiser4 specific portion of inode, frees memory occupied by inode.
65133 + */
65134 +static void reiser4_destroy_inode(struct inode *inode)
65135 +{
65136 +       reiser4_inode *info;
65137 +
65138 +       info = reiser4_inode_data(inode);
65139 +
65140 +       assert("vs-1220", inode_has_no_jnodes(info));
65141 +
65142 +       if (!is_bad_inode(inode) && is_inode_loaded(inode)) {
65143 +               file_plugin *fplug = inode_file_plugin(inode);
65144 +               if (fplug->destroy_inode != NULL)
65145 +                       fplug->destroy_inode(inode);
65146 +       }
65147 +       reiser4_dispose_cursors(inode);
65148 +       if (info->pset)
65149 +               plugin_set_put(info->pset);
65150 +       if (info->hset)
65151 +               plugin_set_put(info->hset);
65152 +
65153 +       /*
65154 +        * cannot add similar assertion about ->i_list as prune_icache return
65155 +        * inode into slab with dangling ->list.{next,prev}. This is safe,
65156 +        * because they are re-initialized in the new_inode().
65157 +        */
65158 +       assert("nikita-2895", list_empty(&inode->i_dentry));
65159 +       assert("nikita-2896", hlist_unhashed(&inode->i_hash));
65160 +       assert("nikita-2898", list_empty_careful(get_readdir_list(inode)));
65161 +
65162 +       /* this deals with info's loading semaphore */
65163 +       loading_destroy(info);
65164 +
65165 +       kmem_cache_free(inode_cache,
65166 +                       container_of(info, struct reiser4_inode_object, p));
65167 +}
65168 +
65169 +/**
65170 + * reiser4_dirty_inode - dirty_inode of super operations
65171 + * @inode: inode being dirtied
65172 + *
65173 + * Updates stat data.
65174 + */
65175 +static void reiser4_dirty_inode(struct inode *inode)
65176 +{
65177 +       int result;
65178 +
65179 +       if (!is_in_reiser4_context())
65180 +               return;
65181 +       assert("", !IS_RDONLY(inode));
65182 +       assert("", (inode_file_plugin(inode)->estimate.update(inode) <=
65183 +                   get_current_context()->grabbed_blocks));
65184 +
65185 +       result = reiser4_update_sd(inode);
65186 +       if (result)
65187 +               warning("", "failed to dirty inode for %llu: %d",
65188 +                       get_inode_oid(inode), result);
65189 +}
65190 +
65191 +/**
65192 + * reiser4_delete_inode - delete_inode of super operations
65193 + * @inode: inode to delete
65194 + *
65195 + * Calls file plugin's delete_object method to delete object items from
65196 + * filesystem tree and calls clear_inode.
65197 + */
65198 +static void reiser4_delete_inode(struct inode *inode)
65199 +{
65200 +       reiser4_context *ctx;
65201 +       file_plugin *fplug;
65202 +
65203 +       ctx = reiser4_init_context(inode->i_sb);
65204 +       if (IS_ERR(ctx)) {
65205 +               warning("vs-15", "failed to init context");
65206 +               return;
65207 +       }
65208 +
65209 +       if (is_inode_loaded(inode)) {
65210 +               fplug = inode_file_plugin(inode);
65211 +               if (fplug != NULL && fplug->delete_object != NULL)
65212 +                       fplug->delete_object(inode);
65213 +       }
65214 +
65215 +       truncate_inode_pages(&inode->i_data, 0);
65216 +       inode->i_blocks = 0;
65217 +       clear_inode(inode);
65218 +       reiser4_exit_context(ctx);
65219 +}
65220 +
65221 +/**
65222 + * reiser4_put_super - put_super of super operations
65223 + * @super: super block to free
65224 + *
65225 + * Stops daemons, release resources, umounts in short.
65226 + */
65227 +static void reiser4_put_super(struct super_block *super)
65228 +{
65229 +       reiser4_super_info_data *sbinfo;
65230 +       reiser4_context *ctx;
65231 +
65232 +       sbinfo = get_super_private(super);
65233 +       assert("vs-1699", sbinfo);
65234 +
65235 +       debugfs_remove(sbinfo->tmgr.debugfs_atom_count);
65236 +       debugfs_remove(sbinfo->tmgr.debugfs_id_count);
65237 +       debugfs_remove(sbinfo->debugfs_root);
65238 +
65239 +       ctx = reiser4_init_context(super);
65240 +       if (IS_ERR(ctx)) {
65241 +               warning("vs-17", "failed to init context");
65242 +               return;
65243 +       }
65244 +
65245 +       /* have disk format plugin to free its resources */
65246 +       if (get_super_private(super)->df_plug->release)
65247 +               get_super_private(super)->df_plug->release(super);
65248 +
65249 +       reiser4_done_formatted_fake(super);
65250 +
65251 +       /* stop daemons: ktxnmgr and entd */
65252 +       reiser4_done_entd(super);
65253 +       reiser4_done_ktxnmgrd(super);
65254 +       reiser4_done_txnmgr(&sbinfo->tmgr);
65255 +
65256 +       reiser4_done_fs_info(super);
65257 +       reiser4_exit_context(ctx);
65258 +}
65259 +
65260 +/**
65261 + * reiser4_write_super - write_super of super operations
65262 + * @super: super block to write
65263 + *
65264 + * Captures znode associated with super block, comit all transactions.
65265 + */
65266 +static void reiser4_write_super(struct super_block *super)
65267 +{
65268 +       int ret;
65269 +       reiser4_context *ctx;
65270 +
65271 +       assert("vs-1700", !rofs_super(super));
65272 +
65273 +       ctx = reiser4_init_context(super);
65274 +       if (IS_ERR(ctx)) {
65275 +               warning("vs-16", "failed to init context");
65276 +               return;
65277 +       }
65278 +
65279 +       ret = reiser4_capture_super_block(super);
65280 +       if (ret != 0)
65281 +               warning("vs-1701",
65282 +                       "reiser4_capture_super_block failed in write_super: %d",
65283 +                       ret);
65284 +       ret = txnmgr_force_commit_all(super, 0);
65285 +       if (ret != 0)
65286 +               warning("jmacd-77113",
65287 +                       "txn_force failed in write_super: %d", ret);
65288 +
65289 +       super->s_dirt = 0;
65290 +
65291 +       reiser4_exit_context(ctx);
65292 +}
65293 +
65294 +/**
65295 + * reiser4_statfs - statfs of super operations
65296 + * @super: super block of file system in queried
65297 + * @stafs: buffer to fill with statistics
65298 + *
65299 + * Returns information about filesystem.
65300 + */
65301 +static int reiser4_statfs(struct dentry *dentry, struct kstatfs *statfs)
65302 +{
65303 +       sector_t total;
65304 +       sector_t reserved;
65305 +       sector_t free;
65306 +       sector_t forroot;
65307 +       sector_t deleted;
65308 +       reiser4_context *ctx;
65309 +       struct super_block *super = dentry->d_sb;
65310 +
65311 +       assert("nikita-408", super != NULL);
65312 +       assert("nikita-409", statfs != NULL);
65313 +
65314 +       ctx = reiser4_init_context(super);
65315 +       if (IS_ERR(ctx))
65316 +               return PTR_ERR(ctx);
65317 +
65318 +       statfs->f_type = reiser4_statfs_type(super);
65319 +       statfs->f_bsize = super->s_blocksize;
65320 +
65321 +       /*
65322 +        * 5% of total block space is reserved. This is needed for flush and
65323 +        * for truncates (so that we are able to perform truncate/unlink even
65324 +        * on the otherwise completely full file system). If this reservation
65325 +        * is hidden from statfs(2), users will mistakenly guess that they
65326 +        * have enough free space to complete some operation, which is
65327 +        * frustrating.
65328 +        *
65329 +        * Another possible solution is to subtract ->blocks_reserved from
65330 +        * ->f_bfree, but changing available space seems less intrusive than
65331 +        * letting user to see 5% of disk space to be used directly after
65332 +        * mkfs.
65333 +        */
65334 +       total = reiser4_block_count(super);
65335 +       reserved = get_super_private(super)->blocks_reserved;
65336 +       deleted = txnmgr_count_deleted_blocks();
65337 +       free = reiser4_free_blocks(super) + deleted;
65338 +       forroot = reiser4_reserved_blocks(super, 0, 0);
65339 +
65340 +       /*
65341 +        * These counters may be in inconsistent state because we take the
65342 +        * values without keeping any global spinlock.  Here we do a sanity
65343 +        * check that free block counter does not exceed the number of all
65344 +        * blocks.
65345 +        */
65346 +       if (free > total)
65347 +               free = total;
65348 +       statfs->f_blocks = total - reserved;
65349 +       /* make sure statfs->f_bfree is never larger than statfs->f_blocks */
65350 +       if (free > reserved)
65351 +               free -= reserved;
65352 +       else
65353 +               free = 0;
65354 +       statfs->f_bfree = free;
65355 +
65356 +       if (free > forroot)
65357 +               free -= forroot;
65358 +       else
65359 +               free = 0;
65360 +       statfs->f_bavail = free;
65361 +
65362 +       statfs->f_files = 0;
65363 +       statfs->f_ffree = 0;
65364 +
65365 +       /* maximal acceptable name length depends on directory plugin. */
65366 +       assert("nikita-3351", super->s_root->d_inode != NULL);
65367 +       statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode);
65368 +       reiser4_exit_context(ctx);
65369 +       return 0;
65370 +}
65371 +
65372 +/**
65373 + * reiser4_clear_inode - clear_inode of super operation
65374 + * @inode: inode about to destroy
65375 + *
65376 + * Does sanity checks: being destroyed should have all jnodes detached.
65377 + */
65378 +static void reiser4_clear_inode(struct inode *inode)
65379 +{
65380 +#if REISER4_DEBUG
65381 +       reiser4_inode *r4_inode;
65382 +
65383 +       r4_inode = reiser4_inode_data(inode);
65384 +       if (!inode_has_no_jnodes(r4_inode))
65385 +               warning("vs-1732", "reiser4 inode has %ld jnodes\n",
65386 +                       r4_inode->nr_jnodes);
65387 +#endif
65388 +}
65389 +
65390 +/**
65391 + * reiser4_sync_inodes - sync_inodes of super operations
65392 + * @super:
65393 + * @wbc:
65394 + *
65395 + * This method is called by background and non-backgound writeback. Reiser4's
65396 + * implementation uses generic_sync_sb_inodes to call reiser4_writepages for
65397 + * each of dirty inodes. Reiser4_writepages handles pages dirtied via shared
65398 + * mapping - dirty pages get into atoms. Writeout is called to flush some
65399 + * atoms.
65400 + */
65401 +static void reiser4_sync_inodes(struct super_block *super,
65402 +                               struct writeback_control *wbc)
65403 +{
65404 +       reiser4_context *ctx;
65405 +       long to_write;
65406 +
65407 +       if (wbc->for_kupdate)
65408 +               /* reiser4 has its own means of periodical write-out */
65409 +               return;
65410 +
65411 +       to_write = wbc->nr_to_write;
65412 +       assert("vs-49", wbc->older_than_this == NULL);
65413 +
65414 +       ctx = reiser4_init_context(super);
65415 +       if (IS_ERR(ctx)) {
65416 +               warning("vs-13", "failed to init context");
65417 +               return;
65418 +       }
65419 +
65420 +       /*
65421 +        * call reiser4_writepages for each of dirty inodes to turn dirty pages
65422 +        * into transactions if they were not yet.
65423 +        */
65424 +       generic_sync_sb_inodes(super, wbc);
65425 +
65426 +       /* flush goes here */
65427 +       wbc->nr_to_write = to_write;
65428 +       reiser4_writeout(super, wbc);
65429 +
65430 +       /* avoid recursive calls to ->sync_inodes */
65431 +       context_set_commit_async(ctx);
65432 +       reiser4_exit_context(ctx);
65433 +}
65434 +
65435 +/**
65436 + * reiser4_show_options - show_options of super operations
65437 + * @m: file where to write information
65438 + * @mnt: mount structure
65439 + *
65440 + * Makes reiser4 mount options visible in /proc/mounts.
65441 + */
65442 +static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt)
65443 +{
65444 +       struct super_block *super;
65445 +       reiser4_super_info_data *sbinfo;
65446 +
65447 +       super = mnt->mnt_sb;
65448 +       sbinfo = get_super_private(super);
65449 +
65450 +       seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size);
65451 +       seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age);
65452 +       seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size);
65453 +       seq_printf(m, ",atom_max_flushers=0x%x",
65454 +                  sbinfo->tmgr.atom_max_flushers);
65455 +       seq_printf(m, ",cbk_cache_slots=0x%x",
65456 +                  sbinfo->tree.cbk_cache.nr_slots);
65457 +
65458 +       return 0;
65459 +}
65460 +
65461 +struct super_operations reiser4_super_operations = {
65462 +       .alloc_inode = reiser4_alloc_inode,
65463 +       .destroy_inode = reiser4_destroy_inode,
65464 +       .dirty_inode = reiser4_dirty_inode,
65465 +       .delete_inode = reiser4_delete_inode,
65466 +       .put_super = reiser4_put_super,
65467 +       .write_super = reiser4_write_super,
65468 +       .statfs = reiser4_statfs,
65469 +       .clear_inode = reiser4_clear_inode,
65470 +       .sync_inodes = reiser4_sync_inodes,
65471 +       .show_options = reiser4_show_options
65472 +};
65473 +
65474 +/**
65475 + * fill_super - initialize super block on mount
65476 + * @super: super block to fill
65477 + * @data: reiser4 specific mount option
65478 + * @silent:
65479 + *
65480 + * This is to be called by reiser4_get_sb. Mounts filesystem.
65481 + */
65482 +static int fill_super(struct super_block *super, void *data, int silent)
65483 +{
65484 +       reiser4_context ctx;
65485 +       int result;
65486 +       reiser4_super_info_data *sbinfo;
65487 +
65488 +       assert("zam-989", super != NULL);
65489 +
65490 +       super->s_op = NULL;
65491 +       init_stack_context(&ctx, super);
65492 +
65493 +       /* allocate reiser4 specific super block */
65494 +       if ((result = reiser4_init_fs_info(super)) != 0)
65495 +               goto failed_init_sinfo;
65496 +
65497 +       sbinfo = get_super_private(super);
65498 +       /* initialize various reiser4 parameters, parse mount options */
65499 +       if ((result = reiser4_init_super_data(super, data)) != 0)
65500 +               goto failed_init_super_data;
65501 +
65502 +       /* read reiser4 master super block, initialize disk format plugin */
65503 +       if ((result = reiser4_init_read_super(super, silent)) != 0)
65504 +               goto failed_init_read_super;
65505 +
65506 +       /* initialize transaction manager */
65507 +       reiser4_init_txnmgr(&sbinfo->tmgr);
65508 +
65509 +       /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */
65510 +       if ((result = reiser4_init_ktxnmgrd(super)) != 0)
65511 +               goto failed_init_ktxnmgrd;
65512 +
65513 +       /* initialize entd context and start kernel thread entd */
65514 +       if ((result = reiser4_init_entd(super)) != 0)
65515 +               goto failed_init_entd;
65516 +
65517 +       /* initialize address spaces for formatted nodes and bitmaps */
65518 +       if ((result = reiser4_init_formatted_fake(super)) != 0)
65519 +               goto failed_init_formatted_fake;
65520 +
65521 +       /* initialize disk format plugin */
65522 +       if ((result = get_super_private(super)->df_plug->init_format(super, data)) != 0 )
65523 +               goto failed_init_disk_format;
65524 +
65525 +       /*
65526 +        * There are some 'committed' versions of reiser4 super block counters,
65527 +        * which correspond to reiser4 on-disk state. These counters are
65528 +        * initialized here
65529 +        */
65530 +       sbinfo->blocks_free_committed = sbinfo->blocks_free;
65531 +       sbinfo->nr_files_committed = oids_used(super);
65532 +
65533 +       /* get inode of root directory */
65534 +       if ((result = reiser4_init_root_inode(super)) != 0)
65535 +               goto failed_init_root_inode;
65536 +
65537 +       if ((result = get_super_private(super)->df_plug->version_update(super)) != 0 )
65538 +               goto failed_update_format_version;
65539 +
65540 +       process_safelinks(super);
65541 +       reiser4_exit_context(&ctx);
65542 +
65543 +       sbinfo->debugfs_root = debugfs_create_dir(super->s_id,
65544 +                                                 reiser4_debugfs_root);
65545 +       if (sbinfo->debugfs_root) {
65546 +               sbinfo->tmgr.debugfs_atom_count =
65547 +                       debugfs_create_u32("atom_count", S_IFREG|S_IRUSR,
65548 +                                          sbinfo->debugfs_root,
65549 +                                          &sbinfo->tmgr.atom_count);
65550 +               sbinfo->tmgr.debugfs_id_count =
65551 +                       debugfs_create_u32("id_count", S_IFREG|S_IRUSR,
65552 +                                          sbinfo->debugfs_root,
65553 +                                          &sbinfo->tmgr.id_count);
65554 +       }
65555 +       return 0;
65556 +
65557 + failed_update_format_version:
65558 + failed_init_root_inode:
65559 +       if (sbinfo->df_plug->release)
65560 +               sbinfo->df_plug->release(super);
65561 + failed_init_disk_format:
65562 +       reiser4_done_formatted_fake(super);
65563 + failed_init_formatted_fake:
65564 +       reiser4_done_entd(super);
65565 + failed_init_entd:
65566 +       reiser4_done_ktxnmgrd(super);
65567 + failed_init_ktxnmgrd:
65568 +       reiser4_done_txnmgr(&sbinfo->tmgr);
65569 + failed_init_read_super:
65570 + failed_init_super_data:
65571 +       reiser4_done_fs_info(super);
65572 + failed_init_sinfo:
65573 +       reiser4_exit_context(&ctx);
65574 +       return result;
65575 +}
65576 +
65577 +/**
65578 + * reiser4_get_sb - get_sb of file_system_type operations
65579 + * @fs_type:
65580 + * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc
65581 + * @dev_name: block device file name
65582 + * @data: specific mount options
65583 + *
65584 + * Reiser4 mount entry.
65585 + */
65586 +static int reiser4_get_sb(struct file_system_type *fs_type, int flags,
65587 +                       const char *dev_name, void *data, struct vfsmount *mnt)
65588 +{
65589 +       return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
65590 +}
65591 +
65592 +/* structure describing the reiser4 filesystem implementation */
65593 +static struct file_system_type reiser4_fs_type = {
65594 +       .owner = THIS_MODULE,
65595 +       .name = "reiser4",
65596 +       .fs_flags = FS_REQUIRES_DEV,
65597 +       .get_sb = reiser4_get_sb,
65598 +       .kill_sb = kill_block_super,
65599 +       .next = NULL
65600 +};
65601 +
65602 +void destroy_reiser4_cache(struct kmem_cache **cachep)
65603 +{
65604 +       BUG_ON(*cachep == NULL);
65605 +       kmem_cache_destroy(*cachep);
65606 +       *cachep = NULL;
65607 +}
65608 +
65609 +/**
65610 + * init_reiser4 - reiser4 initialization entry point
65611 + *
65612 + * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called
65613 + * on kernel initialization or during reiser4 module load.
65614 + */
65615 +static int __init init_reiser4(void)
65616 +{
65617 +       int result;
65618 +
65619 +       printk(KERN_INFO
65620 +              "Loading Reiser4. "
65621 +              "See www.namesys.com for a description of Reiser4.\n");
65622 +
65623 +       /* initialize slab cache of inodes */
65624 +       if ((result = init_inodes()) != 0)
65625 +               goto failed_inode_cache;
65626 +
65627 +       /* initialize cache of znodes */
65628 +       if ((result = init_znodes()) != 0)
65629 +               goto failed_init_znodes;
65630 +
65631 +       /* initialize all plugins */
65632 +       if ((result = init_plugins()) != 0)
65633 +               goto failed_init_plugins;
65634 +
65635 +       /* initialize cache of plugin_set-s and plugin_set's hash table */
65636 +       if ((result = init_plugin_set()) != 0)
65637 +               goto failed_init_plugin_set;
65638 +
65639 +       /* initialize caches of txn_atom-s and txn_handle-s */
65640 +       if ((result = init_txnmgr_static()) != 0)
65641 +               goto failed_init_txnmgr_static;
65642 +
65643 +       /* initialize cache of jnodes */
65644 +       if ((result = init_jnodes()) != 0)
65645 +               goto failed_init_jnodes;
65646 +
65647 +       /* initialize cache of flush queues */
65648 +       if ((result = reiser4_init_fqs()) != 0)
65649 +               goto failed_init_fqs;
65650 +
65651 +       /* initialize cache of structures attached to dentry->d_fsdata */
65652 +       if ((result = reiser4_init_dentry_fsdata()) != 0)
65653 +               goto failed_init_dentry_fsdata;
65654 +
65655 +       /* initialize cache of structures attached to file->private_data */
65656 +       if ((result = reiser4_init_file_fsdata()) != 0)
65657 +               goto failed_init_file_fsdata;
65658 +
65659 +       /*
65660 +        * initialize cache of d_cursors. See plugin/file_ops_readdir.c for
65661 +        * more details
65662 +        */
65663 +       if ((result = reiser4_init_d_cursor()) != 0)
65664 +               goto failed_init_d_cursor;
65665 +
65666 +       if ((result = register_filesystem(&reiser4_fs_type)) == 0) {
65667 +               reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL);
65668 +               return 0;
65669 +       }
65670 +
65671 +       reiser4_done_d_cursor();
65672 + failed_init_d_cursor:
65673 +       reiser4_done_file_fsdata();
65674 + failed_init_file_fsdata:
65675 +       reiser4_done_dentry_fsdata();
65676 + failed_init_dentry_fsdata:
65677 +       reiser4_done_fqs();
65678 + failed_init_fqs:
65679 +       done_jnodes();
65680 + failed_init_jnodes:
65681 +       done_txnmgr_static();
65682 + failed_init_txnmgr_static:
65683 +       done_plugin_set();
65684 + failed_init_plugin_set:
65685 + failed_init_plugins:
65686 +       done_znodes();
65687 + failed_init_znodes:
65688 +       done_inodes();
65689 + failed_inode_cache:
65690 +       return result;
65691 +}
65692 +
65693 +/**
65694 + * done_reiser4 - reiser4 exit entry point
65695 + *
65696 + * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown
65697 + * or at module unload.
65698 + */
65699 +static void __exit done_reiser4(void)
65700 +{
65701 +       int result;
65702 +
65703 +       debugfs_remove(reiser4_debugfs_root);
65704 +       result = unregister_filesystem(&reiser4_fs_type);
65705 +       BUG_ON(result != 0);
65706 +       reiser4_done_d_cursor();
65707 +       reiser4_done_file_fsdata();
65708 +       reiser4_done_dentry_fsdata();
65709 +       reiser4_done_fqs();
65710 +       done_jnodes();
65711 +       done_txnmgr_static();
65712 +       done_plugin_set();
65713 +       done_znodes();
65714 +       destroy_reiser4_cache(&inode_cache);
65715 +}
65716 +
65717 +module_init(init_reiser4);
65718 +module_exit(done_reiser4);
65719 +
65720 +MODULE_DESCRIPTION("Reiser4 filesystem");
65721 +MODULE_AUTHOR("Hans Reiser <Reiser@Namesys.COM>");
65722 +
65723 +MODULE_LICENSE("GPL");
65724 +
65725 +/*
65726 + * Local variables:
65727 + * c-indentation-style: "K&R"
65728 + * mode-name: "LC"
65729 + * c-basic-offset: 8
65730 + * tab-width: 8
65731 + * fill-column: 79
65732 + * End:
65733 + */
65734 diff -urN linux-2.6.27.orig/fs/reiser4/tap.c linux-2.6.27/fs/reiser4/tap.c
65735 --- linux-2.6.27.orig/fs/reiser4/tap.c  1970-01-01 03:00:00.000000000 +0300
65736 +++ linux-2.6.27/fs/reiser4/tap.c       2008-10-12 18:20:01.000000000 +0400
65737 @@ -0,0 +1,377 @@
65738 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
65739 + * reiser4/README */
65740 +
65741 +/*
65742 +   Tree Access Pointer (tap).
65743 +
65744 +   tap is data structure combining coord and lock handle (mostly). It is
65745 +   useful when one has to scan tree nodes (for example, in readdir, or flush),
65746 +   for tap functions allow to move tap in either direction transparently
65747 +   crossing unit/item/node borders.
65748 +
65749 +   Tap doesn't provide automatic synchronization of its fields as it is
65750 +   supposed to be per-thread object.
65751 +*/
65752 +
65753 +#include "forward.h"
65754 +#include "debug.h"
65755 +#include "coord.h"
65756 +#include "tree.h"
65757 +#include "context.h"
65758 +#include "tap.h"
65759 +#include "znode.h"
65760 +#include "tree_walk.h"
65761 +
65762 +#if REISER4_DEBUG
65763 +static int tap_invariant(const tap_t * tap);
65764 +static void tap_check(const tap_t * tap);
65765 +#else
65766 +#define tap_check(tap) noop
65767 +#endif
65768 +
65769 +/** load node tap is pointing to, if not loaded already */
65770 +int reiser4_tap_load(tap_t * tap)
65771 +{
65772 +       tap_check(tap);
65773 +       if (tap->loaded == 0) {
65774 +               int result;
65775 +
65776 +               result = zload_ra(tap->coord->node, &tap->ra_info);
65777 +               if (result != 0)
65778 +                       return result;
65779 +               coord_clear_iplug(tap->coord);
65780 +       }
65781 +       ++tap->loaded;
65782 +       tap_check(tap);
65783 +       return 0;
65784 +}
65785 +
65786 +/** release node tap is pointing to. Dual to tap_load() */
65787 +void reiser4_tap_relse(tap_t * tap)
65788 +{
65789 +       tap_check(tap);
65790 +       if (tap->loaded > 0) {
65791 +               --tap->loaded;
65792 +               if (tap->loaded == 0) {
65793 +                       zrelse(tap->coord->node);
65794 +               }
65795 +       }
65796 +       tap_check(tap);
65797 +}
65798 +
65799 +/**
65800 + * init tap to consist of @coord and @lh. Locks on nodes will be acquired with
65801 + * @mode
65802 + */
65803 +void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
65804 +                     znode_lock_mode mode)
65805 +{
65806 +       tap->coord = coord;
65807 +       tap->lh = lh;
65808 +       tap->mode = mode;
65809 +       tap->loaded = 0;
65810 +       INIT_LIST_HEAD(&tap->linkage);
65811 +       reiser4_init_ra_info(&tap->ra_info);
65812 +}
65813 +
65814 +/** add @tap to the per-thread list of all taps */
65815 +void reiser4_tap_monitor(tap_t * tap)
65816 +{
65817 +       assert("nikita-2623", tap != NULL);
65818 +       tap_check(tap);
65819 +       list_add(&tap->linkage, reiser4_taps_list());
65820 +       tap_check(tap);
65821 +}
65822 +
65823 +/* duplicate @src into @dst. Copy lock handle. @dst is not initially
65824 + * loaded. */
65825 +void reiser4_tap_copy(tap_t * dst, tap_t * src)
65826 +{
65827 +       assert("nikita-3193", src != NULL);
65828 +       assert("nikita-3194", dst != NULL);
65829 +
65830 +       *dst->coord = *src->coord;
65831 +       if (src->lh->node)
65832 +               copy_lh(dst->lh, src->lh);
65833 +       dst->mode = src->mode;
65834 +       dst->loaded = 0;
65835 +       INIT_LIST_HEAD(&dst->linkage);
65836 +       dst->ra_info = src->ra_info;
65837 +}
65838 +
65839 +/** finish with @tap */
65840 +void reiser4_tap_done(tap_t * tap)
65841 +{
65842 +       assert("nikita-2565", tap != NULL);
65843 +       tap_check(tap);
65844 +       if (tap->loaded > 0)
65845 +               zrelse(tap->coord->node);
65846 +       done_lh(tap->lh);
65847 +       tap->loaded = 0;
65848 +       list_del_init(&tap->linkage);
65849 +       tap->coord->node = NULL;
65850 +}
65851 +
65852 +/**
65853 + * move @tap to the new node, locked with @target. Load @target, if @tap was
65854 + * already loaded.
65855 + */
65856 +int reiser4_tap_move(tap_t * tap, lock_handle * target)
65857 +{
65858 +       int result = 0;
65859 +
65860 +       assert("nikita-2567", tap != NULL);
65861 +       assert("nikita-2568", target != NULL);
65862 +       assert("nikita-2570", target->node != NULL);
65863 +       assert("nikita-2569", tap->coord->node == tap->lh->node);
65864 +
65865 +       tap_check(tap);
65866 +       if (tap->loaded > 0)
65867 +               result = zload_ra(target->node, &tap->ra_info);
65868 +
65869 +       if (result == 0) {
65870 +               if (tap->loaded > 0)
65871 +                       zrelse(tap->coord->node);
65872 +               done_lh(tap->lh);
65873 +               copy_lh(tap->lh, target);
65874 +               tap->coord->node = target->node;
65875 +               coord_clear_iplug(tap->coord);
65876 +       }
65877 +       tap_check(tap);
65878 +       return result;
65879 +}
65880 +
65881 +/**
65882 + * move @tap to @target. Acquire lock on @target, if @tap was already
65883 + * loaded.
65884 + */
65885 +static int tap_to(tap_t * tap, znode * target)
65886 +{
65887 +       int result;
65888 +
65889 +       assert("nikita-2624", tap != NULL);
65890 +       assert("nikita-2625", target != NULL);
65891 +
65892 +       tap_check(tap);
65893 +       result = 0;
65894 +       if (tap->coord->node != target) {
65895 +               lock_handle here;
65896 +
65897 +               init_lh(&here);
65898 +               result = longterm_lock_znode(&here, target,
65899 +                                            tap->mode, ZNODE_LOCK_HIPRI);
65900 +               if (result == 0) {
65901 +                       result = reiser4_tap_move(tap, &here);
65902 +                       done_lh(&here);
65903 +               }
65904 +       }
65905 +       tap_check(tap);
65906 +       return result;
65907 +}
65908 +
65909 +/**
65910 + * move @tap to given @target, loading and locking @target->node if
65911 + * necessary
65912 + */
65913 +int tap_to_coord(tap_t * tap, coord_t * target)
65914 +{
65915 +       int result;
65916 +
65917 +       tap_check(tap);
65918 +       result = tap_to(tap, target->node);
65919 +       if (result == 0)
65920 +               coord_dup(tap->coord, target);
65921 +       tap_check(tap);
65922 +       return result;
65923 +}
65924 +
65925 +/** return list of all taps */
65926 +struct list_head *reiser4_taps_list(void)
65927 +{
65928 +       return &get_current_context()->taps;
65929 +}
65930 +
65931 +/** helper function for go_{next,prev}_{item,unit,node}() */
65932 +int go_dir_el(tap_t * tap, sideof dir, int units_p)
65933 +{
65934 +       coord_t dup;
65935 +       coord_t *coord;
65936 +       int result;
65937 +
65938 +       int (*coord_dir) (coord_t *);
65939 +       int (*get_dir_neighbor) (lock_handle *, znode *, int, int);
65940 +       void (*coord_init) (coord_t *, const znode *);
65941 +       ON_DEBUG(int (*coord_check) (const coord_t *));
65942 +
65943 +       assert("nikita-2556", tap != NULL);
65944 +       assert("nikita-2557", tap->coord != NULL);
65945 +       assert("nikita-2558", tap->lh != NULL);
65946 +       assert("nikita-2559", tap->coord->node != NULL);
65947 +
65948 +       tap_check(tap);
65949 +       if (dir == LEFT_SIDE) {
65950 +               coord_dir = units_p ? coord_prev_unit : coord_prev_item;
65951 +               get_dir_neighbor = reiser4_get_left_neighbor;
65952 +               coord_init = coord_init_last_unit;
65953 +       } else {
65954 +               coord_dir = units_p ? coord_next_unit : coord_next_item;
65955 +               get_dir_neighbor = reiser4_get_right_neighbor;
65956 +               coord_init = coord_init_first_unit;
65957 +       }
65958 +       ON_DEBUG(coord_check =
65959 +                units_p ? coord_is_existing_unit : coord_is_existing_item);
65960 +       assert("nikita-2560", coord_check(tap->coord));
65961 +
65962 +       coord = tap->coord;
65963 +       coord_dup(&dup, coord);
65964 +       if (coord_dir(&dup) != 0) {
65965 +               do {
65966 +                       /* move to the left neighboring node */
65967 +                       lock_handle dup;
65968 +
65969 +                       init_lh(&dup);
65970 +                       result =
65971 +                           get_dir_neighbor(&dup, coord->node, (int)tap->mode,
65972 +                                            GN_CAN_USE_UPPER_LEVELS);
65973 +                       if (result == 0) {
65974 +                               result = reiser4_tap_move(tap, &dup);
65975 +                               if (result == 0)
65976 +                                       coord_init(tap->coord, dup.node);
65977 +                               done_lh(&dup);
65978 +                       }
65979 +                       /* skip empty nodes */
65980 +               } while ((result == 0) && node_is_empty(coord->node));
65981 +       } else {
65982 +               result = 0;
65983 +               coord_dup(coord, &dup);
65984 +       }
65985 +       assert("nikita-2564", ergo(!result, coord_check(tap->coord)));
65986 +       tap_check(tap);
65987 +       return result;
65988 +}
65989 +
65990 +/**
65991 + * move @tap to the next unit, transparently crossing item and node
65992 + * boundaries
65993 + */
65994 +int go_next_unit(tap_t * tap)
65995 +{
65996 +       return go_dir_el(tap, RIGHT_SIDE, 1);
65997 +}
65998 +
65999 +/**
66000 + * move @tap to the previous unit, transparently crossing item and node
66001 + * boundaries
66002 + */
66003 +int go_prev_unit(tap_t * tap)
66004 +{
66005 +       return go_dir_el(tap, LEFT_SIDE, 1);
66006 +}
66007 +
66008 +/**
66009 + * @shift times apply @actor to the @tap. This is used to move @tap by
66010 + * @shift units (or items, or nodes) in either direction.
66011 + */
66012 +static int rewind_to(tap_t * tap, go_actor_t actor, int shift)
66013 +{
66014 +       int result;
66015 +
66016 +       assert("nikita-2555", shift >= 0);
66017 +       assert("nikita-2562", tap->coord->node == tap->lh->node);
66018 +
66019 +       tap_check(tap);
66020 +       result = reiser4_tap_load(tap);
66021 +       if (result != 0)
66022 +               return result;
66023 +
66024 +       for (; shift > 0; --shift) {
66025 +               result = actor(tap);
66026 +               assert("nikita-2563", tap->coord->node == tap->lh->node);
66027 +               if (result != 0)
66028 +                       break;
66029 +       }
66030 +       reiser4_tap_relse(tap);
66031 +       tap_check(tap);
66032 +       return result;
66033 +}
66034 +
66035 +/** move @tap @shift units rightward */
66036 +int rewind_right(tap_t * tap, int shift)
66037 +{
66038 +       return rewind_to(tap, go_next_unit, shift);
66039 +}
66040 +
66041 +/** move @tap @shift units leftward */
66042 +int rewind_left(tap_t * tap, int shift)
66043 +{
66044 +       return rewind_to(tap, go_prev_unit, shift);
66045 +}
66046 +
66047 +#if REISER4_DEBUG
66048 +/** debugging function: print @tap content in human readable form */
66049 +static void print_tap(const char *prefix, const tap_t * tap)
66050 +{
66051 +       if (tap == NULL) {
66052 +               printk("%s: null tap\n", prefix);
66053 +               return;
66054 +       }
66055 +       printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix,
66056 +              tap->loaded, (&tap->linkage == tap->linkage.next &&
66057 +                            &tap->linkage == tap->linkage.prev),
66058 +              tap->lh->node,
66059 +              lock_mode_name(tap->mode));
66060 +       print_coord("\tcoord", tap->coord, 0);
66061 +}
66062 +
66063 +/** check [tap-sane] invariant */
66064 +static int tap_invariant(const tap_t * tap)
66065 +{
66066 +       /* [tap-sane] invariant */
66067 +
66068 +       if (tap == NULL)
66069 +               return 1;
66070 +       /* tap->mode is one of
66071 +        *
66072 +        * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
66073 +        */
66074 +       if (tap->mode != ZNODE_NO_LOCK &&
66075 +           tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK)
66076 +               return 2;
66077 +       /* tap->coord != NULL, and */
66078 +       if (tap->coord == NULL)
66079 +               return 3;
66080 +       /* tap->lh != NULL, and */
66081 +       if (tap->lh == NULL)
66082 +               return 4;
66083 +       /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */
66084 +       if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node)))
66085 +               return 5;
66086 +       /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */
66087 +       if (tap->lh->node != NULL && tap->coord->node != tap->lh->node)
66088 +               return 6;
66089 +       return 0;
66090 +}
66091 +
66092 +/** debugging function: check internal @tap consistency */
66093 +static void tap_check(const tap_t * tap)
66094 +{
66095 +       int result;
66096 +
66097 +       result = tap_invariant(tap);
66098 +       if (result != 0) {
66099 +               print_tap("broken", tap);
66100 +               reiser4_panic("nikita-2831", "tap broken: %i\n", result);
66101 +       }
66102 +}
66103 +#endif
66104 +
66105 +/* Make Linus happy.
66106 +   Local variables:
66107 +   c-indentation-style: "K&R"
66108 +   mode-name: "LC"
66109 +   c-basic-offset: 8
66110 +   tab-width: 8
66111 +   fill-column: 120
66112 +   scroll-step: 1
66113 +   End:
66114 +*/
66115 diff -urN linux-2.6.27.orig/fs/reiser4/tap.h linux-2.6.27/fs/reiser4/tap.h
66116 --- linux-2.6.27.orig/fs/reiser4/tap.h  1970-01-01 03:00:00.000000000 +0300
66117 +++ linux-2.6.27/fs/reiser4/tap.h       2008-10-12 18:20:01.000000000 +0400
66118 @@ -0,0 +1,70 @@
66119 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
66120 +
66121 +/* Tree Access Pointers. See tap.c for more details. */
66122 +
66123 +#if !defined( __REISER4_TAP_H__ )
66124 +#define __REISER4_TAP_H__
66125 +
66126 +#include "forward.h"
66127 +#include "readahead.h"
66128 +
66129 +/**
66130 +    tree_access_pointer aka tap. Data structure combining coord_t and lock
66131 +    handle.
66132 +    Invariants involving this data-type, see doc/lock-ordering for details:
66133 +
66134 +      [tap-sane]
66135 + */
66136 +struct tree_access_pointer {
66137 +       /* coord tap is at */
66138 +       coord_t *coord;
66139 +       /* lock handle on ->coord->node */
66140 +       lock_handle *lh;
66141 +       /* mode of lock acquired by this tap */
66142 +       znode_lock_mode mode;
66143 +       /* incremented by reiser4_tap_load().
66144 +          Decremented by reiser4_tap_relse(). */
66145 +       int loaded;
66146 +       /* list of taps */
66147 +       struct list_head linkage;
66148 +       /* read-ahead hint */
66149 +       ra_info_t ra_info;
66150 +};
66151 +
66152 +typedef int (*go_actor_t) (tap_t * tap);
66153 +
66154 +extern int reiser4_tap_load(tap_t * tap);
66155 +extern void reiser4_tap_relse(tap_t * tap);
66156 +extern void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
66157 +                    znode_lock_mode mode);
66158 +extern void reiser4_tap_monitor(tap_t * tap);
66159 +extern void reiser4_tap_copy(tap_t * dst, tap_t * src);
66160 +extern void reiser4_tap_done(tap_t * tap);
66161 +extern int reiser4_tap_move(tap_t * tap, lock_handle * target);
66162 +extern int tap_to_coord(tap_t * tap, coord_t * target);
66163 +
66164 +extern int go_dir_el(tap_t * tap, sideof dir, int units_p);
66165 +extern int go_next_unit(tap_t * tap);
66166 +extern int go_prev_unit(tap_t * tap);
66167 +extern int rewind_right(tap_t * tap, int shift);
66168 +extern int rewind_left(tap_t * tap, int shift);
66169 +
66170 +extern struct list_head *reiser4_taps_list(void);
66171 +
66172 +#define for_all_taps(tap)                                                     \
66173 +       for (tap = list_entry(reiser4_taps_list()->next, tap_t, linkage);      \
66174 +            reiser4_taps_list() != &tap->linkage;                             \
66175 +            tap = list_entry(tap->linkage.next, tap_t, linkage))
66176 +
66177 +/* __REISER4_TAP_H__ */
66178 +#endif
66179 +/* Make Linus happy.
66180 +   Local variables:
66181 +   c-indentation-style: "K&R"
66182 +   mode-name: "LC"
66183 +   c-basic-offset: 8
66184 +   tab-width: 8
66185 +   fill-column: 120
66186 +   scroll-step: 1
66187 +   End:
66188 +*/
66189 diff -urN linux-2.6.27.orig/fs/reiser4/tree.c linux-2.6.27/fs/reiser4/tree.c
66190 --- linux-2.6.27.orig/fs/reiser4/tree.c 1970-01-01 03:00:00.000000000 +0300
66191 +++ linux-2.6.27/fs/reiser4/tree.c      2008-10-12 18:20:01.000000000 +0400
66192 @@ -0,0 +1,1876 @@
66193 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66194 + * reiser4/README */
66195 +
66196 +/*
66197 + * KEYS IN A TREE.
66198 + *
66199 + * The tree consists of nodes located on the disk. Node in the tree is either
66200 + * formatted or unformatted. Formatted node is one that has structure
66201 + * understood by the tree balancing and traversal code. Formatted nodes are
66202 + * further classified into leaf and internal nodes. Latter distinctions is
66203 + * (almost) of only historical importance: general structure of leaves and
66204 + * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
66205 + * that are part of bodies of ordinary files and attributes.
66206 + *
66207 + * Each node in the tree spawns some interval in the key space. Key ranges for
66208 + * all nodes in the tree are disjoint. Actually, this only holds in some weak
66209 + * sense, because of the non-unique keys: intersection of key ranges for
66210 + * different nodes is either empty, or consists of exactly one key.
66211 + *
66212 + * Formatted node consists of a sequence of items. Each item spawns some
66213 + * interval in key space. Key ranges for all items in a tree are disjoint,
66214 + * modulo non-unique keys again. Items within nodes are ordered in the key
66215 + * order of the smallest key in a item.
66216 + *
66217 + * Particular type of item can be further split into units. Unit is piece of
66218 + * item that can be cut from item and moved into another item of the same
66219 + * time. Units are used by balancing code to repack data during balancing.
66220 + *
66221 + * Unit can be further split into smaller entities (for example, extent unit
66222 + * represents several pages, and it is natural for extent code to operate on
66223 + * particular pages and even bytes within one unit), but this is of no
66224 + * relevance to the generic balancing and lookup code.
66225 + *
66226 + * Although item is said to "spawn" range or interval of keys, it is not
66227 + * necessary that item contains piece of data addressable by each and every
66228 + * key in this range. For example, compound directory item, consisting of
66229 + * units corresponding to directory entries and keyed by hashes of file names,
66230 + * looks more as having "discrete spectrum": only some disjoint keys inside
66231 + * range occupied by this item really address data.
66232 + *
66233 + * No than less, each item always has well-defined least (minimal) key, that
66234 + * is recorded in item header, stored in the node this item is in. Also, item
66235 + * plugin can optionally define method ->max_key_inside() returning maximal
66236 + * key that can _possibly_ be located within this item. This method is used
66237 + * (mainly) to determine when given piece of data should be merged into
66238 + * existing item, in stead of creating new one. Because of this, even though
66239 + * ->max_key_inside() can be larger that any key actually located in the item,
66240 + * intervals
66241 + *
66242 + * [ reiser4_min_key( item ), ->max_key_inside( item ) ]
66243 + *
66244 + * are still disjoint for all items within the _same_ node.
66245 + *
66246 + * In memory node is represented by znode. It plays several roles:
66247 + *
66248 + *  . something locks are taken on
66249 + *
66250 + *  . something tracked by transaction manager (this is going to change)
66251 + *
66252 + *  . something used to access node data
66253 + *
66254 + *  . something used to maintain tree structure in memory: sibling and
66255 + *  parental linkage.
66256 + *
66257 + *  . something used to organize nodes into "slums"
66258 + *
66259 + * More on znodes see in znode.[ch]
66260 + *
66261 + * DELIMITING KEYS
66262 + *
66263 + *   To simplify balancing, allow some flexibility in locking and speed up
66264 + *   important coord cache optimization, we keep delimiting keys of nodes in
66265 + *   memory. Depending on disk format (implemented by appropriate node plugin)
66266 + *   node on disk can record both left and right delimiting key, only one of
66267 + *   them, or none. Still, our balancing and tree traversal code keep both
66268 + *   delimiting keys for a node that is in memory stored in the znode. When
66269 + *   node is first brought into memory during tree traversal, its left
66270 + *   delimiting key is taken from its parent, and its right delimiting key is
66271 + *   either next key in its parent, or is right delimiting key of parent if
66272 + *   node is the rightmost child of parent.
66273 + *
66274 + *   Physical consistency of delimiting key is protected by special dk
66275 + *   read-write lock. That is, delimiting keys can only be inspected or
66276 + *   modified under this lock. But dk lock is only sufficient for fast
66277 + *   "pessimistic" check, because to simplify code and to decrease lock
66278 + *   contention, balancing (carry) only updates delimiting keys right before
66279 + *   unlocking all locked nodes on the given tree level. For example,
66280 + *   coord-by-key cache scans LRU list of recently accessed znodes. For each
66281 + *   node it first does fast check under dk spin lock. If key looked for is
66282 + *   not between delimiting keys for this node, next node is inspected and so
66283 + *   on. If key is inside of the key range, long term lock is taken on node
66284 + *   and key range is rechecked.
66285 + *
66286 + * COORDINATES
66287 + *
66288 + *   To find something in the tree, you supply a key, and the key is resolved
66289 + *   by coord_by_key() into a coord (coordinate) that is valid as long as the
66290 + *   node the coord points to remains locked.  As mentioned above trees
66291 + *   consist of nodes that consist of items that consist of units. A unit is
66292 + *   the smallest and indivisible piece of tree as far as balancing and tree
66293 + *   search are concerned. Each node, item, and unit can be addressed by
66294 + *   giving its level in the tree and the key occupied by this entity.  A node
66295 + *   knows what the key ranges are of the items within it, and how to find its
66296 + *   items and invoke their item handlers, but it does not know how to access
66297 + *   individual units within its items except through the item handlers.
66298 + *   coord is a structure containing a pointer to the node, the ordinal number
66299 + *   of the item within this node (a sort of item offset), and the ordinal
66300 + *   number of the unit within this item.
66301 + *
66302 + * TREE LOOKUP
66303 + *
66304 + *   There are two types of access to the tree: lookup and modification.
66305 + *
66306 + *   Lookup is a search for the key in the tree. Search can look for either
66307 + *   exactly the key given to it, or for the largest key that is not greater
66308 + *   than the key given to it. This distinction is determined by "bias"
66309 + *   parameter of search routine (coord_by_key()). coord_by_key() either
66310 + *   returns error (key is not in the tree, or some kind of external error
66311 + *   occurred), or successfully resolves key into coord.
66312 + *
66313 + *   This resolution is done by traversing tree top-to-bottom from root level
66314 + *   to the desired level. On levels above twig level (level one above the
66315 + *   leaf level) nodes consist exclusively of internal items. Internal item is
66316 + *   nothing more than pointer to the tree node on the child level. On twig
66317 + *   level nodes consist of internal items intermixed with extent
66318 + *   items. Internal items form normal search tree structure used by traversal
66319 + *   to descent through the tree.
66320 + *
66321 + * TREE LOOKUP OPTIMIZATIONS
66322 + *
66323 + * Tree lookup described above is expensive even if all nodes traversed are
66324 + * already in the memory: for each node binary search within it has to be
66325 + * performed and binary searches are CPU consuming and tend to destroy CPU
66326 + * caches.
66327 + *
66328 + * Several optimizations are used to work around this:
66329 + *
66330 + *   . cbk_cache (look-aside cache for tree traversals, see search.c for
66331 + *   details)
66332 + *
66333 + *   . seals (see seal.[ch])
66334 + *
66335 + *   . vroot (see search.c)
66336 + *
66337 + * General search-by-key is layered thusly:
66338 + *
66339 + *                   [check seal, if any]   --ok--> done
66340 + *                           |
66341 + *                         failed
66342 + *                           |
66343 + *                           V
66344 + *                     [vroot defined] --no--> node = tree_root
66345 + *                           |                   |
66346 + *                          yes                  |
66347 + *                           |                   |
66348 + *                           V                   |
66349 + *                       node = vroot            |
66350 + *                                 |             |
66351 + *                                 |             |
66352 + *                                 |             |
66353 + *                                 V             V
66354 + *                            [check cbk_cache for key]  --ok--> done
66355 + *                                        |
66356 + *                                      failed
66357 + *                                        |
66358 + *                                        V
66359 + *                       [start tree traversal from node]
66360 + *
66361 + */
66362 +
66363 +#include "forward.h"
66364 +#include "debug.h"
66365 +#include "dformat.h"
66366 +#include "key.h"
66367 +#include "coord.h"
66368 +#include "plugin/item/static_stat.h"
66369 +#include "plugin/item/item.h"
66370 +#include "plugin/node/node.h"
66371 +#include "plugin/plugin.h"
66372 +#include "txnmgr.h"
66373 +#include "jnode.h"
66374 +#include "znode.h"
66375 +#include "block_alloc.h"
66376 +#include "tree_walk.h"
66377 +#include "carry.h"
66378 +#include "carry_ops.h"
66379 +#include "tap.h"
66380 +#include "tree.h"
66381 +#include "vfs_ops.h"
66382 +#include "page_cache.h"
66383 +#include "super.h"
66384 +#include "reiser4.h"
66385 +#include "inode.h"
66386 +
66387 +#include <linux/fs.h>          /* for struct super_block  */
66388 +#include <linux/spinlock.h>
66389 +
66390 +/* Disk address (block number) never ever used for any real tree node. This is
66391 +   used as block number of "uber" znode.
66392 +
66393 +   Invalid block addresses are 0 by tradition.
66394 +
66395 +*/
66396 +const reiser4_block_nr UBER_TREE_ADDR = 0ull;
66397 +
66398 +#define CUT_TREE_MIN_ITERATIONS 64
66399 +
66400 +static int find_child_by_addr(znode * parent, znode * child, coord_t * result);
66401 +
66402 +/* return node plugin of coord->node */
66403 +node_plugin *node_plugin_by_coord(const coord_t * coord)
66404 +{
66405 +       assert("vs-1", coord != NULL);
66406 +       assert("vs-2", coord->node != NULL);
66407 +
66408 +       return coord->node->nplug;
66409 +}
66410 +
66411 +/* insert item into tree. Fields of @coord are updated so that they can be
66412 + * used by consequent insert operation. */
66413 +insert_result insert_by_key(reiser4_tree * tree        /* tree to insert new item
66414 +                                                * into */ ,
66415 +                           const reiser4_key * key /* key of new item */ ,
66416 +                           reiser4_item_data * data    /* parameters for item
66417 +                                                        * creation */ ,
66418 +                           coord_t * coord /* resulting insertion coord */ ,
66419 +                           lock_handle * lh    /* resulting lock
66420 +                                                * handle */ ,
66421 +                           tree_level stop_level /** level where to insert */ ,
66422 +                           __u32 flags /* insertion flags */ )
66423 +{
66424 +       int result;
66425 +
66426 +       assert("nikita-358", tree != NULL);
66427 +       assert("nikita-360", coord != NULL);
66428 +
66429 +       result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
66430 +                             FIND_EXACT, stop_level, stop_level,
66431 +                             flags | CBK_FOR_INSERT, NULL /*ra_info */ );
66432 +       switch (result) {
66433 +       default:
66434 +               break;
66435 +       case CBK_COORD_FOUND:
66436 +               result = IBK_ALREADY_EXISTS;
66437 +               break;
66438 +       case CBK_COORD_NOTFOUND:
66439 +               assert("nikita-2017", coord->node != NULL);
66440 +               result = insert_by_coord(coord, data, key, lh, 0 /*flags */ );
66441 +               break;
66442 +       }
66443 +       return result;
66444 +}
66445 +
66446 +/* insert item by calling carry. Helper function called if short-cut
66447 +   insertion failed  */
66448 +static insert_result insert_with_carry_by_coord(coord_t * coord,       /* coord where to insert */
66449 +                                               lock_handle * lh,       /* lock handle of insertion
66450 +                                                                        * node */
66451 +                                               reiser4_item_data * data,       /* parameters of new
66452 +                                                                                * item */
66453 +                                               const reiser4_key * key,        /* key of new item */
66454 +                                               carry_opcode cop,       /* carry operation to perform */
66455 +                                               cop_insert_flag flags
66456 +                                               /* carry flags */ )
66457 +{
66458 +       int result;
66459 +       carry_pool *pool;
66460 +       carry_level *lowest_level;
66461 +       carry_insert_data *cdata;
66462 +       carry_op *op;
66463 +
66464 +       assert("umka-314", coord != NULL);
66465 +
66466 +       /* allocate carry_pool and 3 carry_level-s */
66467 +       pool =
66468 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
66469 +                           sizeof(*cdata));
66470 +       if (IS_ERR(pool))
66471 +               return PTR_ERR(pool);
66472 +       lowest_level = (carry_level *) (pool + 1);
66473 +       init_carry_level(lowest_level, pool);
66474 +
66475 +       op = reiser4_post_carry(lowest_level, cop, coord->node, 0);
66476 +       if (IS_ERR(op) || (op == NULL)) {
66477 +               done_carry_pool(pool);
66478 +               return RETERR(op ? PTR_ERR(op) : -EIO);
66479 +       }
66480 +       cdata = (carry_insert_data *) (lowest_level + 3);
66481 +       cdata->coord = coord;
66482 +       cdata->data = data;
66483 +       cdata->key = key;
66484 +       op->u.insert.d = cdata;
66485 +       if (flags == 0)
66486 +               flags = znode_get_tree(coord->node)->carry.insert_flags;
66487 +       op->u.insert.flags = flags;
66488 +       op->u.insert.type = COPT_ITEM_DATA;
66489 +       op->u.insert.child = NULL;
66490 +       if (lh != NULL) {
66491 +               assert("nikita-3245", lh->node == coord->node);
66492 +               lowest_level->track_type = CARRY_TRACK_CHANGE;
66493 +               lowest_level->tracked = lh;
66494 +       }
66495 +
66496 +       result = reiser4_carry(lowest_level, NULL);
66497 +       done_carry_pool(pool);
66498 +
66499 +       return result;
66500 +}
66501 +
66502 +/* form carry queue to perform paste of @data with @key at @coord, and launch
66503 +   its execution by calling carry().
66504 +
66505 +   Instruct carry to update @lh it after balancing insertion coord moves into
66506 +   different block.
66507 +
66508 +*/
66509 +static int paste_with_carry(coord_t * coord,   /* coord of paste */
66510 +                           lock_handle * lh,   /* lock handle of node
66511 +                                                * where item is
66512 +                                                * pasted */
66513 +                           reiser4_item_data * data,   /* parameters of new
66514 +                                                        * item */
66515 +                           const reiser4_key * key,    /* key of new item */
66516 +                           unsigned flags /* paste flags */ )
66517 +{
66518 +       int result;
66519 +       carry_pool *pool;
66520 +       carry_level *lowest_level;
66521 +       carry_insert_data *cdata;
66522 +       carry_op *op;
66523 +
66524 +       assert("umka-315", coord != NULL);
66525 +       assert("umka-316", key != NULL);
66526 +
66527 +       pool =
66528 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
66529 +                           sizeof(*cdata));
66530 +       if (IS_ERR(pool))
66531 +               return PTR_ERR(pool);
66532 +       lowest_level = (carry_level *) (pool + 1);
66533 +       init_carry_level(lowest_level, pool);
66534 +
66535 +       op = reiser4_post_carry(lowest_level, COP_PASTE, coord->node, 0);
66536 +       if (IS_ERR(op) || (op == NULL)) {
66537 +               done_carry_pool(pool);
66538 +               return RETERR(op ? PTR_ERR(op) : -EIO);
66539 +       }
66540 +       cdata = (carry_insert_data *) (lowest_level + 3);
66541 +       cdata->coord = coord;
66542 +       cdata->data = data;
66543 +       cdata->key = key;
66544 +       op->u.paste.d = cdata;
66545 +       if (flags == 0)
66546 +               flags = znode_get_tree(coord->node)->carry.paste_flags;
66547 +       op->u.paste.flags = flags;
66548 +       op->u.paste.type = COPT_ITEM_DATA;
66549 +       if (lh != NULL) {
66550 +               lowest_level->track_type = CARRY_TRACK_CHANGE;
66551 +               lowest_level->tracked = lh;
66552 +       }
66553 +
66554 +       result = reiser4_carry(lowest_level, NULL);
66555 +       done_carry_pool(pool);
66556 +
66557 +       return result;
66558 +}
66559 +
66560 +/* insert item at the given coord.
66561 +
66562 +   First try to skip carry by directly calling ->create_item() method of node
66563 +   plugin. If this is impossible (there is not enough free space in the node,
66564 +   or leftmost item in the node is created), call insert_with_carry_by_coord()
66565 +   that will do full carry().
66566 +
66567 +*/
66568 +insert_result insert_by_coord(coord_t * coord  /* coord where to
66569 +                                                * insert. coord->node has
66570 +                                                * to be write locked by
66571 +                                                * caller */ ,
66572 +                             reiser4_item_data * data  /* data to be
66573 +                                                        * inserted */ ,
66574 +                             const reiser4_key * key /* key of new item */ ,
66575 +                             lock_handle * lh  /* lock handle of write
66576 +                                                * lock on node */ ,
66577 +                             __u32 flags /* insertion flags */ )
66578 +{
66579 +       unsigned item_size;
66580 +       int result;
66581 +       znode *node;
66582 +
66583 +       assert("vs-247", coord != NULL);
66584 +       assert("vs-248", data != NULL);
66585 +       assert("vs-249", data->length >= 0);
66586 +       assert("nikita-1191", znode_is_write_locked(coord->node));
66587 +
66588 +       node = coord->node;
66589 +       coord_clear_iplug(coord);
66590 +       result = zload(node);
66591 +       if (result != 0)
66592 +               return result;
66593 +
66594 +       item_size = space_needed(node, NULL, data, 1);
66595 +       if (item_size > znode_free_space(node) &&
66596 +           (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
66597 +           && (flags & COPI_DONT_ALLOCATE)) {
66598 +               /* we are forced to use free space of coord->node and new item
66599 +                  does not fit into it.
66600 +
66601 +                  Currently we get here only when we allocate and copy units
66602 +                  of extent item from a node to its left neighbor during
66603 +                  "squalloc"-ing.  If @node (this is left neighbor) does not
66604 +                  have enough free space - we do not want to attempt any
66605 +                  shifting and allocations because we are in squeezing and
66606 +                  everything to the left of @node is tightly packed.
66607 +                */
66608 +               result = -E_NODE_FULL;
66609 +       } else if ((item_size <= znode_free_space(node)) &&
66610 +                  !coord_is_before_leftmost(coord) &&
66611 +                  (node_plugin_by_node(node)->fast_insert != NULL)
66612 +                  && node_plugin_by_node(node)->fast_insert(coord)) {
66613 +               /* shortcut insertion without carry() overhead.
66614 +
66615 +                  Only possible if:
66616 +
66617 +                  - there is enough free space
66618 +
66619 +                  - insertion is not into the leftmost position in a node
66620 +                  (otherwise it would require updating of delimiting key in a
66621 +                  parent)
66622 +
66623 +                  - node plugin agrees with this
66624 +
66625 +                */
66626 +               result =
66627 +                   node_plugin_by_node(node)->create_item(coord, key, data,
66628 +                                                          NULL);
66629 +               znode_make_dirty(node);
66630 +       } else {
66631 +               /* otherwise do full-fledged carry(). */
66632 +               result =
66633 +                   insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT,
66634 +                                              flags);
66635 +       }
66636 +       zrelse(node);
66637 +       return result;
66638 +}
66639 +
66640 +/* @coord is set to leaf level and @data is to be inserted to twig level */
66641 +insert_result
66642 +insert_extent_by_coord(coord_t *
66643 +                      coord
66644 +                      /* coord where to insert. coord->node * has to be write * locked by caller */
66645 +                      ,
66646 +                      reiser4_item_data * data /* data to be inserted */ ,
66647 +                      const reiser4_key * key /* key of new item */ ,
66648 +                      lock_handle *
66649 +                      lh /* lock handle of write lock on * node */ )
66650 +{
66651 +       assert("vs-405", coord != NULL);
66652 +       assert("vs-406", data != NULL);
66653 +       assert("vs-407", data->length > 0);
66654 +       assert("vs-408", znode_is_write_locked(coord->node));
66655 +       assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
66656 +
66657 +       return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT,
66658 +                                         0 /*flags */ );
66659 +}
66660 +
66661 +/* Insert into the item at the given coord.
66662 +
66663 +   First try to skip carry by directly calling ->paste() method of item
66664 +   plugin. If this is impossible (there is not enough free space in the node,
66665 +   or we are pasting into leftmost position in the node), call
66666 +   paste_with_carry() that will do full carry().
66667 +
66668 +*/
66669 +/* paste_into_item */
66670 +int insert_into_item(coord_t * coord /* coord of pasting */ ,
66671 +                    lock_handle * lh /* lock handle on node involved */ ,
66672 +                    const reiser4_key * key /* key of unit being pasted */ ,
66673 +                    reiser4_item_data * data /* parameters for new unit */ ,
66674 +                    unsigned flags /* insert/paste flags */ )
66675 +{
66676 +       int result;
66677 +       int size_change;
66678 +       node_plugin *nplug;
66679 +       item_plugin *iplug;
66680 +
66681 +       assert("umka-317", coord != NULL);
66682 +       assert("umka-318", key != NULL);
66683 +
66684 +       iplug = item_plugin_by_coord(coord);
66685 +       nplug = node_plugin_by_coord(coord);
66686 +
66687 +       assert("nikita-1480", iplug == data->iplug);
66688 +
66689 +       size_change = space_needed(coord->node, coord, data, 0);
66690 +       if (size_change > (int)znode_free_space(coord->node) &&
66691 +           (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
66692 +           && (flags & COPI_DONT_ALLOCATE)) {
66693 +               /* we are forced to use free space of coord->node and new data
66694 +                  does not fit into it. */
66695 +               return -E_NODE_FULL;
66696 +       }
66697 +
66698 +       /* shortcut paste without carry() overhead.
66699 +
66700 +          Only possible if:
66701 +
66702 +          - there is enough free space
66703 +
66704 +          - paste is not into the leftmost unit in a node (otherwise
66705 +          it would require updating of delimiting key in a parent)
66706 +
66707 +          - node plugin agrees with this
66708 +
66709 +          - item plugin agrees with us
66710 +        */
66711 +       if (size_change <= (int)znode_free_space(coord->node) &&
66712 +           (coord->item_pos != 0 ||
66713 +            coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
66714 +           coord->unit_pos != 0 && nplug->fast_paste != NULL &&
66715 +           nplug->fast_paste(coord) &&
66716 +           iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
66717 +               if (size_change > 0)
66718 +                       nplug->change_item_size(coord, size_change);
66719 +               /* NOTE-NIKITA: huh? where @key is used? */
66720 +               result = iplug->b.paste(coord, data, NULL);
66721 +               if (size_change < 0)
66722 +                       nplug->change_item_size(coord, size_change);
66723 +               znode_make_dirty(coord->node);
66724 +       } else
66725 +               /* otherwise do full-fledged carry(). */
66726 +               result = paste_with_carry(coord, lh, data, key, flags);
66727 +       return result;
66728 +}
66729 +
66730 +/* this either appends or truncates item @coord */
66731 +int reiser4_resize_item(coord_t * coord /* coord of item being resized */ ,
66732 +                       reiser4_item_data * data /* parameters of resize */ ,
66733 +                       reiser4_key * key /* key of new unit */ ,
66734 +                       lock_handle * lh        /* lock handle of node
66735 +                                                * being modified */ ,
66736 +                       cop_insert_flag flags /* carry flags */ )
66737 +{
66738 +       int result;
66739 +       znode *node;
66740 +
66741 +       assert("nikita-362", coord != NULL);
66742 +       assert("nikita-363", data != NULL);
66743 +       assert("vs-245", data->length != 0);
66744 +
66745 +       node = coord->node;
66746 +       coord_clear_iplug(coord);
66747 +       result = zload(node);
66748 +       if (result != 0)
66749 +               return result;
66750 +
66751 +       if (data->length < 0)
66752 +               result = node_plugin_by_coord(coord)->shrink_item(coord,
66753 +                                                                 -data->length);
66754 +       else
66755 +               result = insert_into_item(coord, lh, key, data, flags);
66756 +
66757 +       zrelse(node);
66758 +       return result;
66759 +}
66760 +
66761 +/* insert flow @f */
66762 +int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
66763 +{
66764 +       int result;
66765 +       carry_pool *pool;
66766 +       carry_level *lowest_level;
66767 +       reiser4_item_data *data;
66768 +       carry_op *op;
66769 +
66770 +       pool =
66771 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
66772 +                           sizeof(*data));
66773 +       if (IS_ERR(pool))
66774 +               return PTR_ERR(pool);
66775 +       lowest_level = (carry_level *) (pool + 1);
66776 +       init_carry_level(lowest_level, pool);
66777 +
66778 +       op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
66779 +                       0 /* operate directly on coord -> node */ );
66780 +       if (IS_ERR(op) || (op == NULL)) {
66781 +               done_carry_pool(pool);
66782 +               return RETERR(op ? PTR_ERR(op) : -EIO);
66783 +       }
66784 +
66785 +       /* these are permanent during insert_flow */
66786 +       data = (reiser4_item_data *) (lowest_level + 3);
66787 +       data->user = 1;
66788 +       data->iplug = item_plugin_by_id(FORMATTING_ID);
66789 +       data->arg = NULL;
66790 +       /* data.length and data.data will be set before calling paste or
66791 +          insert */
66792 +       data->length = 0;
66793 +       data->data = NULL;
66794 +
66795 +       op->u.insert_flow.flags = 0;
66796 +       op->u.insert_flow.insert_point = coord;
66797 +       op->u.insert_flow.flow = f;
66798 +       op->u.insert_flow.data = data;
66799 +       op->u.insert_flow.new_nodes = 0;
66800 +
66801 +       lowest_level->track_type = CARRY_TRACK_CHANGE;
66802 +       lowest_level->tracked = lh;
66803 +
66804 +       result = reiser4_carry(lowest_level, NULL);
66805 +       done_carry_pool(pool);
66806 +
66807 +       return result;
66808 +}
66809 +
66810 +/* Given a coord in parent node, obtain a znode for the corresponding child */
66811 +znode *child_znode(const coord_t * parent_coord        /* coord of pointer to
66812 +                                                * child */ ,
66813 +                  znode * parent /* parent of child */ ,
66814 +                  int incore_p /* if !0 only return child if already in
66815 +                                * memory */ ,
66816 +                  int setup_dkeys_p    /* if !0 update delimiting keys of
66817 +                                        * child */ )
66818 +{
66819 +       znode *child;
66820 +
66821 +       assert("nikita-1374", parent_coord != NULL);
66822 +       assert("nikita-1482", parent != NULL);
66823 +#if REISER4_DEBUG
66824 +       if (setup_dkeys_p)
66825 +               assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock));
66826 +#endif
66827 +       assert("nikita-2947", znode_is_any_locked(parent));
66828 +
66829 +       if (znode_get_level(parent) <= LEAF_LEVEL) {
66830 +               /* trying to get child of leaf node */
66831 +               warning("nikita-1217", "Child of maize?");
66832 +               return ERR_PTR(RETERR(-EIO));
66833 +       }
66834 +       if (item_is_internal(parent_coord)) {
66835 +               reiser4_block_nr addr;
66836 +               item_plugin *iplug;
66837 +               reiser4_tree *tree;
66838 +
66839 +               iplug = item_plugin_by_coord(parent_coord);
66840 +               assert("vs-512", iplug->s.internal.down_link);
66841 +               iplug->s.internal.down_link(parent_coord, NULL, &addr);
66842 +
66843 +               tree = znode_get_tree(parent);
66844 +               if (incore_p)
66845 +                       child = zlook(tree, &addr);
66846 +               else
66847 +                       child =
66848 +                           zget(tree, &addr, parent,
66849 +                                znode_get_level(parent) - 1,
66850 +                                reiser4_ctx_gfp_mask_get());
66851 +               if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
66852 +                       set_child_delimiting_keys(parent, parent_coord, child);
66853 +       } else {
66854 +               warning("nikita-1483", "Internal item expected");
66855 +               child = ERR_PTR(RETERR(-EIO));
66856 +       }
66857 +       return child;
66858 +}
66859 +
66860 +/* remove znode from transaction */
66861 +static void uncapture_znode(znode * node)
66862 +{
66863 +       struct page *page;
66864 +
66865 +       assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
66866 +
66867 +       if (!reiser4_blocknr_is_fake(znode_get_block(node))) {
66868 +               int ret;
66869 +
66870 +               /* An already allocated block goes right to the atom's delete set. */
66871 +               ret =
66872 +                   reiser4_dealloc_block(znode_get_block(node), 0,
66873 +                                         BA_DEFER | BA_FORMATTED);
66874 +               if (ret)
66875 +                       warning("zam-942",
66876 +                               "can\'t add a block (%llu) number to atom's delete set\n",
66877 +                               (unsigned long long)(*znode_get_block(node)));
66878 +
66879 +               spin_lock_znode(node);
66880 +               /* Here we return flush reserved block which was reserved at the
66881 +                * moment when this allocated node was marked dirty and still
66882 +                * not used by flush in node relocation procedure.  */
66883 +               if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
66884 +                       txn_atom *atom;
66885 +
66886 +                       atom = jnode_get_atom(ZJNODE(node));
66887 +                       assert("zam-939", atom != NULL);
66888 +                       spin_unlock_znode(node);
66889 +                       flush_reserved2grabbed(atom, (__u64) 1);
66890 +                       spin_unlock_atom(atom);
66891 +               } else
66892 +                       spin_unlock_znode(node);
66893 +       } else {
66894 +               /* znode has assigned block which is counted as "fake
66895 +                  allocated". Return it back to "free blocks") */
66896 +               fake_allocated2free((__u64) 1, BA_FORMATTED);
66897 +       }
66898 +
66899 +       /*
66900 +        * uncapture page from transaction. There is a possibility of a race
66901 +        * with ->releasepage(): reiser4_releasepage() detaches page from this
66902 +        * jnode and we have nothing to uncapture. To avoid this, get
66903 +        * reference of node->pg under jnode spin lock. reiser4_uncapture_page()
66904 +        * will deal with released page itself.
66905 +        */
66906 +       spin_lock_znode(node);
66907 +       page = znode_page(node);
66908 +       if (likely(page != NULL)) {
66909 +               /*
66910 +                * reiser4_uncapture_page() can only be called when we are sure
66911 +                * that znode is pinned in memory, which we are, because
66912 +                * forget_znode() is only called from longterm_unlock_znode().
66913 +                */
66914 +               page_cache_get(page);
66915 +               spin_unlock_znode(node);
66916 +               lock_page(page);
66917 +               reiser4_uncapture_page(page);
66918 +               unlock_page(page);
66919 +               page_cache_release(page);
66920 +       } else {
66921 +               txn_atom *atom;
66922 +
66923 +               /* handle "flush queued" znodes */
66924 +               while (1) {
66925 +                       atom = jnode_get_atom(ZJNODE(node));
66926 +                       assert("zam-943", atom != NULL);
66927 +
66928 +                       if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED)
66929 +                           || !atom->nr_running_queues)
66930 +                               break;
66931 +
66932 +                       spin_unlock_znode(node);
66933 +                       reiser4_atom_wait_event(atom);
66934 +                       spin_lock_znode(node);
66935 +               }
66936 +
66937 +               reiser4_uncapture_block(ZJNODE(node));
66938 +               spin_unlock_atom(atom);
66939 +               zput(node);
66940 +       }
66941 +}
66942 +
66943 +/* This is called from longterm_unlock_znode() when last lock is released from
66944 +   the node that has been removed from the tree. At this point node is removed
66945 +   from sibling list and its lock is invalidated. */
66946 +void forget_znode(lock_handle * handle)
66947 +{
66948 +       znode *node;
66949 +       reiser4_tree *tree;
66950 +
66951 +       assert("umka-319", handle != NULL);
66952 +
66953 +       node = handle->node;
66954 +       tree = znode_get_tree(node);
66955 +
66956 +       assert("vs-164", znode_is_write_locked(node));
66957 +       assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
66958 +       assert_rw_locked(&(node->lock.guard));
66959 +
66960 +       /* We assume that this node was detached from its parent before
66961 +        * unlocking, it gives no way to reach this node from parent through a
66962 +        * down link.  The node should have no children and, thereby, can't be
66963 +        * reached from them by their parent pointers.  The only way to obtain a
66964 +        * reference to the node is to use sibling pointers from its left and
66965 +        * right neighbors.  In the next several lines we remove the node from
66966 +        * the sibling list. */
66967 +
66968 +       write_lock_tree(tree);
66969 +       sibling_list_remove(node);
66970 +       znode_remove(node, tree);
66971 +       write_unlock_tree(tree);
66972 +
66973 +       /* Here we set JNODE_DYING and cancel all pending lock requests.  It
66974 +        * forces all lock requestor threads to repeat iterations of getting
66975 +        * lock on a child, neighbor or parent node.  But, those threads can't
66976 +        * come to this node again, because this node is no longer a child,
66977 +        * neighbor or parent of any other node.  This order of znode
66978 +        * invalidation does not allow other threads to waste cpu time is a busy
66979 +        * loop, trying to lock dying object.  The exception is in the flush
66980 +        * code when we take node directly from atom's capture list.*/
66981 +       reiser4_invalidate_lock(handle);
66982 +       uncapture_znode(node);
66983 +}
66984 +
66985 +/* Check that internal item at @pointer really contains pointer to @child. */
66986 +int check_tree_pointer(const coord_t * pointer /* would-be pointer to
66987 +                                                * @child */ ,
66988 +                      const znode * child /* child znode */ )
66989 +{
66990 +       assert("nikita-1016", pointer != NULL);
66991 +       assert("nikita-1017", child != NULL);
66992 +       assert("nikita-1018", pointer->node != NULL);
66993 +
66994 +       assert("nikita-1325", znode_is_any_locked(pointer->node));
66995 +
66996 +       assert("nikita-2985",
66997 +              znode_get_level(pointer->node) == znode_get_level(child) + 1);
66998 +
66999 +       coord_clear_iplug((coord_t *) pointer);
67000 +
67001 +       if (coord_is_existing_unit(pointer)) {
67002 +               item_plugin *iplug;
67003 +               reiser4_block_nr addr;
67004 +
67005 +               if (item_is_internal(pointer)) {
67006 +                       iplug = item_plugin_by_coord(pointer);
67007 +                       assert("vs-513", iplug->s.internal.down_link);
67008 +                       iplug->s.internal.down_link(pointer, NULL, &addr);
67009 +                       /* check that cached value is correct */
67010 +                       if (disk_addr_eq(&addr, znode_get_block(child))) {
67011 +                               return NS_FOUND;
67012 +                       }
67013 +               }
67014 +       }
67015 +       /* warning ("jmacd-1002", "tree pointer incorrect"); */
67016 +       return NS_NOT_FOUND;
67017 +}
67018 +
67019 +/* find coord of pointer to new @child in @parent.
67020 +
67021 +   Find the &coord_t in the @parent where pointer to a given @child will
67022 +   be in.
67023 +
67024 +*/
67025 +int find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
67026 +                      znode *
67027 +                      child UNUSED_ARG /* child znode, passed locked */ ,
67028 +                      znode * left /* left brother of new node */ ,
67029 +                      coord_t * result /* where result is stored in */ )
67030 +{
67031 +       int ret;
67032 +
67033 +       assert("nikita-1486", parent != NULL);
67034 +       assert("nikita-1487", child != NULL);
67035 +       assert("nikita-1488", result != NULL);
67036 +
67037 +       ret = find_child_ptr(parent, left, result);
67038 +       if (ret != NS_FOUND) {
67039 +               warning("nikita-1489", "Cannot find brother position: %i", ret);
67040 +               return RETERR(-EIO);
67041 +       } else {
67042 +               result->between = AFTER_UNIT;
67043 +               return RETERR(NS_NOT_FOUND);
67044 +       }
67045 +}
67046 +
67047 +/* find coord of pointer to @child in @parent.
67048 +
67049 +   Find the &coord_t in the @parent where pointer to a given @child is in.
67050 +
67051 +*/
67052 +int find_child_ptr(znode * parent /* parent znode, passed locked */ ,
67053 +                  znode * child /* child znode, passed locked */ ,
67054 +                  coord_t * result /* where result is stored in */ )
67055 +{
67056 +       int lookup_res;
67057 +       node_plugin *nplug;
67058 +       /* left delimiting key of a child */
67059 +       reiser4_key ld;
67060 +       reiser4_tree *tree;
67061 +
67062 +       assert("nikita-934", parent != NULL);
67063 +       assert("nikita-935", child != NULL);
67064 +       assert("nikita-936", result != NULL);
67065 +       assert("zam-356", znode_is_loaded(parent));
67066 +
67067 +       coord_init_zero(result);
67068 +       result->node = parent;
67069 +
67070 +       nplug = parent->nplug;
67071 +       assert("nikita-939", nplug != NULL);
67072 +
67073 +       tree = znode_get_tree(parent);
67074 +       /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
67075 +        * not aliased to ->in_parent of some znode. Otherwise,
67076 +        * parent_coord_to_coord() below would modify data protected by tree
67077 +        * lock. */
67078 +       read_lock_tree(tree);
67079 +       /* fast path. Try to use cached value. Lock tree to keep
67080 +          node->pos_in_parent and pos->*_blocknr consistent. */
67081 +       if (child->in_parent.item_pos + 1 != 0) {
67082 +               parent_coord_to_coord(&child->in_parent, result);
67083 +               if (check_tree_pointer(result, child) == NS_FOUND) {
67084 +                       read_unlock_tree(tree);
67085 +                       return NS_FOUND;
67086 +               }
67087 +
67088 +               child->in_parent.item_pos = (unsigned short)~0;
67089 +       }
67090 +       read_unlock_tree(tree);
67091 +
67092 +       /* is above failed, find some key from @child. We are looking for the
67093 +          least key in a child. */
67094 +       read_lock_dk(tree);
67095 +       ld = *znode_get_ld_key(child);
67096 +       read_unlock_dk(tree);
67097 +       /*
67098 +        * now, lookup parent with key just found. Note, that left delimiting
67099 +        * key doesn't identify node uniquely, because (in extremely rare
67100 +        * case) two nodes can have equal left delimiting keys, if one of them
67101 +        * is completely filled with directory entries that all happened to be
67102 +        * hash collision. But, we check block number in check_tree_pointer()
67103 +        * and, so, are safe.
67104 +        */
67105 +       lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
67106 +       /* update cached pos_in_node */
67107 +       if (lookup_res == NS_FOUND) {
67108 +               write_lock_tree(tree);
67109 +               coord_to_parent_coord(result, &child->in_parent);
67110 +               write_unlock_tree(tree);
67111 +               lookup_res = check_tree_pointer(result, child);
67112 +       }
67113 +       if (lookup_res == NS_NOT_FOUND)
67114 +               lookup_res = find_child_by_addr(parent, child, result);
67115 +       return lookup_res;
67116 +}
67117 +
67118 +/* find coord of pointer to @child in @parent by scanning
67119 +
67120 +   Find the &coord_t in the @parent where pointer to a given @child
67121 +   is in by scanning all internal items in @parent and comparing block
67122 +   numbers in them with that of @child.
67123 +
67124 +*/
67125 +static int find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
67126 +                             znode * child /* child znode, passed locked */ ,
67127 +                             coord_t * result /* where result is stored in */ )
67128 +{
67129 +       int ret;
67130 +
67131 +       assert("nikita-1320", parent != NULL);
67132 +       assert("nikita-1321", child != NULL);
67133 +       assert("nikita-1322", result != NULL);
67134 +
67135 +       ret = NS_NOT_FOUND;
67136 +
67137 +       for_all_units(result, parent) {
67138 +               if (check_tree_pointer(result, child) == NS_FOUND) {
67139 +                       write_lock_tree(znode_get_tree(parent));
67140 +                       coord_to_parent_coord(result, &child->in_parent);
67141 +                       write_unlock_tree(znode_get_tree(parent));
67142 +                       ret = NS_FOUND;
67143 +                       break;
67144 +               }
67145 +       }
67146 +       return ret;
67147 +}
67148 +
67149 +/* true, if @addr is "unallocated block number", which is just address, with
67150 +   highest bit set. */
67151 +int is_disk_addr_unallocated(const reiser4_block_nr * addr     /* address to
67152 +                                                                * check */ )
67153 +{
67154 +       assert("nikita-1766", addr != NULL);
67155 +       cassert(sizeof(reiser4_block_nr) == 8);
67156 +       return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) ==
67157 +           REISER4_UNALLOCATED_STATUS_VALUE;
67158 +}
67159 +
67160 +/* returns true if removing bytes of given range of key [from_key, to_key]
67161 +   causes removing of whole item @from */
67162 +static int
67163 +item_removed_completely(coord_t * from, const reiser4_key * from_key,
67164 +                       const reiser4_key * to_key)
67165 +{
67166 +       item_plugin *iplug;
67167 +       reiser4_key key_in_item;
67168 +
67169 +       assert("umka-325", from != NULL);
67170 +       assert("", item_is_extent(from));
67171 +
67172 +       /* check first key just for case */
67173 +       item_key_by_coord(from, &key_in_item);
67174 +       if (keygt(from_key, &key_in_item))
67175 +               return 0;
67176 +
67177 +       /* check last key */
67178 +       iplug = item_plugin_by_coord(from);
67179 +       assert("vs-611", iplug && iplug->s.file.append_key);
67180 +
67181 +       iplug->s.file.append_key(from, &key_in_item);
67182 +       set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1);
67183 +
67184 +       if (keylt(to_key, &key_in_item))
67185 +               /* last byte is not removed */
67186 +               return 0;
67187 +       return 1;
67188 +}
67189 +
67190 +/* helper function for prepare_twig_kill(): @left and @right are formatted
67191 + * neighbors of extent item being completely removed. Load and lock neighbors
67192 + * and store lock handles into @cdata for later use by kill_hook_extent() */
67193 +static int
67194 +prepare_children(znode * left, znode * right, carry_kill_data * kdata)
67195 +{
67196 +       int result;
67197 +       int left_loaded;
67198 +       int right_loaded;
67199 +
67200 +       result = 0;
67201 +       left_loaded = right_loaded = 0;
67202 +
67203 +       if (left != NULL) {
67204 +               result = zload(left);
67205 +               if (result == 0) {
67206 +                       left_loaded = 1;
67207 +                       result = longterm_lock_znode(kdata->left, left,
67208 +                                                    ZNODE_READ_LOCK,
67209 +                                                    ZNODE_LOCK_LOPRI);
67210 +               }
67211 +       }
67212 +       if (result == 0 && right != NULL) {
67213 +               result = zload(right);
67214 +               if (result == 0) {
67215 +                       right_loaded = 1;
67216 +                       result = longterm_lock_znode(kdata->right, right,
67217 +                                                    ZNODE_READ_LOCK,
67218 +                                                    ZNODE_LOCK_HIPRI |
67219 +                                                    ZNODE_LOCK_NONBLOCK);
67220 +               }
67221 +       }
67222 +       if (result != 0) {
67223 +               done_lh(kdata->left);
67224 +               done_lh(kdata->right);
67225 +               if (left_loaded != 0)
67226 +                       zrelse(left);
67227 +               if (right_loaded != 0)
67228 +                       zrelse(right);
67229 +       }
67230 +       return result;
67231 +}
67232 +
67233 +static void done_children(carry_kill_data * kdata)
67234 +{
67235 +       if (kdata->left != NULL && kdata->left->node != NULL) {
67236 +               zrelse(kdata->left->node);
67237 +               done_lh(kdata->left);
67238 +       }
67239 +       if (kdata->right != NULL && kdata->right->node != NULL) {
67240 +               zrelse(kdata->right->node);
67241 +               done_lh(kdata->right);
67242 +       }
67243 +}
67244 +
67245 +/* part of cut_node. It is called when cut_node is called to remove or cut part
67246 +   of extent item. When head of that item is removed - we have to update right
67247 +   delimiting of left neighbor of extent. When item is removed completely - we
67248 +   have to set sibling link between left and right neighbor of removed
67249 +   extent. This may return -E_DEADLOCK because of trying to get left neighbor
67250 +   locked. So, caller should repeat an attempt
67251 +*/
67252 +/* Audited by: umka (2002.06.16) */
67253 +static int
67254 +prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor)
67255 +{
67256 +       int result;
67257 +       reiser4_key key;
67258 +       lock_handle left_lh;
67259 +       lock_handle right_lh;
67260 +       coord_t left_coord;
67261 +       coord_t *from;
67262 +       znode *left_child;
67263 +       znode *right_child;
67264 +       reiser4_tree *tree;
67265 +       int left_zloaded_here, right_zloaded_here;
67266 +
67267 +       from = kdata->params.from;
67268 +       assert("umka-326", from != NULL);
67269 +       assert("umka-327", kdata->params.to != NULL);
67270 +
67271 +       /* for one extent item only yet */
67272 +       assert("vs-591", item_is_extent(from));
67273 +       assert("vs-592", from->item_pos == kdata->params.to->item_pos);
67274 +
67275 +       if ((kdata->params.from_key
67276 +            && keygt(kdata->params.from_key, item_key_by_coord(from, &key)))
67277 +           || from->unit_pos != 0) {
67278 +               /* head of item @from is not removed, there is nothing to
67279 +                  worry about */
67280 +               return 0;
67281 +       }
67282 +
67283 +       result = 0;
67284 +       left_zloaded_here = 0;
67285 +       right_zloaded_here = 0;
67286 +
67287 +       left_child = right_child = NULL;
67288 +
67289 +       coord_dup(&left_coord, from);
67290 +       init_lh(&left_lh);
67291 +       init_lh(&right_lh);
67292 +       if (coord_prev_unit(&left_coord)) {
67293 +               /* @from is leftmost item in its node */
67294 +               if (!locked_left_neighbor) {
67295 +                       result =
67296 +                           reiser4_get_left_neighbor(&left_lh, from->node,
67297 +                                                     ZNODE_READ_LOCK,
67298 +                                                     GN_CAN_USE_UPPER_LEVELS);
67299 +                       switch (result) {
67300 +                       case 0:
67301 +                               break;
67302 +                       case -E_NO_NEIGHBOR:
67303 +                               /* there is no formatted node to the left of
67304 +                                  from->node */
67305 +                               warning("vs-605",
67306 +                                       "extent item has smallest key in "
67307 +                                       "the tree and it is about to be removed");
67308 +                               return 0;
67309 +                       case -E_DEADLOCK:
67310 +                               /* need to restart */
67311 +                       default:
67312 +                               return result;
67313 +                       }
67314 +
67315 +                       /* we have acquired left neighbor of from->node */
67316 +                       result = zload(left_lh.node);
67317 +                       if (result)
67318 +                               goto done;
67319 +
67320 +                       locked_left_neighbor = left_lh.node;
67321 +               } else {
67322 +                       /* squalloc_right_twig_cut should have supplied locked
67323 +                        * left neighbor */
67324 +                       assert("vs-834",
67325 +                              znode_is_write_locked(locked_left_neighbor));
67326 +                       result = zload(locked_left_neighbor);
67327 +                       if (result)
67328 +                               return result;
67329 +               }
67330 +
67331 +               left_zloaded_here = 1;
67332 +               coord_init_last_unit(&left_coord, locked_left_neighbor);
67333 +       }
67334 +
67335 +       if (!item_is_internal(&left_coord)) {
67336 +               /* what else but extent can be on twig level */
67337 +               assert("vs-606", item_is_extent(&left_coord));
67338 +
67339 +               /* there is no left formatted child */
67340 +               if (left_zloaded_here)
67341 +                       zrelse(locked_left_neighbor);
67342 +               done_lh(&left_lh);
67343 +               return 0;
67344 +       }
67345 +
67346 +       tree = znode_get_tree(left_coord.node);
67347 +       left_child = child_znode(&left_coord, left_coord.node, 1, 0);
67348 +
67349 +       if (IS_ERR(left_child)) {
67350 +               result = PTR_ERR(left_child);
67351 +               goto done;
67352 +       }
67353 +
67354 +       /* left child is acquired, calculate new right delimiting key for it
67355 +          and get right child if it is necessary */
67356 +       if (item_removed_completely
67357 +           (from, kdata->params.from_key, kdata->params.to_key)) {
67358 +               /* try to get right child of removed item */
67359 +               coord_t right_coord;
67360 +
67361 +               assert("vs-607",
67362 +                      kdata->params.to->unit_pos ==
67363 +                      coord_last_unit_pos(kdata->params.to));
67364 +               coord_dup(&right_coord, kdata->params.to);
67365 +               if (coord_next_unit(&right_coord)) {
67366 +                       /* @to is rightmost unit in the node */
67367 +                       result =
67368 +                           reiser4_get_right_neighbor(&right_lh, from->node,
67369 +                                                      ZNODE_READ_LOCK,
67370 +                                                      GN_CAN_USE_UPPER_LEVELS);
67371 +                       switch (result) {
67372 +                       case 0:
67373 +                               result = zload(right_lh.node);
67374 +                               if (result)
67375 +                                       goto done;
67376 +
67377 +                               right_zloaded_here = 1;
67378 +                               coord_init_first_unit(&right_coord,
67379 +                                                     right_lh.node);
67380 +                               item_key_by_coord(&right_coord, &key);
67381 +                               break;
67382 +
67383 +                       case -E_NO_NEIGHBOR:
67384 +                               /* there is no formatted node to the right of
67385 +                                  from->node */
67386 +                               read_lock_dk(tree);
67387 +                               key = *znode_get_rd_key(from->node);
67388 +                               read_unlock_dk(tree);
67389 +                               right_coord.node = NULL;
67390 +                               result = 0;
67391 +                               break;
67392 +                       default:
67393 +                               /* real error */
67394 +                               goto done;
67395 +                       }
67396 +               } else {
67397 +                       /* there is an item to the right of @from - take its key */
67398 +                       item_key_by_coord(&right_coord, &key);
67399 +               }
67400 +
67401 +               /* try to get right child of @from */
67402 +               if (right_coord.node && /* there is right neighbor of @from */
67403 +                   item_is_internal(&right_coord)) {   /* it is internal item */
67404 +                       right_child = child_znode(&right_coord,
67405 +                                                 right_coord.node, 1, 0);
67406 +
67407 +                       if (IS_ERR(right_child)) {
67408 +                               result = PTR_ERR(right_child);
67409 +                               goto done;
67410 +                       }
67411 +
67412 +               }
67413 +               /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
67414 +                  update of right delimiting key of left_child */
67415 +               result = prepare_children(left_child, right_child, kdata);
67416 +       } else {
67417 +               /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
67418 +               result = prepare_children(left_child, NULL, kdata);
67419 +       }
67420 +
67421 +      done:
67422 +       if (right_child)
67423 +               zput(right_child);
67424 +       if (right_zloaded_here)
67425 +               zrelse(right_lh.node);
67426 +       done_lh(&right_lh);
67427 +
67428 +       if (left_child)
67429 +               zput(left_child);
67430 +       if (left_zloaded_here)
67431 +               zrelse(locked_left_neighbor);
67432 +       done_lh(&left_lh);
67433 +       return result;
67434 +}
67435 +
67436 +/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
67437 +   are to be cut completely */
67438 +/* for try_to_merge_with_left, delete_copied, reiser4_delete_node */
67439 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,       /* first key to be removed */
67440 +                    const reiser4_key * to_key,        /* last key to be removed */
67441 +                    reiser4_key *
67442 +                    smallest_removed /* smallest key actually removed */ )
67443 +{
67444 +       int result;
67445 +       carry_pool *pool;
67446 +       carry_level *lowest_level;
67447 +       carry_cut_data *cut_data;
67448 +       carry_op *op;
67449 +
67450 +       assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
67451 +
67452 +       pool =
67453 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
67454 +                           sizeof(*cut_data));
67455 +       if (IS_ERR(pool))
67456 +               return PTR_ERR(pool);
67457 +       lowest_level = (carry_level *) (pool + 1);
67458 +       init_carry_level(lowest_level, pool);
67459 +
67460 +       op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
67461 +       assert("vs-1509", op != 0);
67462 +       if (IS_ERR(op)) {
67463 +               done_carry_pool(pool);
67464 +               return PTR_ERR(op);
67465 +       }
67466 +
67467 +       cut_data = (carry_cut_data *) (lowest_level + 3);
67468 +       cut_data->params.from = from;
67469 +       cut_data->params.to = to;
67470 +       cut_data->params.from_key = from_key;
67471 +       cut_data->params.to_key = to_key;
67472 +       cut_data->params.smallest_removed = smallest_removed;
67473 +
67474 +       op->u.cut_or_kill.is_cut = 1;
67475 +       op->u.cut_or_kill.u.cut = cut_data;
67476 +
67477 +       result = reiser4_carry(lowest_level, NULL);
67478 +       done_carry_pool(pool);
67479 +
67480 +       return result;
67481 +}
67482 +
67483 +/* cut part of the node
67484 +
67485 +   Cut part or whole content of node.
67486 +
67487 +   cut data between @from and @to of @from->node and call carry() to make
67488 +   corresponding changes in the tree. @from->node may become empty. If so -
67489 +   pointer to it will be removed. Neighboring nodes are not changed. Smallest
67490 +   removed key is stored in @smallest_removed
67491 +
67492 +*/
67493 +int kill_node_content(coord_t * from,  /* coord of the first unit/item that will be eliminated */
67494 +                     coord_t * to,     /* coord of the last unit/item that will be eliminated */
67495 +                     const reiser4_key * from_key,     /* first key to be removed */
67496 +                     const reiser4_key * to_key,       /* last key to be removed */
67497 +                     reiser4_key * smallest_removed,   /* smallest key actually removed */
67498 +                     znode * locked_left_neighbor,     /* this is set when kill_node_content is called with left neighbor
67499 +                                                        * locked (in squalloc_right_twig_cut, namely) */
67500 +                     struct inode *inode,      /* inode of file whose item (or its part) is to be killed. This is necessary to
67501 +                                                  invalidate pages together with item pointing to them */
67502 +                     int truncate)
67503 +{                              /* this call is made for file truncate)  */
67504 +       int result;
67505 +       carry_pool *pool;
67506 +       carry_level *lowest_level;
67507 +       carry_kill_data *kdata;
67508 +       lock_handle *left_child;
67509 +       lock_handle *right_child;
67510 +       carry_op *op;
67511 +
67512 +       assert("umka-328", from != NULL);
67513 +       assert("vs-316", !node_is_empty(from->node));
67514 +       assert("nikita-1812", coord_is_existing_unit(from)
67515 +              && coord_is_existing_unit(to));
67516 +
67517 +       /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
67518 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
67519 +                              sizeof(carry_kill_data) +
67520 +                              2 * sizeof(lock_handle) +
67521 +                              5 * sizeof(reiser4_key) + 2 * sizeof(coord_t));
67522 +       if (IS_ERR(pool))
67523 +               return PTR_ERR(pool);
67524 +
67525 +       lowest_level = (carry_level *) (pool + 1);
67526 +       init_carry_level(lowest_level, pool);
67527 +
67528 +       kdata = (carry_kill_data *) (lowest_level + 3);
67529 +       left_child = (lock_handle *) (kdata + 1);
67530 +       right_child = left_child + 1;
67531 +
67532 +       init_lh(left_child);
67533 +       init_lh(right_child);
67534 +
67535 +       kdata->params.from = from;
67536 +       kdata->params.to = to;
67537 +       kdata->params.from_key = from_key;
67538 +       kdata->params.to_key = to_key;
67539 +       kdata->params.smallest_removed = smallest_removed;
67540 +       kdata->params.truncate = truncate;
67541 +       kdata->flags = 0;
67542 +       kdata->inode = inode;
67543 +       kdata->left = left_child;
67544 +       kdata->right = right_child;
67545 +       /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
67546 +       kdata->buf = (char *)(right_child + 1);
67547 +
67548 +       if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
67549 +               /* left child of extent item may have to get updated right
67550 +                  delimiting key and to get linked with right child of extent
67551 +                  @from if it will be removed completely */
67552 +               result = prepare_twig_kill(kdata, locked_left_neighbor);
67553 +               if (result) {
67554 +                       done_children(kdata);
67555 +                       done_carry_pool(pool);
67556 +                       return result;
67557 +               }
67558 +       }
67559 +
67560 +       op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
67561 +       if (IS_ERR(op) || (op == NULL)) {
67562 +               done_children(kdata);
67563 +               done_carry_pool(pool);
67564 +               return RETERR(op ? PTR_ERR(op) : -EIO);
67565 +       }
67566 +
67567 +       op->u.cut_or_kill.is_cut = 0;
67568 +       op->u.cut_or_kill.u.kill = kdata;
67569 +
67570 +       result = reiser4_carry(lowest_level, NULL);
67571 +
67572 +       done_children(kdata);
67573 +       done_carry_pool(pool);
67574 +       return result;
67575 +}
67576 +
67577 +void
67578 +fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate)
67579 +{
67580 +       if (reiser4_inode_get_flag(inode, REISER4_HAS_MMAP)) {
67581 +               pgoff_t start_pg, end_pg;
67582 +
67583 +               start_pg = start >> PAGE_CACHE_SHIFT;
67584 +               end_pg = (end - 1) >> PAGE_CACHE_SHIFT;
67585 +
67586 +               if ((start & (PAGE_CACHE_SIZE - 1)) == 0) {
67587 +                       /*
67588 +                        * kill up to the page boundary.
67589 +                        */
67590 +                       assert("vs-123456", start_pg == end_pg);
67591 +                       reiser4_invalidate_pages(inode->i_mapping, start_pg, 1,
67592 +                                                truncate);
67593 +               } else if (start_pg != end_pg) {
67594 +                       /*
67595 +                        * page boundary is within killed portion of node.
67596 +                        */
67597 +                       assert("vs-654321", end_pg - start_pg == 1);
67598 +                       reiser4_invalidate_pages(inode->i_mapping, end_pg,
67599 +                                                end_pg - start_pg, 1);
67600 +               }
67601 +       }
67602 +       inode_sub_bytes(inode, end - start);
67603 +}
67604 +
67605 +/**
67606 + * Delete whole @node from the reiser4 tree without loading it.
67607 + *
67608 + * @left: locked left neighbor,
67609 + * @node: node to be deleted,
67610 + * @smallest_removed: leftmost key of deleted node,
67611 + * @object: inode pointer, if we truncate a file body.
67612 + * @truncate: true if called for file truncate.
67613 + *
67614 + * @return: 0 if success, error code otherwise.
67615 + *
67616 + * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
67617 + * contains the right value of the smallest removed key from the previous
67618 + * cut_worker() iteration.  This is needed for proper accounting of
67619 + * "i_blocks" and "i_bytes" fields of the @object.
67620 + */
67621 +int reiser4_delete_node(znode * node, reiser4_key * smallest_removed,
67622 +                       struct inode *object, int truncate)
67623 +{
67624 +       lock_handle parent_lock;
67625 +       coord_t cut_from;
67626 +       coord_t cut_to;
67627 +       reiser4_tree *tree;
67628 +       int ret;
67629 +
67630 +       assert("zam-937", node != NULL);
67631 +       assert("zam-933", znode_is_write_locked(node));
67632 +       assert("zam-999", smallest_removed != NULL);
67633 +
67634 +       init_lh(&parent_lock);
67635 +
67636 +       ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
67637 +       if (ret)
67638 +               return ret;
67639 +
67640 +       assert("zam-934", !znode_above_root(parent_lock.node));
67641 +
67642 +       ret = zload(parent_lock.node);
67643 +       if (ret)
67644 +               goto failed_nozrelse;
67645 +
67646 +       ret = find_child_ptr(parent_lock.node, node, &cut_from);
67647 +       if (ret)
67648 +               goto failed;
67649 +
67650 +       /* decrement child counter and set parent pointer to NULL before
67651 +          deleting the list from parent node because of checks in
67652 +          internal_kill_item_hook (we can delete the last item from the parent
67653 +          node, the parent node is going to be deleted and its c_count should
67654 +          be zero). */
67655 +
67656 +       tree = znode_get_tree(node);
67657 +       write_lock_tree(tree);
67658 +       init_parent_coord(&node->in_parent, NULL);
67659 +       --parent_lock.node->c_count;
67660 +       write_unlock_tree(tree);
67661 +
67662 +       assert("zam-989", item_is_internal(&cut_from));
67663 +
67664 +       /* @node should be deleted after unlocking. */
67665 +       ZF_SET(node, JNODE_HEARD_BANSHEE);
67666 +
67667 +       /* remove a pointer from the parent node to the node being deleted. */
67668 +       coord_dup(&cut_to, &cut_from);
67669 +       /* FIXME: shouldn't this be kill_node_content */
67670 +       ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
67671 +       if (ret)
67672 +               /* FIXME(Zam): Should we re-connect the node to its parent if
67673 +                * cut_node fails? */
67674 +               goto failed;
67675 +
67676 +       {
67677 +               reiser4_tree *tree = current_tree;
67678 +               __u64 start_offset = 0, end_offset = 0;
67679 +
67680 +               read_lock_tree(tree);
67681 +               write_lock_dk(tree);
67682 +               if (object) {
67683 +                       /* We use @smallest_removed and the left delimiting of
67684 +                        * the current node for @object->i_blocks, i_bytes
67685 +                        * calculation.  We assume that the items after the
67686 +                        * *@smallest_removed key have been deleted from the
67687 +                        * file body. */
67688 +                       start_offset = get_key_offset(znode_get_ld_key(node));
67689 +                       end_offset = get_key_offset(smallest_removed);
67690 +               }
67691 +
67692 +               assert("zam-1021", znode_is_connected(node));
67693 +               if (node->left)
67694 +                       znode_set_rd_key(node->left, znode_get_rd_key(node));
67695 +
67696 +               *smallest_removed = *znode_get_ld_key(node);
67697 +
67698 +               write_unlock_dk(tree);
67699 +               read_unlock_tree(tree);
67700 +
67701 +               if (object) {
67702 +                       /* we used to perform actions which are to be performed on items on their removal from tree in
67703 +                          special item method - kill_hook. Here for optimization reasons we avoid reading node
67704 +                          containing item we remove and can not call item's kill hook. Instead we call function which
67705 +                          does exactly the same things as tail kill hook in assumption that node we avoid reading
67706 +                          contains only one item and that item is a tail one. */
67707 +                       fake_kill_hook_tail(object, start_offset, end_offset,
67708 +                                           truncate);
67709 +               }
67710 +       }
67711 +      failed:
67712 +       zrelse(parent_lock.node);
67713 +      failed_nozrelse:
67714 +       done_lh(&parent_lock);
67715 +
67716 +       return ret;
67717 +}
67718 +
67719 +static int can_delete(const reiser4_key *key, znode *node)
67720 +{
67721 +       int result;
67722 +
67723 +       read_lock_dk(current_tree);
67724 +       result = keyle(key, znode_get_ld_key(node));
67725 +       read_unlock_dk(current_tree);
67726 +       return result;
67727 +}
67728 +
67729 +/**
67730 + * This subroutine is not optimal but implementation seems to
67731 + * be easier).
67732 + *
67733 + * @tap: the point deletion process begins from,
67734 + * @from_key: the beginning of the deleted key range,
67735 + * @to_key: the end of the deleted key range,
67736 + * @smallest_removed: the smallest removed key,
67737 + * @truncate: true if called for file truncate.
67738 + * @progress: return true if a progress in file items deletions was made,
67739 + *            @smallest_removed value is actual in that case.
67740 + *
67741 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long
67742 + * reiser4_cut_tree operation was interrupted for allowing atom commit.
67743 + */
67744 +int
67745 +cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key,
67746 +                      const reiser4_key * to_key,
67747 +                      reiser4_key * smallest_removed, struct inode *object,
67748 +                      int truncate, int *progress)
67749 +{
67750 +       lock_handle next_node_lock;
67751 +       coord_t left_coord;
67752 +       int result;
67753 +
67754 +       assert("zam-931", tap->coord->node != NULL);
67755 +       assert("zam-932", znode_is_write_locked(tap->coord->node));
67756 +
67757 +       *progress = 0;
67758 +       init_lh(&next_node_lock);
67759 +
67760 +       while (1) {
67761 +               znode *node;    /* node from which items are cut */
67762 +               node_plugin *nplug;     /* node plugin for @node */
67763 +
67764 +               node = tap->coord->node;
67765 +
67766 +               /* Move next_node_lock to the next node on the left. */
67767 +               result =
67768 +                   reiser4_get_left_neighbor(&next_node_lock, node,
67769 +                                             ZNODE_WRITE_LOCK,
67770 +                                             GN_CAN_USE_UPPER_LEVELS);
67771 +               if (result != 0 && result != -E_NO_NEIGHBOR)
67772 +                       break;
67773 +               /* Check can we delete the node as a whole. */
67774 +               if (*progress && znode_get_level(node) == LEAF_LEVEL &&
67775 +                   can_delete(from_key, node)) {
67776 +                       result = reiser4_delete_node(node, smallest_removed,
67777 +                                                    object, truncate);
67778 +               } else {
67779 +                       result = reiser4_tap_load(tap);
67780 +                       if (result)
67781 +                               return result;
67782 +
67783 +                       /* Prepare the second (right) point for cut_node() */
67784 +                       if (*progress)
67785 +                               coord_init_last_unit(tap->coord, node);
67786 +
67787 +                       else if (item_plugin_by_coord(tap->coord)->b.lookup ==
67788 +                                NULL)
67789 +                               /* set rightmost unit for the items without lookup method */
67790 +                               tap->coord->unit_pos =
67791 +                                   coord_last_unit_pos(tap->coord);
67792 +
67793 +                       nplug = node->nplug;
67794 +
67795 +                       assert("vs-686", nplug);
67796 +                       assert("vs-687", nplug->lookup);
67797 +
67798 +                       /* left_coord is leftmost unit cut from @node */
67799 +                       result = nplug->lookup(node, from_key,
67800 +                                              FIND_MAX_NOT_MORE_THAN,
67801 +                                              &left_coord);
67802 +
67803 +                       if (IS_CBKERR(result))
67804 +                               break;
67805 +
67806 +                       /* adjust coordinates so that they are set to existing units */
67807 +                       if (coord_set_to_right(&left_coord)
67808 +                           || coord_set_to_left(tap->coord)) {
67809 +                               result = 0;
67810 +                               break;
67811 +                       }
67812 +
67813 +                       if (coord_compare(&left_coord, tap->coord) ==
67814 +                           COORD_CMP_ON_RIGHT) {
67815 +                               /* keys from @from_key to @to_key are not in the tree */
67816 +                               result = 0;
67817 +                               break;
67818 +                       }
67819 +
67820 +                       if (left_coord.item_pos != tap->coord->item_pos) {
67821 +                               /* do not allow to cut more than one item. It is added to solve problem of truncating
67822 +                                  partially converted files. If file is partially converted there may exist a twig node
67823 +                                  containing both internal item or items pointing to leaf nodes with formatting items
67824 +                                  and extent item. We do not want to kill internal items being at twig node here
67825 +                                  because cut_tree_worker assumes killing them from level level */
67826 +                               coord_dup(&left_coord, tap->coord);
67827 +                               assert("vs-1652",
67828 +                                      coord_is_existing_unit(&left_coord));
67829 +                               left_coord.unit_pos = 0;
67830 +                       }
67831 +
67832 +                       /* cut data from one node */
67833 +                       // *smallest_removed = *reiser4_min_key();
67834 +                       result =
67835 +                           kill_node_content(&left_coord, tap->coord, from_key,
67836 +                                             to_key, smallest_removed,
67837 +                                             next_node_lock.node, object,
67838 +                                             truncate);
67839 +                       reiser4_tap_relse(tap);
67840 +               }
67841 +               if (result)
67842 +                       break;
67843 +
67844 +               ++(*progress);
67845 +
67846 +               /* Check whether all items with keys >= from_key were removed
67847 +                * from the tree. */
67848 +               if (keyle(smallest_removed, from_key))
67849 +                       /* result = 0; */
67850 +                       break;
67851 +
67852 +               if (next_node_lock.node == NULL)
67853 +                       break;
67854 +
67855 +               result = reiser4_tap_move(tap, &next_node_lock);
67856 +               done_lh(&next_node_lock);
67857 +               if (result)
67858 +                       break;
67859 +
67860 +               /* Break long reiser4_cut_tree operation (deletion of a large
67861 +                  file) if atom requires commit. */
67862 +               if (*progress > CUT_TREE_MIN_ITERATIONS
67863 +                   && current_atom_should_commit()) {
67864 +                       result = -E_REPEAT;
67865 +                       break;
67866 +               }
67867 +       }
67868 +       done_lh(&next_node_lock);
67869 +       // assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key()));
67870 +       return result;
67871 +}
67872 +
67873 +/* there is a fundamental problem with optimizing deletes: VFS does it
67874 +   one file at a time.  Another problem is that if an item can be
67875 +   anything, then deleting items must be done one at a time.  It just
67876 +   seems clean to writes this to specify a from and a to key, and cut
67877 +   everything between them though.  */
67878 +
67879 +/* use this function with care if deleting more than what is part of a single file. */
67880 +/* do not use this when cutting a single item, it is suboptimal for that */
67881 +
67882 +/* You are encouraged to write plugin specific versions of this.  It
67883 +   cannot be optimal for all plugins because it works item at a time,
67884 +   and some plugins could sometimes work node at a time. Regular files
67885 +   however are not optimizable to work node at a time because of
67886 +   extents needing to free the blocks they point to.
67887 +
67888 +   Optimizations compared to v3 code:
67889 +
67890 +   It does not balance (that task is left to memory pressure code).
67891 +
67892 +   Nodes are deleted only if empty.
67893 +
67894 +   Uses extents.
67895 +
67896 +   Performs read-ahead of formatted nodes whose contents are part of
67897 +   the deletion.
67898 +*/
67899 +
67900 +/**
67901 + * Delete everything from the reiser4 tree between two keys: @from_key and
67902 + * @to_key.
67903 + *
67904 + * @from_key: the beginning of the deleted key range,
67905 + * @to_key: the end of the deleted key range,
67906 + * @smallest_removed: the smallest removed key,
67907 + * @object: owner of cutting items.
67908 + * @truncate: true if called for file truncate.
67909 + * @progress: return true if a progress in file items deletions was made,
67910 + *            @smallest_removed value is actual in that case.
67911 + *
67912 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
67913 + * operation was interrupted for allowing atom commit .
67914 + */
67915 +
67916 +int reiser4_cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
67917 +                           const reiser4_key * to_key,
67918 +                           reiser4_key * smallest_removed_p,
67919 +                           struct inode *object, int truncate, int *progress)
67920 +{
67921 +       lock_handle lock;
67922 +       int result;
67923 +       tap_t tap;
67924 +       coord_t right_coord;
67925 +       reiser4_key smallest_removed;
67926 +       int (*cut_tree_worker) (tap_t *, const reiser4_key *,
67927 +                               const reiser4_key *, reiser4_key *,
67928 +                               struct inode *, int, int *);
67929 +       STORE_COUNTERS;
67930 +
67931 +       assert("umka-329", tree != NULL);
67932 +       assert("umka-330", from_key != NULL);
67933 +       assert("umka-331", to_key != NULL);
67934 +       assert("zam-936", keyle(from_key, to_key));
67935 +
67936 +       if (smallest_removed_p == NULL)
67937 +               smallest_removed_p = &smallest_removed;
67938 +
67939 +       init_lh(&lock);
67940 +
67941 +       do {
67942 +               /* Find rightmost item to cut away from the tree. */
67943 +               result = reiser4_object_lookup(object, to_key, &right_coord,
67944 +                                              &lock, ZNODE_WRITE_LOCK,
67945 +                                              FIND_MAX_NOT_MORE_THAN,
67946 +                                              TWIG_LEVEL, LEAF_LEVEL,
67947 +                                              CBK_UNIQUE, NULL /*ra_info */);
67948 +               if (result != CBK_COORD_FOUND)
67949 +                       break;
67950 +               if (object == NULL
67951 +                   || inode_file_plugin(object)->cut_tree_worker == NULL)
67952 +                       cut_tree_worker = cut_tree_worker_common;
67953 +               else
67954 +                       cut_tree_worker =
67955 +                           inode_file_plugin(object)->cut_tree_worker;
67956 +               reiser4_tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
67957 +               result =
67958 +                   cut_tree_worker(&tap, from_key, to_key, smallest_removed_p,
67959 +                                   object, truncate, progress);
67960 +               reiser4_tap_done(&tap);
67961 +
67962 +               reiser4_preempt_point();
67963 +
67964 +       } while (0);
67965 +
67966 +       done_lh(&lock);
67967 +
67968 +       if (result) {
67969 +               switch (result) {
67970 +               case -E_NO_NEIGHBOR:
67971 +                       result = 0;
67972 +                       break;
67973 +               case -E_DEADLOCK:
67974 +                       result = -E_REPEAT;
67975 +               case -E_REPEAT:
67976 +               case -ENOMEM:
67977 +               case -ENOENT:
67978 +                       break;
67979 +               default:
67980 +                       warning("nikita-2861", "failure: %i", result);
67981 +               }
67982 +       }
67983 +
67984 +       CHECK_COUNTERS;
67985 +       return result;
67986 +}
67987 +
67988 +/* repeat reiser4_cut_tree_object until everything is deleted.
67989 + * unlike cut_file_items, it does not end current transaction if -E_REPEAT
67990 + * is returned by cut_tree_object. */
67991 +int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
67992 +                    const reiser4_key * to, struct inode *inode, int truncate)
67993 +{
67994 +       int result;
67995 +       int progress;
67996 +
67997 +       do {
67998 +               result = reiser4_cut_tree_object(tree, from, to, NULL,
67999 +                                                inode, truncate, &progress);
68000 +       } while (result == -E_REPEAT);
68001 +
68002 +       return result;
68003 +}
68004 +
68005 +/* finishing reiser4 initialization */
68006 +int reiser4_init_tree(reiser4_tree * tree      /* pointer to structure being
68007 +                                        * initialized */ ,
68008 +             const reiser4_block_nr * root_block       /* address of a root block
68009 +                                                        * on a disk */ ,
68010 +             tree_level height /* height of a tree */ ,
68011 +             node_plugin * nplug /* default node plugin */ )
68012 +{
68013 +       int result;
68014 +
68015 +       assert("nikita-306", tree != NULL);
68016 +       assert("nikita-307", root_block != NULL);
68017 +       assert("nikita-308", height > 0);
68018 +       assert("nikita-309", nplug != NULL);
68019 +       assert("zam-587", tree->super != NULL);
68020 +
68021 +       tree->root_block = *root_block;
68022 +       tree->height = height;
68023 +       tree->estimate_one_insert = calc_estimate_one_insert(height);
68024 +       tree->nplug = nplug;
68025 +
68026 +       tree->znode_epoch = 1ull;
68027 +
68028 +       cbk_cache_init(&tree->cbk_cache);
68029 +
68030 +       result = znodes_tree_init(tree);
68031 +       if (result == 0)
68032 +               result = jnodes_tree_init(tree);
68033 +       if (result == 0) {
68034 +               tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0,
68035 +                                 reiser4_ctx_gfp_mask_get());
68036 +               if (IS_ERR(tree->uber)) {
68037 +                       result = PTR_ERR(tree->uber);
68038 +                       tree->uber = NULL;
68039 +               }
68040 +       }
68041 +       return result;
68042 +}
68043 +
68044 +/* release resources associated with @tree */
68045 +void reiser4_done_tree(reiser4_tree * tree /* tree to release */ )
68046 +{
68047 +       if (tree == NULL)
68048 +               return;
68049 +
68050 +       if (tree->uber != NULL) {
68051 +               zput(tree->uber);
68052 +               tree->uber = NULL;
68053 +       }
68054 +       znodes_tree_done(tree);
68055 +       jnodes_tree_done(tree);
68056 +       cbk_cache_done(&tree->cbk_cache);
68057 +}
68058 +
68059 +/* Make Linus happy.
68060 +   Local variables:
68061 +   c-indentation-style: "K&R"
68062 +   mode-name: "LC"
68063 +   c-basic-offset: 8
68064 +   tab-width: 8
68065 +   fill-column: 120
68066 +   scroll-step: 1
68067 +   End:
68068 +*/
68069 diff -urN linux-2.6.27.orig/fs/reiser4/tree.h linux-2.6.27/fs/reiser4/tree.h
68070 --- linux-2.6.27.orig/fs/reiser4/tree.h 1970-01-01 03:00:00.000000000 +0300
68071 +++ linux-2.6.27/fs/reiser4/tree.h      2008-10-12 18:20:01.000000000 +0400
68072 @@ -0,0 +1,577 @@
68073 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68074 + * reiser4/README */
68075 +
68076 +/* Tree operations. See fs/reiser4/tree.c for comments */
68077 +
68078 +#if !defined( __REISER4_TREE_H__ )
68079 +#define __REISER4_TREE_H__
68080 +
68081 +#include "forward.h"
68082 +#include "debug.h"
68083 +#include "dformat.h"
68084 +#include "plugin/node/node.h"
68085 +#include "plugin/plugin.h"
68086 +#include "znode.h"
68087 +#include "tap.h"
68088 +
68089 +#include <linux/types.h>       /* for __u??  */
68090 +#include <linux/fs.h>          /* for struct super_block  */
68091 +#include <linux/spinlock.h>
68092 +#include <linux/sched.h>       /* for struct task_struct */
68093 +
68094 +/* fictive block number never actually used */
68095 +extern const reiser4_block_nr UBER_TREE_ADDR;
68096 +
68097 +/* &cbk_cache_slot - entry in a coord cache.
68098 +
68099 +   This is entry in a coord_by_key (cbk) cache, represented by
68100 +   &cbk_cache.
68101 +
68102 +*/
68103 +typedef struct cbk_cache_slot {
68104 +       /* cached node */
68105 +       znode *node;
68106 +       /* linkage to the next cbk cache slot in a LRU order */
68107 +       struct list_head lru;
68108 +} cbk_cache_slot;
68109 +
68110 +/* &cbk_cache - coord cache. This is part of reiser4_tree.
68111 +
68112 +   cbk_cache is supposed to speed up tree lookups by caching results of recent
68113 +   successful lookups (we don't cache negative results as dentry cache
68114 +   does). Cache consists of relatively small number of entries kept in a LRU
68115 +   order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
68116 +   which we can obtain a range of keys that covered by this znode. Before
68117 +   embarking into real tree traversal we scan cbk_cache slot by slot and for
68118 +   each slot check whether key we are looking for is between minimal and
68119 +   maximal keys for node pointed to by this slot. If no match is found, real
68120 +   tree traversal is performed and if result is successful, appropriate entry
68121 +   is inserted into cache, possibly pulling least recently used entry out of
68122 +   it.
68123 +
68124 +   Tree spin lock is used to protect coord cache. If contention for this
68125 +   lock proves to be too high, more finer grained locking can be added.
68126 +
68127 +   Invariants involving parts of this data-type:
68128 +
68129 +      [cbk-cache-invariant]
68130 +*/
68131 +typedef struct cbk_cache {
68132 +       /* serializator */
68133 +       rwlock_t guard;
68134 +       int nr_slots;
68135 +       /* head of LRU list of cache slots */
68136 +       struct list_head lru;
68137 +       /* actual array of slots */
68138 +       cbk_cache_slot *slot;
68139 +} cbk_cache;
68140 +
68141 +/* level_lookup_result - possible outcome of looking up key at some level.
68142 +   This is used by coord_by_key when traversing tree downward. */
68143 +typedef enum {
68144 +       /* continue to the next level */
68145 +       LOOKUP_CONT,
68146 +       /* done. Either required item was found, or we can prove it
68147 +          doesn't exist, or some error occurred. */
68148 +       LOOKUP_DONE,
68149 +       /* restart traversal from the root. Infamous "repetition". */
68150 +       LOOKUP_REST
68151 +} level_lookup_result;
68152 +
68153 +/*    This is representation of internal reiser4 tree where all file-system
68154 +   data and meta-data are stored. This structure is passed to all tree
68155 +   manipulation functions. It's different from the super block because:
68156 +   we don't want to limit ourselves to strictly one to one mapping
68157 +   between super blocks and trees, and, because they are logically
68158 +   different: there are things in a super block that have no relation to
68159 +   the tree (bitmaps, journalling area, mount options, etc.) and there
68160 +   are things in a tree that bear no relation to the super block, like
68161 +   tree of znodes.
68162 +
68163 +   At this time, there is only one tree
68164 +   per filesystem, and this struct is part of the super block.  We only
68165 +   call the super block the super block for historical reasons (most
68166 +   other filesystems call the per filesystem metadata the super block).
68167 +*/
68168 +
68169 +struct reiser4_tree {
68170 +       /* block_nr == 0 is fake znode. Write lock it, while changing
68171 +          tree height. */
68172 +       /* disk address of root node of a tree */
68173 +       reiser4_block_nr root_block;
68174 +
68175 +       /* level of the root node. If this is 1, tree consists of root
68176 +          node only */
68177 +       tree_level height;
68178 +
68179 +       /*
68180 +        * this is cached here avoid calling plugins through function
68181 +        * dereference all the time.
68182 +        */
68183 +       __u64 estimate_one_insert;
68184 +
68185 +       /* cache of recent tree lookup results */
68186 +       cbk_cache cbk_cache;
68187 +
68188 +       /* hash table to look up znodes by block number. */
68189 +       z_hash_table zhash_table;
68190 +       z_hash_table zfake_table;
68191 +       /* hash table to look up jnodes by inode and offset. */
68192 +       j_hash_table jhash_table;
68193 +
68194 +       /* lock protecting:
68195 +          - parent pointers,
68196 +          - sibling pointers,
68197 +          - znode hash table
68198 +          - coord cache
68199 +        */
68200 +       /* NOTE: The "giant" tree lock can be replaced by more spin locks,
68201 +          hoping they will be less contented. We can use one spin lock per one
68202 +          znode hash bucket.  With adding of some code complexity, sibling
68203 +          pointers can be protected by both znode spin locks.  However it looks
68204 +          more SMP scalable we should test this locking change on n-ways (n >
68205 +          4) SMP machines.  Current 4-ways machine test does not show that tree
68206 +          lock is contented and it is a bottleneck (2003.07.25). */
68207 +
68208 +       rwlock_t tree_lock;
68209 +
68210 +       /* lock protecting delimiting keys */
68211 +       rwlock_t dk_lock;
68212 +
68213 +       /* spin lock protecting znode_epoch */
68214 +       spinlock_t epoch_lock;
68215 +       /* version stamp used to mark znode updates. See seal.[ch] for more
68216 +        * information. */
68217 +       __u64 znode_epoch;
68218 +
68219 +       znode *uber;
68220 +       node_plugin *nplug;
68221 +       struct super_block *super;
68222 +       struct {
68223 +               /* carry flags used for insertion of new nodes */
68224 +               __u32 new_node_flags;
68225 +               /* carry flags used for insertion of new extents */
68226 +               __u32 new_extent_flags;
68227 +               /* carry flags used for paste operations */
68228 +               __u32 paste_flags;
68229 +               /* carry flags used for insert operations */
68230 +               __u32 insert_flags;
68231 +       } carry;
68232 +};
68233 +
68234 +extern int reiser4_init_tree(reiser4_tree * tree,
68235 +                            const reiser4_block_nr * root_block,
68236 +                            tree_level height, node_plugin * default_plugin);
68237 +extern void reiser4_done_tree(reiser4_tree * tree);
68238 +
68239 +/* cbk flags: options for coord_by_key() */
68240 +typedef enum {
68241 +       /* coord_by_key() is called for insertion. This is necessary because
68242 +          of extents being located at the twig level. For explanation, see
68243 +          comment just above is_next_item_internal().
68244 +        */
68245 +       CBK_FOR_INSERT = (1 << 0),
68246 +       /* coord_by_key() is called with key that is known to be unique */
68247 +       CBK_UNIQUE = (1 << 1),
68248 +       /* coord_by_key() can trust delimiting keys. This options is not user
68249 +          accessible. coord_by_key() will set it automatically. It will be
68250 +          only cleared by special-case in extents-on-the-twig-level handling
68251 +          where it is necessary to insert item with a key smaller than
68252 +          leftmost key in a node. This is necessary because of extents being
68253 +          located at the twig level. For explanation, see comment just above
68254 +          is_next_item_internal().
68255 +        */
68256 +       CBK_TRUST_DK = (1 << 2),
68257 +       CBK_READA = (1 << 3),   /* original: readahead leaves which contain items of certain file */
68258 +       CBK_READDIR_RA = (1 << 4),      /* readdir: readahead whole directory and all its stat datas */
68259 +       CBK_DKSET = (1 << 5),
68260 +       CBK_EXTENDED_COORD = (1 << 6),  /* coord_t is actually */
68261 +       CBK_IN_CACHE = (1 << 7),        /* node is already in cache */
68262 +       CBK_USE_CRABLOCK = (1 << 8)     /* use crab_lock in stead of long term
68263 +                                        * lock */
68264 +} cbk_flags;
68265 +
68266 +/* insertion outcome. IBK = insert by key */
68267 +typedef enum {
68268 +       IBK_INSERT_OK = 0,
68269 +       IBK_ALREADY_EXISTS = -EEXIST,
68270 +       IBK_IO_ERROR = -EIO,
68271 +       IBK_NO_SPACE = -E_NODE_FULL,
68272 +       IBK_OOM = -ENOMEM
68273 +} insert_result;
68274 +
68275 +#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
68276 +
68277 +typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord,
68278 +                                    lock_handle * lh, void *arg);
68279 +extern int reiser4_iterate_tree(reiser4_tree * tree, coord_t * coord,
68280 +                               lock_handle * lh,
68281 +                               tree_iterate_actor_t actor, void *arg,
68282 +                               znode_lock_mode mode, int through_units_p);
68283 +extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
68284 +                         znode_lock_request pri, lock_handle * lh);
68285 +
68286 +/* return node plugin of @node */
68287 +static inline node_plugin *node_plugin_by_node(const znode *
68288 +                                              node /* node to query */ )
68289 +{
68290 +       assert("vs-213", node != NULL);
68291 +       assert("vs-214", znode_is_loaded(node));
68292 +
68293 +       return node->nplug;
68294 +}
68295 +
68296 +/* number of items in @node */
68297 +static inline pos_in_node_t node_num_items(const znode * node)
68298 +{
68299 +       assert("nikita-2754", znode_is_loaded(node));
68300 +       assert("nikita-2468",
68301 +              node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
68302 +
68303 +       return node->nr_items;
68304 +}
68305 +
68306 +/* Return the number of items at the present node.  Asserts coord->node !=
68307 +   NULL. */
68308 +static inline unsigned coord_num_items(const coord_t * coord)
68309 +{
68310 +       assert("jmacd-9805", coord->node != NULL);
68311 +
68312 +       return node_num_items(coord->node);
68313 +}
68314 +
68315 +/* true if @node is empty */
68316 +static inline int node_is_empty(const znode * node)
68317 +{
68318 +       return node_num_items(node) == 0;
68319 +}
68320 +
68321 +typedef enum {
68322 +       SHIFTED_SOMETHING = 0,
68323 +       SHIFT_NO_SPACE = -E_NODE_FULL,
68324 +       SHIFT_IO_ERROR = -EIO,
68325 +       SHIFT_OOM = -ENOMEM,
68326 +} shift_result;
68327 +
68328 +extern node_plugin *node_plugin_by_coord(const coord_t * coord);
68329 +extern int is_coord_in_node(const coord_t * coord);
68330 +extern int key_in_node(const reiser4_key *, const coord_t *);
68331 +extern void coord_item_move_to(coord_t * coord, int items);
68332 +extern void coord_unit_move_to(coord_t * coord, int units);
68333 +
68334 +/* there are two types of repetitive accesses (ra): intra-syscall
68335 +   (local) and inter-syscall (global). Local ra is used when
68336 +   during single syscall we add/delete several items and units in the
68337 +   same place in a tree. Note that plan-A fragments local ra by
68338 +   separating stat-data and file body in key-space. Global ra is
68339 +   used when user does repetitive modifications in the same place in a
68340 +   tree.
68341 +
68342 +   Our ra implementation serves following purposes:
68343 +    1 it affects balancing decisions so that next operation in a row
68344 +      can be performed faster;
68345 +    2 it affects lower-level read-ahead in page-cache;
68346 +    3 it allows to avoid unnecessary lookups by maintaining some state
68347 +      across several operations (this is only for local ra);
68348 +    4 it leaves room for lazy-micro-balancing: when we start a sequence of
68349 +      operations they are performed without actually doing any intra-node
68350 +      shifts, until we finish sequence or scope of sequence leaves
68351 +      current node, only then we really pack node (local ra only).
68352 +*/
68353 +
68354 +/* another thing that can be useful is to keep per-tree and/or
68355 +   per-process cache of recent lookups. This cache can be organised as a
68356 +   list of block numbers of formatted nodes sorted by starting key in
68357 +   this node. Balancings should invalidate appropriate parts of this
68358 +   cache.
68359 +*/
68360 +
68361 +lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
68362 +                          coord_t * coord, lock_handle * handle,
68363 +                          znode_lock_mode lock, lookup_bias bias,
68364 +                          tree_level lock_level, tree_level stop_level,
68365 +                          __u32 flags, ra_info_t *);
68366 +
68367 +lookup_result reiser4_object_lookup(struct inode *object,
68368 +                                   const reiser4_key * key,
68369 +                                   coord_t * coord,
68370 +                                   lock_handle * lh,
68371 +                                   znode_lock_mode lock_mode,
68372 +                                   lookup_bias bias,
68373 +                                   tree_level lock_level,
68374 +                                   tree_level stop_level,
68375 +                                   __u32 flags, ra_info_t * info);
68376 +
68377 +insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
68378 +                           reiser4_item_data * data, coord_t * coord,
68379 +                           lock_handle * lh,
68380 +                           tree_level stop_level, __u32 flags);
68381 +insert_result insert_by_coord(coord_t * coord,
68382 +                             reiser4_item_data * data, const reiser4_key * key,
68383 +                             lock_handle * lh, __u32);
68384 +insert_result insert_extent_by_coord(coord_t * coord,
68385 +                                    reiser4_item_data * data,
68386 +                                    const reiser4_key * key, lock_handle * lh);
68387 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,
68388 +                    const reiser4_key * to_key,
68389 +                    reiser4_key * smallest_removed);
68390 +int kill_node_content(coord_t * from, coord_t * to,
68391 +                     const reiser4_key * from_key, const reiser4_key * to_key,
68392 +                     reiser4_key * smallest_removed,
68393 +                     znode * locked_left_neighbor, struct inode *inode,
68394 +                     int truncate);
68395 +
68396 +int reiser4_resize_item(coord_t * coord, reiser4_item_data * data,
68397 +                       reiser4_key * key, lock_handle * lh, cop_insert_flag);
68398 +int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key,
68399 +                    reiser4_item_data * data, unsigned);
68400 +int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
68401 +int find_new_child_ptr(znode * parent, znode * child, znode * left,
68402 +                      coord_t * result);
68403 +
68404 +int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
68405 +int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
68406 +
68407 +void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int);
68408 +
68409 +extern int cut_tree_worker_common(tap_t *, const reiser4_key *,
68410 +                                 const reiser4_key *, reiser4_key *,
68411 +                                 struct inode *, int, int *);
68412 +extern int reiser4_cut_tree_object(reiser4_tree *, const reiser4_key *,
68413 +                                  const reiser4_key *, reiser4_key *,
68414 +                                  struct inode *, int, int *);
68415 +extern int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
68416 +                           const reiser4_key * to, struct inode *, int);
68417 +
68418 +extern int reiser4_delete_node(znode *, reiser4_key *, struct inode *, int);
68419 +extern int check_tree_pointer(const coord_t * pointer, const znode * child);
68420 +extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG,
68421 +                             znode * left, coord_t * result);
68422 +extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
68423 +extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent,
68424 +                                    znode * child);
68425 +extern znode *child_znode(const coord_t * in_parent, znode * parent,
68426 +                         int incore_p, int setup_dkeys_p);
68427 +
68428 +extern int cbk_cache_init(cbk_cache * cache);
68429 +extern void cbk_cache_done(cbk_cache * cache);
68430 +extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
68431 +
68432 +extern char *sprint_address(const reiser4_block_nr * block);
68433 +
68434 +#if REISER4_DEBUG
68435 +extern void print_coord_content(const char *prefix, coord_t * p);
68436 +extern void reiser4_print_address(const char *prefix,
68437 +                       const reiser4_block_nr * block);
68438 +extern void print_tree_rec(const char *prefix, reiser4_tree * tree,
68439 +                          __u32 flags);
68440 +extern void check_dkeys(znode *node);
68441 +#else
68442 +#define print_coord_content(p, c) noop
68443 +#define reiser4_print_address(p, b) noop
68444 +#endif
68445 +
68446 +extern void forget_znode(lock_handle * handle);
68447 +extern int deallocate_znode(znode * node);
68448 +
68449 +extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
68450 +
68451 +/* struct used internally to pack all numerous arguments of tree lookup.
68452 +    Used to avoid passing a lot of arguments to helper functions. */
68453 +typedef struct cbk_handle {
68454 +       /* tree we are in */
68455 +       reiser4_tree *tree;
68456 +       /* key we are going after */
68457 +       const reiser4_key *key;
68458 +       /* coord we will store result in */
68459 +       coord_t *coord;
68460 +       /* type of lock to take on target node */
68461 +       znode_lock_mode lock_mode;
68462 +       /* lookup bias. See comments at the declaration of lookup_bias */
68463 +       lookup_bias bias;
68464 +       /* lock level: level starting from which tree traversal starts taking
68465 +        * write locks. */
68466 +       tree_level lock_level;
68467 +       /* level where search will stop. Either item will be found between
68468 +          lock_level and stop_level, or CBK_COORD_NOTFOUND will be
68469 +          returned.
68470 +        */
68471 +       tree_level stop_level;
68472 +       /* level we are currently at */
68473 +       tree_level level;
68474 +       /* block number of @active node. Tree traversal operates on two
68475 +          nodes: active and parent.  */
68476 +       reiser4_block_nr block;
68477 +       /* put here error message to be printed by caller */
68478 +       const char *error;
68479 +       /* result passed back to caller */
68480 +       lookup_result result;
68481 +       /* lock handles for active and parent */
68482 +       lock_handle *parent_lh;
68483 +       lock_handle *active_lh;
68484 +       reiser4_key ld_key;
68485 +       reiser4_key rd_key;
68486 +       /* flags, passed to the cbk routine. Bits of this bitmask are defined
68487 +          in tree.h:cbk_flags enum. */
68488 +       __u32 flags;
68489 +       ra_info_t *ra_info;
68490 +       struct inode *object;
68491 +} cbk_handle;
68492 +
68493 +extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
68494 +
68495 +/* eottl.c */
68496 +extern int handle_eottl(cbk_handle *h, int *outcome);
68497 +
68498 +int lookup_multikey(cbk_handle * handle, int nr_keys);
68499 +int lookup_couple(reiser4_tree * tree,
68500 +                 const reiser4_key * key1, const reiser4_key * key2,
68501 +                 coord_t * coord1, coord_t * coord2,
68502 +                 lock_handle * lh1, lock_handle * lh2,
68503 +                 znode_lock_mode lock_mode, lookup_bias bias,
68504 +                 tree_level lock_level, tree_level stop_level, __u32 flags,
68505 +                 int *result1, int *result2);
68506 +
68507 +static inline void read_lock_tree(reiser4_tree *tree)
68508 +{
68509 +       /* check that tree is not locked */
68510 +       assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
68511 +                   LOCK_CNT_NIL(read_locked_tree) &&
68512 +                   LOCK_CNT_NIL(write_locked_tree)));
68513 +       /* check that spinlocks of lower priorities are not held */
68514 +       assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
68515 +                   LOCK_CNT_NIL(rw_locked_dk) &&
68516 +                   LOCK_CNT_NIL(spin_locked_stack)));
68517 +
68518 +       read_lock(&(tree->tree_lock));
68519 +
68520 +       LOCK_CNT_INC(read_locked_tree);
68521 +       LOCK_CNT_INC(rw_locked_tree);
68522 +       LOCK_CNT_INC(spin_locked);
68523 +}
68524 +
68525 +static inline void read_unlock_tree(reiser4_tree *tree)
68526 +{
68527 +       assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree));
68528 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
68529 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68530 +
68531 +       LOCK_CNT_DEC(read_locked_tree);
68532 +       LOCK_CNT_DEC(rw_locked_tree);
68533 +       LOCK_CNT_DEC(spin_locked);
68534 +
68535 +       read_unlock(&(tree->tree_lock));
68536 +}
68537 +
68538 +static inline void write_lock_tree(reiser4_tree *tree)
68539 +{
68540 +       /* check that tree is not locked */
68541 +       assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
68542 +                   LOCK_CNT_NIL(read_locked_tree) &&
68543 +                   LOCK_CNT_NIL(write_locked_tree)));
68544 +       /* check that spinlocks of lower priorities are not held */
68545 +       assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
68546 +                   LOCK_CNT_NIL(rw_locked_dk) &&
68547 +                   LOCK_CNT_NIL(spin_locked_stack)));
68548 +
68549 +       write_lock(&(tree->tree_lock));
68550 +
68551 +       LOCK_CNT_INC(write_locked_tree);
68552 +       LOCK_CNT_INC(rw_locked_tree);
68553 +       LOCK_CNT_INC(spin_locked);
68554 +}
68555 +
68556 +static inline void write_unlock_tree(reiser4_tree *tree)
68557 +{
68558 +       assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree));
68559 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
68560 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68561 +
68562 +       LOCK_CNT_DEC(write_locked_tree);
68563 +       LOCK_CNT_DEC(rw_locked_tree);
68564 +       LOCK_CNT_DEC(spin_locked);
68565 +
68566 +       write_unlock(&(tree->tree_lock));
68567 +}
68568 +
68569 +static inline void read_lock_dk(reiser4_tree *tree)
68570 +{
68571 +       /* check that dk is not locked */
68572 +       assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
68573 +                   LOCK_CNT_NIL(read_locked_dk) &&
68574 +                   LOCK_CNT_NIL(write_locked_dk)));
68575 +       /* check that spinlocks of lower priorities are not held */
68576 +       assert("", LOCK_CNT_NIL(spin_locked_stack));
68577 +
68578 +       read_lock(&((tree)->dk_lock));
68579 +
68580 +       LOCK_CNT_INC(read_locked_dk);
68581 +       LOCK_CNT_INC(rw_locked_dk);
68582 +       LOCK_CNT_INC(spin_locked);
68583 +}
68584 +
68585 +static inline void read_unlock_dk(reiser4_tree *tree)
68586 +{
68587 +       assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk));
68588 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
68589 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68590 +
68591 +       LOCK_CNT_DEC(read_locked_dk);
68592 +       LOCK_CNT_DEC(rw_locked_dk);
68593 +       LOCK_CNT_DEC(spin_locked);
68594 +
68595 +       read_unlock(&(tree->dk_lock));
68596 +}
68597 +
68598 +static inline void write_lock_dk(reiser4_tree *tree)
68599 +{
68600 +       /* check that dk is not locked */
68601 +       assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
68602 +                   LOCK_CNT_NIL(read_locked_dk) &&
68603 +                   LOCK_CNT_NIL(write_locked_dk)));
68604 +       /* check that spinlocks of lower priorities are not held */
68605 +       assert("", LOCK_CNT_NIL(spin_locked_stack));
68606 +
68607 +       write_lock(&((tree)->dk_lock));
68608 +
68609 +       LOCK_CNT_INC(write_locked_dk);
68610 +       LOCK_CNT_INC(rw_locked_dk);
68611 +       LOCK_CNT_INC(spin_locked);
68612 +}
68613 +
68614 +static inline void write_unlock_dk(reiser4_tree *tree)
68615 +{
68616 +       assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk));
68617 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
68618 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
68619 +
68620 +       LOCK_CNT_DEC(write_locked_dk);
68621 +       LOCK_CNT_DEC(rw_locked_dk);
68622 +       LOCK_CNT_DEC(spin_locked);
68623 +
68624 +       write_unlock(&(tree->dk_lock));
68625 +}
68626 +
68627 +/* estimate api. Implementation is in estimate.c */
68628 +reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
68629 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
68630 +reiser4_block_nr estimate_insert_flow(tree_level);
68631 +reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
68632 +reiser4_block_nr calc_estimate_one_insert(tree_level);
68633 +reiser4_block_nr estimate_dirty_cluster(struct inode *);
68634 +reiser4_block_nr estimate_insert_cluster(struct inode *);
68635 +reiser4_block_nr estimate_update_cluster(struct inode *);
68636 +
68637 +/* __REISER4_TREE_H__ */
68638 +#endif
68639 +
68640 +/* Make Linus happy.
68641 +   Local variables:
68642 +   c-indentation-style: "K&R"
68643 +   mode-name: "LC"
68644 +   c-basic-offset: 8
68645 +   tab-width: 8
68646 +   fill-column: 120
68647 +   scroll-step: 1
68648 +   End:
68649 +*/
68650 diff -urN linux-2.6.27.orig/fs/reiser4/tree_mod.c linux-2.6.27/fs/reiser4/tree_mod.c
68651 --- linux-2.6.27.orig/fs/reiser4/tree_mod.c     1970-01-01 03:00:00.000000000 +0300
68652 +++ linux-2.6.27/fs/reiser4/tree_mod.c  2008-10-12 18:20:01.000000000 +0400
68653 @@ -0,0 +1,386 @@
68654 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68655 + * reiser4/README */
68656 +
68657 +/*
68658 + * Functions to add/delete new nodes to/from the tree.
68659 + *
68660 + * Functions from this file are used by carry (see carry*) to handle:
68661 + *
68662 + *     . insertion of new formatted node into tree
68663 + *
68664 + *     . addition of new tree root, increasing tree height
68665 + *
68666 + *     . removing tree root, decreasing tree height
68667 + *
68668 + */
68669 +
68670 +#include "forward.h"
68671 +#include "debug.h"
68672 +#include "dformat.h"
68673 +#include "key.h"
68674 +#include "coord.h"
68675 +#include "plugin/plugin.h"
68676 +#include "jnode.h"
68677 +#include "znode.h"
68678 +#include "tree_mod.h"
68679 +#include "block_alloc.h"
68680 +#include "tree_walk.h"
68681 +#include "tree.h"
68682 +#include "super.h"
68683 +
68684 +#include <linux/err.h>
68685 +
68686 +static int add_child_ptr(znode * parent, znode * child);
68687 +/* warning only issued if error is not -E_REPEAT */
68688 +#define ewarning( error, ... )                 \
68689 +       if( ( error ) != -E_REPEAT )            \
68690 +               warning( __VA_ARGS__ )
68691 +
68692 +/* allocate new node on the @level and immediately on the right of @brother. */
68693 +znode * reiser4_new_node(znode * brother /* existing left neighbor
68694 +                                         *  of new node */,
68695 +                        tree_level level /* tree level at which new node is to
68696 +                                          * be allocated */)
68697 +{
68698 +       znode *result;
68699 +       int retcode;
68700 +       reiser4_block_nr blocknr;
68701 +
68702 +       assert("nikita-930", brother != NULL);
68703 +       assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT);
68704 +
68705 +       retcode = assign_fake_blocknr_formatted(&blocknr);
68706 +       if (retcode == 0) {
68707 +               result =
68708 +                   zget(znode_get_tree(brother), &blocknr, NULL, level,
68709 +                        reiser4_ctx_gfp_mask_get());
68710 +               if (IS_ERR(result)) {
68711 +                       ewarning(PTR_ERR(result), "nikita-929",
68712 +                                "Cannot allocate znode for carry: %li",
68713 +                                PTR_ERR(result));
68714 +                       return result;
68715 +               }
68716 +               /* cheap test, can be executed even when debugging is off */
68717 +               if (!znode_just_created(result)) {
68718 +                       warning("nikita-2213",
68719 +                               "Allocated already existing block: %llu",
68720 +                               (unsigned long long)blocknr);
68721 +                       zput(result);
68722 +                       return ERR_PTR(RETERR(-EIO));
68723 +               }
68724 +
68725 +               assert("nikita-931", result != NULL);
68726 +               result->nplug = znode_get_tree(brother)->nplug;
68727 +               assert("nikita-933", result->nplug != NULL);
68728 +
68729 +               retcode = zinit_new(result, reiser4_ctx_gfp_mask_get());
68730 +               if (retcode == 0) {
68731 +                       ZF_SET(result, JNODE_CREATED);
68732 +                       zrelse(result);
68733 +               } else {
68734 +                       zput(result);
68735 +                       result = ERR_PTR(retcode);
68736 +               }
68737 +       } else {
68738 +               /* failure to allocate new node during balancing.
68739 +                  This should never happen. Ever. Returning -E_REPEAT
68740 +                  is not viable solution, because "out of disk space"
68741 +                  is not transient error that will go away by itself.
68742 +                */
68743 +               ewarning(retcode, "nikita-928",
68744 +                        "Cannot allocate block for carry: %i", retcode);
68745 +               result = ERR_PTR(retcode);
68746 +       }
68747 +       assert("nikita-1071", result != NULL);
68748 +       return result;
68749 +}
68750 +
68751 +/* allocate new root and add it to the tree
68752 +
68753 +   This helper function is called by add_new_root().
68754 +
68755 +*/
68756 +znode *reiser4_add_tree_root(znode * old_root /* existing tree root */ ,
68757 +                    znode * fake /* "fake" znode */ )
68758 +{
68759 +       reiser4_tree *tree = znode_get_tree(old_root);
68760 +       znode *new_root = NULL; /* to shut gcc up */
68761 +       int result;
68762 +
68763 +       assert("nikita-1069", old_root != NULL);
68764 +       assert("umka-262", fake != NULL);
68765 +       assert("umka-263", tree != NULL);
68766 +
68767 +       /* "fake" znode---one always hanging just above current root. This
68768 +          node is locked when new root is created or existing root is
68769 +          deleted. Downward tree traversal takes lock on it before taking
68770 +          lock on a root node. This avoids race conditions with root
68771 +          manipulations.
68772 +
68773 +        */
68774 +       assert("nikita-1348", znode_above_root(fake));
68775 +       assert("nikita-1211", znode_is_root(old_root));
68776 +
68777 +       result = 0;
68778 +       if (tree->height >= REAL_MAX_ZTREE_HEIGHT) {
68779 +               warning("nikita-1344", "Tree is too tall: %i", tree->height);
68780 +               /* ext2 returns -ENOSPC when it runs out of free inodes with a
68781 +                  following comment (fs/ext2/ialloc.c:441): Is it really
68782 +                  ENOSPC?
68783 +
68784 +                  -EXFULL? -EINVAL?
68785 +                */
68786 +               result = RETERR(-ENOSPC);
68787 +       } else {
68788 +               /* Allocate block for new root. It's not that
68789 +                  important where it will be allocated, as root is
68790 +                  almost always in memory. Moreover, allocate on
68791 +                  flush can be going here.
68792 +                */
68793 +               assert("nikita-1448", znode_is_root(old_root));
68794 +               new_root = reiser4_new_node(fake, tree->height + 1);
68795 +               if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) {
68796 +                       lock_handle rlh;
68797 +
68798 +                       init_lh(&rlh);
68799 +                       result =
68800 +                           longterm_lock_znode(&rlh, new_root,
68801 +                                               ZNODE_WRITE_LOCK,
68802 +                                               ZNODE_LOCK_LOPRI);
68803 +                       if (result == 0) {
68804 +                               parent_coord_t *in_parent;
68805 +
68806 +                               znode_make_dirty(fake);
68807 +
68808 +                               /* new root is a child of "fake" node */
68809 +                               write_lock_tree(tree);
68810 +
68811 +                               ++tree->height;
68812 +
68813 +                               /* recalculate max balance overhead */
68814 +                               tree->estimate_one_insert =
68815 +                                   estimate_one_insert_item(tree);
68816 +
68817 +                               tree->root_block = *znode_get_block(new_root);
68818 +                               in_parent = &new_root->in_parent;
68819 +                               init_parent_coord(in_parent, fake);
68820 +                               /* manually insert new root into sibling
68821 +                                * list. With this all nodes involved into
68822 +                                * balancing are connected after balancing is
68823 +                                * done---useful invariant to check. */
68824 +                               sibling_list_insert_nolock(new_root, NULL);
68825 +                               write_unlock_tree(tree);
68826 +
68827 +                               /* insert into new root pointer to the
68828 +                                  @old_root. */
68829 +                               assert("nikita-1110",
68830 +                                      WITH_DATA(new_root,
68831 +                                                node_is_empty(new_root)));
68832 +                               write_lock_dk(tree);
68833 +                               znode_set_ld_key(new_root, reiser4_min_key());
68834 +                               znode_set_rd_key(new_root, reiser4_max_key());
68835 +                               write_unlock_dk(tree);
68836 +                               if (REISER4_DEBUG) {
68837 +                                       ZF_CLR(old_root, JNODE_LEFT_CONNECTED);
68838 +                                       ZF_CLR(old_root, JNODE_RIGHT_CONNECTED);
68839 +                                       ZF_SET(old_root, JNODE_ORPHAN);
68840 +                               }
68841 +                               result = add_child_ptr(new_root, old_root);
68842 +                               done_lh(&rlh);
68843 +                       }
68844 +                       zrelse(new_root);
68845 +               }
68846 +       }
68847 +       if (result != 0)
68848 +               new_root = ERR_PTR(result);
68849 +       return new_root;
68850 +}
68851 +
68852 +/* build &reiser4_item_data for inserting child pointer
68853 +
68854 +   Build &reiser4_item_data that can be later used to insert pointer to @child
68855 +   in its parent.
68856 +
68857 +*/
68858 +void build_child_ptr_data(znode * child        /* node pointer to which will be
68859 +                                        * inserted */ ,
68860 +                         reiser4_item_data * data /* where to store result */ )
68861 +{
68862 +       assert("nikita-1116", child != NULL);
68863 +       assert("nikita-1117", data != NULL);
68864 +
68865 +       /*
68866 +        * NOTE: use address of child's blocknr as address of data to be
68867 +        * inserted. As result of this data gets into on-disk structure in cpu
68868 +        * byte order. internal's create_hook converts it to little endian byte
68869 +        * order.
68870 +        */
68871 +       data->data = (char *)znode_get_block(child);
68872 +       /* data -> data is kernel space */
68873 +       data->user = 0;
68874 +       data->length = sizeof(reiser4_block_nr);
68875 +       /* FIXME-VS: hardcoded internal item? */
68876 +
68877 +       /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */
68878 +       data->iplug = item_plugin_by_id(NODE_POINTER_ID);
68879 +}
68880 +
68881 +/* add pointer to @child into empty @parent.
68882 +
68883 +   This is used when pointer to old root is inserted into new root which is
68884 +   empty.
68885 +*/
68886 +static int add_child_ptr(znode * parent, znode * child)
68887 +{
68888 +       coord_t coord;
68889 +       reiser4_item_data data;
68890 +       int result;
68891 +       reiser4_key key;
68892 +
68893 +       assert("nikita-1111", parent != NULL);
68894 +       assert("nikita-1112", child != NULL);
68895 +       assert("nikita-1115",
68896 +              znode_get_level(parent) == znode_get_level(child) + 1);
68897 +
68898 +       result = zload(parent);
68899 +       if (result != 0)
68900 +               return result;
68901 +       assert("nikita-1113", node_is_empty(parent));
68902 +       coord_init_first_unit(&coord, parent);
68903 +
68904 +       build_child_ptr_data(child, &data);
68905 +       data.arg = NULL;
68906 +
68907 +       read_lock_dk(znode_get_tree(parent));
68908 +       key = *znode_get_ld_key(child);
68909 +       read_unlock_dk(znode_get_tree(parent));
68910 +
68911 +       result = node_plugin_by_node(parent)->create_item(&coord, &key, &data,
68912 +                                                         NULL);
68913 +       znode_make_dirty(parent);
68914 +       zrelse(parent);
68915 +       return result;
68916 +}
68917 +
68918 +/* actually remove tree root */
68919 +static int reiser4_kill_root(reiser4_tree * tree /* tree from which root is
68920 +                                                 * being removed */,
68921 +                            znode * old_root /* root node that is being
68922 +                                              * removed */ ,
68923 +                            znode * new_root   /* new root---sole child of
68924 +                                                * @old_root */,
68925 +                    const reiser4_block_nr * new_root_blk /* disk address of
68926 +                                                           * @new_root */)
68927 +{
68928 +       znode *uber;
68929 +       int result;
68930 +       lock_handle handle_for_uber;
68931 +
68932 +       assert("umka-265", tree != NULL);
68933 +       assert("nikita-1198", new_root != NULL);
68934 +       assert("nikita-1199",
68935 +              znode_get_level(new_root) + 1 == znode_get_level(old_root));
68936 +
68937 +       assert("nikita-1201", znode_is_write_locked(old_root));
68938 +
68939 +       assert("nikita-1203",
68940 +              disk_addr_eq(new_root_blk, znode_get_block(new_root)));
68941 +
68942 +       init_lh(&handle_for_uber);
68943 +       /* obtain and lock "fake" znode protecting changes in tree height. */
68944 +       result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
68945 +                               &handle_for_uber);
68946 +       if (result == 0) {
68947 +               uber = handle_for_uber.node;
68948 +
68949 +               znode_make_dirty(uber);
68950 +
68951 +               /* don't take long term lock a @new_root. Take spinlock. */
68952 +
68953 +               write_lock_tree(tree);
68954 +
68955 +               tree->root_block = *new_root_blk;
68956 +               --tree->height;
68957 +
68958 +               /* recalculate max balance overhead */
68959 +               tree->estimate_one_insert = estimate_one_insert_item(tree);
68960 +
68961 +               assert("nikita-1202",
68962 +                      tree->height == znode_get_level(new_root));
68963 +
68964 +               /* new root is child on "fake" node */
68965 +               init_parent_coord(&new_root->in_parent, uber);
68966 +               ++uber->c_count;
68967 +
68968 +               /* sibling_list_insert_nolock(new_root, NULL); */
68969 +               write_unlock_tree(tree);
68970 +
68971 +               /* reinitialise old root. */
68972 +               result = node_plugin_by_node(old_root)->init(old_root);
68973 +               znode_make_dirty(old_root);
68974 +               if (result == 0) {
68975 +                       assert("nikita-1279", node_is_empty(old_root));
68976 +                       ZF_SET(old_root, JNODE_HEARD_BANSHEE);
68977 +                       old_root->c_count = 0;
68978 +               }
68979 +       }
68980 +       done_lh(&handle_for_uber);
68981 +
68982 +       return result;
68983 +}
68984 +
68985 +/* remove tree root
68986 +
68987 +   This function removes tree root, decreasing tree height by one.  Tree root
68988 +   and its only child (that is going to become new tree root) are write locked
68989 +   at the entry.
68990 +
68991 +   To remove tree root we need to take lock on special "fake" znode that
68992 +   protects changes of tree height. See comments in reiser4_add_tree_root() for
68993 +   more on this.
68994 +
68995 +   Also parent pointers have to be updated in
68996 +   old and new root. To simplify code, function is split into two parts: outer
68997 +   reiser4_kill_tree_root() collects all necessary arguments and calls
68998 +   reiser4_kill_root() to do the actual job.
68999 +
69000 +*/
69001 +int reiser4_kill_tree_root(znode * old_root /* tree root that we are
69002 +                                              removing*/)
69003 +{
69004 +       int result;
69005 +       coord_t down_link;
69006 +       znode *new_root;
69007 +       reiser4_tree *tree;
69008 +
69009 +       assert("umka-266", current_tree != NULL);
69010 +       assert("nikita-1194", old_root != NULL);
69011 +       assert("nikita-1196", znode_is_root(old_root));
69012 +       assert("nikita-1200", node_num_items(old_root) == 1);
69013 +       assert("nikita-1401", znode_is_write_locked(old_root));
69014 +
69015 +       coord_init_first_unit(&down_link, old_root);
69016 +
69017 +       tree = znode_get_tree(old_root);
69018 +       new_root = child_znode(&down_link, old_root, 0, 1);
69019 +       if (!IS_ERR(new_root)) {
69020 +               result =
69021 +                       reiser4_kill_root(tree, old_root, new_root,
69022 +                                         znode_get_block(new_root));
69023 +               zput(new_root);
69024 +       } else
69025 +               result = PTR_ERR(new_root);
69026 +
69027 +       return result;
69028 +}
69029 +
69030 +/* Make Linus happy.
69031 +   Local variables:
69032 +   c-indentation-style: "K&R"
69033 +   mode-name: "LC"
69034 +   c-basic-offset: 8
69035 +   tab-width: 8
69036 +   fill-column: 120
69037 +   scroll-step: 1
69038 +   End:
69039 +*/
69040 diff -urN linux-2.6.27.orig/fs/reiser4/tree_mod.h linux-2.6.27/fs/reiser4/tree_mod.h
69041 --- linux-2.6.27.orig/fs/reiser4/tree_mod.h     1970-01-01 03:00:00.000000000 +0300
69042 +++ linux-2.6.27/fs/reiser4/tree_mod.h  2008-10-12 18:20:01.000000000 +0400
69043 @@ -0,0 +1,29 @@
69044 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69045 + * reiser4/README */
69046 +
69047 +/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for
69048 + * comments. */
69049 +
69050 +#if !defined( __REISER4_TREE_MOD_H__ )
69051 +#define __REISER4_TREE_MOD_H__
69052 +
69053 +#include "forward.h"
69054 +
69055 +znode *reiser4_new_node(znode * brother, tree_level level);
69056 +znode *reiser4_add_tree_root(znode * old_root, znode * fake);
69057 +int reiser4_kill_tree_root(znode * old_root);
69058 +void build_child_ptr_data(znode * child, reiser4_item_data * data);
69059 +
69060 +/* __REISER4_TREE_MOD_H__ */
69061 +#endif
69062 +
69063 +/* Make Linus happy.
69064 +   Local variables:
69065 +   c-indentation-style: "K&R"
69066 +   mode-name: "LC"
69067 +   c-basic-offset: 8
69068 +   tab-width: 8
69069 +   fill-column: 120
69070 +   scroll-step: 1
69071 +   End:
69072 +*/
69073 diff -urN linux-2.6.27.orig/fs/reiser4/tree_walk.c linux-2.6.27/fs/reiser4/tree_walk.c
69074 --- linux-2.6.27.orig/fs/reiser4/tree_walk.c    1970-01-01 03:00:00.000000000 +0300
69075 +++ linux-2.6.27/fs/reiser4/tree_walk.c 2008-10-12 18:20:01.000000000 +0400
69076 @@ -0,0 +1,927 @@
69077 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69078 + * reiser4/README */
69079 +
69080 +/* Routines and macros to:
69081 +
69082 +   get_left_neighbor()
69083 +
69084 +   get_right_neighbor()
69085 +
69086 +   get_parent()
69087 +
69088 +   get_first_child()
69089 +
69090 +   get_last_child()
69091 +
69092 +   various routines to walk the whole tree and do things to it like
69093 +   repack it, or move it to tertiary storage.  Please make them as
69094 +   generic as is reasonable.
69095 +
69096 +*/
69097 +
69098 +#include "forward.h"
69099 +#include "debug.h"
69100 +#include "dformat.h"
69101 +#include "coord.h"
69102 +#include "plugin/item/item.h"
69103 +#include "jnode.h"
69104 +#include "znode.h"
69105 +#include "tree_walk.h"
69106 +#include "tree.h"
69107 +#include "super.h"
69108 +
69109 +/* These macros are used internally in tree_walk.c in attempt to make
69110 +   lock_neighbor() code usable to build lock_parent(), lock_right_neighbor,
69111 +   lock_left_neighbor */
69112 +#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off)))
69113 +#define FIELD_OFFSET(name)  offsetof(znode, name)
69114 +#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node)
69115 +#define LEFT_PTR_OFFSET   FIELD_OFFSET(left)
69116 +#define RIGHT_PTR_OFFSET  FIELD_OFFSET(right)
69117 +
69118 +/* This is the generic procedure to get and lock `generic' neighbor (left or
69119 +    right neighbor or parent). It implements common algorithm for all cases of
69120 +    getting lock on neighbor node, only znode structure field is different in
69121 +    each case. This is parameterized by ptr_offset argument, which is byte
69122 +    offset for the pointer to the desired neighbor within the current node's
69123 +    znode structure. This function should be called with the tree lock held */
69124 +static int lock_neighbor(
69125 +                               /* resulting lock handle */
69126 +                               lock_handle * result,
69127 +                               /* znode to lock */
69128 +                               znode * node,
69129 +                               /* pointer to neighbor (or parent) znode field offset, in bytes from
69130 +                                  the base address of znode structure  */
69131 +                               int ptr_offset,
69132 +                               /* lock mode for longterm_lock_znode call */
69133 +                               znode_lock_mode mode,
69134 +                               /* lock request for longterm_lock_znode call */
69135 +                               znode_lock_request req,
69136 +                               /* GN_* flags */
69137 +                               int flags, int rlocked)
69138 +{
69139 +       reiser4_tree *tree = znode_get_tree(node);
69140 +       znode *neighbor;
69141 +       int ret;
69142 +
69143 +       assert("umka-236", node != NULL);
69144 +       assert("umka-237", tree != NULL);
69145 +       assert_rw_locked(&(tree->tree_lock));
69146 +
69147 +       if (flags & GN_TRY_LOCK)
69148 +               req |= ZNODE_LOCK_NONBLOCK;
69149 +       if (flags & GN_SAME_ATOM)
69150 +               req |= ZNODE_LOCK_DONT_FUSE;
69151 +
69152 +       /* get neighbor's address by using of sibling link, quit while loop
69153 +          (and return) if link is not available. */
69154 +       while (1) {
69155 +               neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset);
69156 +
69157 +               /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if
69158 +                * node pointed by it is not connected.
69159 +                *
69160 +                * However, GN_ALLOW_NOT_CONNECTED option masks "connected"
69161 +                * check and allows passing reference to not connected znode to
69162 +                * subsequent longterm_lock_znode() call.  This kills possible
69163 +                * busy loop if we are trying to get longterm lock on locked but
69164 +                * not yet connected parent node. */
69165 +               if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED)
69166 +                                         || znode_is_connected(neighbor))) {
69167 +                       return RETERR(-E_NO_NEIGHBOR);
69168 +               }
69169 +
69170 +               /* protect it from deletion. */
69171 +               zref(neighbor);
69172 +
69173 +               rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
69174 +
69175 +               ret = longterm_lock_znode(result, neighbor, mode, req);
69176 +
69177 +               /* The lock handle obtains its own reference, release the one from above. */
69178 +               zput(neighbor);
69179 +
69180 +               rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
69181 +
69182 +               /* restart if node we got reference to is being
69183 +                  invalidated. we should not get reference to this node
69184 +                  again. */
69185 +               if (ret == -EINVAL)
69186 +                       continue;
69187 +               if (ret)
69188 +                       return ret;
69189 +
69190 +               /* check if neighbor link still points to just locked znode;
69191 +                  the link could have been changed while the process slept. */
69192 +               if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset))
69193 +                       return 0;
69194 +
69195 +               /* znode was locked by mistake; unlock it and restart locking
69196 +                  process from beginning. */
69197 +               rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
69198 +               longterm_unlock_znode(result);
69199 +               rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
69200 +       }
69201 +}
69202 +
69203 +/* get parent node with longterm lock, accepts GN* flags. */
69204 +int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ ,
69205 +                            znode * node /* child node */ ,
69206 +                            znode_lock_mode mode
69207 +                            /* type of lock: read or write */ ,
69208 +                            int flags /* GN_* flags */ )
69209 +{
69210 +       int result;
69211 +
69212 +       read_lock_tree(znode_get_tree(node));
69213 +       result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode,
69214 +                              ZNODE_LOCK_HIPRI, flags, 1);
69215 +       read_unlock_tree(znode_get_tree(node));
69216 +       return result;
69217 +}
69218 +
69219 +/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT
69220 +   bit in @flags parameter  */
69221 +/* Audited by: umka (2002.06.14) */
69222 +static inline int
69223 +lock_side_neighbor(lock_handle * result,
69224 +                  znode * node, znode_lock_mode mode, int flags, int rlocked)
69225 +{
69226 +       int ret;
69227 +       int ptr_offset;
69228 +       znode_lock_request req;
69229 +
69230 +       if (flags & GN_GO_LEFT) {
69231 +               ptr_offset = LEFT_PTR_OFFSET;
69232 +               req = ZNODE_LOCK_LOPRI;
69233 +       } else {
69234 +               ptr_offset = RIGHT_PTR_OFFSET;
69235 +               req = ZNODE_LOCK_HIPRI;
69236 +       }
69237 +
69238 +       ret =
69239 +           lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked);
69240 +
69241 +       if (ret == -E_NO_NEIGHBOR)      /* if we walk left or right -E_NO_NEIGHBOR does not
69242 +                                        * guarantee that neighbor is absent in the
69243 +                                        * tree; in this case we return -ENOENT --
69244 +                                        * means neighbor at least not found in
69245 +                                        * cache */
69246 +               return RETERR(-ENOENT);
69247 +
69248 +       return ret;
69249 +}
69250 +
69251 +#if REISER4_DEBUG
69252 +
69253 +int check_sibling_list(znode * node)
69254 +{
69255 +       znode *scan;
69256 +       znode *next;
69257 +
69258 +       assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree));
69259 +
69260 +       if (node == NULL)
69261 +               return 1;
69262 +
69263 +       if (ZF_ISSET(node, JNODE_RIP))
69264 +               return 1;
69265 +
69266 +       assert("nikita-3270", node != NULL);
69267 +       assert_rw_write_locked(&(znode_get_tree(node)->tree_lock));
69268 +
69269 +       for (scan = node; znode_is_left_connected(scan); scan = next) {
69270 +               next = scan->left;
69271 +               if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
69272 +                       assert("nikita-3271", znode_is_right_connected(next));
69273 +                       assert("nikita-3272", next->right == scan);
69274 +               } else
69275 +                       break;
69276 +       }
69277 +       for (scan = node; znode_is_right_connected(scan); scan = next) {
69278 +               next = scan->right;
69279 +               if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
69280 +                       assert("nikita-3273", znode_is_left_connected(next));
69281 +                       assert("nikita-3274", next->left == scan);
69282 +               } else
69283 +                       break;
69284 +       }
69285 +       return 1;
69286 +}
69287 +
69288 +#endif
69289 +
69290 +/* Znode sibling pointers maintenence. */
69291 +
69292 +/* Znode sibling pointers are established between any neighbored nodes which are
69293 +   in cache.  There are two znode state bits (JNODE_LEFT_CONNECTED,
69294 +   JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual
69295 +   value (even NULL), corresponded JNODE_*_CONNECTED bit is set.
69296 +
69297 +   Reiser4 tree operations which may allocate new znodes (CBK, tree balancing)
69298 +   take care about searching (hash table lookup may be required) of znode
69299 +   neighbors, establishing sibling pointers between them and setting
69300 +   JNODE_*_CONNECTED state bits. */
69301 +
69302 +/* adjusting of sibling pointers and `connected' states for two
69303 +   neighbors; works if one neighbor is NULL (was not found). */
69304 +
69305 +/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */
69306 +void link_left_and_right(znode * left, znode * right)
69307 +{
69308 +       assert("nikita-3275", check_sibling_list(left));
69309 +       assert("nikita-3275", check_sibling_list(right));
69310 +
69311 +       if (left != NULL) {
69312 +               if (left->right == NULL) {
69313 +                       left->right = right;
69314 +                       ZF_SET(left, JNODE_RIGHT_CONNECTED);
69315 +
69316 +                       ON_DEBUG(left->right_version =
69317 +                                atomic_inc_return(&delim_key_version);
69318 +                           );
69319 +
69320 +               } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE)
69321 +                          && left->right != right) {
69322 +
69323 +                       ON_DEBUG(left->right->left_version =
69324 +                                atomic_inc_return(&delim_key_version);
69325 +                                left->right_version =
69326 +                                atomic_inc_return(&delim_key_version););
69327 +
69328 +                       left->right->left = NULL;
69329 +                       left->right = right;
69330 +                       ZF_SET(left, JNODE_RIGHT_CONNECTED);
69331 +               } else
69332 +                       /*
69333 +                        * there is a race condition in renew_sibling_link()
69334 +                        * and assertions below check that it is only one
69335 +                        * there. Thread T1 calls renew_sibling_link() without
69336 +                        * GN_NO_ALLOC flag. zlook() doesn't find neighbor
69337 +                        * node, but before T1 gets to the
69338 +                        * link_left_and_right(), another thread T2 creates
69339 +                        * neighbor node and connects it. check for
69340 +                        * left->right == NULL above protects T1 from
69341 +                        * overwriting correct left->right pointer installed
69342 +                        * by T2.
69343 +                        */
69344 +                       assert("nikita-3302",
69345 +                              right == NULL || left->right == right);
69346 +       }
69347 +       if (right != NULL) {
69348 +               if (right->left == NULL) {
69349 +                       right->left = left;
69350 +                       ZF_SET(right, JNODE_LEFT_CONNECTED);
69351 +
69352 +                       ON_DEBUG(right->left_version =
69353 +                                atomic_inc_return(&delim_key_version);
69354 +                           );
69355 +
69356 +               } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE)
69357 +                          && right->left != left) {
69358 +
69359 +                       ON_DEBUG(right->left->right_version =
69360 +                                atomic_inc_return(&delim_key_version);
69361 +                                right->left_version =
69362 +                                atomic_inc_return(&delim_key_version););
69363 +
69364 +                       right->left->right = NULL;
69365 +                       right->left = left;
69366 +                       ZF_SET(right, JNODE_LEFT_CONNECTED);
69367 +
69368 +               } else
69369 +                       assert("nikita-3303",
69370 +                              left == NULL || right->left == left);
69371 +       }
69372 +       assert("nikita-3275", check_sibling_list(left));
69373 +       assert("nikita-3275", check_sibling_list(right));
69374 +}
69375 +
69376 +/* Audited by: umka (2002.06.14) */
69377 +static void link_znodes(znode * first, znode * second, int to_left)
69378 +{
69379 +       if (to_left)
69380 +               link_left_and_right(second, first);
69381 +       else
69382 +               link_left_and_right(first, second);
69383 +}
69384 +
69385 +/* getting of next (to left or to right, depend on gn_to_left bit in flags)
69386 +   coord's unit position in horizontal direction, even across node
69387 +   boundary. Should be called under tree lock, it protects nonexistence of
69388 +   sibling link on parent level, if lock_side_neighbor() fails with
69389 +   -ENOENT. */
69390 +static int far_next_coord(coord_t * coord, lock_handle * handle, int flags)
69391 +{
69392 +       int ret;
69393 +       znode *node;
69394 +       reiser4_tree *tree;
69395 +
69396 +       assert("umka-243", coord != NULL);
69397 +       assert("umka-244", handle != NULL);
69398 +       assert("zam-1069", handle->node == NULL);
69399 +
69400 +       ret =
69401 +           (flags & GN_GO_LEFT) ? coord_prev_unit(coord) :
69402 +           coord_next_unit(coord);
69403 +       if (!ret)
69404 +               return 0;
69405 +
69406 +       ret =
69407 +           lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0);
69408 +       if (ret)
69409 +               return ret;
69410 +
69411 +       node = handle->node;
69412 +       tree = znode_get_tree(node);
69413 +       write_unlock_tree(tree);
69414 +
69415 +       coord_init_zero(coord);
69416 +
69417 +       /* We avoid synchronous read here if it is specified by flag. */
69418 +       if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) {
69419 +               ret = jstartio(ZJNODE(handle->node));
69420 +               if (!ret)
69421 +                       ret = -E_REPEAT;
69422 +               goto error_locked;
69423 +       }
69424 +
69425 +       /* corresponded zrelse() should be called by the clients of
69426 +          far_next_coord(), in place when this node gets unlocked. */
69427 +       ret = zload(handle->node);
69428 +       if (ret)
69429 +               goto error_locked;
69430 +
69431 +       if (flags & GN_GO_LEFT)
69432 +               coord_init_last_unit(coord, node);
69433 +       else
69434 +               coord_init_first_unit(coord, node);
69435 +
69436 +       if (0) {
69437 +             error_locked:
69438 +               longterm_unlock_znode(handle);
69439 +       }
69440 +       write_lock_tree(tree);
69441 +       return ret;
69442 +}
69443 +
69444 +/* Very significant function which performs a step in horizontal direction
69445 +   when sibling pointer is not available.  Actually, it is only function which
69446 +   does it.
69447 +   Note: this function does not restore locking status at exit,
69448 +   caller should does care about proper unlocking and zrelsing */
69449 +static int
69450 +renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child,
69451 +                  tree_level level, int flags, int *nr_locked)
69452 +{
69453 +       int ret;
69454 +       int to_left = flags & GN_GO_LEFT;
69455 +       reiser4_block_nr da;
69456 +       /* parent of the neighbor node; we set it to parent until not sharing
69457 +          of one parent between child and neighbor node is detected */
69458 +       znode *side_parent = coord->node;
69459 +       reiser4_tree *tree = znode_get_tree(child);
69460 +       znode *neighbor = NULL;
69461 +
69462 +       assert("umka-245", coord != NULL);
69463 +       assert("umka-246", handle != NULL);
69464 +       assert("umka-247", child != NULL);
69465 +       assert("umka-303", tree != NULL);
69466 +
69467 +       init_lh(handle);
69468 +       write_lock_tree(tree);
69469 +       ret = far_next_coord(coord, handle, flags);
69470 +
69471 +       if (ret) {
69472 +               if (ret != -ENOENT) {
69473 +                       write_unlock_tree(tree);
69474 +                       return ret;
69475 +               }
69476 +       } else {
69477 +               item_plugin *iplug;
69478 +
69479 +               if (handle->node != NULL) {
69480 +                       (*nr_locked)++;
69481 +                       side_parent = handle->node;
69482 +               }
69483 +
69484 +               /* does coord object points to internal item? We do not
69485 +                  support sibling pointers between znode for formatted and
69486 +                  unformatted nodes and return -E_NO_NEIGHBOR in that case. */
69487 +               iplug = item_plugin_by_coord(coord);
69488 +               if (!item_is_internal(coord)) {
69489 +                       link_znodes(child, NULL, to_left);
69490 +                       write_unlock_tree(tree);
69491 +                       /* we know there can't be formatted neighbor */
69492 +                       return RETERR(-E_NO_NEIGHBOR);
69493 +               }
69494 +               write_unlock_tree(tree);
69495 +
69496 +               iplug->s.internal.down_link(coord, NULL, &da);
69497 +
69498 +               if (flags & GN_NO_ALLOC) {
69499 +                       neighbor = zlook(tree, &da);
69500 +               } else {
69501 +                       neighbor =
69502 +                           zget(tree, &da, side_parent, level,
69503 +                                reiser4_ctx_gfp_mask_get());
69504 +               }
69505 +
69506 +               if (IS_ERR(neighbor)) {
69507 +                       ret = PTR_ERR(neighbor);
69508 +                       return ret;
69509 +               }
69510 +
69511 +               if (neighbor)
69512 +                       /* update delimiting keys */
69513 +                       set_child_delimiting_keys(coord->node, coord, neighbor);
69514 +
69515 +               write_lock_tree(tree);
69516 +       }
69517 +
69518 +       if (likely(neighbor == NULL ||
69519 +                  (znode_get_level(child) == znode_get_level(neighbor)
69520 +                   && child != neighbor)))
69521 +               link_znodes(child, neighbor, to_left);
69522 +       else {
69523 +               warning("nikita-3532",
69524 +                       "Sibling nodes on the different levels: %i != %i\n",
69525 +                       znode_get_level(child), znode_get_level(neighbor));
69526 +               ret = RETERR(-EIO);
69527 +       }
69528 +
69529 +       write_unlock_tree(tree);
69530 +
69531 +       /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */
69532 +       if (neighbor != NULL && (flags & GN_NO_ALLOC))
69533 +               /* atomic_dec(&ZJNODE(neighbor)->x_count); */
69534 +               zput(neighbor);
69535 +
69536 +       return ret;
69537 +}
69538 +
69539 +/* This function is for establishing of one side relation. */
69540 +/* Audited by: umka (2002.06.14) */
69541 +static int connect_one_side(coord_t * coord, znode * node, int flags)
69542 +{
69543 +       coord_t local;
69544 +       lock_handle handle;
69545 +       int nr_locked;
69546 +       int ret;
69547 +
69548 +       assert("umka-248", coord != NULL);
69549 +       assert("umka-249", node != NULL);
69550 +
69551 +       coord_dup_nocheck(&local, coord);
69552 +
69553 +       init_lh(&handle);
69554 +
69555 +       ret =
69556 +           renew_sibling_link(&local, &handle, node, znode_get_level(node),
69557 +                              flags | GN_NO_ALLOC, &nr_locked);
69558 +
69559 +       if (handle.node != NULL) {
69560 +               /* complementary operations for zload() and lock() in far_next_coord() */
69561 +               zrelse(handle.node);
69562 +               longterm_unlock_znode(&handle);
69563 +       }
69564 +
69565 +       /* we catch error codes which are not interesting for us because we
69566 +          run renew_sibling_link() only for znode connection. */
69567 +       if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
69568 +               return 0;
69569 +
69570 +       return ret;
69571 +}
69572 +
69573 +/* if @child is not in `connected' state, performs hash searches for left and
69574 +   right neighbor nodes and establishes horizontal sibling links */
69575 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
69576 +int connect_znode(coord_t * parent_coord, znode * child)
69577 +{
69578 +       reiser4_tree *tree = znode_get_tree(child);
69579 +       int ret = 0;
69580 +
69581 +       assert("zam-330", parent_coord != NULL);
69582 +       assert("zam-331", child != NULL);
69583 +       assert("zam-332", parent_coord->node != NULL);
69584 +       assert("umka-305", tree != NULL);
69585 +
69586 +       /* it is trivial to `connect' root znode because it can't have
69587 +          neighbors */
69588 +       if (znode_above_root(parent_coord->node)) {
69589 +               child->left = NULL;
69590 +               child->right = NULL;
69591 +               ZF_SET(child, JNODE_LEFT_CONNECTED);
69592 +               ZF_SET(child, JNODE_RIGHT_CONNECTED);
69593 +
69594 +               ON_DEBUG(child->left_version =
69595 +                        atomic_inc_return(&delim_key_version);
69596 +                        child->right_version =
69597 +                        atomic_inc_return(&delim_key_version););
69598 +
69599 +               return 0;
69600 +       }
69601 +
69602 +       /* load parent node */
69603 +       coord_clear_iplug(parent_coord);
69604 +       ret = zload(parent_coord->node);
69605 +
69606 +       if (ret != 0)
69607 +               return ret;
69608 +
69609 +       /* protect `connected' state check by tree_lock */
69610 +       read_lock_tree(tree);
69611 +
69612 +       if (!znode_is_right_connected(child)) {
69613 +               read_unlock_tree(tree);
69614 +               /* connect right (default is right) */
69615 +               ret = connect_one_side(parent_coord, child, GN_NO_ALLOC);
69616 +               if (ret)
69617 +                       goto zrelse_and_ret;
69618 +
69619 +               read_lock_tree(tree);
69620 +       }
69621 +
69622 +       ret = znode_is_left_connected(child);
69623 +
69624 +       read_unlock_tree(tree);
69625 +
69626 +       if (!ret) {
69627 +               ret =
69628 +                   connect_one_side(parent_coord, child,
69629 +                                    GN_NO_ALLOC | GN_GO_LEFT);
69630 +       } else
69631 +               ret = 0;
69632 +
69633 +      zrelse_and_ret:
69634 +       zrelse(parent_coord->node);
69635 +
69636 +       return ret;
69637 +}
69638 +
69639 +/* this function is like renew_sibling_link() but allocates neighbor node if
69640 +   it doesn't exist and `connects' it. It may require making two steps in
69641 +   horizontal direction, first one for neighbor node finding/allocation,
69642 +   second one is for finding neighbor of neighbor to connect freshly allocated
69643 +   znode. */
69644 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
69645 +static int
69646 +renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags)
69647 +{
69648 +       coord_t local;
69649 +       lock_handle empty[2];
69650 +       reiser4_tree *tree = znode_get_tree(node);
69651 +       znode *neighbor = NULL;
69652 +       int nr_locked = 0;
69653 +       int ret;
69654 +
69655 +       assert("umka-250", coord != NULL);
69656 +       assert("umka-251", node != NULL);
69657 +       assert("umka-307", tree != NULL);
69658 +       assert("umka-308", level <= tree->height);
69659 +
69660 +       /* umka (2002.06.14)
69661 +          Here probably should be a check for given "level" validness.
69662 +          Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT);
69663 +        */
69664 +
69665 +       coord_dup(&local, coord);
69666 +
69667 +       ret =
69668 +           renew_sibling_link(&local, &empty[0], node, level,
69669 +                              flags & ~GN_NO_ALLOC, &nr_locked);
69670 +       if (ret)
69671 +               goto out;
69672 +
69673 +       /* tree lock is not needed here because we keep parent node(s) locked
69674 +          and reference to neighbor znode incremented */
69675 +       neighbor = (flags & GN_GO_LEFT) ? node->left : node->right;
69676 +
69677 +       read_lock_tree(tree);
69678 +       ret = znode_is_connected(neighbor);
69679 +       read_unlock_tree(tree);
69680 +       if (ret) {
69681 +               ret = 0;
69682 +               goto out;
69683 +       }
69684 +
69685 +       ret =
69686 +           renew_sibling_link(&local, &empty[nr_locked], neighbor, level,
69687 +                              flags | GN_NO_ALLOC, &nr_locked);
69688 +       /* second renew_sibling_link() call is used for znode connection only,
69689 +          so we can live with these errors */
69690 +       if (-ENOENT == ret || -E_NO_NEIGHBOR == ret)
69691 +               ret = 0;
69692 +
69693 +      out:
69694 +
69695 +       for (--nr_locked; nr_locked >= 0; --nr_locked) {
69696 +               zrelse(empty[nr_locked].node);
69697 +               longterm_unlock_znode(&empty[nr_locked]);
69698 +       }
69699 +
69700 +       if (neighbor != NULL)
69701 +               /* decrement znode reference counter without actually
69702 +                  releasing it. */
69703 +               atomic_dec(&ZJNODE(neighbor)->x_count);
69704 +
69705 +       return ret;
69706 +}
69707 +
69708 +/*
69709 +   reiser4_get_neighbor() -- lock node's neighbor.
69710 +
69711 +   reiser4_get_neighbor() locks node's neighbor (left or right one, depends on
69712 +   given parameter) using sibling link to it. If sibling link is not available
69713 +   (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one
69714 +   level up for information about neighbor's disk address. We lock node's
69715 +   parent, if it is common parent for both 'node' and its neighbor, neighbor's
69716 +   disk address is in next (to left or to right) down link from link that points
69717 +   to original node. If not, we need to lock parent's neighbor, read its content
69718 +   and take first(last) downlink with neighbor's disk address.  That locking
69719 +   could be done by using sibling link and lock_neighbor() function, if sibling
69720 +   link exists. In another case we have to go level up again until we find
69721 +   common parent or valid sibling link. Then go down
69722 +   allocating/connecting/locking/reading nodes until neighbor of first one is
69723 +   locked.
69724 +
69725 +   @neighbor:  result lock handle,
69726 +   @node: a node which we lock neighbor of,
69727 +   @lock_mode: lock mode {LM_READ, LM_WRITE},
69728 +   @flags: logical OR of {GN_*} (see description above) subset.
69729 +
69730 +   @return: 0 if success, negative value if lock was impossible due to an error
69731 +   or lack of neighbor node.
69732 +*/
69733 +
69734 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
69735 +int
69736 +reiser4_get_neighbor(lock_handle * neighbor, znode * node,
69737 +                    znode_lock_mode lock_mode, int flags)
69738 +{
69739 +       reiser4_tree *tree = znode_get_tree(node);
69740 +       lock_handle path[REAL_MAX_ZTREE_HEIGHT];
69741 +
69742 +       coord_t coord;
69743 +
69744 +       tree_level base_level;
69745 +       tree_level h = 0;
69746 +       int ret;
69747 +
69748 +       assert("umka-252", tree != NULL);
69749 +       assert("umka-253", neighbor != NULL);
69750 +       assert("umka-254", node != NULL);
69751 +
69752 +       base_level = znode_get_level(node);
69753 +
69754 +       assert("umka-310", base_level <= tree->height);
69755 +
69756 +       coord_init_zero(&coord);
69757 +
69758 +      again:
69759 +       /* first, we try to use simple lock_neighbor() which requires sibling
69760 +          link existence */
69761 +       read_lock_tree(tree);
69762 +       ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1);
69763 +       read_unlock_tree(tree);
69764 +       if (!ret) {
69765 +               /* load znode content if it was specified */
69766 +               if (flags & GN_LOAD_NEIGHBOR) {
69767 +                       ret = zload(node);
69768 +                       if (ret)
69769 +                               longterm_unlock_znode(neighbor);
69770 +               }
69771 +               return ret;
69772 +       }
69773 +
69774 +       /* only -ENOENT means we may look upward and try to connect
69775 +          @node with its neighbor (if @flags allow us to do it) */
69776 +       if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS))
69777 +               return ret;
69778 +
69779 +       /* before establishing of sibling link we lock parent node; it is
69780 +          required by renew_neighbor() to work.  */
69781 +       init_lh(&path[0]);
69782 +       ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK);
69783 +       if (ret)
69784 +               return ret;
69785 +       if (znode_above_root(path[0].node)) {
69786 +               longterm_unlock_znode(&path[0]);
69787 +               return RETERR(-E_NO_NEIGHBOR);
69788 +       }
69789 +
69790 +       while (1) {
69791 +               znode *child = (h == 0) ? node : path[h - 1].node;
69792 +               znode *parent = path[h].node;
69793 +
69794 +               ret = zload(parent);
69795 +               if (ret)
69796 +                       break;
69797 +
69798 +               ret = find_child_ptr(parent, child, &coord);
69799 +
69800 +               if (ret) {
69801 +                       zrelse(parent);
69802 +                       break;
69803 +               }
69804 +
69805 +               /* try to establish missing sibling link */
69806 +               ret = renew_neighbor(&coord, child, h + base_level, flags);
69807 +
69808 +               zrelse(parent);
69809 +
69810 +               switch (ret) {
69811 +               case 0:
69812 +                       /* unlocking of parent znode prevents simple
69813 +                          deadlock situation */
69814 +                       done_lh(&path[h]);
69815 +
69816 +                       /* depend on tree level we stay on we repeat first
69817 +                          locking attempt ...  */
69818 +                       if (h == 0)
69819 +                               goto again;
69820 +
69821 +                       /* ... or repeat establishing of sibling link at
69822 +                          one level below. */
69823 +                       --h;
69824 +                       break;
69825 +
69826 +               case -ENOENT:
69827 +                       /* sibling link is not available -- we go
69828 +                          upward. */
69829 +                       init_lh(&path[h + 1]);
69830 +                       ret =
69831 +                           reiser4_get_parent(&path[h + 1], parent,
69832 +                                              ZNODE_READ_LOCK);
69833 +                       if (ret)
69834 +                               goto fail;
69835 +                       ++h;
69836 +                       if (znode_above_root(path[h].node)) {
69837 +                               ret = RETERR(-E_NO_NEIGHBOR);
69838 +                               goto fail;
69839 +                       }
69840 +                       break;
69841 +
69842 +               case -E_DEADLOCK:
69843 +                       /* there was lock request from hi-pri locker. if
69844 +                          it is possible we unlock last parent node and
69845 +                          re-lock it again. */
69846 +                       for (; reiser4_check_deadlock(); h--) {
69847 +                               done_lh(&path[h]);
69848 +                               if (h == 0)
69849 +                                       goto fail;
69850 +                       }
69851 +
69852 +                       break;
69853 +
69854 +               default:        /* other errors. */
69855 +                       goto fail;
69856 +               }
69857 +       }
69858 +      fail:
69859 +       ON_DEBUG(check_lock_node_data(node));
69860 +       ON_DEBUG(check_lock_data());
69861 +
69862 +       /* unlock path */
69863 +       do {
69864 +               /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto
69865 +                  fail; path[0] is already done_lh-ed, therefore
69866 +                  longterm_unlock_znode(&path[h]); is not applicable */
69867 +               done_lh(&path[h]);
69868 +               --h;
69869 +       } while (h + 1 != 0);
69870 +
69871 +       return ret;
69872 +}
69873 +
69874 +/* remove node from sibling list */
69875 +/* Audited by: umka (2002.06.14) */
69876 +void sibling_list_remove(znode * node)
69877 +{
69878 +       reiser4_tree *tree;
69879 +
69880 +       tree = znode_get_tree(node);
69881 +       assert("umka-255", node != NULL);
69882 +       assert_rw_write_locked(&(tree->tree_lock));
69883 +       assert("nikita-3275", check_sibling_list(node));
69884 +
69885 +       write_lock_dk(tree);
69886 +       if (znode_is_right_connected(node) && node->right != NULL &&
69887 +           znode_is_left_connected(node) && node->left != NULL) {
69888 +               assert("zam-32245",
69889 +                      keyeq(znode_get_rd_key(node),
69890 +                            znode_get_ld_key(node->right)));
69891 +               znode_set_rd_key(node->left, znode_get_ld_key(node->right));
69892 +       }
69893 +       write_unlock_dk(tree);
69894 +
69895 +       if (znode_is_right_connected(node) && node->right != NULL) {
69896 +               assert("zam-322", znode_is_left_connected(node->right));
69897 +               node->right->left = node->left;
69898 +               ON_DEBUG(node->right->left_version =
69899 +                        atomic_inc_return(&delim_key_version);
69900 +                   );
69901 +       }
69902 +       if (znode_is_left_connected(node) && node->left != NULL) {
69903 +               assert("zam-323", znode_is_right_connected(node->left));
69904 +               node->left->right = node->right;
69905 +               ON_DEBUG(node->left->right_version =
69906 +                        atomic_inc_return(&delim_key_version);
69907 +                   );
69908 +       }
69909 +
69910 +       ZF_CLR(node, JNODE_LEFT_CONNECTED);
69911 +       ZF_CLR(node, JNODE_RIGHT_CONNECTED);
69912 +       ON_DEBUG(node->left = node->right = NULL;
69913 +                node->left_version = atomic_inc_return(&delim_key_version);
69914 +                node->right_version = atomic_inc_return(&delim_key_version););
69915 +       assert("nikita-3276", check_sibling_list(node));
69916 +}
69917 +
69918 +/* disconnect node from sibling list */
69919 +void sibling_list_drop(znode * node)
69920 +{
69921 +       znode *right;
69922 +       znode *left;
69923 +
69924 +       assert("nikita-2464", node != NULL);
69925 +       assert("nikita-3277", check_sibling_list(node));
69926 +
69927 +       right = node->right;
69928 +       if (right != NULL) {
69929 +               assert("nikita-2465", znode_is_left_connected(right));
69930 +               right->left = NULL;
69931 +               ON_DEBUG(right->left_version =
69932 +                        atomic_inc_return(&delim_key_version);
69933 +                   );
69934 +       }
69935 +       left = node->left;
69936 +       if (left != NULL) {
69937 +               assert("zam-323", znode_is_right_connected(left));
69938 +               left->right = NULL;
69939 +               ON_DEBUG(left->right_version =
69940 +                        atomic_inc_return(&delim_key_version);
69941 +                   );
69942 +       }
69943 +       ZF_CLR(node, JNODE_LEFT_CONNECTED);
69944 +       ZF_CLR(node, JNODE_RIGHT_CONNECTED);
69945 +       ON_DEBUG(node->left = node->right = NULL;
69946 +                node->left_version = atomic_inc_return(&delim_key_version);
69947 +                node->right_version = atomic_inc_return(&delim_key_version););
69948 +}
69949 +
69950 +/* Insert new node into sibling list. Regular balancing inserts new node
69951 +   after (at right side) existing and locked node (@before), except one case
69952 +   of adding new tree root node. @before should be NULL in that case. */
69953 +void sibling_list_insert_nolock(znode * new, znode * before)
69954 +{
69955 +       assert("zam-334", new != NULL);
69956 +       assert("nikita-3298", !znode_is_left_connected(new));
69957 +       assert("nikita-3299", !znode_is_right_connected(new));
69958 +       assert("nikita-3300", new->left == NULL);
69959 +       assert("nikita-3301", new->right == NULL);
69960 +       assert("nikita-3278", check_sibling_list(new));
69961 +       assert("nikita-3279", check_sibling_list(before));
69962 +
69963 +       if (before != NULL) {
69964 +               assert("zam-333", znode_is_connected(before));
69965 +               new->right = before->right;
69966 +               new->left = before;
69967 +               ON_DEBUG(new->right_version =
69968 +                        atomic_inc_return(&delim_key_version);
69969 +                        new->left_version =
69970 +                        atomic_inc_return(&delim_key_version););
69971 +               if (before->right != NULL) {
69972 +                       before->right->left = new;
69973 +                       ON_DEBUG(before->right->left_version =
69974 +                                atomic_inc_return(&delim_key_version);
69975 +                           );
69976 +               }
69977 +               before->right = new;
69978 +               ON_DEBUG(before->right_version =
69979 +                        atomic_inc_return(&delim_key_version);
69980 +                   );
69981 +       } else {
69982 +               new->right = NULL;
69983 +               new->left = NULL;
69984 +               ON_DEBUG(new->right_version =
69985 +                        atomic_inc_return(&delim_key_version);
69986 +                        new->left_version =
69987 +                        atomic_inc_return(&delim_key_version););
69988 +       }
69989 +       ZF_SET(new, JNODE_LEFT_CONNECTED);
69990 +       ZF_SET(new, JNODE_RIGHT_CONNECTED);
69991 +       assert("nikita-3280", check_sibling_list(new));
69992 +       assert("nikita-3281", check_sibling_list(before));
69993 +}
69994 +
69995 +/*
69996 +   Local variables:
69997 +   c-indentation-style: "K&R"
69998 +   mode-name: "LC"
69999 +   c-basic-offset: 8
70000 +   tab-width: 8
70001 +   fill-column: 80
70002 +   End:
70003 +*/
70004 diff -urN linux-2.6.27.orig/fs/reiser4/tree_walk.h linux-2.6.27/fs/reiser4/tree_walk.h
70005 --- linux-2.6.27.orig/fs/reiser4/tree_walk.h    1970-01-01 03:00:00.000000000 +0300
70006 +++ linux-2.6.27/fs/reiser4/tree_walk.h 2008-10-12 18:20:01.000000000 +0400
70007 @@ -0,0 +1,125 @@
70008 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
70009 +
70010 +/* definitions of reiser4 tree walk functions */
70011 +
70012 +#ifndef __FS_REISER4_TREE_WALK_H__
70013 +#define __FS_REISER4_TREE_WALK_H__
70014 +
70015 +#include "debug.h"
70016 +#include "forward.h"
70017 +
70018 +/* establishes horizontal links between cached znodes */
70019 +int connect_znode(coord_t * coord, znode * node);
70020 +
70021 +/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor())
70022 +  have the following common arguments:
70023 +
70024 +  return codes:
70025 +
70026 +  @return : 0        - OK,
70027 +
70028 +ZAM-FIXME-HANS: wrong return code name.  Change them all.
70029 +           -ENOENT  - neighbor is not in cache, what is detected by sibling
70030 +                      link absence.
70031 +
70032 +            -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be
70033 +                       found (because we are left-/right- most node of the
70034 +                      tree, for example). Also, this return code is for
70035 +                      reiser4_get_parent() when we see no parent link -- it
70036 +                      means that our node is root node.
70037 +
70038 +            -E_DEADLOCK - deadlock detected (request from high-priority process
70039 +                      received), other error codes are conformed to
70040 +                      /usr/include/asm/errno.h .
70041 +*/
70042 +
70043 +int
70044 +reiser4_get_parent_flags(lock_handle * result, znode * node,
70045 +                        znode_lock_mode mode, int flags);
70046 +
70047 +/* bits definition for reiser4_get_neighbor function `flags' arg. */
70048 +typedef enum {
70049 +       /* If sibling pointer is NULL, this flag allows get_neighbor() to try to
70050 +        * find not allocated not connected neigbor by going though upper
70051 +        * levels */
70052 +       GN_CAN_USE_UPPER_LEVELS = 0x1,
70053 +       /* locking left neighbor instead of right one */
70054 +       GN_GO_LEFT = 0x2,
70055 +       /* automatically load neighbor node content */
70056 +       GN_LOAD_NEIGHBOR = 0x4,
70057 +       /* return -E_REPEAT if can't lock  */
70058 +       GN_TRY_LOCK = 0x8,
70059 +       /* used internally in tree_walk.c, causes renew_sibling to not
70060 +          allocate neighbor znode, but only search for it in znode cache */
70061 +       GN_NO_ALLOC = 0x10,
70062 +       /* do not go across atom boundaries */
70063 +       GN_SAME_ATOM = 0x20,
70064 +       /* allow to lock not connected nodes */
70065 +       GN_ALLOW_NOT_CONNECTED = 0x40,
70066 +       /*  Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */
70067 +       GN_ASYNC = 0x80
70068 +} znode_get_neigbor_flags;
70069 +
70070 +/* A commonly used wrapper for reiser4_get_parent_flags(). */
70071 +static inline int reiser4_get_parent(lock_handle * result, znode * node,
70072 +                                    znode_lock_mode mode)
70073 +{
70074 +       return reiser4_get_parent_flags(result, node, mode,
70075 +                                       GN_ALLOW_NOT_CONNECTED);
70076 +}
70077 +
70078 +int reiser4_get_neighbor(lock_handle * neighbor, znode * node,
70079 +                        znode_lock_mode lock_mode, int flags);
70080 +
70081 +/* there are wrappers for most common usages of reiser4_get_neighbor() */
70082 +static inline int
70083 +reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode,
70084 +                         int flags)
70085 +{
70086 +       return reiser4_get_neighbor(result, node, lock_mode,
70087 +                                   flags | GN_GO_LEFT);
70088 +}
70089 +
70090 +static inline int
70091 +reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode,
70092 +                          int flags)
70093 +{
70094 +       ON_DEBUG(check_lock_node_data(node));
70095 +       ON_DEBUG(check_lock_data());
70096 +       return reiser4_get_neighbor(result, node, lock_mode,
70097 +                                   flags & (~GN_GO_LEFT));
70098 +}
70099 +
70100 +extern void sibling_list_remove(znode * node);
70101 +extern void sibling_list_drop(znode * node);
70102 +extern void sibling_list_insert_nolock(znode * new, znode * before);
70103 +extern void link_left_and_right(znode * left, znode * right);
70104 +
70105 +/* Functions called by tree_walk() when tree_walk() ...  */
70106 +struct tree_walk_actor {
70107 +       /* ... meets a formatted node, */
70108 +       int (*process_znode) (tap_t *, void *);
70109 +       /* ... meets an extent, */
70110 +       int (*process_extent) (tap_t *, void *);
70111 +       /* ... begins tree traversal or repeats it after -E_REPEAT was returned by
70112 +        * node or extent processing functions. */
70113 +       int (*before) (void *);
70114 +};
70115 +
70116 +#if REISER4_DEBUG
70117 +int check_sibling_list(znode * node);
70118 +#else
70119 +#define check_sibling_list(n) (1)
70120 +#endif
70121 +
70122 +#endif                         /* __FS_REISER4_TREE_WALK_H__ */
70123 +
70124 +/*
70125 +   Local variables:
70126 +   c-indentation-style: "K&R"
70127 +   mode-name: "LC"
70128 +   c-basic-offset: 8
70129 +   tab-width: 8
70130 +   fill-column: 120
70131 +   End:
70132 +*/
70133 diff -urN linux-2.6.27.orig/fs/reiser4/txnmgr.c linux-2.6.27/fs/reiser4/txnmgr.c
70134 --- linux-2.6.27.orig/fs/reiser4/txnmgr.c       1970-01-01 03:00:00.000000000 +0300
70135 +++ linux-2.6.27/fs/reiser4/txnmgr.c    2008-10-12 18:20:01.000000000 +0400
70136 @@ -0,0 +1,3164 @@
70137 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
70138 + * reiser4/README */
70139 +
70140 +/* Joshua MacDonald wrote the first draft of this code. */
70141 +
70142 +/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a
70143 +filesystem scales only as well as its worst locking design.  You need to
70144 +substantially restructure this code. Josh was not as experienced a programmer
70145 +as you.  Particularly review how the locking style differs from what you did
70146 +for znodes usingt hi-lo priority locking, and present to me an opinion on
70147 +whether the differences are well founded.  */
70148 +
70149 +/* I cannot help but to disagree with the sentiment above. Locking of
70150 + * transaction manager is _not_ badly designed, and, at the very least, is not
70151 + * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority
70152 + * locking on znodes, especially on the root node of the tree. --nikita,
70153 + * 2003.10.13 */
70154 +
70155 +/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles.  The
70156 +   txnmgr processes capture_block requests and manages the relationship between jnodes and
70157 +   atoms through the various stages of a transcrash, and it also oversees the fusion and
70158 +   capture-on-copy processes.  The main difficulty with this task is maintaining a
70159 +   deadlock-free lock ordering between atoms and jnodes/handles.  The reason for the
70160 +   difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle
70161 +   must be broken.  The main requirement is that atom-fusion be deadlock free, so once you
70162 +   hold the atom_lock you may then wait to acquire any jnode or handle lock.  This implies
70163 +   that any time you check the atom-pointer of a jnode or handle and then try to lock that
70164 +   atom, you must use trylock() and possibly reverse the order.
70165 +
70166 +   This code implements the design documented at:
70167 +
70168 +     http://namesys.com/txn-doc.html
70169 +
70170 +ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the
70171 +above document and reference the new.  Be sure to provide some credit to Josh.  I already have some writings on this
70172 +topic in v4.html, but they are lacking in details present in the above.  Cure that.  Remember to write for the bright 12
70173 +year old --- define all technical terms used.
70174 +
70175 +*/
70176 +
70177 +/* Thoughts on the external transaction interface:
70178 +
70179 +   In the current code, a TRANSCRASH handle is created implicitly by reiser4_init_context() (which
70180 +   creates state that lasts for the duration of a system call and is called at the start
70181 +   of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(),
70182 +   occupying the scope of a single system call.  We wish to give certain applications an
70183 +   interface to begin and close (commit) transactions.  Since our implementation of
70184 +   transactions does not yet support isolation, allowing an application to open a
70185 +   transaction implies trusting it to later close the transaction.  Part of the
70186 +   transaction interface will be aimed at enabling that trust, but the interface for
70187 +   actually using transactions is fairly narrow.
70188 +
70189 +   BEGIN_TRANSCRASH: Returns a transcrash identifier.  It should be possible to translate
70190 +   this identifier into a string that a shell-script could use, allowing you to start a
70191 +   transaction by issuing a command.  Once open, the transcrash should be set in the task
70192 +   structure, and there should be options (I suppose) to allow it to be carried across
70193 +   fork/exec.  A transcrash has several options:
70194 +
70195 +     - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only
70196 +     on writes (WRITE_FUSING) and allow "dirty reads".  If the application wishes to
70197 +     capture on reads as well, it should set READ_FUSING.
70198 +
70199 +     - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must
70200 +     eventually close (or else the machine must crash).  If the application dies an
70201 +     unexpected death with an open transcrash, for example, or if it hangs for a long
70202 +     duration, one solution (to avoid crashing the machine) is to simply close it anyway.
70203 +     This is a dangerous option, but it is one way to solve the problem until isolated
70204 +     transcrashes are available for untrusted applications.
70205 +
70206 +     It seems to be what databases do, though it is unclear how one avoids a DoS attack
70207 +     creating a vulnerability based on resource starvation.  Guaranteeing that some
70208 +     minimum amount of computational resources are made available would seem more correct
70209 +     than guaranteeing some amount of time.  When we again have someone to code the work,
70210 +     this issue should be considered carefully.  -Hans
70211 +
70212 +   RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how
70213 +   many dirty blocks it expects.  The reserve_blocks interface should be called at a point
70214 +   where it is safe for the application to fail, because the system may not be able to
70215 +   grant the allocation and the application must be able to back-out.  For this reason,
70216 +   the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but
70217 +   the application may also wish to extend the allocation after beginning its transcrash.
70218 +
70219 +   CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making
70220 +   modifications that require transaction protection.  When isolated transactions are
70221 +   supported the CLOSE operation is replaced by either COMMIT or ABORT.  For example, if a
70222 +   RESERVE_BLOCKS call fails for the application, it should "abort" by calling
70223 +   CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is
70224 +   why, for safety, the application should call RESERVE_BLOCKS before making any changes).
70225 +
70226 +   For actually implementing these out-of-system-call-scopped transcrashes, the
70227 +   reiser4_context has a "txn_handle *trans" pointer that may be set to an open
70228 +   transcrash.  Currently there are no dynamically-allocated transcrashes, but there is a
70229 +   "struct kmem_cache *_txnh_slab" created for that purpose in this file.
70230 +*/
70231 +
70232 +/* Extending the other system call interfaces for future transaction features:
70233 +
70234 +   Specialized applications may benefit from passing flags to the ordinary system call
70235 +   interface such as read(), write(), or stat().  For example, the application specifies
70236 +   WRITE_FUSING by default but wishes to add that a certain read() command should be
70237 +   treated as READ_FUSING.  But which read?  Is it the directory-entry read, the stat-data
70238 +   read, or the file-data read?  These issues are straight-forward, but there are a lot of
70239 +   them and adding the necessary flags-passing code will be tedious.
70240 +
70241 +   When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW)
70242 +   flag, which specifies that although it is a read operation being requested, a
70243 +   write-lock should be taken.  The reason is that read-locks are shared while write-locks
70244 +   are exclusive, so taking a read-lock when a later-write is known in advance will often
70245 +   leads to deadlock.  If a reader knows it will write later, it should issue read
70246 +   requests with the RMW flag set.
70247 +*/
70248 +
70249 +/*
70250 +   The znode/atom deadlock avoidance.
70251 +
70252 +   FIXME(Zam): writing of this comment is in progress.
70253 +
70254 +   The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's
70255 +   long-term locking, which makes reiser4 locking scheme more complex.  It had
70256 +   deadlocks until we implement deadlock avoidance algorithms.  That deadlocks
70257 +   looked as the following: one stopped thread waits for a long-term lock on
70258 +   znode, the thread who owns that lock waits when fusion with another atom will
70259 +   be allowed.
70260 +
70261 +   The source of the deadlocks is an optimization of not capturing index nodes
70262 +   for read.  Let's prove it.  Suppose we have dumb node capturing scheme which
70263 +   unconditionally captures each block before locking it.
70264 +
70265 +   That scheme has no deadlocks.  Let's begin with the thread which stage is
70266 +   ASTAGE_CAPTURE_WAIT and it waits for a znode lock.  The thread can't wait for
70267 +   a capture because it's stage allows fusion with any atom except which are
70268 +   being committed currently. A process of atom commit can't deadlock because
70269 +   atom commit procedure does not acquire locks and does not fuse with other
70270 +   atoms.  Reiser4 does capturing right before going to sleep inside the
70271 +   longtertm_lock_znode() function, it means the znode which we want to lock is
70272 +   already captured and its atom is in ASTAGE_CAPTURE_WAIT stage.  If we
70273 +   continue the analysis we understand that no one process in the sequence may
70274 +   waits atom fusion.  Thereby there are no deadlocks of described kind.
70275 +
70276 +   The capturing optimization makes the deadlocks possible.  A thread can wait a
70277 +   lock which owner did not captured that node.  The lock owner's current atom
70278 +   is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT
70279 +   state. A deadlock is possible when that atom meets another one which is in
70280 +   ASTAGE_CAPTURE_WAIT already.
70281 +
70282 +   The deadlock avoidance scheme includes two algorithms:
70283 +
70284 +   First algorithm is used when a thread captures a node which is locked but not
70285 +   captured by another thread.  Those nodes are marked MISSED_IN_CAPTURE at the
70286 +   moment we skip their capturing.  If such a node (marked MISSED_IN_CAPTURE) is
70287 +   being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the
70288 +   routine which forces all lock owners to join with current atom is executed.
70289 +
70290 +   Second algorithm does not allow to skip capturing of already captured nodes.
70291 +
70292 +   Both algorithms together prevent waiting a longterm lock without atom fusion
70293 +   with atoms of all lock owners, which is a key thing for getting atom/znode
70294 +   locking deadlocks.
70295 +*/
70296 +
70297 +/*
70298 + * Transactions and mmap(2).
70299 + *
70300 + *     1. Transactions are not supported for accesses through mmap(2), because
70301 + *     this would effectively amount to user-level transactions whose duration
70302 + *     is beyond control of the kernel.
70303 + *
70304 + *     2. That said, we still want to preserve some decency with regard to
70305 + *     mmap(2). During normal write(2) call, following sequence of events
70306 + *     happens:
70307 + *
70308 + *         1. page is created;
70309 + *
70310 + *         2. jnode is created, dirtied and captured into current atom.
70311 + *
70312 + *         3. extent is inserted and modified.
70313 + *
70314 + *     Steps (2) and (3) take place under long term lock on the twig node.
70315 + *
70316 + *     When file is accessed through mmap(2) page is always created during
70317 + *     page fault.
70318 + *     After this (in reiser4_readpage()->reiser4_readpage_extent()):
70319 + *
70320 + *         1. if access is made to non-hole page new jnode is created, (if
70321 + *         necessary)
70322 + *
70323 + *         2. if access is made to the hole page, jnode is not created (XXX
70324 + *         not clear why).
70325 + *
70326 + *     Also, even if page is created by write page fault it is not marked
70327 + *     dirty immediately by handle_mm_fault(). Probably this is to avoid races
70328 + *     with page write-out.
70329 + *
70330 + *     Dirty bit installed by hardware is only transferred to the struct page
70331 + *     later, when page is unmapped (in zap_pte_range(), or
70332 + *     try_to_unmap_one()).
70333 + *
70334 + *     So, with mmap(2) we have to handle following irksome situations:
70335 + *
70336 + *         1. there exists modified page (clean or dirty) without jnode
70337 + *
70338 + *         2. there exists modified page (clean or dirty) with clean jnode
70339 + *
70340 + *         3. clean page which is a part of atom can be transparently modified
70341 + *         at any moment through mapping without becoming dirty.
70342 + *
70343 + *     (1) and (2) can lead to the out-of-memory situation: ->writepage()
70344 + *     doesn't know what to do with such pages and ->sync_sb()/->writepages()
70345 + *     don't see them, because these methods operate on atoms.
70346 + *
70347 + *     (3) can lead to the loss of data: suppose we have dirty page with dirty
70348 + *     captured jnode captured by some atom. As part of early flush (for
70349 + *     example) page was written out. Dirty bit was cleared on both page and
70350 + *     jnode. After this page is modified through mapping, but kernel doesn't
70351 + *     notice and just discards page and jnode as part of commit. (XXX
70352 + *     actually it doesn't, because to reclaim page ->releasepage() has to be
70353 + *     called and before this dirty bit will be transferred to the struct
70354 + *     page).
70355 + *
70356 + */
70357 +
70358 +#include "debug.h"
70359 +#include "txnmgr.h"
70360 +#include "jnode.h"
70361 +#include "znode.h"
70362 +#include "block_alloc.h"
70363 +#include "tree.h"
70364 +#include "wander.h"
70365 +#include "ktxnmgrd.h"
70366 +#include "super.h"
70367 +#include "page_cache.h"
70368 +#include "reiser4.h"
70369 +#include "vfs_ops.h"
70370 +#include "inode.h"
70371 +#include "flush.h"
70372 +
70373 +#include <asm/atomic.h>
70374 +#include <linux/types.h>
70375 +#include <linux/fs.h>
70376 +#include <linux/mm.h>
70377 +#include <linux/slab.h>
70378 +#include <linux/pagemap.h>
70379 +#include <linux/writeback.h>
70380 +#include <linux/swap.h>                /* for totalram_pages */
70381 +
70382 +static void atom_free(txn_atom * atom);
70383 +
70384 +static int commit_txnh(txn_handle * txnh);
70385 +
70386 +static void wakeup_atom_waitfor_list(txn_atom * atom);
70387 +static void wakeup_atom_waiting_list(txn_atom * atom);
70388 +
70389 +static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh);
70390 +
70391 +static void capture_assign_block_nolock(txn_atom * atom, jnode * node);
70392 +
70393 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node);
70394 +
70395 +static int capture_init_fusion(jnode * node, txn_handle * txnh,
70396 +                              txn_capture mode);
70397 +
70398 +static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture);
70399 +
70400 +static void capture_fuse_into(txn_atom * small, txn_atom * large);
70401 +
70402 +void reiser4_invalidate_list(struct list_head *);
70403 +
70404 +/* GENERIC STRUCTURES */
70405 +
70406 +typedef struct _txn_wait_links txn_wait_links;
70407 +
70408 +struct _txn_wait_links {
70409 +       lock_stack *_lock_stack;
70410 +       struct list_head _fwaitfor_link;
70411 +       struct list_head _fwaiting_link;
70412 +       int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
70413 +       int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
70414 +};
70415 +
70416 +/* FIXME: In theory, we should be using the slab cache init & destructor
70417 +   methods instead of, e.g., jnode_init, etc. */
70418 +static struct kmem_cache *_atom_slab = NULL;
70419 +/* this is for user-visible, cross system-call transactions. */
70420 +static struct kmem_cache *_txnh_slab = NULL;
70421 +
70422 +/**
70423 + * init_txnmgr_static - create transaction manager slab caches
70424 + *
70425 + * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module
70426 + * initialization.
70427 + */
70428 +int init_txnmgr_static(void)
70429 +{
70430 +       assert("jmacd-600", _atom_slab == NULL);
70431 +       assert("jmacd-601", _txnh_slab == NULL);
70432 +
70433 +       ON_DEBUG(atomic_set(&flush_cnt, 0));
70434 +
70435 +       _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0,
70436 +                                      SLAB_HWCACHE_ALIGN |
70437 +                                      SLAB_RECLAIM_ACCOUNT, NULL);
70438 +       if (_atom_slab == NULL)
70439 +               return RETERR(-ENOMEM);
70440 +
70441 +       _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0,
70442 +                             SLAB_HWCACHE_ALIGN, NULL);
70443 +       if (_txnh_slab == NULL) {
70444 +               kmem_cache_destroy(_atom_slab);
70445 +               _atom_slab = NULL;
70446 +               return RETERR(-ENOMEM);
70447 +       }
70448 +
70449 +       return 0;
70450 +}
70451 +
70452 +/**
70453 + * done_txnmgr_static - delete txn_atom and txn_handle caches
70454 + *
70455 + * This is called on reiser4 module unloading or system shutdown.
70456 + */
70457 +void done_txnmgr_static(void)
70458 +{
70459 +       destroy_reiser4_cache(&_atom_slab);
70460 +       destroy_reiser4_cache(&_txnh_slab);
70461 +}
70462 +
70463 +/**
70464 + * init_txnmgr - initialize a new transaction manager
70465 + * @mgr: pointer to transaction manager embedded in reiser4 super block
70466 + *
70467 + * This is called on mount. Makes necessary initializations.
70468 + */
70469 +void reiser4_init_txnmgr(txn_mgr *mgr)
70470 +{
70471 +       assert("umka-169", mgr != NULL);
70472 +
70473 +       mgr->atom_count = 0;
70474 +       mgr->id_count = 1;
70475 +       INIT_LIST_HEAD(&mgr->atoms_list);
70476 +       spin_lock_init(&mgr->tmgr_lock);
70477 +       mutex_init(&mgr->commit_mutex);
70478 +}
70479 +
70480 +/**
70481 + * reiser4_done_txnmgr - stop transaction manager
70482 + * @mgr: pointer to transaction manager embedded in reiser4 super block
70483 + *
70484 + * This is called on umount. Does sanity checks.
70485 + */
70486 +void reiser4_done_txnmgr(txn_mgr *mgr)
70487 +{
70488 +       assert("umka-170", mgr != NULL);
70489 +       assert("umka-1701", list_empty_careful(&mgr->atoms_list));
70490 +       assert("umka-1702", mgr->atom_count == 0);
70491 +}
70492 +
70493 +/* Initialize a transaction handle. */
70494 +/* Audited by: umka (2002.06.13) */
70495 +static void txnh_init(txn_handle * txnh, txn_mode mode)
70496 +{
70497 +       assert("umka-171", txnh != NULL);
70498 +
70499 +       txnh->mode = mode;
70500 +       txnh->atom = NULL;
70501 +       reiser4_ctx_gfp_mask_set();
70502 +       txnh->flags = 0;
70503 +       spin_lock_init(&txnh->hlock);
70504 +       INIT_LIST_HEAD(&txnh->txnh_link);
70505 +}
70506 +
70507 +#if REISER4_DEBUG
70508 +/* Check if a transaction handle is clean. */
70509 +static int txnh_isclean(txn_handle * txnh)
70510 +{
70511 +       assert("umka-172", txnh != NULL);
70512 +       return txnh->atom == NULL &&
70513 +               LOCK_CNT_NIL(spin_locked_txnh);
70514 +}
70515 +#endif
70516 +
70517 +/* Initialize an atom. */
70518 +static void atom_init(txn_atom * atom)
70519 +{
70520 +       int level;
70521 +
70522 +       assert("umka-173", atom != NULL);
70523 +
70524 +       memset(atom, 0, sizeof(txn_atom));
70525 +
70526 +       atom->stage = ASTAGE_FREE;
70527 +       atom->start_time = jiffies;
70528 +
70529 +       for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1)
70530 +               INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level));
70531 +
70532 +       INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom));
70533 +       INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom));
70534 +       INIT_LIST_HEAD(ATOM_WB_LIST(atom));
70535 +       INIT_LIST_HEAD(&atom->inodes);
70536 +       spin_lock_init(&(atom->alock));
70537 +       /* list of transaction handles */
70538 +       INIT_LIST_HEAD(&atom->txnh_list);
70539 +       /* link to transaction manager's list of atoms */
70540 +       INIT_LIST_HEAD(&atom->atom_link);
70541 +       INIT_LIST_HEAD(&atom->fwaitfor_list);
70542 +       INIT_LIST_HEAD(&atom->fwaiting_list);
70543 +       blocknr_set_init(&atom->delete_set);
70544 +       blocknr_set_init(&atom->wandered_map);
70545 +
70546 +       init_atom_fq_parts(atom);
70547 +}
70548 +
70549 +#if REISER4_DEBUG
70550 +/* Check if an atom is clean. */
70551 +static int atom_isclean(txn_atom * atom)
70552 +{
70553 +       int level;
70554 +
70555 +       assert("umka-174", atom != NULL);
70556 +
70557 +       for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
70558 +               if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) {
70559 +                       return 0;
70560 +               }
70561 +       }
70562 +
70563 +       return  atom->stage == ASTAGE_FREE &&
70564 +               atom->txnh_count == 0 &&
70565 +               atom->capture_count == 0 &&
70566 +               atomic_read(&atom->refcount) == 0 &&
70567 +               (&atom->atom_link == atom->atom_link.next &&
70568 +                &atom->atom_link == atom->atom_link.prev) &&
70569 +               list_empty_careful(&atom->txnh_list) &&
70570 +               list_empty_careful(ATOM_CLEAN_LIST(atom)) &&
70571 +               list_empty_careful(ATOM_OVRWR_LIST(atom)) &&
70572 +               list_empty_careful(ATOM_WB_LIST(atom)) &&
70573 +               list_empty_careful(&atom->fwaitfor_list) &&
70574 +               list_empty_careful(&atom->fwaiting_list) &&
70575 +               atom_fq_parts_are_clean(atom);
70576 +}
70577 +#endif
70578 +
70579 +/* Begin a transaction in this context.  Currently this uses the reiser4_context's
70580 +   trans_in_ctx, which means that transaction handles are stack-allocated.  Eventually
70581 +   this will be extended to allow transaction handles to span several contexts. */
70582 +/* Audited by: umka (2002.06.13) */
70583 +void reiser4_txn_begin(reiser4_context * context)
70584 +{
70585 +       assert("jmacd-544", context->trans == NULL);
70586 +
70587 +       context->trans = &context->trans_in_ctx;
70588 +
70589 +       /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING
70590 +          transcrash.  Default should be TXN_WRITE_FUSING.  Also, the _trans variable is
70591 +          stack allocated right now, but we would like to allow for dynamically allocated
70592 +          transcrashes that span multiple system calls.
70593 +        */
70594 +       txnh_init(context->trans, TXN_WRITE_FUSING);
70595 +}
70596 +
70597 +/* Finish a transaction handle context. */
70598 +int reiser4_txn_end(reiser4_context * context)
70599 +{
70600 +       long ret = 0;
70601 +       txn_handle *txnh;
70602 +
70603 +       assert("umka-283", context != NULL);
70604 +       assert("nikita-3012", reiser4_schedulable());
70605 +       assert("vs-24", context == get_current_context());
70606 +       assert("nikita-2967", lock_stack_isclean(get_current_lock_stack()));
70607 +
70608 +       txnh = context->trans;
70609 +       if (txnh != NULL) {
70610 +               if (txnh->atom != NULL)
70611 +                       ret = commit_txnh(txnh);
70612 +               assert("jmacd-633", txnh_isclean(txnh));
70613 +               context->trans = NULL;
70614 +       }
70615 +       return ret;
70616 +}
70617 +
70618 +void reiser4_txn_restart(reiser4_context * context)
70619 +{
70620 +       reiser4_txn_end(context);
70621 +       reiser4_preempt_point();
70622 +       reiser4_txn_begin(context);
70623 +}
70624 +
70625 +void reiser4_txn_restart_current(void)
70626 +{
70627 +       reiser4_txn_restart(get_current_context());
70628 +}
70629 +
70630 +/* TXN_ATOM */
70631 +
70632 +/* Get the atom belonging to a txnh, which is not locked.  Return txnh locked. Locks atom, if atom
70633 +   is not NULL.  This performs the necessary spin_trylock to break the lock-ordering cycle.  May
70634 +   return NULL. */
70635 +static txn_atom *txnh_get_atom(txn_handle * txnh)
70636 +{
70637 +       txn_atom *atom;
70638 +
70639 +       assert("umka-180", txnh != NULL);
70640 +       assert_spin_not_locked(&(txnh->hlock));
70641 +
70642 +       while (1) {
70643 +               spin_lock_txnh(txnh);
70644 +               atom = txnh->atom;
70645 +
70646 +               if (atom == NULL)
70647 +                       break;
70648 +
70649 +               if (spin_trylock_atom(atom))
70650 +                       break;
70651 +
70652 +               atomic_inc(&atom->refcount);
70653 +
70654 +               spin_unlock_txnh(txnh);
70655 +               spin_lock_atom(atom);
70656 +               spin_lock_txnh(txnh);
70657 +
70658 +               if (txnh->atom == atom) {
70659 +                       atomic_dec(&atom->refcount);
70660 +                       break;
70661 +               }
70662 +
70663 +               spin_unlock_txnh(txnh);
70664 +               atom_dec_and_unlock(atom);
70665 +       }
70666 +
70667 +       return atom;
70668 +}
70669 +
70670 +/* Get the current atom and spinlock it if current atom present. May return NULL  */
70671 +txn_atom *get_current_atom_locked_nocheck(void)
70672 +{
70673 +       reiser4_context *cx;
70674 +       txn_atom *atom;
70675 +       txn_handle *txnh;
70676 +
70677 +       cx = get_current_context();
70678 +       assert("zam-437", cx != NULL);
70679 +
70680 +       txnh = cx->trans;
70681 +       assert("zam-435", txnh != NULL);
70682 +
70683 +       atom = txnh_get_atom(txnh);
70684 +
70685 +       spin_unlock_txnh(txnh);
70686 +       return atom;
70687 +}
70688 +
70689 +/* Get the atom belonging to a jnode, which is initially locked.  Return with
70690 +   both jnode and atom locked.  This performs the necessary spin_trylock to
70691 +   break the lock-ordering cycle.  Assumes the jnode is already locked, and
70692 +   returns NULL if atom is not set. */
70693 +txn_atom *jnode_get_atom(jnode * node)
70694 +{
70695 +       txn_atom *atom;
70696 +
70697 +       assert("umka-181", node != NULL);
70698 +
70699 +       while (1) {
70700 +               assert_spin_locked(&(node->guard));
70701 +
70702 +               atom = node->atom;
70703 +               /* node is not in any atom */
70704 +               if (atom == NULL)
70705 +                       break;
70706 +
70707 +               /* If atom is not locked, grab the lock and return */
70708 +               if (spin_trylock_atom(atom))
70709 +                       break;
70710 +
70711 +               /* At least one jnode belongs to this atom it guarantees that
70712 +                * atom->refcount > 0, we can safely increment refcount. */
70713 +               atomic_inc(&atom->refcount);
70714 +               spin_unlock_jnode(node);
70715 +
70716 +               /* re-acquire spin locks in the right order */
70717 +               spin_lock_atom(atom);
70718 +               spin_lock_jnode(node);
70719 +
70720 +               /* check if node still points to the same atom. */
70721 +               if (node->atom == atom) {
70722 +                       atomic_dec(&atom->refcount);
70723 +                       break;
70724 +               }
70725 +
70726 +               /* releasing of atom lock and reference requires not holding
70727 +                * locks on jnodes.  */
70728 +               spin_unlock_jnode(node);
70729 +
70730 +               /* We do not sure that this atom has extra references except our
70731 +                * one, so we should call proper function which may free atom if
70732 +                * last reference is released. */
70733 +               atom_dec_and_unlock(atom);
70734 +
70735 +               /* lock jnode again for getting valid node->atom pointer
70736 +                * value. */
70737 +               spin_lock_jnode(node);
70738 +       }
70739 +
70740 +       return atom;
70741 +}
70742 +
70743 +/* Returns true if @node is dirty and part of the same atom as one of its neighbors.  Used
70744 +   by flush code to indicate whether the next node (in some direction) is suitable for
70745 +   flushing. */
70746 +int
70747 +same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value)
70748 +{
70749 +       int compat;
70750 +       txn_atom *atom;
70751 +
70752 +       assert("umka-182", node != NULL);
70753 +       assert("umka-183", check != NULL);
70754 +
70755 +       /* Not sure what this function is supposed to do if supplied with @check that is
70756 +          neither formatted nor unformatted (bitmap or so). */
70757 +       assert("nikita-2373", jnode_is_znode(check)
70758 +              || jnode_is_unformatted(check));
70759 +
70760 +       /* Need a lock on CHECK to get its atom and to check various state bits.
70761 +          Don't need a lock on NODE once we get the atom lock. */
70762 +       /* It is not enough to lock two nodes and check (node->atom ==
70763 +          check->atom) because atom could be locked and being fused at that
70764 +          moment, jnodes of the atom of that state (being fused) can point to
70765 +          different objects, but the atom is the same. */
70766 +       spin_lock_jnode(check);
70767 +
70768 +       atom = jnode_get_atom(check);
70769 +
70770 +       if (atom == NULL) {
70771 +               compat = 0;
70772 +       } else {
70773 +               compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY));
70774 +
70775 +               if (compat && jnode_is_znode(check)) {
70776 +                       compat &= znode_is_connected(JZNODE(check));
70777 +               }
70778 +
70779 +               if (compat && alloc_check) {
70780 +                       compat &= (alloc_value == jnode_is_flushprepped(check));
70781 +               }
70782 +
70783 +               spin_unlock_atom(atom);
70784 +       }
70785 +
70786 +       spin_unlock_jnode(check);
70787 +
70788 +       return compat;
70789 +}
70790 +
70791 +/* Decrement the atom's reference count and if it falls to zero, free it. */
70792 +void atom_dec_and_unlock(txn_atom * atom)
70793 +{
70794 +       txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
70795 +
70796 +       assert("umka-186", atom != NULL);
70797 +       assert_spin_locked(&(atom->alock));
70798 +       assert("zam-1039", atomic_read(&atom->refcount) > 0);
70799 +
70800 +       if (atomic_dec_and_test(&atom->refcount)) {
70801 +               /* take txnmgr lock and atom lock in proper order. */
70802 +               if (!spin_trylock_txnmgr(mgr)) {
70803 +                       /* This atom should exist after we re-acquire its
70804 +                        * spinlock, so we increment its reference counter. */
70805 +                       atomic_inc(&atom->refcount);
70806 +                       spin_unlock_atom(atom);
70807 +                       spin_lock_txnmgr(mgr);
70808 +                       spin_lock_atom(atom);
70809 +
70810 +                       if (!atomic_dec_and_test(&atom->refcount)) {
70811 +                               spin_unlock_atom(atom);
70812 +                               spin_unlock_txnmgr(mgr);
70813 +                               return;
70814 +                       }
70815 +               }
70816 +               assert_spin_locked(&(mgr->tmgr_lock));
70817 +               atom_free(atom);
70818 +               spin_unlock_txnmgr(mgr);
70819 +       } else
70820 +               spin_unlock_atom(atom);
70821 +}
70822 +
70823 +/* Create new atom and connect it to given transaction handle.  This adds the
70824 +   atom to the transaction manager's list and sets its reference count to 1, an
70825 +   artificial reference which is kept until it commits.  We play strange games
70826 +   to avoid allocation under jnode & txnh spinlocks.*/
70827 +
70828 +static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh)
70829 +{
70830 +       txn_atom *atom;
70831 +       txn_mgr *mgr;
70832 +
70833 +       if (REISER4_DEBUG && rofs_tree(current_tree)) {
70834 +               warning("nikita-3366", "Creating atom on rofs");
70835 +               dump_stack();
70836 +       }
70837 +
70838 +       if (*atom_alloc == NULL) {
70839 +               (*atom_alloc) = kmem_cache_alloc(_atom_slab,
70840 +                                                reiser4_ctx_gfp_mask_get());
70841 +
70842 +               if (*atom_alloc == NULL)
70843 +                       return RETERR(-ENOMEM);
70844 +       }
70845 +
70846 +       /* and, also, txnmgr spin lock should be taken before jnode and txnh
70847 +          locks. */
70848 +       mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
70849 +       spin_lock_txnmgr(mgr);
70850 +       spin_lock_txnh(txnh);
70851 +
70852 +       /* Check whether new atom still needed */
70853 +       if (txnh->atom != NULL) {
70854 +               /* NOTE-NIKITA probably it is rather better to free
70855 +                * atom_alloc here than thread it up to reiser4_try_capture() */
70856 +
70857 +               spin_unlock_txnh(txnh);
70858 +               spin_unlock_txnmgr(mgr);
70859 +
70860 +               return -E_REPEAT;
70861 +       }
70862 +
70863 +       atom = *atom_alloc;
70864 +       *atom_alloc = NULL;
70865 +
70866 +       atom_init(atom);
70867 +
70868 +       assert("jmacd-17", atom_isclean(atom));
70869 +
70870 +        /*
70871 +        * lock ordering is broken here. It is ok, as long as @atom is new
70872 +        * and inaccessible for others. We can't use spin_lock_atom or
70873 +        * spin_lock(&atom->alock) because they care about locking
70874 +        * dependencies. spin_trylock_lock doesn't.
70875 +        */
70876 +       check_me("", spin_trylock_atom(atom));
70877 +
70878 +       /* add atom to the end of transaction manager's list of atoms */
70879 +       list_add_tail(&atom->atom_link, &mgr->atoms_list);
70880 +       atom->atom_id = mgr->id_count++;
70881 +       mgr->atom_count += 1;
70882 +
70883 +       /* Release txnmgr lock */
70884 +       spin_unlock_txnmgr(mgr);
70885 +
70886 +       /* One reference until it commits. */
70887 +       atomic_inc(&atom->refcount);
70888 +       atom->stage = ASTAGE_CAPTURE_FUSE;
70889 +       atom->super = reiser4_get_current_sb();
70890 +       capture_assign_txnh_nolock(atom, txnh);
70891 +
70892 +       spin_unlock_atom(atom);
70893 +       spin_unlock_txnh(txnh);
70894 +
70895 +       return -E_REPEAT;
70896 +}
70897 +
70898 +/* Return true if an atom is currently "open". */
70899 +static int atom_isopen(const txn_atom * atom)
70900 +{
70901 +       assert("umka-185", atom != NULL);
70902 +
70903 +       return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT;
70904 +}
70905 +
70906 +/* Return the number of pointers to this atom that must be updated during fusion.  This
70907 +   approximates the amount of work to be done.  Fusion chooses the atom with fewer
70908 +   pointers to fuse into the atom with more pointers. */
70909 +static int atom_pointer_count(const txn_atom * atom)
70910 +{
70911 +       assert("umka-187", atom != NULL);
70912 +
70913 +       /* This is a measure of the amount of work needed to fuse this atom
70914 +        * into another. */
70915 +       return atom->txnh_count + atom->capture_count;
70916 +}
70917 +
70918 +/* Called holding the atom lock, this removes the atom from the transaction manager list
70919 +   and frees it. */
70920 +static void atom_free(txn_atom * atom)
70921 +{
70922 +       txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
70923 +
70924 +       assert("umka-188", atom != NULL);
70925 +       assert_spin_locked(&(atom->alock));
70926 +
70927 +       /* Remove from the txn_mgr's atom list */
70928 +       assert_spin_locked(&(mgr->tmgr_lock));
70929 +       mgr->atom_count -= 1;
70930 +       list_del_init(&atom->atom_link);
70931 +
70932 +       /* Clean the atom */
70933 +       assert("jmacd-16",
70934 +              (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE));
70935 +       atom->stage = ASTAGE_FREE;
70936 +
70937 +       blocknr_set_destroy(&atom->delete_set);
70938 +       blocknr_set_destroy(&atom->wandered_map);
70939 +
70940 +       assert("jmacd-16", atom_isclean(atom));
70941 +
70942 +       spin_unlock_atom(atom);
70943 +
70944 +       kmem_cache_free(_atom_slab, atom);
70945 +}
70946 +
70947 +static int atom_is_dotard(const txn_atom * atom)
70948 +{
70949 +       return time_after(jiffies, atom->start_time +
70950 +                         get_current_super_private()->tmgr.atom_max_age);
70951 +}
70952 +
70953 +static int atom_can_be_committed(txn_atom * atom)
70954 +{
70955 +       assert_spin_locked(&(atom->alock));
70956 +       assert("zam-885", atom->txnh_count > atom->nr_waiters);
70957 +       return atom->txnh_count == atom->nr_waiters + 1;
70958 +}
70959 +
70960 +/* Return true if an atom should commit now.  This is determined by aging, atom
70961 +   size or atom flags. */
70962 +static int atom_should_commit(const txn_atom * atom)
70963 +{
70964 +       assert("umka-189", atom != NULL);
70965 +       return
70966 +           (atom->flags & ATOM_FORCE_COMMIT) ||
70967 +           ((unsigned)atom_pointer_count(atom) >
70968 +            get_current_super_private()->tmgr.atom_max_size)
70969 +           || atom_is_dotard(atom);
70970 +}
70971 +
70972 +/* return 1 if current atom exists and requires commit. */
70973 +int current_atom_should_commit(void)
70974 +{
70975 +       txn_atom *atom;
70976 +       int result = 0;
70977 +
70978 +       atom = get_current_atom_locked_nocheck();
70979 +       if (atom) {
70980 +               result = atom_should_commit(atom);
70981 +               spin_unlock_atom(atom);
70982 +       }
70983 +       return result;
70984 +}
70985 +
70986 +static int atom_should_commit_asap(const txn_atom * atom)
70987 +{
70988 +       unsigned int captured;
70989 +       unsigned int pinnedpages;
70990 +
70991 +       assert("nikita-3309", atom != NULL);
70992 +
70993 +       captured = (unsigned)atom->capture_count;
70994 +       pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode);
70995 +
70996 +       return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100);
70997 +}
70998 +
70999 +static jnode *find_first_dirty_in_list(struct list_head *head, int flags)
71000 +{
71001 +       jnode *first_dirty;
71002 +
71003 +       list_for_each_entry(first_dirty, head, capture_link) {
71004 +               if (!(flags & JNODE_FLUSH_COMMIT)) {
71005 +                       /*
71006 +                        * skip jnodes which "heard banshee" or having active
71007 +                        * I/O
71008 +                        */
71009 +                       if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) ||
71010 +                           JF_ISSET(first_dirty, JNODE_WRITEBACK))
71011 +                               continue;
71012 +               }
71013 +               return first_dirty;
71014 +       }
71015 +       return NULL;
71016 +}
71017 +
71018 +/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty
71019 +   nodes on atom's lists */
71020 +jnode *find_first_dirty_jnode(txn_atom * atom, int flags)
71021 +{
71022 +       jnode *first_dirty;
71023 +       tree_level level;
71024 +
71025 +       assert_spin_locked(&(atom->alock));
71026 +
71027 +       /* The flush starts from LEAF_LEVEL (=1). */
71028 +       for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
71029 +               if (list_empty_careful(ATOM_DIRTY_LIST(atom, level)))
71030 +                       continue;
71031 +
71032 +               first_dirty =
71033 +                   find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level),
71034 +                                            flags);
71035 +               if (first_dirty)
71036 +                       return first_dirty;
71037 +       }
71038 +
71039 +       /* znode-above-root is on the list #0. */
71040 +       return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags);
71041 +}
71042 +
71043 +static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq)
71044 +{
71045 +       jnode *cur;
71046 +
71047 +       assert("zam-905", atom_is_protected(atom));
71048 +
71049 +       cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link);
71050 +       while (ATOM_WB_LIST(atom) != &cur->capture_link) {
71051 +               jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
71052 +
71053 +               spin_lock_jnode(cur);
71054 +               if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
71055 +                       if (JF_ISSET(cur, JNODE_DIRTY)) {
71056 +                               queue_jnode(fq, cur);
71057 +                       } else {
71058 +                               /* move jnode to atom's clean list */
71059 +                               list_move_tail(&cur->capture_link,
71060 +                                             ATOM_CLEAN_LIST(atom));
71061 +                       }
71062 +               }
71063 +               spin_unlock_jnode(cur);
71064 +
71065 +               cur = next;
71066 +       }
71067 +}
71068 +
71069 +/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback
71070 + * jnodes to disk. */
71071 +static int submit_wb_list(void)
71072 +{
71073 +       int ret;
71074 +       flush_queue_t *fq;
71075 +
71076 +       fq = get_fq_for_current_atom();
71077 +       if (IS_ERR(fq))
71078 +               return PTR_ERR(fq);
71079 +
71080 +       dispatch_wb_list(fq->atom, fq);
71081 +       spin_unlock_atom(fq->atom);
71082 +
71083 +       ret = reiser4_write_fq(fq, NULL, 1);
71084 +       reiser4_fq_put(fq);
71085 +
71086 +       return ret;
71087 +}
71088 +
71089 +/* Wait completion of all writes, re-submit atom writeback list if needed. */
71090 +static int current_atom_complete_writes(void)
71091 +{
71092 +       int ret;
71093 +
71094 +       /* Each jnode from that list was modified and dirtied when it had i/o
71095 +        * request running already. After i/o completion we have to resubmit
71096 +        * them to disk again.*/
71097 +       ret = submit_wb_list();
71098 +       if (ret < 0)
71099 +               return ret;
71100 +
71101 +       /* Wait all i/o completion */
71102 +       ret = current_atom_finish_all_fq();
71103 +       if (ret)
71104 +               return ret;
71105 +
71106 +       /* Scan wb list again; all i/o should be completed, we re-submit dirty
71107 +        * nodes to disk */
71108 +       ret = submit_wb_list();
71109 +       if (ret < 0)
71110 +               return ret;
71111 +
71112 +       /* Wait all nodes we just submitted */
71113 +       return current_atom_finish_all_fq();
71114 +}
71115 +
71116 +#if REISER4_DEBUG
71117 +
71118 +static void reiser4_info_atom(const char *prefix, const txn_atom * atom)
71119 +{
71120 +       if (atom == NULL) {
71121 +               printk("%s: no atom\n", prefix);
71122 +               return;
71123 +       }
71124 +
71125 +       printk("%s: refcount: %i id: %i flags: %x txnh_count: %i"
71126 +              " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix,
71127 +              atomic_read(&atom->refcount), atom->atom_id, atom->flags,
71128 +              atom->txnh_count, atom->capture_count, atom->stage,
71129 +              atom->start_time, atom->flushed);
71130 +}
71131 +
71132 +#else  /*  REISER4_DEBUG  */
71133 +
71134 +static inline void reiser4_info_atom(const char *prefix, const txn_atom * atom) {}
71135 +
71136 +#endif  /*  REISER4_DEBUG  */
71137 +
71138 +#define TOOMANYFLUSHES (1 << 13)
71139 +
71140 +/* Called with the atom locked and no open "active" transaction handlers except
71141 +   ours, this function calls flush_current_atom() until all dirty nodes are
71142 +   processed.  Then it initiates commit processing.
71143 +
71144 +   Called by the single remaining open "active" txnh, which is closing. Other
71145 +   open txnhs belong to processes which wait atom commit in commit_txnh()
71146 +   routine. They are counted as "waiters" in atom->nr_waiters.  Therefore as
71147 +   long as we hold the atom lock none of the jnodes can be captured and/or
71148 +   locked.
71149 +
71150 +   Return value is an error code if commit fails.
71151 +*/
71152 +static int commit_current_atom(long *nr_submitted, txn_atom ** atom)
71153 +{
71154 +       reiser4_super_info_data *sbinfo = get_current_super_private();
71155 +       long ret = 0;
71156 +       /* how many times jnode_flush() was called as a part of attempt to
71157 +        * commit this atom. */
71158 +       int flushiters;
71159 +
71160 +       assert("zam-888", atom != NULL && *atom != NULL);
71161 +       assert_spin_locked(&((*atom)->alock));
71162 +       assert("zam-887", get_current_context()->trans->atom == *atom);
71163 +       assert("jmacd-151", atom_isopen(*atom));
71164 +
71165 +       assert("nikita-3184",
71166 +              get_current_super_private()->delete_mutex_owner != current);
71167 +
71168 +       for (flushiters = 0;; ++flushiters) {
71169 +               ret =
71170 +                   flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS |
71171 +                                      JNODE_FLUSH_COMMIT,
71172 +                                      LONG_MAX /* nr_to_write */ ,
71173 +                                      nr_submitted, atom, NULL);
71174 +               if (ret != -E_REPEAT)
71175 +                       break;
71176 +
71177 +               /* if atom's dirty list contains one znode which is
71178 +                  HEARD_BANSHEE and is locked we have to allow lock owner to
71179 +                  continue and uncapture that znode */
71180 +               reiser4_preempt_point();
71181 +
71182 +               *atom = get_current_atom_locked();
71183 +               if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) {
71184 +                       warning("nikita-3176",
71185 +                               "Flushing like mad: %i", flushiters);
71186 +                       reiser4_info_atom("atom", *atom);
71187 +                       DEBUGON(flushiters > (1 << 20));
71188 +               }
71189 +       }
71190 +
71191 +       if (ret)
71192 +               return ret;
71193 +
71194 +       assert_spin_locked(&((*atom)->alock));
71195 +
71196 +       if (!atom_can_be_committed(*atom)) {
71197 +               spin_unlock_atom(*atom);
71198 +               return RETERR(-E_REPEAT);
71199 +       }
71200 +
71201 +       if ((*atom)->capture_count == 0)
71202 +               goto done;
71203 +
71204 +       /* Up to this point we have been flushing and after flush is called we
71205 +          return -E_REPEAT.  Now we can commit.  We cannot return -E_REPEAT
71206 +          at this point, commit should be successful. */
71207 +       reiser4_atom_set_stage(*atom, ASTAGE_PRE_COMMIT);
71208 +       ON_DEBUG(((*atom)->committer = current));
71209 +       spin_unlock_atom(*atom);
71210 +
71211 +       ret = current_atom_complete_writes();
71212 +       if (ret)
71213 +               return ret;
71214 +
71215 +       assert("zam-906", list_empty(ATOM_WB_LIST(*atom)));
71216 +
71217 +       /* isolate critical code path which should be executed by only one
71218 +        * thread using tmgr mutex */
71219 +       mutex_lock(&sbinfo->tmgr.commit_mutex);
71220 +
71221 +       ret = reiser4_write_logs(nr_submitted);
71222 +       if (ret < 0)
71223 +               reiser4_panic("zam-597", "write log failed (%ld)\n", ret);
71224 +
71225 +       /* The atom->ovrwr_nodes list is processed under commit mutex held
71226 +          because of bitmap nodes which are captured by special way in
71227 +          reiser4_pre_commit_hook_bitmap(), that way does not include
71228 +          capture_fuse_wait() as a capturing of other nodes does -- the commit
71229 +          mutex is used for transaction isolation instead. */
71230 +       reiser4_invalidate_list(ATOM_OVRWR_LIST(*atom));
71231 +       mutex_unlock(&sbinfo->tmgr.commit_mutex);
71232 +
71233 +       reiser4_invalidate_list(ATOM_CLEAN_LIST(*atom));
71234 +       reiser4_invalidate_list(ATOM_WB_LIST(*atom));
71235 +       assert("zam-927", list_empty(&(*atom)->inodes));
71236 +
71237 +       spin_lock_atom(*atom);
71238 + done:
71239 +       reiser4_atom_set_stage(*atom, ASTAGE_DONE);
71240 +       ON_DEBUG((*atom)->committer = NULL);
71241 +
71242 +       /* Atom's state changes, so wake up everybody waiting for this
71243 +          event. */
71244 +       wakeup_atom_waiting_list(*atom);
71245 +
71246 +       /* Decrement the "until commit" reference, at least one txnh (the caller) is
71247 +          still open. */
71248 +       atomic_dec(&(*atom)->refcount);
71249 +
71250 +       assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0);
71251 +       assert("jmacd-1062", (*atom)->capture_count == 0);
71252 +       BUG_ON((*atom)->capture_count != 0);
71253 +       assert_spin_locked(&((*atom)->alock));
71254 +
71255 +       return ret;
71256 +}
71257 +
71258 +/* TXN_TXNH */
71259 +
71260 +/**
71261 + * force_commit_atom - commit current atom and wait commit completion
71262 + * @txnh:
71263 + *
71264 + * Commits current atom and wait commit completion; current atom and @txnh have
71265 + * to be spinlocked before call, this function unlocks them on exit.
71266 + */
71267 +int force_commit_atom(txn_handle *txnh)
71268 +{
71269 +       txn_atom *atom;
71270 +
71271 +       assert("zam-837", txnh != NULL);
71272 +       assert_spin_locked(&(txnh->hlock));
71273 +       assert("nikita-2966", lock_stack_isclean(get_current_lock_stack()));
71274 +
71275 +       atom = txnh->atom;
71276 +
71277 +       assert("zam-834", atom != NULL);
71278 +       assert_spin_locked(&(atom->alock));
71279 +
71280 +       /*
71281 +        * Set flags for atom and txnh: forcing atom commit and waiting for
71282 +        * commit completion
71283 +        */
71284 +       txnh->flags |= TXNH_WAIT_COMMIT;
71285 +       atom->flags |= ATOM_FORCE_COMMIT;
71286 +
71287 +       spin_unlock_txnh(txnh);
71288 +       spin_unlock_atom(atom);
71289 +
71290 +       /* commit is here */
71291 +       reiser4_txn_restart_current();
71292 +       return 0;
71293 +}
71294 +
71295 +/* Called to force commit of any outstanding atoms.  @commit_all_atoms controls
71296 + * should we commit all atoms including new ones which are created after this
71297 + * functions is called. */
71298 +int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms)
71299 +{
71300 +       int ret;
71301 +       txn_atom *atom;
71302 +       txn_mgr *mgr;
71303 +       txn_handle *txnh;
71304 +       unsigned long start_time = jiffies;
71305 +       reiser4_context *ctx = get_current_context();
71306 +
71307 +       assert("nikita-2965", lock_stack_isclean(get_current_lock_stack()));
71308 +       assert("nikita-3058", reiser4_commit_check_locks());
71309 +
71310 +       reiser4_txn_restart_current();
71311 +
71312 +       mgr = &get_super_private(super)->tmgr;
71313 +
71314 +       txnh = ctx->trans;
71315 +
71316 +      again:
71317 +
71318 +       spin_lock_txnmgr(mgr);
71319 +
71320 +       list_for_each_entry(atom, &mgr->atoms_list, atom_link) {
71321 +               spin_lock_atom(atom);
71322 +
71323 +               /* Commit any atom which can be committed.  If @commit_new_atoms
71324 +                * is not set we commit only atoms which were created before
71325 +                * this call is started. */
71326 +               if (commit_all_atoms
71327 +                   || time_before_eq(atom->start_time, start_time)) {
71328 +                       if (atom->stage <= ASTAGE_POST_COMMIT) {
71329 +                               spin_unlock_txnmgr(mgr);
71330 +
71331 +                               if (atom->stage < ASTAGE_PRE_COMMIT) {
71332 +                                       spin_lock_txnh(txnh);
71333 +                                       /* Add force-context txnh */
71334 +                                       capture_assign_txnh_nolock(atom, txnh);
71335 +                                       ret = force_commit_atom(txnh);
71336 +                                       if (ret)
71337 +                                               return ret;
71338 +                               } else
71339 +                                       /* wait atom commit */
71340 +                                       reiser4_atom_wait_event(atom);
71341 +
71342 +                               goto again;
71343 +                       }
71344 +               }
71345 +
71346 +               spin_unlock_atom(atom);
71347 +       }
71348 +
71349 +#if REISER4_DEBUG
71350 +       if (commit_all_atoms) {
71351 +               reiser4_super_info_data *sbinfo = get_super_private(super);
71352 +               spin_lock_reiser4_super(sbinfo);
71353 +               assert("zam-813",
71354 +                      sbinfo->blocks_fake_allocated_unformatted == 0);
71355 +               assert("zam-812", sbinfo->blocks_fake_allocated == 0);
71356 +               spin_unlock_reiser4_super(sbinfo);
71357 +       }
71358 +#endif
71359 +
71360 +       spin_unlock_txnmgr(mgr);
71361 +
71362 +       return 0;
71363 +}
71364 +
71365 +/* check whether commit_some_atoms() can commit @atom. Locking is up to the
71366 + * caller */
71367 +static int atom_is_committable(txn_atom * atom)
71368 +{
71369 +       return
71370 +           atom->stage < ASTAGE_PRE_COMMIT &&
71371 +           atom->txnh_count == atom->nr_waiters && atom_should_commit(atom);
71372 +}
71373 +
71374 +/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin
71375 + * lock at exit */
71376 +int commit_some_atoms(txn_mgr * mgr)
71377 +{
71378 +       int ret = 0;
71379 +       txn_atom *atom;
71380 +       txn_handle *txnh;
71381 +       reiser4_context *ctx;
71382 +       struct list_head *pos, *tmp;
71383 +
71384 +       ctx = get_current_context();
71385 +       assert("nikita-2444", ctx != NULL);
71386 +
71387 +       txnh = ctx->trans;
71388 +       spin_lock_txnmgr(mgr);
71389 +
71390 +       /*
71391 +        * this is to avoid gcc complain that atom might be used
71392 +        * uninitialized
71393 +        */
71394 +       atom = NULL;
71395 +
71396 +       /* look for atom to commit */
71397 +       list_for_each_safe(pos, tmp, &mgr->atoms_list) {
71398 +               atom = list_entry(pos, txn_atom, atom_link);
71399 +               /*
71400 +                * first test without taking atom spin lock, whether it is
71401 +                * eligible for committing at all
71402 +                */
71403 +               if (atom_is_committable(atom)) {
71404 +                       /* now, take spin lock and re-check */
71405 +                       spin_lock_atom(atom);
71406 +                       if (atom_is_committable(atom))
71407 +                               break;
71408 +                       spin_unlock_atom(atom);
71409 +               }
71410 +       }
71411 +
71412 +       ret = (&mgr->atoms_list == pos);
71413 +       spin_unlock_txnmgr(mgr);
71414 +
71415 +       if (ret) {
71416 +               /* nothing found */
71417 +               spin_unlock(&mgr->daemon->guard);
71418 +               return 0;
71419 +       }
71420 +
71421 +       spin_lock_txnh(txnh);
71422 +
71423 +       BUG_ON(atom == NULL);
71424 +       /* Set the atom to force committing */
71425 +       atom->flags |= ATOM_FORCE_COMMIT;
71426 +
71427 +       /* Add force-context txnh */
71428 +       capture_assign_txnh_nolock(atom, txnh);
71429 +
71430 +       spin_unlock_txnh(txnh);
71431 +       spin_unlock_atom(atom);
71432 +
71433 +       /* we are about to release daemon spin lock, notify daemon it
71434 +          has to rescan atoms */
71435 +       mgr->daemon->rescan = 1;
71436 +       spin_unlock(&mgr->daemon->guard);
71437 +       reiser4_txn_restart_current();
71438 +       return 0;
71439 +}
71440 +
71441 +static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom)
71442 +{
71443 +       int atom_stage;
71444 +       txn_atom *atom_2;
71445 +       int repeat;
71446 +
71447 +       assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT);
71448 +
71449 +       atom_stage = atom->stage;
71450 +       repeat = 0;
71451 +
71452 +       if (!spin_trylock_txnmgr(tmgr)) {
71453 +               atomic_inc(&atom->refcount);
71454 +               spin_unlock_atom(atom);
71455 +               spin_lock_txnmgr(tmgr);
71456 +               spin_lock_atom(atom);
71457 +               repeat = 1;
71458 +               if (atom->stage != atom_stage) {
71459 +                       spin_unlock_txnmgr(tmgr);
71460 +                       atom_dec_and_unlock(atom);
71461 +                       return -E_REPEAT;
71462 +               }
71463 +               atomic_dec(&atom->refcount);
71464 +       }
71465 +
71466 +       list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) {
71467 +               if (atom == atom_2)
71468 +                       continue;
71469 +               /*
71470 +                * if trylock does not succeed we just do not fuse with that
71471 +                * atom.
71472 +                */
71473 +               if (spin_trylock_atom(atom_2)) {
71474 +                       if (atom_2->stage < ASTAGE_PRE_COMMIT) {
71475 +                               spin_unlock_txnmgr(tmgr);
71476 +                               capture_fuse_into(atom_2, atom);
71477 +                               /* all locks are lost we can only repeat here */
71478 +                               return -E_REPEAT;
71479 +                       }
71480 +                       spin_unlock_atom(atom_2);
71481 +               }
71482 +       }
71483 +       atom->flags |= ATOM_CANCEL_FUSION;
71484 +       spin_unlock_txnmgr(tmgr);
71485 +       if (repeat) {
71486 +               spin_unlock_atom(atom);
71487 +               return -E_REPEAT;
71488 +       }
71489 +       return 0;
71490 +}
71491 +
71492 +/* Calls jnode_flush for current atom if it exists; if not, just take another
71493 +   atom and call jnode_flush() for him.  If current transaction handle has
71494 +   already assigned atom (current atom) we have to close current transaction
71495 +   prior to switch to another atom or do something with current atom. This
71496 +   code tries to flush current atom.
71497 +
71498 +   flush_some_atom() is called as part of memory clearing process. It is
71499 +   invoked from balance_dirty_pages(), pdflushd, and entd.
71500 +
71501 +   If we can flush no nodes, atom is committed, because this frees memory.
71502 +
71503 +   If atom is too large or too old it is committed also.
71504 +*/
71505 +int
71506 +flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc,
71507 +               int flags)
71508 +{
71509 +       reiser4_context *ctx = get_current_context();
71510 +       txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr;
71511 +       txn_handle *txnh = ctx->trans;
71512 +       txn_atom *atom;
71513 +       int ret;
71514 +
71515 +       BUG_ON(wbc->nr_to_write == 0);
71516 +       BUG_ON(*nr_submitted != 0);
71517 +       assert("zam-1042", txnh != NULL);
71518 +      repeat:
71519 +       if (txnh->atom == NULL) {
71520 +               /* current atom is not available, take first from txnmgr */
71521 +               spin_lock_txnmgr(tmgr);
71522 +
71523 +               /* traverse the list of all atoms */
71524 +               list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
71525 +                       /* lock atom before checking its state */
71526 +                       spin_lock_atom(atom);
71527 +
71528 +                       /*
71529 +                        * we need an atom which is not being committed and
71530 +                        * which has no flushers (jnode_flush() add one flusher
71531 +                        * at the beginning and subtract one at the end).
71532 +                        */
71533 +                       if (atom->stage < ASTAGE_PRE_COMMIT &&
71534 +                           atom->nr_flushers == 0) {
71535 +                               spin_lock_txnh(txnh);
71536 +                               capture_assign_txnh_nolock(atom, txnh);
71537 +                               spin_unlock_txnh(txnh);
71538 +
71539 +                               goto found;
71540 +                       }
71541 +
71542 +                       spin_unlock_atom(atom);
71543 +               }
71544 +
71545 +               /*
71546 +                * Write throttling is case of no one atom can be
71547 +                * flushed/committed.
71548 +                */
71549 +               if (!current_is_pdflush() && !wbc->nonblocking) {
71550 +                       list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
71551 +                               spin_lock_atom(atom);
71552 +                               /* Repeat the check from the above. */
71553 +                               if (atom->stage < ASTAGE_PRE_COMMIT
71554 +                                   && atom->nr_flushers == 0) {
71555 +                                       spin_lock_txnh(txnh);
71556 +                                       capture_assign_txnh_nolock(atom, txnh);
71557 +                                       spin_unlock_txnh(txnh);
71558 +
71559 +                                       goto found;
71560 +                               }
71561 +                               if (atom->stage <= ASTAGE_POST_COMMIT) {
71562 +                                       spin_unlock_txnmgr(tmgr);
71563 +                                       /*
71564 +                                        * we just wait until atom's flusher
71565 +                                        * makes a progress in flushing or
71566 +                                        * committing the atom
71567 +                                        */
71568 +                                       reiser4_atom_wait_event(atom);
71569 +                                       goto repeat;
71570 +                               }
71571 +                               spin_unlock_atom(atom);
71572 +                       }
71573 +               }
71574 +               spin_unlock_txnmgr(tmgr);
71575 +               return 0;
71576 +             found:
71577 +               spin_unlock_txnmgr(tmgr);
71578 +       } else
71579 +               atom = get_current_atom_locked();
71580 +
71581 +       BUG_ON(atom->super != ctx->super);
71582 +       assert("vs-35", atom->super == ctx->super);
71583 +       if (start) {
71584 +               spin_lock_jnode(start);
71585 +               ret = (atom == start->atom) ? 1 : 0;
71586 +               spin_unlock_jnode(start);
71587 +               if (ret == 0)
71588 +                       start = NULL;
71589 +       }
71590 +       ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start);
71591 +       if (ret == 0) {
71592 +               /* flush_current_atom returns 0 only if it submitted for write
71593 +                  nothing */
71594 +               BUG_ON(*nr_submitted != 0);
71595 +               if (*nr_submitted == 0 || atom_should_commit_asap(atom)) {
71596 +                       if (atom->capture_count < tmgr->atom_min_size &&
71597 +                           !(atom->flags & ATOM_CANCEL_FUSION)) {
71598 +                               ret = txn_try_to_fuse_small_atom(tmgr, atom);
71599 +                               if (ret == -E_REPEAT) {
71600 +                                       reiser4_preempt_point();
71601 +                                       goto repeat;
71602 +                               }
71603 +                       }
71604 +                       /* if early flushing could not make more nodes clean,
71605 +                        * or atom is too old/large,
71606 +                        * we force current atom to commit */
71607 +                       /* wait for commit completion but only if this
71608 +                        * wouldn't stall pdflushd and ent thread. */
71609 +                       if (!wbc->nonblocking && !ctx->entd)
71610 +                               txnh->flags |= TXNH_WAIT_COMMIT;
71611 +                       atom->flags |= ATOM_FORCE_COMMIT;
71612 +               }
71613 +               spin_unlock_atom(atom);
71614 +       } else if (ret == -E_REPEAT) {
71615 +               if (*nr_submitted == 0) {
71616 +                       /* let others who hampers flushing (hold longterm locks,
71617 +                          for instance) to free the way for flush */
71618 +                       reiser4_preempt_point();
71619 +                       goto repeat;
71620 +               }
71621 +               ret = 0;
71622 +       }
71623 +/*
71624 +       if (*nr_submitted > wbc->nr_to_write)
71625 +               warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted);
71626 +*/
71627 +       reiser4_txn_restart(ctx);
71628 +
71629 +       return ret;
71630 +}
71631 +
71632 +/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
71633 +void reiser4_invalidate_list(struct list_head *head)
71634 +{
71635 +       while (!list_empty(head)) {
71636 +               jnode *node;
71637 +
71638 +               node = list_entry(head->next, jnode, capture_link);
71639 +               spin_lock_jnode(node);
71640 +               reiser4_uncapture_block(node);
71641 +               jput(node);
71642 +       }
71643 +}
71644 +
71645 +static void init_wlinks(txn_wait_links * wlinks)
71646 +{
71647 +       wlinks->_lock_stack = get_current_lock_stack();
71648 +       INIT_LIST_HEAD(&wlinks->_fwaitfor_link);
71649 +       INIT_LIST_HEAD(&wlinks->_fwaiting_link);
71650 +       wlinks->waitfor_cb = NULL;
71651 +       wlinks->waiting_cb = NULL;
71652 +}
71653 +
71654 +/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */
71655 +void reiser4_atom_wait_event(txn_atom * atom)
71656 +{
71657 +       txn_wait_links _wlinks;
71658 +
71659 +       assert_spin_locked(&(atom->alock));
71660 +       assert("nikita-3156",
71661 +              lock_stack_isclean(get_current_lock_stack()) ||
71662 +              atom->nr_running_queues > 0);
71663 +
71664 +       init_wlinks(&_wlinks);
71665 +       list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list);
71666 +       atomic_inc(&atom->refcount);
71667 +       spin_unlock_atom(atom);
71668 +
71669 +       reiser4_prepare_to_sleep(_wlinks._lock_stack);
71670 +       reiser4_go_to_sleep(_wlinks._lock_stack);
71671 +
71672 +       spin_lock_atom(atom);
71673 +       list_del(&_wlinks._fwaitfor_link);
71674 +       atom_dec_and_unlock(atom);
71675 +}
71676 +
71677 +void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage)
71678 +{
71679 +       assert("nikita-3535", atom != NULL);
71680 +       assert_spin_locked(&(atom->alock));
71681 +       assert("nikita-3536", stage <= ASTAGE_INVALID);
71682 +       /* Excelsior! */
71683 +       assert("nikita-3537", stage >= atom->stage);
71684 +       if (atom->stage != stage) {
71685 +               atom->stage = stage;
71686 +               reiser4_atom_send_event(atom);
71687 +       }
71688 +}
71689 +
71690 +/* wake all threads which wait for an event */
71691 +void reiser4_atom_send_event(txn_atom * atom)
71692 +{
71693 +       assert_spin_locked(&(atom->alock));
71694 +       wakeup_atom_waitfor_list(atom);
71695 +}
71696 +
71697 +/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for
71698 +   example, because it does fsync(2)) */
71699 +static int should_wait_commit(txn_handle * h)
71700 +{
71701 +       return h->flags & TXNH_WAIT_COMMIT;
71702 +}
71703 +
71704 +typedef struct commit_data {
71705 +       txn_atom *atom;
71706 +       txn_handle *txnh;
71707 +       long nr_written;
71708 +       /* as an optimization we start committing atom by first trying to
71709 +        * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This
71710 +        * allows to reduce stalls due to other threads waiting for atom in
71711 +        * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these
71712 +        * preliminary flushes. */
71713 +       int preflush;
71714 +       /* have we waited on atom. */
71715 +       int wait;
71716 +       int failed;
71717 +       int wake_ktxnmgrd_up;
71718 +} commit_data;
71719 +
71720 +/*
71721 + * Called from commit_txnh() repeatedly, until either error happens, or atom
71722 + * commits successfully.
71723 + */
71724 +static int try_commit_txnh(commit_data * cd)
71725 +{
71726 +       int result;
71727 +
71728 +       assert("nikita-2968", lock_stack_isclean(get_current_lock_stack()));
71729 +
71730 +       /* Get the atom and txnh locked. */
71731 +       cd->atom = txnh_get_atom(cd->txnh);
71732 +       assert("jmacd-309", cd->atom != NULL);
71733 +       spin_unlock_txnh(cd->txnh);
71734 +
71735 +       if (cd->wait) {
71736 +               cd->atom->nr_waiters--;
71737 +               cd->wait = 0;
71738 +       }
71739 +
71740 +       if (cd->atom->stage == ASTAGE_DONE)
71741 +               return 0;
71742 +
71743 +       if (cd->failed)
71744 +               return 0;
71745 +
71746 +       if (atom_should_commit(cd->atom)) {
71747 +               /* if atom is _very_ large schedule it for commit as soon as
71748 +                * possible. */
71749 +               if (atom_should_commit_asap(cd->atom)) {
71750 +                       /*
71751 +                        * When atom is in PRE_COMMIT or later stage following
71752 +                        * invariant (encoded   in    atom_can_be_committed())
71753 +                        * holds:  there is exactly one non-waiter transaction
71754 +                        * handle opened  on this atom.  When  thread wants to
71755 +                        * wait  until atom  commits (for  example  sync()) it
71756 +                        * waits    on    atom  event     after     increasing
71757 +                        * atom->nr_waiters (see blow  in  this  function). It
71758 +                        * cannot be guaranteed that atom is already committed
71759 +                        * after    receiving event,  so     loop has   to  be
71760 +                        * re-started. But  if  atom switched into  PRE_COMMIT
71761 +                        * stage and became  too  large, we cannot  change its
71762 +                        * state back   to CAPTURE_WAIT (atom  stage can  only
71763 +                        * increase monotonically), hence this check.
71764 +                        */
71765 +                       if (cd->atom->stage < ASTAGE_CAPTURE_WAIT)
71766 +                               reiser4_atom_set_stage(cd->atom,
71767 +                                                      ASTAGE_CAPTURE_WAIT);
71768 +                       cd->atom->flags |= ATOM_FORCE_COMMIT;
71769 +               }
71770 +               if (cd->txnh->flags & TXNH_DONT_COMMIT) {
71771 +                       /*
71772 +                        * this  thread (transaction  handle  that is) doesn't
71773 +                        * want to commit  atom. Notify waiters that handle is
71774 +                        * closed. This can happen, for  example, when we  are
71775 +                        * under  VFS directory lock  and don't want to commit
71776 +                        * atom  right   now to  avoid  stalling other threads
71777 +                        * working in the same directory.
71778 +                        */
71779 +
71780 +                       /* Wake  the ktxnmgrd up if  the ktxnmgrd is needed to
71781 +                        * commit this  atom: no  atom  waiters  and only  one
71782 +                        * (our) open transaction handle. */
71783 +                       cd->wake_ktxnmgrd_up =
71784 +                           cd->atom->txnh_count == 1 &&
71785 +                           cd->atom->nr_waiters == 0;
71786 +                       reiser4_atom_send_event(cd->atom);
71787 +                       result = 0;
71788 +               } else if (!atom_can_be_committed(cd->atom)) {
71789 +                       if (should_wait_commit(cd->txnh)) {
71790 +                               /* sync(): wait for commit */
71791 +                               cd->atom->nr_waiters++;
71792 +                               cd->wait = 1;
71793 +                               reiser4_atom_wait_event(cd->atom);
71794 +                               result = RETERR(-E_REPEAT);
71795 +                       } else {
71796 +                               result = 0;
71797 +                       }
71798 +               } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) {
71799 +                       /*
71800 +                        * optimization: flush  atom without switching it into
71801 +                        * ASTAGE_CAPTURE_WAIT.
71802 +                        *
71803 +                        * But don't  do this for  ktxnmgrd, because  ktxnmgrd
71804 +                        * should never block on atom fusion.
71805 +                        */
71806 +                       result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS,
71807 +                                                   LONG_MAX, &cd->nr_written,
71808 +                                                   &cd->atom, NULL);
71809 +                       if (result == 0) {
71810 +                               spin_unlock_atom(cd->atom);
71811 +                               cd->preflush = 0;
71812 +                               result = RETERR(-E_REPEAT);
71813 +                       } else  /* Atoms wasn't flushed
71814 +                                * completely. Rinse. Repeat. */
71815 +                               --cd->preflush;
71816 +               } else {
71817 +                       /* We change   atom state  to   ASTAGE_CAPTURE_WAIT to
71818 +                          prevent atom fusion and count  ourself as an active
71819 +                          flusher */
71820 +                       reiser4_atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
71821 +                       cd->atom->flags |= ATOM_FORCE_COMMIT;
71822 +
71823 +                       result =
71824 +                           commit_current_atom(&cd->nr_written, &cd->atom);
71825 +                       if (result != 0 && result != -E_REPEAT)
71826 +                               cd->failed = 1;
71827 +               }
71828 +       } else
71829 +               result = 0;
71830 +
71831 +#if REISER4_DEBUG
71832 +       if (result == 0)
71833 +               assert_spin_locked(&(cd->atom->alock));
71834 +#endif
71835 +
71836 +       /* perfectly valid assertion, except that when atom/txnh is not locked
71837 +        * fusion can take place, and cd->atom points nowhere. */
71838 +       /*
71839 +          assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom)));
71840 +        */
71841 +       return result;
71842 +}
71843 +
71844 +/* Called to commit a transaction handle.  This decrements the atom's number of open
71845 +   handles and if it is the last handle to commit and the atom should commit, initiates
71846 +   atom commit. if commit does not fail, return number of written blocks */
71847 +static int commit_txnh(txn_handle * txnh)
71848 +{
71849 +       commit_data cd;
71850 +       assert("umka-192", txnh != NULL);
71851 +
71852 +       memset(&cd, 0, sizeof cd);
71853 +       cd.txnh = txnh;
71854 +       cd.preflush = 10;
71855 +
71856 +       /* calls try_commit_txnh() until either atom commits, or error
71857 +        * happens */
71858 +       while (try_commit_txnh(&cd) != 0)
71859 +               reiser4_preempt_point();
71860 +
71861 +       spin_lock_txnh(txnh);
71862 +
71863 +       cd.atom->txnh_count -= 1;
71864 +       txnh->atom = NULL;
71865 +       /* remove transaction handle from atom's list of transaction handles */
71866 +       list_del_init(&txnh->txnh_link);
71867 +
71868 +       spin_unlock_txnh(txnh);
71869 +       atom_dec_and_unlock(cd.atom);
71870 +       /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably
71871 +        * because it takes time) by current thread, we do that work
71872 +        * asynchronously by ktxnmgrd daemon. */
71873 +       if (cd.wake_ktxnmgrd_up)
71874 +               ktxnmgrd_kick(&get_current_super_private()->tmgr);
71875 +
71876 +       return 0;
71877 +}
71878 +
71879 +/* TRY_CAPTURE */
71880 +
71881 +/* This routine attempts a single block-capture request.  It may return -E_REPEAT if some
71882 +   condition indicates that the request should be retried, and it may block if the
71883 +   txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag.
71884 +
71885 +   This routine encodes the basic logic of block capturing described by:
71886 +
71887 +     http://namesys.com/v4/v4.html
71888 +
71889 +   Our goal here is to ensure that any two blocks that contain dependent modifications
71890 +   should commit at the same time.  This function enforces this discipline by initiating
71891 +   fusion whenever a transaction handle belonging to one atom requests to read or write a
71892 +   block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC).
71893 +
71894 +   In addition, this routine handles the initial assignment of atoms to blocks and
71895 +   transaction handles.  These are possible outcomes of this function:
71896 +
71897 +   1. The block and handle are already part of the same atom: return immediate success
71898 +
71899 +   2. The block is assigned but the handle is not: call capture_assign_txnh to assign
71900 +      the handle to the block's atom.
71901 +
71902 +   3. The handle is assigned but the block is not: call capture_assign_block to assign
71903 +      the block to the handle's atom.
71904 +
71905 +   4. Both handle and block are assigned, but to different atoms: call capture_init_fusion
71906 +      to fuse atoms.
71907 +
71908 +   5. Neither block nor handle are assigned: create a new atom and assign them both.
71909 +
71910 +   6. A read request for a non-captured block: return immediate success.
71911 +
71912 +   This function acquires and releases the handle's spinlock.  This function is called
71913 +   under the jnode lock and if the return value is 0, it returns with the jnode lock still
71914 +   held.  If the return is -E_REPEAT or some other error condition, the jnode lock is
71915 +   released.  The external interface (reiser4_try_capture) manages re-aquiring the jnode
71916 +   lock in the failure case.
71917 +*/
71918 +static int try_capture_block(
71919 +       txn_handle * txnh, jnode * node, txn_capture mode,
71920 +       txn_atom ** atom_alloc)
71921 +{
71922 +       txn_atom *block_atom;
71923 +       txn_atom *txnh_atom;
71924 +
71925 +       /* Should not call capture for READ_NONCOM requests, handled in reiser4_try_capture. */
71926 +       assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM);
71927 +
71928 +       /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree ==
71929 +        * node->tree somewhere. */
71930 +       assert("umka-194", txnh != NULL);
71931 +       assert("umka-195", node != NULL);
71932 +
71933 +       /* The jnode is already locked!  Being called from reiser4_try_capture(). */
71934 +       assert_spin_locked(&(node->guard));
71935 +       block_atom = node->atom;
71936 +
71937 +       /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't
71938 +          let us touch the atoms themselves. */
71939 +       spin_lock_txnh(txnh);
71940 +       txnh_atom = txnh->atom;
71941 +       /* Process of capturing continues into one of four branches depends on
71942 +          which atoms from (block atom (node->atom), current atom (txnh->atom))
71943 +          exist. */
71944 +       if (txnh_atom == NULL) {
71945 +               if (block_atom == NULL) {
71946 +                       spin_unlock_txnh(txnh);
71947 +                       spin_unlock_jnode(node);
71948 +                       /* assign empty atom to the txnh and repeat */
71949 +                       return atom_begin_and_assign_to_txnh(atom_alloc, txnh);
71950 +               } else {
71951 +                       atomic_inc(&block_atom->refcount);
71952 +                       /* node spin-lock isn't needed anymore */
71953 +                       spin_unlock_jnode(node);
71954 +                       if (!spin_trylock_atom(block_atom)) {
71955 +                               spin_unlock_txnh(txnh);
71956 +                               spin_lock_atom(block_atom);
71957 +                               spin_lock_txnh(txnh);
71958 +                       }
71959 +                       /* re-check state after getting txnh and the node
71960 +                        * atom spin-locked */
71961 +                       if (node->atom != block_atom || txnh->atom != NULL) {
71962 +                               spin_unlock_txnh(txnh);
71963 +                               atom_dec_and_unlock(block_atom);
71964 +                               return RETERR(-E_REPEAT);
71965 +                       }
71966 +                       atomic_dec(&block_atom->refcount);
71967 +                       if (block_atom->stage > ASTAGE_CAPTURE_WAIT ||
71968 +                           (block_atom->stage == ASTAGE_CAPTURE_WAIT &&
71969 +                            block_atom->txnh_count != 0))
71970 +                               return capture_fuse_wait(txnh, block_atom, NULL, mode);
71971 +                       capture_assign_txnh_nolock(block_atom, txnh);
71972 +                       spin_unlock_txnh(txnh);
71973 +                       spin_unlock_atom(block_atom);
71974 +                       return RETERR(-E_REPEAT);
71975 +               }
71976 +       } else {
71977 +               /* It is time to perform deadlock prevention check over the
71978 +                  node we want to capture.  It is possible this node was locked
71979 +                  for read without capturing it. The optimization which allows
71980 +                  to do it helps us in keeping atoms independent as long as
71981 +                  possible but it may cause lock/fuse deadlock problems.
71982 +
71983 +                  A number of similar deadlock situations with locked but not
71984 +                  captured nodes were found.  In each situation there are two
71985 +                  or more threads: one of them does flushing while another one
71986 +                  does routine balancing or tree lookup.  The flushing thread
71987 +                  (F) sleeps in long term locking request for node (N), another
71988 +                  thread (A) sleeps in trying to capture some node already
71989 +                  belonging the atom F, F has a state which prevents
71990 +                  immediately fusion .
71991 +
71992 +                  Deadlocks of this kind cannot happen if node N was properly
71993 +                  captured by thread A. The F thread fuse atoms before locking
71994 +                  therefore current atom of thread F and current atom of thread
71995 +                  A became the same atom and thread A may proceed.  This does
71996 +                  not work if node N was not captured because the fusion of
71997 +                  atom does not happens.
71998 +
71999 +                  The following scheme solves the deadlock: If
72000 +                  longterm_lock_znode locks and does not capture a znode, that
72001 +                  znode is marked as MISSED_IN_CAPTURE.  A node marked this way
72002 +                  is processed by the code below which restores the missed
72003 +                  capture and fuses current atoms of all the node lock owners
72004 +                  by calling the fuse_not_fused_lock_owners() function. */
72005 +               if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) {
72006 +                       JF_CLR(node, JNODE_MISSED_IN_CAPTURE);
72007 +                       if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) {
72008 +                               spin_unlock_txnh(txnh);
72009 +                               spin_unlock_jnode(node);
72010 +                               fuse_not_fused_lock_owners(txnh, JZNODE(node));
72011 +                               return RETERR(-E_REPEAT);
72012 +                       }
72013 +               }
72014 +               if (block_atom == NULL) {
72015 +                       atomic_inc(&txnh_atom->refcount);
72016 +                       spin_unlock_txnh(txnh);
72017 +                       if (!spin_trylock_atom(txnh_atom)) {
72018 +                               spin_unlock_jnode(node);
72019 +                               spin_lock_atom(txnh_atom);
72020 +                               spin_lock_jnode(node);
72021 +                       }
72022 +                       if (txnh->atom != txnh_atom || node->atom != NULL
72023 +                               || JF_ISSET(node, JNODE_IS_DYING)) {
72024 +                               spin_unlock_jnode(node);
72025 +                               atom_dec_and_unlock(txnh_atom);
72026 +                               return RETERR(-E_REPEAT);
72027 +                       }
72028 +                       atomic_dec(&txnh_atom->refcount);
72029 +                       capture_assign_block_nolock(txnh_atom, node);
72030 +                       spin_unlock_atom(txnh_atom);
72031 +               } else {
72032 +                       if (txnh_atom != block_atom) {
72033 +                               if (mode & TXN_CAPTURE_DONT_FUSE) {
72034 +                                       spin_unlock_txnh(txnh);
72035 +                                       spin_unlock_jnode(node);
72036 +                                       /* we are in a "no-fusion" mode and @node is
72037 +                                        * already part of transaction. */
72038 +                                       return RETERR(-E_NO_NEIGHBOR);
72039 +                               }
72040 +                               return capture_init_fusion(node, txnh, mode);
72041 +                       }
72042 +                       spin_unlock_txnh(txnh);
72043 +               }
72044 +       }
72045 +       return 0;
72046 +}
72047 +
72048 +static txn_capture
72049 +build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags)
72050 +{
72051 +       txn_capture cap_mode;
72052 +
72053 +       assert_spin_locked(&(node->guard));
72054 +
72055 +       /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */
72056 +
72057 +       if (lock_mode == ZNODE_WRITE_LOCK) {
72058 +               cap_mode = TXN_CAPTURE_WRITE;
72059 +       } else if (node->atom != NULL) {
72060 +               cap_mode = TXN_CAPTURE_WRITE;
72061 +       } else if (0 &&         /* txnh->mode == TXN_READ_FUSING && */
72062 +                  jnode_get_level(node) == LEAF_LEVEL) {
72063 +               /* NOTE-NIKITA TXN_READ_FUSING is not currently used */
72064 +               /* We only need a READ_FUSING capture at the leaf level.  This
72065 +                  is because the internal levels of the tree (twigs included)
72066 +                  are redundant from the point of the user that asked for a
72067 +                  read-fusing transcrash.  The user only wants to read-fuse
72068 +                  atoms due to reading uncommitted data that another user has
72069 +                  written.  It is the file system that reads/writes the
72070 +                  internal tree levels, the user only reads/writes leaves. */
72071 +               cap_mode = TXN_CAPTURE_READ_ATOMIC;
72072 +       } else {
72073 +               /* In this case (read lock at a non-leaf) there's no reason to
72074 +                * capture. */
72075 +               /* cap_mode = TXN_CAPTURE_READ_NONCOM; */
72076 +               return 0;
72077 +       }
72078 +
72079 +       cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE));
72080 +       assert("nikita-3186", cap_mode != 0);
72081 +       return cap_mode;
72082 +}
72083 +
72084 +/* This is an external interface to try_capture_block(), it calls
72085 +   try_capture_block() repeatedly as long as -E_REPEAT is returned.
72086 +
72087 +   @node:         node to capture,
72088 +   @lock_mode:    read or write lock is used in capture mode calculation,
72089 +   @flags:        see txn_capture flags enumeration,
72090 +   @can_coc     : can copy-on-capture
72091 +
72092 +   @return: 0 - node was successfully captured, -E_REPEAT - capture request
72093 +            cannot be processed immediately as it was requested in flags,
72094 +           < 0 - other errors.
72095 +*/
72096 +int reiser4_try_capture(jnode *node, znode_lock_mode lock_mode,
72097 +                       txn_capture flags)
72098 +{
72099 +       txn_atom *atom_alloc = NULL;
72100 +       txn_capture cap_mode;
72101 +       txn_handle *txnh = get_current_context()->trans;
72102 +       int ret;
72103 +
72104 +       assert_spin_locked(&(node->guard));
72105 +
72106 +      repeat:
72107 +       if (JF_ISSET(node, JNODE_IS_DYING))
72108 +               return RETERR(-EINVAL);
72109 +       if (node->atom != NULL && txnh->atom == node->atom)
72110 +               return 0;
72111 +       cap_mode = build_capture_mode(node, lock_mode, flags);
72112 +       if (cap_mode == 0 ||
72113 +           (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) {
72114 +               /* Mark this node as "MISSED".  It helps in further deadlock
72115 +                * analysis */
72116 +               if (jnode_is_znode(node))
72117 +                       JF_SET(node, JNODE_MISSED_IN_CAPTURE);
72118 +               return 0;
72119 +       }
72120 +       /* Repeat try_capture as long as -E_REPEAT is returned. */
72121 +       ret = try_capture_block(txnh, node, cap_mode, &atom_alloc);
72122 +       /* Regardless of non_blocking:
72123 +
72124 +          If ret == 0 then jnode is still locked.
72125 +          If ret != 0 then jnode is unlocked.
72126 +        */
72127 +#if REISER4_DEBUG
72128 +       if (ret == 0)
72129 +               assert_spin_locked(&(node->guard));
72130 +       else
72131 +               assert_spin_not_locked(&(node->guard));
72132 +#endif
72133 +       assert_spin_not_locked(&(txnh->guard));
72134 +
72135 +       if (ret == -E_REPEAT) {
72136 +               /* E_REPEAT implies all locks were released, therefore we need
72137 +                  to take the jnode's lock again. */
72138 +               spin_lock_jnode(node);
72139 +
72140 +               /* Although this may appear to be a busy loop, it is not.
72141 +                  There are several conditions that cause E_REPEAT to be
72142 +                  returned by the call to try_capture_block, all cases
72143 +                  indicating some kind of state change that means you should
72144 +                  retry the request and will get a different result.  In some
72145 +                  cases this could be avoided with some extra code, but
72146 +                  generally it is done because the necessary locks were
72147 +                  released as a result of the operation and repeating is the
72148 +                  simplest thing to do (less bug potential).  The cases are:
72149 +                  atom fusion returns E_REPEAT after it completes (jnode and
72150 +                  txnh were unlocked); race conditions in assign_block,
72151 +                  assign_txnh, and init_fusion return E_REPEAT (trylock
72152 +                  failure); after going to sleep in capture_fuse_wait
72153 +                  (request was blocked but may now succeed).  I'm not quite
72154 +                  sure how capture_copy works yet, but it may also return
72155 +                  E_REPEAT.  When the request is legitimately blocked, the
72156 +                  requestor goes to sleep in fuse_wait, so this is not a busy
72157 +                  loop. */
72158 +               /* NOTE-NIKITA: still don't understand:
72159 +
72160 +                  try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT
72161 +
72162 +                  looks like busy loop?
72163 +                */
72164 +               goto repeat;
72165 +       }
72166 +
72167 +       /* free extra atom object that was possibly allocated by
72168 +          try_capture_block().
72169 +
72170 +          Do this before acquiring jnode spin lock to
72171 +          minimize time spent under lock. --nikita */
72172 +       if (atom_alloc != NULL) {
72173 +               kmem_cache_free(_atom_slab, atom_alloc);
72174 +       }
72175 +
72176 +       if (ret != 0) {
72177 +               if (ret == -E_BLOCK) {
72178 +                       assert("nikita-3360",
72179 +                              cap_mode & TXN_CAPTURE_NONBLOCKING);
72180 +                       ret = -E_REPEAT;
72181 +               }
72182 +
72183 +               /* Failure means jnode is not locked.  FIXME_LATER_JMACD May
72184 +                  want to fix the above code to avoid releasing the lock and
72185 +                  re-acquiring it, but there are cases were failure occurs
72186 +                  when the lock is not held, and those cases would need to be
72187 +                  modified to re-take the lock. */
72188 +               spin_lock_jnode(node);
72189 +       }
72190 +
72191 +       /* Jnode is still locked. */
72192 +       assert_spin_locked(&(node->guard));
72193 +       return ret;
72194 +}
72195 +
72196 +static void release_two_atoms(txn_atom *one, txn_atom *two)
72197 +{
72198 +       spin_unlock_atom(one);
72199 +       atom_dec_and_unlock(two);
72200 +       spin_lock_atom(one);
72201 +       atom_dec_and_unlock(one);
72202 +}
72203 +
72204 +/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is
72205 +   returned by that routine.  The txn_capture request mode is computed here depending on
72206 +   the transaction handle's type and the lock request.  This is called from the depths of
72207 +   the lock manager with the jnode lock held and it always returns with the jnode lock
72208 +   held.
72209 +*/
72210 +
72211 +/* fuse all 'active' atoms of lock owners of given node. */
72212 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node)
72213 +{
72214 +       lock_handle *lh;
72215 +       int repeat;
72216 +       txn_atom *atomh, *atomf;
72217 +       reiser4_context *me = get_current_context();
72218 +       reiser4_context *ctx = NULL;
72219 +
72220 +       assert_spin_not_locked(&(ZJNODE(node)->guard));
72221 +       assert_spin_not_locked(&(txnh->hlock));
72222 +
72223 + repeat:
72224 +       repeat = 0;
72225 +       atomh = txnh_get_atom(txnh);
72226 +       spin_unlock_txnh(txnh);
72227 +       assert("zam-692", atomh != NULL);
72228 +
72229 +       spin_lock_zlock(&node->lock);
72230 +       /* inspect list of lock owners */
72231 +       list_for_each_entry(lh, &node->lock.owners, owners_link) {
72232 +               ctx = get_context_by_lock_stack(lh->owner);
72233 +               if (ctx == me)
72234 +                       continue;
72235 +               /* below we use two assumptions to avoid addition spin-locks
72236 +                  for checking the condition :
72237 +
72238 +                  1) if the lock stack has lock, the transaction should be
72239 +                  opened, i.e. ctx->trans != NULL;
72240 +
72241 +                  2) reading of well-aligned ctx->trans->atom is atomic, if it
72242 +                  equals to the address of spin-locked atomh, we take that
72243 +                  the atoms are the same, nothing has to be captured. */
72244 +               if (atomh != ctx->trans->atom) {
72245 +                       reiser4_wake_up(lh->owner);
72246 +                       repeat = 1;
72247 +                       break;
72248 +               }
72249 +       }
72250 +       if (repeat) {
72251 +               if (!spin_trylock_txnh(ctx->trans)) {
72252 +                       spin_unlock_zlock(&node->lock);
72253 +                       spin_unlock_atom(atomh);
72254 +                       goto repeat;
72255 +               }
72256 +               atomf = ctx->trans->atom;
72257 +               if (atomf == NULL) {
72258 +                       capture_assign_txnh_nolock(atomh, ctx->trans);
72259 +                       /* release zlock lock _after_ assigning the atom to the
72260 +                        * transaction handle, otherwise the lock owner thread
72261 +                        * may unlock all znodes, exit kernel context and here
72262 +                        * we would access an invalid transaction handle. */
72263 +                       spin_unlock_zlock(&node->lock);
72264 +                       spin_unlock_atom(atomh);
72265 +                       spin_unlock_txnh(ctx->trans);
72266 +                       goto repeat;
72267 +               }
72268 +               assert("zam-1059", atomf != atomh);
72269 +               spin_unlock_zlock(&node->lock);
72270 +               atomic_inc(&atomh->refcount);
72271 +               atomic_inc(&atomf->refcount);
72272 +               spin_unlock_txnh(ctx->trans);
72273 +               if (atomf > atomh) {
72274 +                       spin_lock_atom_nested(atomf);
72275 +               } else {
72276 +                       spin_unlock_atom(atomh);
72277 +                       spin_lock_atom(atomf);
72278 +                       spin_lock_atom_nested(atomh);
72279 +               }
72280 +               if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) {
72281 +                       release_two_atoms(atomf, atomh);
72282 +                       goto repeat;
72283 +               }
72284 +               atomic_dec(&atomh->refcount);
72285 +               atomic_dec(&atomf->refcount);
72286 +               capture_fuse_into(atomf, atomh);
72287 +               goto repeat;
72288 +       }
72289 +       spin_unlock_zlock(&node->lock);
72290 +       spin_unlock_atom(atomh);
72291 +}
72292 +
72293 +/* This is the interface to capture unformatted nodes via their struct page
72294 +   reference. Currently it is only used in reiser4_invalidatepage */
72295 +int try_capture_page_to_invalidate(struct page *pg)
72296 +{
72297 +       int ret;
72298 +       jnode *node;
72299 +
72300 +       assert("umka-292", pg != NULL);
72301 +       assert("nikita-2597", PageLocked(pg));
72302 +
72303 +       if (IS_ERR(node = jnode_of_page(pg))) {
72304 +               return PTR_ERR(node);
72305 +       }
72306 +
72307 +       spin_lock_jnode(node);
72308 +       unlock_page(pg);
72309 +
72310 +       ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
72311 +       spin_unlock_jnode(node);
72312 +       jput(node);
72313 +       lock_page(pg);
72314 +       return ret;
72315 +}
72316 +
72317 +/* This informs the transaction manager when a node is deleted.  Add the block to the
72318 +   atom's delete set and uncapture the block.
72319 +
72320 +VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for
72321 +explanations.  find all the functions that use it, and unless there is some very
72322 +good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....),
72323 +move the loop to inside the function.
72324 +
72325 +VS-FIXME-HANS: can this code be at all streamlined?  In particular, can you lock and unlock the jnode fewer times?
72326 +  */
72327 +void reiser4_uncapture_page(struct page *pg)
72328 +{
72329 +       jnode *node;
72330 +       txn_atom *atom;
72331 +
72332 +       assert("umka-199", pg != NULL);
72333 +       assert("nikita-3155", PageLocked(pg));
72334 +
72335 +       clear_page_dirty_for_io(pg);
72336 +
72337 +       reiser4_wait_page_writeback(pg);
72338 +
72339 +       node = jprivate(pg);
72340 +       BUG_ON(node == NULL);
72341 +
72342 +       spin_lock_jnode(node);
72343 +
72344 +       atom = jnode_get_atom(node);
72345 +       if (atom == NULL) {
72346 +               assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
72347 +               spin_unlock_jnode(node);
72348 +               return;
72349 +       }
72350 +
72351 +       /* We can remove jnode from transaction even if it is on flush queue
72352 +        * prepped list, we only need to be sure that flush queue is not being
72353 +        * written by reiser4_write_fq().  reiser4_write_fq() does not use atom
72354 +        * spin lock for protection of the prepped nodes list, instead
72355 +        * write_fq() increments atom's nr_running_queues counters for the time
72356 +        * when prepped list is not protected by spin lock.  Here we check this
72357 +        * counter if we want to remove jnode from flush queue and, if the
72358 +        * counter is not zero, wait all reiser4_write_fq() for this atom to
72359 +        * complete. This is not significant overhead. */
72360 +       while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) {
72361 +               spin_unlock_jnode(node);
72362 +               /*
72363 +                * at this moment we want to wait for "atom event", viz. wait
72364 +                * until @node can be removed from flush queue. But
72365 +                * reiser4_atom_wait_event() cannot be called with page locked,
72366 +                * because it deadlocks with jnode_extent_write(). Unlock page,
72367 +                * after making sure (through page_cache_get()) that it cannot
72368 +                * be released from memory.
72369 +                */
72370 +               page_cache_get(pg);
72371 +               unlock_page(pg);
72372 +               reiser4_atom_wait_event(atom);
72373 +               lock_page(pg);
72374 +               /*
72375 +                * page may has been detached by ->writepage()->releasepage().
72376 +                */
72377 +               reiser4_wait_page_writeback(pg);
72378 +               spin_lock_jnode(node);
72379 +               page_cache_release(pg);
72380 +               atom = jnode_get_atom(node);
72381 +/* VS-FIXME-HANS: improve the commenting in this function */
72382 +               if (atom == NULL) {
72383 +                       spin_unlock_jnode(node);
72384 +                       return;
72385 +               }
72386 +       }
72387 +       reiser4_uncapture_block(node);
72388 +       spin_unlock_atom(atom);
72389 +       jput(node);
72390 +}
72391 +
72392 +/* this is used in extent's kill hook to uncapture and unhash jnodes attached to
72393 + * inode's tree of jnodes */
72394 +void reiser4_uncapture_jnode(jnode * node)
72395 +{
72396 +       txn_atom *atom;
72397 +
72398 +       assert_spin_locked(&(node->guard));
72399 +       assert("", node->pg == 0);
72400 +
72401 +       atom = jnode_get_atom(node);
72402 +       if (atom == NULL) {
72403 +               assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
72404 +               spin_unlock_jnode(node);
72405 +               return;
72406 +       }
72407 +
72408 +       reiser4_uncapture_block(node);
72409 +       spin_unlock_atom(atom);
72410 +       jput(node);
72411 +}
72412 +
72413 +/* No-locking version of assign_txnh.  Sets the transaction handle's atom pointer,
72414 +   increases atom refcount and txnh_count, adds to txnh_list. */
72415 +static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh)
72416 +{
72417 +       assert("umka-200", atom != NULL);
72418 +       assert("umka-201", txnh != NULL);
72419 +
72420 +       assert_spin_locked(&(txnh->hlock));
72421 +       assert_spin_locked(&(atom->alock));
72422 +       assert("jmacd-824", txnh->atom == NULL);
72423 +       assert("nikita-3540", atom_isopen(atom));
72424 +       BUG_ON(txnh->atom != NULL);
72425 +
72426 +       atomic_inc(&atom->refcount);
72427 +       txnh->atom = atom;
72428 +       reiser4_ctx_gfp_mask_set();
72429 +       list_add_tail(&txnh->txnh_link, &atom->txnh_list);
72430 +       atom->txnh_count += 1;
72431 +}
72432 +
72433 +/* No-locking version of assign_block.  Sets the block's atom pointer, references the
72434 +   block, adds it to the clean or dirty capture_jnode list, increments capture_count. */
72435 +static void capture_assign_block_nolock(txn_atom *atom, jnode *node)
72436 +{
72437 +       assert("umka-202", atom != NULL);
72438 +       assert("umka-203", node != NULL);
72439 +       assert_spin_locked(&(node->guard));
72440 +       assert_spin_locked(&(atom->alock));
72441 +       assert("jmacd-323", node->atom == NULL);
72442 +       BUG_ON(!list_empty_careful(&node->capture_link));
72443 +       assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY));
72444 +
72445 +       /* Pointer from jnode to atom is not counted in atom->refcount. */
72446 +       node->atom = atom;
72447 +
72448 +       list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom));
72449 +       atom->capture_count += 1;
72450 +       /* reference to jnode is acquired by atom. */
72451 +       jref(node);
72452 +
72453 +       ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1));
72454 +
72455 +       LOCK_CNT_INC(t_refs);
72456 +}
72457 +
72458 +/* common code for dirtying both unformatted jnodes and formatted znodes. */
72459 +static void do_jnode_make_dirty(jnode * node, txn_atom * atom)
72460 +{
72461 +       assert_spin_locked(&(node->guard));
72462 +       assert_spin_locked(&(atom->alock));
72463 +       assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY));
72464 +
72465 +       JF_SET(node, JNODE_DIRTY);
72466 +
72467 +       get_current_context()->nr_marked_dirty++;
72468 +
72469 +       /* We grab2flush_reserve one additional block only if node was
72470 +          not CREATED and jnode_flush did not sort it into neither
72471 +          relocate set nor overwrite one. If node is in overwrite or
72472 +          relocate set we assume that atom's flush reserved counter was
72473 +          already adjusted. */
72474 +       if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC)
72475 +           && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node)
72476 +           && !jnode_is_cluster_page(node)) {
72477 +               assert("vs-1093", !reiser4_blocknr_is_fake(&node->blocknr));
72478 +               assert("vs-1506", *jnode_get_block(node) != 0);
72479 +               grabbed2flush_reserved_nolock(atom, (__u64) 1);
72480 +               JF_SET(node, JNODE_FLUSH_RESERVED);
72481 +       }
72482 +
72483 +       if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
72484 +               /* If the atom is not set yet, it will be added to the appropriate list in
72485 +                  capture_assign_block_nolock. */
72486 +               /* Sometimes a node is set dirty before being captured -- the case for new
72487 +                  jnodes.  In that case the jnode will be added to the appropriate list
72488 +                  in capture_assign_block_nolock. Another reason not to re-link jnode is
72489 +                  that jnode is on a flush queue (see flush.c for details) */
72490 +
72491 +               int level = jnode_get_level(node);
72492 +
72493 +               assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR));
72494 +               assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT);
72495 +               assert("nikita-2607", 0 <= level);
72496 +               assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT);
72497 +
72498 +               /* move node to atom's dirty list */
72499 +               list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level));
72500 +               ON_DEBUG(count_jnode
72501 +                        (atom, node, NODE_LIST(node), DIRTY_LIST, 1));
72502 +       }
72503 +}
72504 +
72505 +/* Set the dirty status for this (spin locked) jnode. */
72506 +void jnode_make_dirty_locked(jnode * node)
72507 +{
72508 +       assert("umka-204", node != NULL);
72509 +       assert_spin_locked(&(node->guard));
72510 +
72511 +       if (REISER4_DEBUG && rofs_jnode(node)) {
72512 +               warning("nikita-3365", "Dirtying jnode on rofs");
72513 +               dump_stack();
72514 +       }
72515 +
72516 +       /* Fast check for already dirty node */
72517 +       if (!JF_ISSET(node, JNODE_DIRTY)) {
72518 +               txn_atom *atom;
72519 +
72520 +               atom = jnode_get_atom(node);
72521 +               assert("vs-1094", atom);
72522 +               /* Check jnode dirty status again because node spin lock might
72523 +                * be released inside jnode_get_atom(). */
72524 +               if (likely(!JF_ISSET(node, JNODE_DIRTY)))
72525 +                       do_jnode_make_dirty(node, atom);
72526 +               spin_unlock_atom(atom);
72527 +       }
72528 +}
72529 +
72530 +/* Set the dirty status for this znode. */
72531 +void znode_make_dirty(znode * z)
72532 +{
72533 +       jnode *node;
72534 +       struct page *page;
72535 +
72536 +       assert("umka-204", z != NULL);
72537 +       assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z));
72538 +       assert("nikita-3560", znode_is_write_locked(z));
72539 +
72540 +       node = ZJNODE(z);
72541 +       /* znode is longterm locked, we can check dirty bit without spinlock */
72542 +       if (JF_ISSET(node, JNODE_DIRTY)) {
72543 +               /* znode is dirty already. All we have to do is to change znode version */
72544 +               z->version = znode_build_version(jnode_get_tree(node));
72545 +               return;
72546 +       }
72547 +
72548 +       spin_lock_jnode(node);
72549 +       jnode_make_dirty_locked(node);
72550 +       page = jnode_page(node);
72551 +       if (page != NULL) {
72552 +               /* this is useful assertion (allows one to check that no
72553 +                * modifications are lost due to update of in-flight page),
72554 +                * but it requires locking on page to check PG_writeback
72555 +                * bit. */
72556 +               /* assert("nikita-3292",
72557 +                  !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */
72558 +               page_cache_get(page);
72559 +
72560 +               /* jnode lock is not needed for the rest of
72561 +                * znode_set_dirty(). */
72562 +               spin_unlock_jnode(node);
72563 +               /* reiser4 file write code calls set_page_dirty for
72564 +                * unformatted nodes, for formatted nodes we do it here. */
72565 +               reiser4_set_page_dirty_internal(page);
72566 +               page_cache_release(page);
72567 +               /* bump version counter in znode */
72568 +               z->version = znode_build_version(jnode_get_tree(node));
72569 +       } else {
72570 +               assert("zam-596", znode_above_root(JZNODE(node)));
72571 +               spin_unlock_jnode(node);
72572 +       }
72573 +
72574 +       assert("nikita-1900", znode_is_write_locked(z));
72575 +       assert("jmacd-9777", node->atom != NULL);
72576 +}
72577 +
72578 +int reiser4_sync_atom(txn_atom * atom)
72579 +{
72580 +       int result;
72581 +       txn_handle *txnh;
72582 +
72583 +       txnh = get_current_context()->trans;
72584 +
72585 +       result = 0;
72586 +       if (atom != NULL) {
72587 +               if (atom->stage < ASTAGE_PRE_COMMIT) {
72588 +                       spin_lock_txnh(txnh);
72589 +                       capture_assign_txnh_nolock(atom, txnh);
72590 +                       result = force_commit_atom(txnh);
72591 +               } else if (atom->stage < ASTAGE_POST_COMMIT) {
72592 +                       /* wait atom commit */
72593 +                       reiser4_atom_wait_event(atom);
72594 +                       /* try once more */
72595 +                       result = RETERR(-E_REPEAT);
72596 +               } else
72597 +                       spin_unlock_atom(atom);
72598 +       }
72599 +       return result;
72600 +}
72601 +
72602 +#if REISER4_DEBUG
72603 +
72604 +/* move jnode form one list to another
72605 +   call this after atom->capture_count is updated */
72606 +void
72607 +count_jnode(txn_atom * atom, jnode * node, atom_list old_list,
72608 +           atom_list new_list, int check_lists)
72609 +{
72610 +       struct list_head *pos;
72611 +
72612 +       assert("zam-1018", atom_is_protected(atom));
72613 +       assert_spin_locked(&(node->guard));
72614 +       assert("", NODE_LIST(node) == old_list);
72615 +
72616 +       switch (NODE_LIST(node)) {
72617 +       case NOT_CAPTURED:
72618 +               break;
72619 +       case DIRTY_LIST:
72620 +               assert("", atom->dirty > 0);
72621 +               atom->dirty--;
72622 +               break;
72623 +       case CLEAN_LIST:
72624 +               assert("", atom->clean > 0);
72625 +               atom->clean--;
72626 +               break;
72627 +       case FQ_LIST:
72628 +               assert("", atom->fq > 0);
72629 +               atom->fq--;
72630 +               break;
72631 +       case WB_LIST:
72632 +               assert("", atom->wb > 0);
72633 +               atom->wb--;
72634 +               break;
72635 +       case OVRWR_LIST:
72636 +               assert("", atom->ovrwr > 0);
72637 +               atom->ovrwr--;
72638 +               break;
72639 +       default:
72640 +               impossible("", "");
72641 +       }
72642 +
72643 +       switch (new_list) {
72644 +       case NOT_CAPTURED:
72645 +               break;
72646 +       case DIRTY_LIST:
72647 +               atom->dirty++;
72648 +               break;
72649 +       case CLEAN_LIST:
72650 +               atom->clean++;
72651 +               break;
72652 +       case FQ_LIST:
72653 +               atom->fq++;
72654 +               break;
72655 +       case WB_LIST:
72656 +               atom->wb++;
72657 +               break;
72658 +       case OVRWR_LIST:
72659 +               atom->ovrwr++;
72660 +               break;
72661 +       default:
72662 +               impossible("", "");
72663 +       }
72664 +       ASSIGN_NODE_LIST(node, new_list);
72665 +       if (0 && check_lists) {
72666 +               int count;
72667 +               tree_level level;
72668 +
72669 +               count = 0;
72670 +
72671 +               /* flush queue list */
72672 +               /* reiser4_check_fq(atom); */
72673 +
72674 +               /* dirty list */
72675 +               count = 0;
72676 +               for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
72677 +                       list_for_each(pos, ATOM_DIRTY_LIST(atom, level))
72678 +                               count++;
72679 +               }
72680 +               if (count != atom->dirty)
72681 +                       warning("", "dirty counter %d, real %d\n", atom->dirty,
72682 +                               count);
72683 +
72684 +               /* clean list */
72685 +               count = 0;
72686 +               list_for_each(pos, ATOM_CLEAN_LIST(atom))
72687 +                       count++;
72688 +               if (count != atom->clean)
72689 +                       warning("", "clean counter %d, real %d\n", atom->clean,
72690 +                               count);
72691 +
72692 +               /* wb list */
72693 +               count = 0;
72694 +               list_for_each(pos, ATOM_WB_LIST(atom))
72695 +                       count++;
72696 +               if (count != atom->wb)
72697 +                       warning("", "wb counter %d, real %d\n", atom->wb,
72698 +                               count);
72699 +
72700 +               /* overwrite list */
72701 +               count = 0;
72702 +               list_for_each(pos, ATOM_OVRWR_LIST(atom))
72703 +                       count++;
72704 +
72705 +               if (count != atom->ovrwr)
72706 +                       warning("", "ovrwr counter %d, real %d\n", atom->ovrwr,
72707 +                               count);
72708 +       }
72709 +       assert("vs-1624", atom->num_queued == atom->fq);
72710 +       if (atom->capture_count !=
72711 +           atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) {
72712 +               printk
72713 +                   ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n",
72714 +                    atom->capture_count, atom->dirty, atom->clean, atom->ovrwr,
72715 +                    atom->wb, atom->fq);
72716 +               assert("vs-1622",
72717 +                      atom->capture_count ==
72718 +                      atom->dirty + atom->clean + atom->ovrwr + atom->wb +
72719 +                      atom->fq);
72720 +       }
72721 +}
72722 +
72723 +#endif
72724 +
72725 +/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode
72726 + * lock should be taken before calling this function. */
72727 +void jnode_make_wander_nolock(jnode * node)
72728 +{
72729 +       txn_atom *atom;
72730 +
72731 +       assert("nikita-2431", node != NULL);
72732 +       assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC));
72733 +       assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY));
72734 +       assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
72735 +       assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
72736 +
72737 +       atom = node->atom;
72738 +
72739 +       assert("zam-895", atom != NULL);
72740 +       assert("zam-894", atom_is_protected(atom));
72741 +
72742 +       JF_SET(node, JNODE_OVRWR);
72743 +       /* move node to atom's overwrite list */
72744 +       list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom));
72745 +       ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1));
72746 +}
72747 +
72748 +/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside
72749 + * this function. */
72750 +void jnode_make_wander(jnode * node)
72751 +{
72752 +       txn_atom *atom;
72753 +
72754 +       spin_lock_jnode(node);
72755 +       atom = jnode_get_atom(node);
72756 +       assert("zam-913", atom != NULL);
72757 +       assert("zam-914", !JF_ISSET(node, JNODE_RELOC));
72758 +
72759 +       jnode_make_wander_nolock(node);
72760 +       spin_unlock_atom(atom);
72761 +       spin_unlock_jnode(node);
72762 +}
72763 +
72764 +/* this just sets RELOC bit  */
72765 +static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node)
72766 +{
72767 +       assert_spin_locked(&(node->guard));
72768 +       assert("zam-916", JF_ISSET(node, JNODE_DIRTY));
72769 +       assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
72770 +       assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
72771 +       assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
72772 +       assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
72773 +       jnode_set_reloc(node);
72774 +}
72775 +
72776 +/* Make znode RELOC and put it on flush queue */
72777 +void znode_make_reloc(znode * z, flush_queue_t * fq)
72778 +{
72779 +       jnode *node;
72780 +       txn_atom *atom;
72781 +
72782 +       node = ZJNODE(z);
72783 +       spin_lock_jnode(node);
72784 +
72785 +       atom = jnode_get_atom(node);
72786 +       assert("zam-919", atom != NULL);
72787 +
72788 +       jnode_make_reloc_nolock(fq, node);
72789 +       queue_jnode(fq, node);
72790 +
72791 +       spin_unlock_atom(atom);
72792 +       spin_unlock_jnode(node);
72793 +
72794 +}
72795 +
72796 +/* Make unformatted node RELOC and put it on flush queue */
72797 +void unformatted_make_reloc(jnode *node, flush_queue_t *fq)
72798 +{
72799 +       assert("vs-1479", jnode_is_unformatted(node));
72800 +
72801 +       jnode_make_reloc_nolock(fq, node);
72802 +       queue_jnode(fq, node);
72803 +}
72804 +
72805 +int reiser4_capture_super_block(struct super_block *s)
72806 +{
72807 +       int result;
72808 +       znode *uber;
72809 +       lock_handle lh;
72810 +
72811 +       init_lh(&lh);
72812 +       result = get_uber_znode(reiser4_get_tree(s),
72813 +                               ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh);
72814 +       if (result)
72815 +               return result;
72816 +
72817 +       uber = lh.node;
72818 +       /* Grabbing one block for superblock */
72819 +       result = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
72820 +       if (result != 0)
72821 +               return result;
72822 +
72823 +       znode_make_dirty(uber);
72824 +
72825 +       done_lh(&lh);
72826 +       return 0;
72827 +}
72828 +
72829 +/* Wakeup every handle on the atom's WAITFOR list */
72830 +static void wakeup_atom_waitfor_list(txn_atom * atom)
72831 +{
72832 +       txn_wait_links *wlinks;
72833 +
72834 +       assert("umka-210", atom != NULL);
72835 +
72836 +       /* atom is locked */
72837 +       list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) {
72838 +               if (wlinks->waitfor_cb == NULL ||
72839 +                   wlinks->waitfor_cb(atom, wlinks))
72840 +                       /* Wake up. */
72841 +                       reiser4_wake_up(wlinks->_lock_stack);
72842 +       }
72843 +}
72844 +
72845 +/* Wakeup every handle on the atom's WAITING list */
72846 +static void wakeup_atom_waiting_list(txn_atom * atom)
72847 +{
72848 +       txn_wait_links *wlinks;
72849 +
72850 +       assert("umka-211", atom != NULL);
72851 +
72852 +       /* atom is locked */
72853 +       list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) {
72854 +               if (wlinks->waiting_cb == NULL ||
72855 +                   wlinks->waiting_cb(atom, wlinks))
72856 +                       /* Wake up. */
72857 +                       reiser4_wake_up(wlinks->_lock_stack);
72858 +       }
72859 +}
72860 +
72861 +/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */
72862 +static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks)
72863 +{
72864 +       assert("nikita-3330", atom != NULL);
72865 +       assert_spin_locked(&(atom->alock));
72866 +
72867 +       /* atom->txnh_count == 1 is for waking waiters up if we are releasing
72868 +        * last transaction handle. */
72869 +       return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1;
72870 +}
72871 +
72872 +/* The general purpose of this function is to wait on the first of two possible events.
72873 +   The situation is that a handle (and its atom atomh) is blocked trying to capture a
72874 +   block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state.  The
72875 +   handle's atom (atomh) is not in the CAPTURE_WAIT state.  However, atomh could fuse with
72876 +   another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it
72877 +   needs to unblock the handle to avoid deadlock.  When the txnh is unblocked it will
72878 +   proceed and fuse the two atoms in the CAPTURE_WAIT state.
72879 +
72880 +   In other words, if either atomh or atomf change state, the handle will be awakened,
72881 +   thus there are two lists per atom: WAITING and WAITFOR.
72882 +
72883 +   This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to
72884 +   close but it is not assigned to an atom of its own.
72885 +
72886 +   Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK,
72887 +   BOTH_ATOM_LOCKS.  Result: all four locks are released.
72888 +*/
72889 +static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf,
72890 +                   txn_atom * atomh, txn_capture mode)
72891 +{
72892 +       int ret;
72893 +       txn_wait_links wlinks;
72894 +
72895 +       assert("umka-213", txnh != NULL);
72896 +       assert("umka-214", atomf != NULL);
72897 +
72898 +       if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) {
72899 +               spin_unlock_txnh(txnh);
72900 +               spin_unlock_atom(atomf);
72901 +
72902 +               if (atomh) {
72903 +                       spin_unlock_atom(atomh);
72904 +               }
72905 +
72906 +               return RETERR(-E_BLOCK);
72907 +       }
72908 +
72909 +       /* Initialize the waiting list links. */
72910 +       init_wlinks(&wlinks);
72911 +
72912 +       /* Add txnh to atomf's waitfor list, unlock atomf. */
72913 +       list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list);
72914 +       wlinks.waitfor_cb = wait_for_fusion;
72915 +       atomic_inc(&atomf->refcount);
72916 +       spin_unlock_atom(atomf);
72917 +
72918 +       if (atomh) {
72919 +               /* Add txnh to atomh's waiting list, unlock atomh. */
72920 +               list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list);
72921 +               atomic_inc(&atomh->refcount);
72922 +               spin_unlock_atom(atomh);
72923 +       }
72924 +
72925 +       /* Go to sleep. */
72926 +       spin_unlock_txnh(txnh);
72927 +
72928 +       ret = reiser4_prepare_to_sleep(wlinks._lock_stack);
72929 +       if (ret == 0) {
72930 +               reiser4_go_to_sleep(wlinks._lock_stack);
72931 +               ret = RETERR(-E_REPEAT);
72932 +       }
72933 +
72934 +       /* Remove from the waitfor list. */
72935 +       spin_lock_atom(atomf);
72936 +
72937 +       list_del(&wlinks._fwaitfor_link);
72938 +       atom_dec_and_unlock(atomf);
72939 +
72940 +       if (atomh) {
72941 +               /* Remove from the waiting list. */
72942 +               spin_lock_atom(atomh);
72943 +               list_del(&wlinks._fwaiting_link);
72944 +               atom_dec_and_unlock(atomh);
72945 +       }
72946 +       return ret;
72947 +}
72948 +
72949 +static void lock_two_atoms(txn_atom * one, txn_atom * two)
72950 +{
72951 +       assert("zam-1067", one != two);
72952 +
72953 +       /* lock the atom with lesser address first */
72954 +       if (one < two) {
72955 +               spin_lock_atom(one);
72956 +               spin_lock_atom_nested(two);
72957 +       } else {
72958 +               spin_lock_atom(two);
72959 +               spin_lock_atom_nested(one);
72960 +       }
72961 +}
72962 +
72963 +/* Perform the necessary work to prepare for fusing two atoms, which involves
72964 + * acquiring two atom locks in the proper order.  If one of the node's atom is
72965 + * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's
72966 + * atom is not then the handle's request is put to sleep.  If the node's atom
72967 + * is committing, then the node can be copy-on-captured.  Otherwise, pick the
72968 + * atom with fewer pointers to be fused into the atom with more pointer and
72969 + * call capture_fuse_into.
72970 + */
72971 +static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode)
72972 +{
72973 +       txn_atom * txnh_atom = txnh->atom;
72974 +       txn_atom * block_atom = node->atom;
72975 +
72976 +       atomic_inc(&txnh_atom->refcount);
72977 +       atomic_inc(&block_atom->refcount);
72978 +
72979 +       spin_unlock_txnh(txnh);
72980 +       spin_unlock_jnode(node);
72981 +
72982 +       lock_two_atoms(txnh_atom, block_atom);
72983 +
72984 +       if (txnh->atom != txnh_atom || node->atom != block_atom ) {
72985 +               release_two_atoms(txnh_atom, block_atom);
72986 +               return RETERR(-E_REPEAT);
72987 +       }
72988 +
72989 +       atomic_dec(&txnh_atom->refcount);
72990 +       atomic_dec(&block_atom->refcount);
72991 +
72992 +       assert ("zam-1066", atom_isopen(txnh_atom));
72993 +
72994 +       if (txnh_atom->stage >= block_atom->stage ||
72995 +           (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) {
72996 +               capture_fuse_into(txnh_atom, block_atom);
72997 +               return RETERR(-E_REPEAT);
72998 +       }
72999 +       spin_lock_txnh(txnh);
73000 +       return capture_fuse_wait(txnh, block_atom, txnh_atom, mode);
73001 +}
73002 +
73003 +/* This function splices together two jnode lists (small and large) and sets all jnodes in
73004 +   the small list to point to the large atom.  Returns the length of the list. */
73005 +static int
73006 +capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head,
73007 +                        struct list_head *small_head)
73008 +{
73009 +       int count = 0;
73010 +       jnode *node;
73011 +
73012 +       assert("umka-218", large != NULL);
73013 +       assert("umka-219", large_head != NULL);
73014 +       assert("umka-220", small_head != NULL);
73015 +       /* small atom should be locked also. */
73016 +       assert_spin_locked(&(large->alock));
73017 +
73018 +       /* For every jnode on small's capture list... */
73019 +       list_for_each_entry(node, small_head, capture_link) {
73020 +               count += 1;
73021 +
73022 +               /* With the jnode lock held, update atom pointer. */
73023 +               spin_lock_jnode(node);
73024 +               node->atom = large;
73025 +               spin_unlock_jnode(node);
73026 +       }
73027 +
73028 +       /* Splice the lists. */
73029 +       list_splice_init(small_head, large_head->prev);
73030 +
73031 +       return count;
73032 +}
73033 +
73034 +/* This function splices together two txnh lists (small and large) and sets all txn handles in
73035 +   the small list to point to the large atom.  Returns the length of the list. */
73036 +static int
73037 +capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head,
73038 +                       struct list_head *small_head)
73039 +{
73040 +       int count = 0;
73041 +       txn_handle *txnh;
73042 +
73043 +       assert("umka-221", large != NULL);
73044 +       assert("umka-222", large_head != NULL);
73045 +       assert("umka-223", small_head != NULL);
73046 +
73047 +       /* Adjust every txnh to the new atom. */
73048 +       list_for_each_entry(txnh, small_head, txnh_link) {
73049 +               count += 1;
73050 +
73051 +               /* With the txnh lock held, update atom pointer. */
73052 +               spin_lock_txnh(txnh);
73053 +               txnh->atom = large;
73054 +               spin_unlock_txnh(txnh);
73055 +       }
73056 +
73057 +       /* Splice the txn_handle list. */
73058 +       list_splice_init(small_head, large_head->prev);
73059 +
73060 +       return count;
73061 +}
73062 +
73063 +/* This function fuses two atoms.  The captured nodes and handles belonging to SMALL are
73064 +   added to LARGE and their ->atom pointers are all updated.  The associated counts are
73065 +   updated as well, and any waiting handles belonging to either are awakened.  Finally the
73066 +   smaller atom's refcount is decremented.
73067 +*/
73068 +static void capture_fuse_into(txn_atom * small, txn_atom * large)
73069 +{
73070 +       int level;
73071 +       unsigned zcount = 0;
73072 +       unsigned tcount = 0;
73073 +
73074 +       assert("umka-224", small != NULL);
73075 +       assert("umka-225", small != NULL);
73076 +
73077 +       assert_spin_locked(&(large->alock));
73078 +       assert_spin_locked(&(small->alock));
73079 +
73080 +       assert("jmacd-201", atom_isopen(small));
73081 +       assert("jmacd-202", atom_isopen(large));
73082 +
73083 +       /* Splice and update the per-level dirty jnode lists */
73084 +       for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73085 +               zcount +=
73086 +                   capture_fuse_jnode_lists(large,
73087 +                                            ATOM_DIRTY_LIST(large, level),
73088 +                                            ATOM_DIRTY_LIST(small, level));
73089 +       }
73090 +
73091 +       /* Splice and update the [clean,dirty] jnode and txnh lists */
73092 +       zcount +=
73093 +           capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large),
73094 +                                    ATOM_CLEAN_LIST(small));
73095 +       zcount +=
73096 +           capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large),
73097 +                                    ATOM_OVRWR_LIST(small));
73098 +       zcount +=
73099 +           capture_fuse_jnode_lists(large, ATOM_WB_LIST(large),
73100 +                                    ATOM_WB_LIST(small));
73101 +       zcount +=
73102 +           capture_fuse_jnode_lists(large, &large->inodes, &small->inodes);
73103 +       tcount +=
73104 +           capture_fuse_txnh_lists(large, &large->txnh_list,
73105 +                                   &small->txnh_list);
73106 +
73107 +       /* Check our accounting. */
73108 +       assert("jmacd-1063",
73109 +              zcount + small->num_queued == small->capture_count);
73110 +       assert("jmacd-1065", tcount == small->txnh_count);
73111 +
73112 +       /* sum numbers of waiters threads */
73113 +       large->nr_waiters += small->nr_waiters;
73114 +       small->nr_waiters = 0;
73115 +
73116 +       /* splice flush queues */
73117 +       reiser4_fuse_fq(large, small);
73118 +
73119 +       /* update counter of jnode on every atom' list */
73120 +       ON_DEBUG(large->dirty += small->dirty;
73121 +                small->dirty = 0;
73122 +                large->clean += small->clean;
73123 +                small->clean = 0;
73124 +                large->ovrwr += small->ovrwr;
73125 +                small->ovrwr = 0;
73126 +                large->wb += small->wb;
73127 +                small->wb = 0;
73128 +                large->fq += small->fq;
73129 +                small->fq = 0;);
73130 +
73131 +       /* count flushers in result atom */
73132 +       large->nr_flushers += small->nr_flushers;
73133 +       small->nr_flushers = 0;
73134 +
73135 +       /* update counts of flushed nodes */
73136 +       large->flushed += small->flushed;
73137 +       small->flushed = 0;
73138 +
73139 +       /* Transfer list counts to large. */
73140 +       large->txnh_count += small->txnh_count;
73141 +       large->capture_count += small->capture_count;
73142 +
73143 +       /* Add all txnh references to large. */
73144 +       atomic_add(small->txnh_count, &large->refcount);
73145 +       atomic_sub(small->txnh_count, &small->refcount);
73146 +
73147 +       /* Reset small counts */
73148 +       small->txnh_count = 0;
73149 +       small->capture_count = 0;
73150 +
73151 +       /* Assign the oldest start_time, merge flags. */
73152 +       large->start_time = min(large->start_time, small->start_time);
73153 +       large->flags |= small->flags;
73154 +
73155 +       /* Merge blocknr sets. */
73156 +       blocknr_set_merge(&small->delete_set, &large->delete_set);
73157 +       blocknr_set_merge(&small->wandered_map, &large->wandered_map);
73158 +
73159 +       /* Merge allocated/deleted file counts */
73160 +       large->nr_objects_deleted += small->nr_objects_deleted;
73161 +       large->nr_objects_created += small->nr_objects_created;
73162 +
73163 +       small->nr_objects_deleted = 0;
73164 +       small->nr_objects_created = 0;
73165 +
73166 +       /* Merge allocated blocks counts */
73167 +       large->nr_blocks_allocated += small->nr_blocks_allocated;
73168 +
73169 +       large->nr_running_queues += small->nr_running_queues;
73170 +       small->nr_running_queues = 0;
73171 +
73172 +       /* Merge blocks reserved for overwrite set. */
73173 +       large->flush_reserved += small->flush_reserved;
73174 +       small->flush_reserved = 0;
73175 +
73176 +       if (large->stage < small->stage) {
73177 +               /* Large only needs to notify if it has changed state. */
73178 +               reiser4_atom_set_stage(large, small->stage);
73179 +               wakeup_atom_waiting_list(large);
73180 +       }
73181 +
73182 +       reiser4_atom_set_stage(small, ASTAGE_INVALID);
73183 +
73184 +       /* Notify any waiters--small needs to unload its wait lists.  Waiters
73185 +          actually remove themselves from the list before returning from the
73186 +          fuse_wait function. */
73187 +       wakeup_atom_waiting_list(small);
73188 +
73189 +       /* Unlock atoms */
73190 +       spin_unlock_atom(large);
73191 +       atom_dec_and_unlock(small);
73192 +}
73193 +
73194 +/* TXNMGR STUFF */
73195 +
73196 +/* Release a block from the atom, reversing the effects of being captured,
73197 +   do not release atom's reference to jnode due to holding spin-locks.
73198 +   Currently this is only called when the atom commits.
73199 +
73200 +   NOTE: this function does not release a (journal) reference to jnode
73201 +   due to locking optimizations, you should call jput() somewhere after
73202 +   calling reiser4_uncapture_block(). */
73203 +void reiser4_uncapture_block(jnode * node)
73204 +{
73205 +       txn_atom *atom;
73206 +
73207 +       assert("umka-226", node != NULL);
73208 +       atom = node->atom;
73209 +       assert("umka-228", atom != NULL);
73210 +
73211 +       assert("jmacd-1021", node->atom == atom);
73212 +       assert_spin_locked(&(node->guard));
73213 +       assert("jmacd-1023", atom_is_protected(atom));
73214 +
73215 +       JF_CLR(node, JNODE_DIRTY);
73216 +       JF_CLR(node, JNODE_RELOC);
73217 +       JF_CLR(node, JNODE_OVRWR);
73218 +       JF_CLR(node, JNODE_CREATED);
73219 +       JF_CLR(node, JNODE_WRITEBACK);
73220 +       JF_CLR(node, JNODE_REPACK);
73221 +
73222 +       list_del_init(&node->capture_link);
73223 +       if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
73224 +               assert("zam-925", atom_isopen(atom));
73225 +               assert("vs-1623", NODE_LIST(node) == FQ_LIST);
73226 +               ON_DEBUG(atom->num_queued--);
73227 +               JF_CLR(node, JNODE_FLUSH_QUEUED);
73228 +       }
73229 +       atom->capture_count -= 1;
73230 +       ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
73231 +       node->atom = NULL;
73232 +
73233 +       spin_unlock_jnode(node);
73234 +       LOCK_CNT_DEC(t_refs);
73235 +}
73236 +
73237 +/* Unconditional insert of jnode into atom's overwrite list. Currently used in
73238 +   bitmap-based allocator code for adding modified bitmap blocks the
73239 +   transaction. @atom and @node are spin locked */
73240 +void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node)
73241 +{
73242 +       assert("zam-538", atom_is_protected(atom));
73243 +       assert_spin_locked(&(node->guard));
73244 +       assert("zam-899", JF_ISSET(node, JNODE_OVRWR));
73245 +       assert("zam-543", node->atom == NULL);
73246 +       assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node));
73247 +
73248 +       list_add(&node->capture_link, ATOM_OVRWR_LIST(atom));
73249 +       jref(node);
73250 +       node->atom = atom;
73251 +       atom->capture_count++;
73252 +       ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
73253 +}
73254 +
73255 +static int count_deleted_blocks_actor(txn_atom * atom,
73256 +                                     const reiser4_block_nr * a,
73257 +                                     const reiser4_block_nr * b, void *data)
73258 +{
73259 +       reiser4_block_nr *counter = data;
73260 +
73261 +       assert("zam-995", data != NULL);
73262 +       assert("zam-996", a != NULL);
73263 +       if (b == NULL)
73264 +               *counter += 1;
73265 +       else
73266 +               *counter += *b;
73267 +       return 0;
73268 +}
73269 +
73270 +reiser4_block_nr txnmgr_count_deleted_blocks(void)
73271 +{
73272 +       reiser4_block_nr result;
73273 +       txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73274 +       txn_atom *atom;
73275 +
73276 +       result = 0;
73277 +
73278 +       spin_lock_txnmgr(tmgr);
73279 +       list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
73280 +               spin_lock_atom(atom);
73281 +               if (atom_isopen(atom))
73282 +                       blocknr_set_iterator(
73283 +                               atom, &atom->delete_set,
73284 +                               count_deleted_blocks_actor, &result, 0);
73285 +               spin_unlock_atom(atom);
73286 +       }
73287 +       spin_unlock_txnmgr(tmgr);
73288 +
73289 +       return result;
73290 +}
73291 +
73292 +/*
73293 + * Local variables:
73294 + * c-indentation-style: "K&R"
73295 + * mode-name: "LC"
73296 + * c-basic-offset: 8
73297 + * tab-width: 8
73298 + * fill-column: 79
73299 + * End:
73300 + */
73301 diff -urN linux-2.6.27.orig/fs/reiser4/txnmgr.h linux-2.6.27/fs/reiser4/txnmgr.h
73302 --- linux-2.6.27.orig/fs/reiser4/txnmgr.h       1970-01-01 03:00:00.000000000 +0300
73303 +++ linux-2.6.27/fs/reiser4/txnmgr.h    2008-10-12 18:20:01.000000000 +0400
73304 @@ -0,0 +1,701 @@
73305 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
73306 + * reiser4/README */
73307 +
73308 +/* data-types and function declarations for transaction manager. See txnmgr.c
73309 + * for details. */
73310 +
73311 +#ifndef __REISER4_TXNMGR_H__
73312 +#define __REISER4_TXNMGR_H__
73313 +
73314 +#include "forward.h"
73315 +#include "dformat.h"
73316 +
73317 +#include <linux/fs.h>
73318 +#include <linux/mm.h>
73319 +#include <linux/types.h>
73320 +#include <linux/spinlock.h>
73321 +#include <asm/atomic.h>
73322 +#include <linux/wait.h>
73323 +
73324 +/* TYPE DECLARATIONS */
73325 +
73326 +/* This enumeration describes the possible types of a capture request (reiser4_try_capture).
73327 +   A capture request dynamically assigns a block to the calling thread's transaction
73328 +   handle. */
73329 +typedef enum {
73330 +       /* A READ_ATOMIC request indicates that a block will be read and that the caller's
73331 +          atom should fuse in order to ensure that the block commits atomically with the
73332 +          caller. */
73333 +       TXN_CAPTURE_READ_ATOMIC = (1 << 0),
73334 +
73335 +       /* A READ_NONCOM request indicates that a block will be read and that the caller is
73336 +          willing to read a non-committed block without causing atoms to fuse. */
73337 +       TXN_CAPTURE_READ_NONCOM = (1 << 1),
73338 +
73339 +       /* A READ_MODIFY request indicates that a block will be read but that the caller
73340 +          wishes for the block to be captured as it will be written.  This capture request
73341 +          mode is not currently used, but eventually it will be useful for preventing
73342 +          deadlock in read-modify-write cycles. */
73343 +       TXN_CAPTURE_READ_MODIFY = (1 << 2),
73344 +
73345 +       /* A WRITE capture request indicates that a block will be modified and that atoms
73346 +          should fuse to make the commit atomic. */
73347 +       TXN_CAPTURE_WRITE = (1 << 3),
73348 +
73349 +       /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
73350 +          exclusive type designation from extra bits that may be supplied -- see
73351 +          below. */
73352 +       TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC |
73353 +                            TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY |
73354 +                            TXN_CAPTURE_WRITE),
73355 +
73356 +       /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
73357 +          indicate modification will occur. */
73358 +       TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
73359 +
73360 +       /* An option to reiser4_try_capture, NONBLOCKING indicates that the caller would
73361 +          prefer not to sleep waiting for an aging atom to commit. */
73362 +       TXN_CAPTURE_NONBLOCKING = (1 << 4),
73363 +
73364 +       /* An option to reiser4_try_capture to prevent atom fusion, just simple
73365 +          capturing is allowed */
73366 +       TXN_CAPTURE_DONT_FUSE = (1 << 5)
73367 +
73368 +       /* This macro selects only the exclusive capture request types, stripping out any
73369 +          options that were supplied (i.e., NONBLOCKING). */
73370 +#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
73371 +} txn_capture;
73372 +
73373 +/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
73374 +   difference is in the handling of read requests.  A WRITE_FUSING transaction handle
73375 +   defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
73376 +   transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
73377 +typedef enum {
73378 +       TXN_WRITE_FUSING = (1 << 0),
73379 +       TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING,  /* READ implies WRITE */
73380 +} txn_mode;
73381 +
73382 +/* Every atom has a stage, which is one of these exclusive values: */
73383 +typedef enum {
73384 +       /* Initially an atom is free. */
73385 +       ASTAGE_FREE = 0,
73386 +
73387 +       /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
73388 +          blocks and fuse with other atoms. */
73389 +       ASTAGE_CAPTURE_FUSE = 1,
73390 +
73391 +       /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
73392 +
73393 +       /* When an atom reaches a certain age it must do all it can to commit.  An atom in
73394 +          the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
73395 +          atoms in the CAPTURE_FUSE stage. */
73396 +       ASTAGE_CAPTURE_WAIT = 2,
73397 +
73398 +       /* Waiting for I/O before commit.  Copy-on-capture (see
73399 +          http://namesys.com/v4/v4.html). */
73400 +       ASTAGE_PRE_COMMIT = 3,
73401 +
73402 +       /* Post-commit overwrite I/O.  Steal-on-capture. */
73403 +       ASTAGE_POST_COMMIT = 4,
73404 +
73405 +       /* Atom which waits for the removal of the last reference to (it? ) to
73406 +        * be deleted from memory  */
73407 +       ASTAGE_DONE = 5,
73408 +
73409 +       /* invalid atom. */
73410 +       ASTAGE_INVALID = 6,
73411 +
73412 +} txn_stage;
73413 +
73414 +/* Certain flags may be set in the txn_atom->flags field. */
73415 +typedef enum {
73416 +       /* Indicates that the atom should commit as soon as possible. */
73417 +       ATOM_FORCE_COMMIT = (1 << 0),
73418 +       /* to avoid endless loop, mark the atom (which was considered as too
73419 +        * small) after failed attempt to fuse it. */
73420 +       ATOM_CANCEL_FUSION = (1 << 1)
73421 +} txn_flags;
73422 +
73423 +/* Flags for controlling commit_txnh */
73424 +typedef enum {
73425 +       /* Wait commit atom completion in commit_txnh */
73426 +       TXNH_WAIT_COMMIT = 0x2,
73427 +       /* Don't commit atom when this handle is closed */
73428 +       TXNH_DONT_COMMIT = 0x4
73429 +} txn_handle_flags_t;
73430 +
73431 +/* TYPE DEFINITIONS */
73432 +
73433 +/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
73434 +   fields, so typically an operation on the atom through either of these objects must (1)
73435 +   lock the object, (2) read the atom pointer, (3) lock the atom.
73436 +
73437 +   During atom fusion, the process holds locks on both atoms at once.  Then, it iterates
73438 +   through the list of handles and pages held by the smaller of the two atoms.  For each
73439 +   handle and page referencing the smaller atom, the fusing process must: (1) lock the
73440 +   object, and (2) update the atom pointer.
73441 +
73442 +   You can see that there is a conflict of lock ordering here, so the more-complex
73443 +   procedure should have priority, i.e., the fusing process has priority so that it is
73444 +   guaranteed to make progress and to avoid restarts.
73445 +
73446 +   This decision, however, means additional complexity for aquiring the atom lock in the
73447 +   first place.
73448 +
73449 +   The general original procedure followed in the code was:
73450 +
73451 +       TXN_OBJECT *obj = ...;
73452 +       TXN_ATOM   *atom;
73453 +
73454 +       spin_lock (& obj->_lock);
73455 +
73456 +       atom = obj->_atom;
73457 +
73458 +       if (! spin_trylock_atom (atom))
73459 +         {
73460 +           spin_unlock (& obj->_lock);
73461 +           RESTART OPERATION, THERE WAS A RACE;
73462 +         }
73463 +
73464 +       ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
73465 +
73466 +   It has however been found that this wastes CPU a lot in a manner that is
73467 +   hard to profile. So, proper refcounting was added to atoms, and new
73468 +   standard locking sequence is like following:
73469 +
73470 +       TXN_OBJECT *obj = ...;
73471 +       TXN_ATOM   *atom;
73472 +
73473 +       spin_lock (& obj->_lock);
73474 +
73475 +       atom = obj->_atom;
73476 +
73477 +       if (! spin_trylock_atom (atom))
73478 +         {
73479 +           atomic_inc (& atom->refcount);
73480 +           spin_unlock (& obj->_lock);
73481 +           spin_lock (&atom->_lock);
73482 +           atomic_dec (& atom->refcount);
73483 +           // HERE atom is locked
73484 +           spin_unlock (&atom->_lock);
73485 +           RESTART OPERATION, THERE WAS A RACE;
73486 +         }
73487 +
73488 +       ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
73489 +
73490 +   (core of this is implemented in trylock_throttle() function)
73491 +
73492 +   See the jnode_get_atom() function for a common case.
73493 +
73494 +   As an additional (and important) optimization allowing to avoid restarts,
73495 +   it is possible to re-check required pre-conditions at the HERE point in
73496 +   code above and proceed without restarting if they are still satisfied.
73497 +*/
73498 +
73499 +/* An atomic transaction: this is the underlying system representation
73500 +   of a transaction, not the one seen by clients.
73501 +
73502 +   Invariants involving this data-type:
73503 +
73504 +      [sb-fake-allocated]
73505 +*/
73506 +struct txn_atom {
73507 +       /* The spinlock protecting the atom, held during fusion and various other state
73508 +          changes. */
73509 +       spinlock_t alock;
73510 +
73511 +       /* The atom's reference counter, increasing (in case of a duplication
73512 +          of an existing reference or when we are sure that some other
73513 +          reference exists) may be done without taking spinlock, decrementing
73514 +          of the ref. counter requires a spinlock to be held.
73515 +
73516 +          Each transaction handle counts in ->refcount. All jnodes count as
73517 +          one reference acquired in atom_begin_andlock(), released in
73518 +          commit_current_atom().
73519 +        */
73520 +       atomic_t refcount;
73521 +
73522 +       /* The atom_id identifies the atom in persistent records such as the log. */
73523 +       __u32 atom_id;
73524 +
73525 +       /* Flags holding any of the txn_flags enumerated values (e.g.,
73526 +          ATOM_FORCE_COMMIT). */
73527 +       __u32 flags;
73528 +
73529 +       /* Number of open handles. */
73530 +       __u32 txnh_count;
73531 +
73532 +       /* The number of znodes captured by this atom.  Equal to the sum of lengths of the
73533 +          dirty_nodes[level] and clean_nodes lists. */
73534 +       __u32 capture_count;
73535 +
73536 +#if REISER4_DEBUG
73537 +       int clean;
73538 +       int dirty;
73539 +       int ovrwr;
73540 +       int wb;
73541 +       int fq;
73542 +#endif
73543 +
73544 +       __u32 flushed;
73545 +
73546 +       /* Current transaction stage. */
73547 +       txn_stage stage;
73548 +
73549 +       /* Start time. */
73550 +       unsigned long start_time;
73551 +
73552 +       /* The atom's delete set. It collects block numbers of the nodes
73553 +          which were deleted during the transaction. */
73554 +       struct list_head delete_set;
73555 +
73556 +       /* The atom's wandered_block mapping. */
73557 +       struct list_head wandered_map;
73558 +
73559 +       /* The transaction's list of dirty captured nodes--per level.  Index
73560 +          by (level). dirty_nodes[0] is for znode-above-root */
73561 +       struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1];
73562 +
73563 +       /* The transaction's list of clean captured nodes. */
73564 +       struct list_head clean_nodes;
73565 +
73566 +       /* The atom's overwrite set */
73567 +       struct list_head ovrwr_nodes;
73568 +
73569 +       /* nodes which are being written to disk */
73570 +       struct list_head writeback_nodes;
73571 +
73572 +       /* list of inodes */
73573 +       struct list_head inodes;
73574 +
73575 +       /* List of handles associated with this atom. */
73576 +       struct list_head txnh_list;
73577 +
73578 +       /* Transaction list link: list of atoms in the transaction manager. */
73579 +       struct list_head atom_link;
73580 +
73581 +       /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
73582 +       struct list_head fwaitfor_list;
73583 +
73584 +       /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
73585 +       struct list_head fwaiting_list;
73586 +
73587 +       /* Numbers of objects which were deleted/created in this transaction
73588 +          thereby numbers of objects IDs which were released/deallocated. */
73589 +       int nr_objects_deleted;
73590 +       int nr_objects_created;
73591 +       /* number of blocks allocated during the transaction */
73592 +       __u64 nr_blocks_allocated;
73593 +       /* All atom's flush queue objects are on this list  */
73594 +       struct list_head flush_queues;
73595 +#if REISER4_DEBUG
73596 +       /* number of flush queues for this atom. */
73597 +       int nr_flush_queues;
73598 +       /* Number of jnodes which were removed from atom's lists and put
73599 +          on flush_queue */
73600 +       int num_queued;
73601 +#endif
73602 +       /* number of threads who wait for this atom to complete commit */
73603 +       int nr_waiters;
73604 +       /* number of threads which do jnode_flush() over this atom */
73605 +       int nr_flushers;
73606 +       /* number of flush queues which are IN_USE and jnodes from fq->prepped
73607 +          are submitted to disk by the reiser4_write_fq() routine. */
73608 +       int nr_running_queues;
73609 +       /* A counter of grabbed unformatted nodes, see a description of the
73610 +        * reiser4 space reservation scheme at block_alloc.c */
73611 +       reiser4_block_nr flush_reserved;
73612 +#if REISER4_DEBUG
73613 +       void *committer;
73614 +#endif
73615 +       struct super_block *super;
73616 +};
73617 +
73618 +#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level])
73619 +#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes)
73620 +#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes)
73621 +#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes)
73622 +#define ATOM_FQ_LIST(fq) (&(fq)->prepped)
73623 +
73624 +#define NODE_LIST(node) (node)->list
73625 +#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
73626 +ON_DEBUG(void
73627 +        count_jnode(txn_atom *, jnode *, atom_list old_list,
73628 +                    atom_list new_list, int check_lists));
73629 +
73630 +/* A transaction handle: the client obtains and commits this handle which is assigned by
73631 +   the system to a txn_atom. */
73632 +struct txn_handle {
73633 +       /* Spinlock protecting ->atom pointer */
73634 +       spinlock_t hlock;
73635 +
73636 +       /* Flags for controlling commit_txnh() behavior */
73637 +       /* from txn_handle_flags_t */
73638 +       txn_handle_flags_t flags;
73639 +
73640 +       /* Whether it is READ_FUSING or WRITE_FUSING. */
73641 +       txn_mode mode;
73642 +
73643 +       /* If assigned, the atom it is part of. */
73644 +       txn_atom *atom;
73645 +
73646 +       /* Transaction list link. Head is in txn_atom. */
73647 +       struct list_head txnh_link;
73648 +};
73649 +
73650 +/* The transaction manager: one is contained in the reiser4_super_info_data */
73651 +struct txn_mgr {
73652 +       /* A spinlock protecting the atom list, id_count, flush_control */
73653 +       spinlock_t tmgr_lock;
73654 +
73655 +       /* List of atoms. */
73656 +       struct list_head atoms_list;
73657 +
73658 +       /* Number of atoms. */
73659 +       int atom_count;
73660 +
73661 +       /* A counter used to assign atom->atom_id values. */
73662 +       __u32 id_count;
73663 +
73664 +       /* a mutex object for commit serialization */
73665 +       struct mutex commit_mutex;
73666 +
73667 +       /* a list of all txnmrgs served by particular daemon. */
73668 +       struct list_head linkage;
73669 +
73670 +       /* description of daemon for this txnmgr */
73671 +       ktxnmgrd_context *daemon;
73672 +
73673 +       /* parameters. Adjustable through mount options. */
73674 +       unsigned int atom_max_size;
73675 +       unsigned int atom_max_age;
73676 +       unsigned int atom_min_size;
73677 +       /* max number of concurrent flushers for one atom, 0 - unlimited.  */
73678 +       unsigned int atom_max_flushers;
73679 +       struct dentry *debugfs_atom_count;
73680 +       struct dentry *debugfs_id_count;
73681 +};
73682 +
73683 +/* FUNCTION DECLARATIONS */
73684 +
73685 +/* These are the externally (within Reiser4) visible transaction functions, therefore they
73686 +   are prefixed with "txn_".  For comments, see txnmgr.c. */
73687 +
73688 +extern int init_txnmgr_static(void);
73689 +extern void done_txnmgr_static(void);
73690 +
73691 +extern void reiser4_init_txnmgr(txn_mgr *);
73692 +extern void reiser4_done_txnmgr(txn_mgr *);
73693 +
73694 +extern int reiser4_txn_reserve(int reserved);
73695 +
73696 +extern void reiser4_txn_begin(reiser4_context * context);
73697 +extern int reiser4_txn_end(reiser4_context * context);
73698 +
73699 +extern void reiser4_txn_restart(reiser4_context * context);
73700 +extern void reiser4_txn_restart_current(void);
73701 +
73702 +extern int txnmgr_force_commit_all(struct super_block *, int);
73703 +extern int current_atom_should_commit(void);
73704 +
73705 +extern jnode *find_first_dirty_jnode(txn_atom *, int);
73706 +
73707 +extern int commit_some_atoms(txn_mgr *);
73708 +extern int force_commit_atom(txn_handle *);
73709 +extern int flush_current_atom(int, long, long *, txn_atom **, jnode *);
73710 +
73711 +extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int);
73712 +
73713 +extern void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage);
73714 +
73715 +extern int same_slum_check(jnode * base, jnode * check, int alloc_check,
73716 +                          int alloc_value);
73717 +extern void atom_dec_and_unlock(txn_atom * atom);
73718 +
73719 +extern int reiser4_try_capture(jnode * node, znode_lock_mode mode, txn_capture flags);
73720 +extern int try_capture_page_to_invalidate(struct page *pg);
73721 +
73722 +extern void reiser4_uncapture_page(struct page *pg);
73723 +extern void reiser4_uncapture_block(jnode *);
73724 +extern void reiser4_uncapture_jnode(jnode *);
73725 +
73726 +extern int reiser4_capture_inode(struct inode *);
73727 +extern int reiser4_uncapture_inode(struct inode *);
73728 +
73729 +extern txn_atom *get_current_atom_locked_nocheck(void);
73730 +
73731 +#if REISER4_DEBUG
73732 +
73733 +/**
73734 + * atom_is_protected - make sure that nobody but us can do anything with atom
73735 + * @atom: atom to be checked
73736 + *
73737 + * This is used to assert that atom either entered commit stages or is spin
73738 + * locked.
73739 + */
73740 +static inline int atom_is_protected(txn_atom *atom)
73741 +{
73742 +       if (atom->stage >= ASTAGE_PRE_COMMIT)
73743 +               return 1;
73744 +       assert_spin_locked(&(atom->alock));
73745 +       return 1;
73746 +}
73747 +
73748 +#endif
73749 +
73750 +/* Get the current atom and spinlock it if current atom present. May not return NULL */
73751 +static inline txn_atom *get_current_atom_locked(void)
73752 +{
73753 +       txn_atom *atom;
73754 +
73755 +       atom = get_current_atom_locked_nocheck();
73756 +       assert("zam-761", atom != NULL);
73757 +
73758 +       return atom;
73759 +}
73760 +
73761 +extern txn_atom *jnode_get_atom(jnode *);
73762 +
73763 +extern void reiser4_atom_wait_event(txn_atom *);
73764 +extern void reiser4_atom_send_event(txn_atom *);
73765 +
73766 +extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node);
73767 +extern int reiser4_capture_super_block(struct super_block *s);
73768 +int capture_bulk(jnode **, int count);
73769 +
73770 +/* See the comment on the function blocknrset.c:blocknr_set_add for the
73771 +   calling convention of these three routines. */
73772 +extern void blocknr_set_init(struct list_head * bset);
73773 +extern void blocknr_set_destroy(struct list_head * bset);
73774 +extern void blocknr_set_merge(struct list_head * from, struct list_head * into);
73775 +extern int blocknr_set_add_extent(txn_atom * atom,
73776 +                                 struct list_head * bset,
73777 +                                 blocknr_set_entry ** new_bsep,
73778 +                                 const reiser4_block_nr * start,
73779 +                                 const reiser4_block_nr * len);
73780 +extern int blocknr_set_add_pair(txn_atom * atom, struct list_head * bset,
73781 +                               blocknr_set_entry ** new_bsep,
73782 +                               const reiser4_block_nr * a,
73783 +                               const reiser4_block_nr * b);
73784 +
73785 +typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *,
73786 +                                   const reiser4_block_nr *, void *);
73787 +
73788 +extern int blocknr_set_iterator(txn_atom * atom, struct list_head * bset,
73789 +                               blocknr_set_actor_f actor, void *data,
73790 +                               int delete);
73791 +
73792 +/* flush code takes care about how to fuse flush queues */
73793 +extern void flush_init_atom(txn_atom * atom);
73794 +extern void flush_fuse_queues(txn_atom * large, txn_atom * small);
73795 +
73796 +static inline void spin_lock_atom(txn_atom *atom)
73797 +{
73798 +       /* check that spinlocks of lower priorities are not held */
73799 +       assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
73800 +                   LOCK_CNT_NIL(spin_locked_atom) &&
73801 +                   LOCK_CNT_NIL(spin_locked_jnode) &&
73802 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
73803 +                   LOCK_CNT_NIL(rw_locked_dk) &&
73804 +                   LOCK_CNT_NIL(rw_locked_tree)));
73805 +
73806 +       spin_lock(&(atom->alock));
73807 +
73808 +       LOCK_CNT_INC(spin_locked_atom);
73809 +       LOCK_CNT_INC(spin_locked);
73810 +}
73811 +
73812 +static inline void spin_lock_atom_nested(txn_atom *atom)
73813 +{
73814 +       assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
73815 +                   LOCK_CNT_NIL(spin_locked_jnode) &&
73816 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
73817 +                   LOCK_CNT_NIL(rw_locked_dk) &&
73818 +                   LOCK_CNT_NIL(rw_locked_tree)));
73819 +
73820 +       spin_lock_nested(&(atom->alock), SINGLE_DEPTH_NESTING);
73821 +
73822 +       LOCK_CNT_INC(spin_locked_atom);
73823 +       LOCK_CNT_INC(spin_locked);
73824 +}
73825 +
73826 +static inline int spin_trylock_atom(txn_atom *atom)
73827 +{
73828 +       if (spin_trylock(&(atom->alock))) {
73829 +               LOCK_CNT_INC(spin_locked_atom);
73830 +               LOCK_CNT_INC(spin_locked);
73831 +               return 1;
73832 +       }
73833 +       return 0;
73834 +}
73835 +
73836 +static inline void spin_unlock_atom(txn_atom *atom)
73837 +{
73838 +       assert_spin_locked(&(atom->alock));
73839 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom));
73840 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
73841 +
73842 +       LOCK_CNT_DEC(spin_locked_atom);
73843 +       LOCK_CNT_DEC(spin_locked);
73844 +
73845 +       spin_unlock(&(atom->alock));
73846 +}
73847 +
73848 +static inline void spin_lock_txnh(txn_handle *txnh)
73849 +{
73850 +       /* check that spinlocks of lower priorities are not held */
73851 +       assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
73852 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
73853 +                   LOCK_CNT_NIL(rw_locked_tree)));
73854 +
73855 +       spin_lock(&(txnh->hlock));
73856 +
73857 +       LOCK_CNT_INC(spin_locked_txnh);
73858 +       LOCK_CNT_INC(spin_locked);
73859 +}
73860 +
73861 +static inline int spin_trylock_txnh(txn_handle *txnh)
73862 +{
73863 +       if (spin_trylock(&(txnh->hlock))) {
73864 +               LOCK_CNT_INC(spin_locked_txnh);
73865 +               LOCK_CNT_INC(spin_locked);
73866 +               return 1;
73867 +       }
73868 +       return 0;
73869 +}
73870 +
73871 +static inline void spin_unlock_txnh(txn_handle *txnh)
73872 +{
73873 +       assert_spin_locked(&(txnh->hlock));
73874 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh));
73875 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
73876 +
73877 +       LOCK_CNT_DEC(spin_locked_txnh);
73878 +       LOCK_CNT_DEC(spin_locked);
73879 +
73880 +       spin_unlock(&(txnh->hlock));
73881 +}
73882 +
73883 +#define spin_ordering_pred_txnmgr(tmgr)                \
73884 +       ( LOCK_CNT_NIL(spin_locked_atom) &&     \
73885 +         LOCK_CNT_NIL(spin_locked_txnh) &&     \
73886 +         LOCK_CNT_NIL(spin_locked_jnode) &&    \
73887 +         LOCK_CNT_NIL(rw_locked_zlock) &&      \
73888 +         LOCK_CNT_NIL(rw_locked_dk) &&         \
73889 +         LOCK_CNT_NIL(rw_locked_tree) )
73890 +
73891 +static inline void spin_lock_txnmgr(txn_mgr *mgr)
73892 +{
73893 +       /* check that spinlocks of lower priorities are not held */
73894 +       assert("", (LOCK_CNT_NIL(spin_locked_atom) &&
73895 +                   LOCK_CNT_NIL(spin_locked_txnh) &&
73896 +                   LOCK_CNT_NIL(spin_locked_jnode) &&
73897 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
73898 +                   LOCK_CNT_NIL(rw_locked_dk) &&
73899 +                   LOCK_CNT_NIL(rw_locked_tree)));
73900 +
73901 +       spin_lock(&(mgr->tmgr_lock));
73902 +
73903 +       LOCK_CNT_INC(spin_locked_txnmgr);
73904 +       LOCK_CNT_INC(spin_locked);
73905 +}
73906 +
73907 +static inline int spin_trylock_txnmgr(txn_mgr *mgr)
73908 +{
73909 +       if (spin_trylock(&(mgr->tmgr_lock))) {
73910 +               LOCK_CNT_INC(spin_locked_txnmgr);
73911 +               LOCK_CNT_INC(spin_locked);
73912 +               return 1;
73913 +       }
73914 +       return 0;
73915 +}
73916 +
73917 +static inline void spin_unlock_txnmgr(txn_mgr *mgr)
73918 +{
73919 +       assert_spin_locked(&(mgr->tmgr_lock));
73920 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr));
73921 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
73922 +
73923 +       LOCK_CNT_DEC(spin_locked_txnmgr);
73924 +       LOCK_CNT_DEC(spin_locked);
73925 +
73926 +       spin_unlock(&(mgr->tmgr_lock));
73927 +}
73928 +
73929 +typedef enum {
73930 +       FQ_IN_USE = 0x1
73931 +} flush_queue_state_t;
73932 +
73933 +typedef struct flush_queue flush_queue_t;
73934 +
73935 +/* This is an accumulator for jnodes prepared for writing to disk. A flush queue
73936 +   is filled by the jnode_flush() routine, and written to disk under memory
73937 +   pressure or at atom commit time. */
73938 +/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
73939 +   field and fq->prepped list can be modified if atom is spin-locked and fq
73940 +   object is "in-use" state.  For read-only traversal of the fq->prepped list
73941 +   and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
73942 +   only have atom spin-locked. */
73943 +struct flush_queue {
73944 +       /* linkage element is the first in this structure to make debugging
73945 +          easier.  See field in atom struct for description of list. */
73946 +       struct list_head alink;
73947 +       /* A spinlock to protect changes of fq state and fq->atom pointer */
73948 +       spinlock_t guard;
73949 +       /* flush_queue state: [in_use | ready] */
73950 +       flush_queue_state_t state;
73951 +       /* A list which contains queued nodes, queued nodes are removed from any
73952 +        * atom's list and put on this ->prepped one. */
73953 +       struct list_head prepped;
73954 +       /* number of submitted i/o requests */
73955 +       atomic_t nr_submitted;
73956 +       /* number of i/o errors */
73957 +       atomic_t nr_errors;
73958 +       /* An atom this flush queue is attached to */
73959 +       txn_atom *atom;
73960 +       /* A wait queue head to wait on i/o completion */
73961 +       wait_queue_head_t wait;
73962 +#if REISER4_DEBUG
73963 +       /* A thread which took this fq in exclusive use, NULL if fq is free,
73964 +        * used for debugging. */
73965 +       struct task_struct *owner;
73966 +#endif
73967 +};
73968 +
73969 +extern int reiser4_fq_by_atom(txn_atom *, flush_queue_t **);
73970 +extern void reiser4_fq_put_nolock(flush_queue_t *);
73971 +extern void reiser4_fq_put(flush_queue_t *);
73972 +extern void reiser4_fuse_fq(txn_atom * to, txn_atom * from);
73973 +extern void queue_jnode(flush_queue_t *, jnode *);
73974 +
73975 +extern int reiser4_write_fq(flush_queue_t *, long *, int);
73976 +extern int current_atom_finish_all_fq(void);
73977 +extern void init_atom_fq_parts(txn_atom *);
73978 +
73979 +extern reiser4_block_nr txnmgr_count_deleted_blocks(void);
73980 +
73981 +extern void znode_make_dirty(znode * node);
73982 +extern void jnode_make_dirty_locked(jnode * node);
73983 +
73984 +extern int reiser4_sync_atom(txn_atom * atom);
73985 +
73986 +#if REISER4_DEBUG
73987 +extern int atom_fq_parts_are_clean(txn_atom *);
73988 +#endif
73989 +
73990 +extern void add_fq_to_bio(flush_queue_t *, struct bio *);
73991 +extern flush_queue_t *get_fq_for_current_atom(void);
73992 +
73993 +void reiser4_invalidate_list(struct list_head * head);
73994 +
73995 +# endif                                /* __REISER4_TXNMGR_H__ */
73996 +
73997 +/* Make Linus happy.
73998 +   Local variables:
73999 +   c-indentation-style: "K&R"
74000 +   mode-name: "LC"
74001 +   c-basic-offset: 8
74002 +   tab-width: 8
74003 +   fill-column: 120
74004 +   End:
74005 +*/
74006 diff -urN linux-2.6.27.orig/fs/reiser4/type_safe_hash.h linux-2.6.27/fs/reiser4/type_safe_hash.h
74007 --- linux-2.6.27.orig/fs/reiser4/type_safe_hash.h       1970-01-01 03:00:00.000000000 +0300
74008 +++ linux-2.6.27/fs/reiser4/type_safe_hash.h    2008-10-12 18:20:01.000000000 +0400
74009 @@ -0,0 +1,320 @@
74010 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74011 + * reiser4/README */
74012 +
74013 +/* A hash table class that uses hash chains (singly-linked) and is
74014 +   parametrized to provide type safety.  */
74015 +
74016 +#ifndef __REISER4_TYPE_SAFE_HASH_H__
74017 +#define __REISER4_TYPE_SAFE_HASH_H__
74018 +
74019 +#include "debug.h"
74020 +
74021 +#include <asm/errno.h>
74022 +/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects
74023 +   based on the object type.  You need to declare the item type before
74024 +   this definition, define it after this definition. */
74025 +#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE)                                                     \
74026 +                                                                                              \
74027 +typedef struct PREFIX##_hash_table_  PREFIX##_hash_table;                                     \
74028 +typedef struct PREFIX##_hash_link_   PREFIX##_hash_link;                                      \
74029 +                                                                                              \
74030 +struct PREFIX##_hash_table_                                                                   \
74031 +{                                                                                             \
74032 +  ITEM_TYPE  **_table;                                                                        \
74033 +  __u32        _buckets;                                                                      \
74034 +};                                                                                            \
74035 +                                                                                              \
74036 +struct PREFIX##_hash_link_                                                                    \
74037 +{                                                                                             \
74038 +  ITEM_TYPE *_next;                                                                           \
74039 +}
74040 +
74041 +/* Step 2: Define the object type of the hash: give it field of type
74042 +   PREFIX_hash_link. */
74043 +
74044 +/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using
74045 +   the type and field name used in step 3.  The arguments are:
74046 +
74047 +   ITEM_TYPE    The item type being hashed
74048 +   KEY_TYPE     The type of key being hashed
74049 +   KEY_NAME     The name of the key field within the item
74050 +   LINK_NAME    The name of the link field within the item, which you must make type PREFIX_hash_link)
74051 +   HASH_FUNC    The name of the hash function (or macro, takes const pointer to key)
74052 +   EQ_FUNC      The name of the equality function (or macro, takes const pointer to two keys)
74053 +
74054 +   It implements these functions:
74055 +
74056 +   prefix_hash_init           Initialize the table given its size.
74057 +   prefix_hash_insert         Insert an item
74058 +   prefix_hash_insert_index   Insert an item w/ precomputed hash_index
74059 +   prefix_hash_find           Find an item by key
74060 +   prefix_hash_find_index     Find an item w/ precomputed hash_index
74061 +   prefix_hash_remove         Remove an item, returns 1 if found, 0 if not found
74062 +   prefix_hash_remove_index   Remove an item w/ precomputed hash_index
74063 +
74064 +   If you'd like something to be done differently, feel free to ask me
74065 +   for modifications.  Additional features that could be added but
74066 +   have not been:
74067 +
74068 +   prefix_hash_remove_key           Find and remove an item by key
74069 +   prefix_hash_remove_key_index     Find and remove an item by key w/ precomputed hash_index
74070 +
74071 +   The hash_function currently receives only the key as an argument,
74072 +   meaning it must somehow know the number of buckets.  If this is a
74073 +   problem let me know.
74074 +
74075 +   This hash table uses a single-linked hash chain.  This means
74076 +   insertion is fast but deletion requires searching the chain.
74077 +
74078 +   There is also the doubly-linked hash chain approach, under which
74079 +   deletion requires no search but the code is longer and it takes two
74080 +   pointers per item.
74081 +
74082 +   The circularly-linked approach has the shortest code but requires
74083 +   two pointers per bucket, doubling the size of the bucket array (in
74084 +   addition to two pointers per item).
74085 +*/
74086 +#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC)  \
74087 +                                                                                       \
74088 +static __inline__ void                                                                 \
74089 +PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG,                            \
74090 +                    __u32                hash UNUSED_ARG)                              \
74091 +{                                                                                      \
74092 +       assert("nikita-2780", hash < table->_buckets);                                  \
74093 +}                                                                                      \
74094 +                                                                                       \
74095 +static __inline__ int                                                                  \
74096 +PREFIX##_hash_init (PREFIX##_hash_table *hash,                                         \
74097 +                   __u32                buckets)                                       \
74098 +{                                                                                      \
74099 +  hash->_table   = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets);              \
74100 +  hash->_buckets = buckets;                                                            \
74101 +  if (hash->_table == NULL)                                                            \
74102 +    {                                                                                  \
74103 +      return RETERR(-ENOMEM);                                                          \
74104 +    }                                                                                  \
74105 +  memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets);                             \
74106 +  ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets));                      \
74107 +  return 0;                                                                            \
74108 +}                                                                                      \
74109 +                                                                                       \
74110 +static __inline__ void                                                                 \
74111 +PREFIX##_hash_done (PREFIX##_hash_table *hash)                                         \
74112 +{                                                                                      \
74113 +  if (REISER4_DEBUG && hash->_table != NULL) {                                          \
74114 +           __u32 i;                                                                    \
74115 +           for (i = 0 ; i < hash->_buckets ; ++ i)                                     \
74116 +                   assert("nikita-2905", hash->_table[i] == NULL);                     \
74117 +  }                                                                                     \
74118 +  if (hash->_table != NULL)                                                            \
74119 +    KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets);                                \
74120 +  hash->_table = NULL;                                                                 \
74121 +}                                                                                      \
74122 +                                                                                       \
74123 +static __inline__ void                                                                 \
74124 +PREFIX##_hash_prefetch_next (ITEM_TYPE *item)                                          \
74125 +{                                                                                      \
74126 +       prefetch(item->LINK_NAME._next);                                                \
74127 +}                                                                                      \
74128 +                                                                                       \
74129 +static __inline__ void                                                                 \
74130 +PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash,                              \
74131 +                              __u32                index)                              \
74132 +{                                                                                      \
74133 +       prefetch(hash->_table[index]);                                                  \
74134 +}                                                                                      \
74135 +                                                                                       \
74136 +static __inline__ ITEM_TYPE*                                                           \
74137 +PREFIX##_hash_find_index (PREFIX##_hash_table *hash,                                   \
74138 +                         __u32                hash_index,                              \
74139 +                         KEY_TYPE const      *find_key)                                \
74140 +{                                                                                      \
74141 +  ITEM_TYPE *item;                                                                     \
74142 +                                                                                       \
74143 +  PREFIX##_check_hash(hash, hash_index);                                               \
74144 +                                                                                       \
74145 +  for (item  = hash->_table[hash_index];                                               \
74146 +       item != NULL;                                                                   \
74147 +       item  = item->LINK_NAME._next)                                                  \
74148 +    {                                                                                  \
74149 +      prefetch(item->LINK_NAME._next);                                                 \
74150 +      prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME));                 \
74151 +      if (EQ_FUNC (& item->KEY_NAME, find_key))                                                \
74152 +        {                                                                              \
74153 +          return item;                                                                 \
74154 +        }                                                                              \
74155 +    }                                                                                  \
74156 +                                                                                       \
74157 +  return NULL;                                                                         \
74158 +}                                                                                      \
74159 +                                                                                       \
74160 +static __inline__ ITEM_TYPE*                                                           \
74161 +PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash,                               \
74162 +                             __u32                hash_index,                          \
74163 +                             KEY_TYPE const      *find_key)                            \
74164 +{                                                                                      \
74165 +  ITEM_TYPE ** item = &hash->_table[hash_index];                                        \
74166 +                                                                                       \
74167 +  PREFIX##_check_hash(hash, hash_index);                                               \
74168 +                                                                                        \
74169 +  while (*item != NULL) {                                                               \
74170 +    prefetch(&(*item)->LINK_NAME._next);                                               \
74171 +    if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) {                                       \
74172 +      ITEM_TYPE *found;                                                                \
74173 +                                                                                       \
74174 +      found = *item;                                                                   \
74175 +      *item = found->LINK_NAME._next;                                                   \
74176 +      found->LINK_NAME._next = hash->_table[hash_index];                               \
74177 +      hash->_table[hash_index] = found;                                                        \
74178 +      return found;                                                                     \
74179 +    }                                                                                   \
74180 +    item = &(*item)->LINK_NAME._next;                                                   \
74181 +  }                                                                                    \
74182 +  return NULL;                                                                         \
74183 +}                                                                                      \
74184 +                                                                                       \
74185 +static __inline__ int                                                                  \
74186 +PREFIX##_hash_remove_index (PREFIX##_hash_table *hash,                                 \
74187 +                           __u32                hash_index,                            \
74188 +                           ITEM_TYPE           *del_item)                              \
74189 +{                                                                                      \
74190 +  ITEM_TYPE ** hash_item_p = &hash->_table[hash_index];                                 \
74191 +                                                                                       \
74192 +  PREFIX##_check_hash(hash, hash_index);                                               \
74193 +                                                                                        \
74194 +  while (*hash_item_p != NULL) {                                                        \
74195 +    prefetch(&(*hash_item_p)->LINK_NAME._next);                                                \
74196 +    if (*hash_item_p == del_item) {                                                     \
74197 +      *hash_item_p = (*hash_item_p)->LINK_NAME._next;                                   \
74198 +      return 1;                                                                         \
74199 +    }                                                                                   \
74200 +    hash_item_p = &(*hash_item_p)->LINK_NAME._next;                                     \
74201 +  }                                                                                    \
74202 +  return 0;                                                                            \
74203 +}                                                                                      \
74204 +                                                                                       \
74205 +static __inline__ void                                                                 \
74206 +PREFIX##_hash_insert_index (PREFIX##_hash_table *hash,                                 \
74207 +                           __u32                hash_index,                            \
74208 +                           ITEM_TYPE           *ins_item)                              \
74209 +{                                                                                      \
74210 +  PREFIX##_check_hash(hash, hash_index);                                               \
74211 +                                                                                       \
74212 +  ins_item->LINK_NAME._next = hash->_table[hash_index];                                        \
74213 +  hash->_table[hash_index]  = ins_item;                                                        \
74214 +}                                                                                      \
74215 +                                                                                       \
74216 +static __inline__ void                                                                 \
74217 +PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash,                             \
74218 +                               __u32                hash_index,                        \
74219 +                               ITEM_TYPE           *ins_item)                          \
74220 +{                                                                                      \
74221 +  PREFIX##_check_hash(hash, hash_index);                                               \
74222 +                                                                                       \
74223 +  ins_item->LINK_NAME._next = hash->_table[hash_index];                                        \
74224 +  smp_wmb();                                                                           \
74225 +  hash->_table[hash_index]  = ins_item;                                                        \
74226 +}                                                                                      \
74227 +                                                                                       \
74228 +static __inline__ ITEM_TYPE*                                                           \
74229 +PREFIX##_hash_find (PREFIX##_hash_table *hash,                                         \
74230 +                   KEY_TYPE const      *find_key)                                      \
74231 +{                                                                                      \
74232 +  return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key);         \
74233 +}                                                                                      \
74234 +                                                                                       \
74235 +static __inline__ ITEM_TYPE*                                                           \
74236 +PREFIX##_hash_find_lru (PREFIX##_hash_table *hash,                                     \
74237 +                       KEY_TYPE const      *find_key)                                  \
74238 +{                                                                                      \
74239 +  return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key);     \
74240 +}                                                                                      \
74241 +                                                                                       \
74242 +static __inline__ int                                                                  \
74243 +PREFIX##_hash_remove (PREFIX##_hash_table *hash,                                       \
74244 +                     ITEM_TYPE           *del_item)                                    \
74245 +{                                                                                      \
74246 +  return PREFIX##_hash_remove_index (hash,                                             \
74247 +                                     HASH_FUNC(hash, &del_item->KEY_NAME), del_item);  \
74248 +}                                                                                      \
74249 +                                                                                       \
74250 +static __inline__ int                                                                  \
74251 +PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash,                                   \
74252 +                     ITEM_TYPE           *del_item)                                    \
74253 +{                                                                                      \
74254 +  return PREFIX##_hash_remove (hash, del_item);                                                \
74255 +}                                                                                      \
74256 +                                                                                       \
74257 +static __inline__ void                                                                 \
74258 +PREFIX##_hash_insert (PREFIX##_hash_table *hash,                                       \
74259 +                     ITEM_TYPE           *ins_item)                                    \
74260 +{                                                                                      \
74261 +  return PREFIX##_hash_insert_index (hash,                                             \
74262 +                                     HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item);  \
74263 +}                                                                                      \
74264 +                                                                                       \
74265 +static __inline__ void                                                                 \
74266 +PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash,                                   \
74267 +                         ITEM_TYPE           *ins_item)                                \
74268 +{                                                                                      \
74269 +  return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME),           \
74270 +                                         ins_item);                                    \
74271 +}                                                                                      \
74272 +                                                                                       \
74273 +static __inline__ ITEM_TYPE *                                                          \
74274 +PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind)                             \
74275 +{                                                                                      \
74276 +  ITEM_TYPE *first;                                                                    \
74277 +                                                                                       \
74278 +  for (first = NULL; ind < hash->_buckets; ++ ind) {                                   \
74279 +    first = hash->_table[ind];                                                         \
74280 +    if (first != NULL)                                                                 \
74281 +      break;                                                                           \
74282 +  }                                                                                    \
74283 +  return first;                                                                                \
74284 +}                                                                                      \
74285 +                                                                                       \
74286 +static __inline__ ITEM_TYPE *                                                          \
74287 +PREFIX##_hash_next (PREFIX##_hash_table *hash,                                         \
74288 +                   ITEM_TYPE           *item)                                          \
74289 +{                                                                                      \
74290 +  ITEM_TYPE  *next;                                                                    \
74291 +                                                                                       \
74292 +  if (item == NULL)                                                                    \
74293 +    return NULL;                                                                       \
74294 +  next = item->LINK_NAME._next;                                                                \
74295 +  if (next == NULL)                                                                    \
74296 +    next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1);           \
74297 +  return next;                                                                         \
74298 +}                                                                                      \
74299 +                                                                                       \
74300 +typedef struct {} PREFIX##_hash_dummy
74301 +
74302 +#define for_all_ht_buckets(table, head)                                        \
74303 +for ((head) = &(table) -> _table[ 0 ] ;                                        \
74304 +     (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head))
74305 +
74306 +#define for_all_in_bucket(bucket, item, next, field)                           \
74307 +for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ;      \
74308 +     (item) != NULL ;                                                          \
74309 +     (item) = (next), (next) = (item) ? (item) -> field._next : NULL )
74310 +
74311 +#define for_all_in_htable(table, prefix, item, next)   \
74312 +for ((item) = prefix ## _hash_first ((table), 0),      \
74313 +     (next) = prefix ## _hash_next ((table), (item)) ; \
74314 +     (item) != NULL ;                                  \
74315 +     (item) = (next),                                  \
74316 +     (next) = prefix ## _hash_next ((table), (item)))
74317 +
74318 +/* __REISER4_TYPE_SAFE_HASH_H__ */
74319 +#endif
74320 +
74321 +/* Make Linus happy.
74322 +   Local variables:
74323 +   c-indentation-style: "K&R"
74324 +   mode-name: "LC"
74325 +   c-basic-offset: 8
74326 +   tab-width: 8
74327 +   fill-column: 120
74328 +   End:
74329 +*/
74330 diff -urN linux-2.6.27.orig/fs/reiser4/vfs_ops.c linux-2.6.27/fs/reiser4/vfs_ops.c
74331 --- linux-2.6.27.orig/fs/reiser4/vfs_ops.c      1970-01-01 03:00:00.000000000 +0300
74332 +++ linux-2.6.27/fs/reiser4/vfs_ops.c   2008-10-12 18:20:01.000000000 +0400
74333 @@ -0,0 +1,259 @@
74334 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74335 + * reiser4/README */
74336 +
74337 +/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined
74338 +   here. */
74339 +
74340 +#include "forward.h"
74341 +#include "debug.h"
74342 +#include "dformat.h"
74343 +#include "coord.h"
74344 +#include "plugin/item/item.h"
74345 +#include "plugin/file/file.h"
74346 +#include "plugin/security/perm.h"
74347 +#include "plugin/disk_format/disk_format.h"
74348 +#include "plugin/plugin.h"
74349 +#include "plugin/plugin_set.h"
74350 +#include "plugin/object.h"
74351 +#include "txnmgr.h"
74352 +#include "jnode.h"
74353 +#include "znode.h"
74354 +#include "block_alloc.h"
74355 +#include "tree.h"
74356 +#include "vfs_ops.h"
74357 +#include "inode.h"
74358 +#include "page_cache.h"
74359 +#include "ktxnmgrd.h"
74360 +#include "super.h"
74361 +#include "reiser4.h"
74362 +#include "entd.h"
74363 +#include "status_flags.h"
74364 +#include "flush.h"
74365 +#include "dscale.h"
74366 +
74367 +#include <linux/profile.h>
74368 +#include <linux/types.h>
74369 +#include <linux/mount.h>
74370 +#include <linux/vfs.h>
74371 +#include <linux/mm.h>
74372 +#include <linux/buffer_head.h>
74373 +#include <linux/dcache.h>
74374 +#include <linux/list.h>
74375 +#include <linux/pagemap.h>
74376 +#include <linux/slab.h>
74377 +#include <linux/seq_file.h>
74378 +#include <linux/init.h>
74379 +#include <linux/module.h>
74380 +#include <linux/writeback.h>
74381 +#include <linux/blkdev.h>
74382 +#include <linux/quotaops.h>
74383 +#include <linux/security.h>
74384 +#include <linux/reboot.h>
74385 +#include <linux/rcupdate.h>
74386 +
74387 +/* update inode stat-data by calling plugin */
74388 +int reiser4_update_sd(struct inode *object)
74389 +{
74390 +       file_plugin *fplug;
74391 +
74392 +       assert("nikita-2338", object != NULL);
74393 +       /* check for read-only file system. */
74394 +       if (IS_RDONLY(object))
74395 +               return 0;
74396 +
74397 +       fplug = inode_file_plugin(object);
74398 +       assert("nikita-2339", fplug != NULL);
74399 +       return fplug->write_sd_by_inode(object);
74400 +}
74401 +
74402 +/* helper function: increase inode nlink count and call plugin method to save
74403 +   updated stat-data.
74404 +
74405 +   Used by link/create and during creation of dot and dotdot in mkdir
74406 +*/
74407 +int reiser4_add_nlink(struct inode *object /* object to which link is added */ ,
74408 +                     struct inode *parent /* parent where new entry will be */
74409 +                     ,
74410 +                     int write_sd_p    /* true if stat-data has to be
74411 +                                        * updated */ )
74412 +{
74413 +       file_plugin *fplug;
74414 +       int result;
74415 +
74416 +       assert("nikita-1351", object != NULL);
74417 +
74418 +       fplug = inode_file_plugin(object);
74419 +       assert("nikita-1445", fplug != NULL);
74420 +
74421 +       /* ask plugin whether it can add yet another link to this
74422 +          object */
74423 +       if (!fplug->can_add_link(object))
74424 +               return RETERR(-EMLINK);
74425 +
74426 +       assert("nikita-2211", fplug->add_link != NULL);
74427 +       /* call plugin to do actual addition of link */
74428 +       result = fplug->add_link(object, parent);
74429 +
74430 +       /* optionally update stat data */
74431 +       if (result == 0 && write_sd_p)
74432 +               result = fplug->write_sd_by_inode(object);
74433 +       return result;
74434 +}
74435 +
74436 +/* helper function: decrease inode nlink count and call plugin method to save
74437 +   updated stat-data.
74438 +
74439 +   Used by unlink/create
74440 +*/
74441 +int reiser4_del_nlink(struct inode *object     /* object from which link is
74442 +                                                * removed */ ,
74443 +                     struct inode *parent /* parent where entry was */ ,
74444 +                     int write_sd_p    /* true is stat-data has to be
74445 +                                        * updated */ )
74446 +{
74447 +       file_plugin *fplug;
74448 +       int result;
74449 +
74450 +       assert("nikita-1349", object != NULL);
74451 +
74452 +       fplug = inode_file_plugin(object);
74453 +       assert("nikita-1350", fplug != NULL);
74454 +       assert("nikita-1446", object->i_nlink > 0);
74455 +       assert("nikita-2210", fplug->rem_link != NULL);
74456 +
74457 +       /* call plugin to do actual deletion of link */
74458 +       result = fplug->rem_link(object, parent);
74459 +
74460 +       /* optionally update stat data */
74461 +       if (result == 0 && write_sd_p)
74462 +               result = fplug->write_sd_by_inode(object);
74463 +       return result;
74464 +}
74465 +
74466 +/* Release reiser4 dentry. This is d_op->d_release() method. */
74467 +static void reiser4_d_release(struct dentry *dentry /* dentry released */ )
74468 +{
74469 +       reiser4_free_dentry_fsdata(dentry);
74470 +}
74471 +
74472 +/*
74473 + * Called by reiser4_sync_inodes(), during speculative write-back (through
74474 + * pdflush, or balance_dirty_pages()).
74475 + */
74476 +void reiser4_writeout(struct super_block *sb, struct writeback_control *wbc)
74477 +{
74478 +       long written = 0;
74479 +       int repeats = 0;
74480 +       int result;
74481 +       struct address_space *mapping;
74482 +
74483 +       /*
74484 +        * Performs early flushing, trying to free some memory. If there is
74485 +        * nothing to flush, commits some atoms.
74486 +        */
74487 +
74488 +       /* Commit all atoms if reiser4_writepages() is called from sys_sync() or
74489 +          sys_fsync(). */
74490 +       if (wbc->sync_mode != WB_SYNC_NONE) {
74491 +               txnmgr_force_commit_all(sb, 0);
74492 +               return;
74493 +       }
74494 +
74495 +       BUG_ON(reiser4_get_super_fake(sb) == NULL);
74496 +       mapping = reiser4_get_super_fake(sb)->i_mapping;
74497 +       do {
74498 +               long nr_submitted = 0;
74499 +               jnode *node = NULL;
74500 +
74501 +               /* do not put more requests to overload write queue */
74502 +               if (wbc->nonblocking &&
74503 +                   bdi_write_congested(mapping->backing_dev_info)) {
74504 +                       blk_run_address_space(mapping);
74505 +                       wbc->encountered_congestion = 1;
74506 +                       break;
74507 +               }
74508 +               repeats++;
74509 +               BUG_ON(wbc->nr_to_write <= 0);
74510 +
74511 +               if (get_current_context()->entd) {
74512 +                       entd_context *ent = get_entd_context(sb);
74513 +
74514 +                       if (ent->cur_request->node)
74515 +                               /*
74516 +                                * this is ent thread and it managed to capture
74517 +                                * requested page itself - start flush from
74518 +                                * that page
74519 +                                */
74520 +                               node = jref(ent->cur_request->node);
74521 +               }
74522 +
74523 +               result = flush_some_atom(node, &nr_submitted, wbc,
74524 +                                        JNODE_FLUSH_WRITE_BLOCKS);
74525 +               if (result != 0)
74526 +                       warning("nikita-31001", "Flush failed: %i", result);
74527 +               if (node)
74528 +                       jput(node);
74529 +               if (!nr_submitted)
74530 +                       break;
74531 +
74532 +               wbc->nr_to_write -= nr_submitted;
74533 +               written += nr_submitted;
74534 +       } while (wbc->nr_to_write > 0);
74535 +}
74536 +
74537 +void reiser4_throttle_write(struct inode *inode)
74538 +{
74539 +       reiser4_txn_restart_current();
74540 +       balance_dirty_pages_ratelimited(inode->i_mapping);
74541 +}
74542 +
74543 +const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4";
74544 +const int REISER4_MAGIC_OFFSET = 16 * 4096;    /* offset to magic string from the
74545 +                                                * beginning of device */
74546 +
74547 +/*
74548 + * Reiser4 initialization/shutdown.
74549 + *
74550 + * Code below performs global reiser4 initialization that is done either as
74551 + * part of kernel initialization (when reiser4 is statically built-in), or
74552 + * during reiser4 module load (when compiled as module).
74553 + */
74554 +
74555 +void reiser4_handle_error(void)
74556 +{
74557 +       struct super_block *sb = reiser4_get_current_sb();
74558 +
74559 +       if (!sb)
74560 +               return;
74561 +       reiser4_status_write(REISER4_STATUS_DAMAGED, 0,
74562 +                            "Filesystem error occured");
74563 +       switch (get_super_private(sb)->onerror) {
74564 +       case 0:
74565 +               reiser4_panic("foobar-42", "Filesystem error occured\n");
74566 +       case 1:
74567 +       default:
74568 +               if (sb->s_flags & MS_RDONLY)
74569 +                       return;
74570 +               sb->s_flags |= MS_RDONLY;
74571 +               break;
74572 +       }
74573 +}
74574 +
74575 +struct dentry_operations reiser4_dentry_operations = {
74576 +       .d_revalidate = NULL,
74577 +       .d_hash = NULL,
74578 +       .d_compare = NULL,
74579 +       .d_delete = NULL,
74580 +       .d_release = reiser4_d_release,
74581 +       .d_iput = NULL,
74582 +};
74583 +
74584 +/* Make Linus happy.
74585 +   Local variables:
74586 +   c-indentation-style: "K&R"
74587 +   mode-name: "LC"
74588 +   c-basic-offset: 8
74589 +   tab-width: 8
74590 +   fill-column: 120
74591 +   End:
74592 +*/
74593 diff -urN linux-2.6.27.orig/fs/reiser4/vfs_ops.h linux-2.6.27/fs/reiser4/vfs_ops.h
74594 --- linux-2.6.27.orig/fs/reiser4/vfs_ops.h      1970-01-01 03:00:00.000000000 +0300
74595 +++ linux-2.6.27/fs/reiser4/vfs_ops.h   2008-10-12 18:20:01.000000000 +0400
74596 @@ -0,0 +1,53 @@
74597 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74598 + * reiser4/README */
74599 +
74600 +/* vfs_ops.c's exported symbols */
74601 +
74602 +#if !defined( __FS_REISER4_VFS_OPS_H__ )
74603 +#define __FS_REISER4_VFS_OPS_H__
74604 +
74605 +#include "forward.h"
74606 +#include "coord.h"
74607 +#include "seal.h"
74608 +#include "plugin/file/file.h"
74609 +#include "super.h"
74610 +#include "readahead.h"
74611 +
74612 +#include <linux/types.h>       /* for loff_t */
74613 +#include <linux/fs.h>          /* for struct address_space */
74614 +#include <linux/dcache.h>      /* for struct dentry */
74615 +#include <linux/mm.h>
74616 +#include <linux/backing-dev.h>
74617 +
74618 +/* address space operations */
74619 +int reiser4_writepage(struct page *, struct writeback_control *);
74620 +int reiser4_set_page_dirty(struct page *);
74621 +void reiser4_invalidatepage(struct page *, unsigned long offset);
74622 +int reiser4_releasepage(struct page *, gfp_t);
74623 +
74624 +extern int reiser4_update_sd(struct inode *);
74625 +extern int reiser4_add_nlink(struct inode *, struct inode *, int);
74626 +extern int reiser4_del_nlink(struct inode *, struct inode *, int);
74627 +
74628 +extern int reiser4_start_up_io(struct page *page);
74629 +extern void reiser4_throttle_write(struct inode *);
74630 +extern int jnode_is_releasable(jnode *);
74631 +
74632 +#define CAPTURE_APAGE_BURST (1024l)
74633 +void reiser4_writeout(struct super_block *, struct writeback_control *);
74634 +
74635 +extern void reiser4_handle_error(void);
74636 +
74637 +/* __FS_REISER4_VFS_OPS_H__ */
74638 +#endif
74639 +
74640 +/* Make Linus happy.
74641 +   Local variables:
74642 +   c-indentation-style: "K&R"
74643 +   mode-name: "LC"
74644 +   c-basic-offset: 8
74645 +   tab-width: 8
74646 +   fill-column: 120
74647 +   scroll-step: 1
74648 +   End:
74649 +*/
74650 diff -urN linux-2.6.27.orig/fs/reiser4/wander.c linux-2.6.27/fs/reiser4/wander.c
74651 --- linux-2.6.27.orig/fs/reiser4/wander.c       1970-01-01 03:00:00.000000000 +0300
74652 +++ linux-2.6.27/fs/reiser4/wander.c    2008-10-12 18:20:01.000000000 +0400
74653 @@ -0,0 +1,1797 @@
74654 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
74655 + * reiser4/README */
74656 +
74657 +/* Reiser4 Wandering Log */
74658 +
74659 +/* You should read http://www.namesys.com/txn-doc.html
74660 +
74661 +   That describes how filesystem operations are performed as atomic
74662 +   transactions, and how we try to arrange it so that we can write most of the
74663 +   data only once while performing the operation atomically.
74664 +
74665 +   For the purposes of this code, it is enough for it to understand that it
74666 +   has been told a given block should be written either once, or twice (if
74667 +   twice then once to the wandered location and once to the real location).
74668 +
74669 +   This code guarantees that those blocks that are defined to be part of an
74670 +   atom either all take effect or none of them take effect.
74671 +
74672 +   Relocate set nodes are submitted to write by the jnode_flush() routine, and
74673 +   the overwrite set is submitted by reiser4_write_log().  This is because with
74674 +   the overwrite set we seek to optimize writes, and with the relocate set we
74675 +   seek to cause disk order to correlate with the parent first pre-order.
74676 +
74677 +   reiser4_write_log() allocates and writes wandered blocks and maintains
74678 +   additional on-disk structures of the atom as wander records (each wander
74679 +   record occupies one block) for storing of the "wandered map" (a table which
74680 +   contains a relation between wandered and real block numbers) and other
74681 +   information which might be needed at transaction recovery time.
74682 +
74683 +   The wander records are unidirectionally linked into a circle: each wander
74684 +   record contains a block number of the next wander record, the last wander
74685 +   record points to the first one.
74686 +
74687 +   One wander record (named "tx head" in this file) has a format which is
74688 +   different from the other wander records. The "tx head" has a reference to the
74689 +   "tx head" block of the previously committed atom.  Also, "tx head" contains
74690 +   fs information (the free blocks counter, and the oid allocator state) which
74691 +   is logged in a special way .
74692 +
74693 +   There are two journal control blocks, named journal header and journal
74694 +   footer which have fixed on-disk locations.  The journal header has a
74695 +   reference to the "tx head" block of the last committed atom.  The journal
74696 +   footer points to the "tx head" of the last flushed atom.  The atom is
74697 +   "played" when all blocks from its overwrite set are written to disk the
74698 +   second time (i.e. written to their real locations).
74699 +
74700 +   NOTE: People who know reiserfs internals and its journal structure might be
74701 +   confused with these terms journal footer and journal header. There is a table
74702 +   with terms of similar semantics in reiserfs (reiser3) and reiser4:
74703 +
74704 +   REISER3 TERM        |  REISER4 TERM         | DESCRIPTION
74705 +   --------------------+-----------------------+----------------------------
74706 +   commit record       |  journal header       | atomic write of this record
74707 +                       |                       | ends transaction commit
74708 +   --------------------+-----------------------+----------------------------
74709 +   journal header      |  journal footer       | atomic write of this record
74710 +                       |                       | ends post-commit writes.
74711 +                       |                       | After successful
74712 +                       |                       | writing of this journal
74713 +                       |                       | blocks (in reiser3) or
74714 +                       |                       | wandered blocks/records are
74715 +                       |                       | free for re-use.
74716 +   --------------------+-----------------------+----------------------------
74717 +
74718 +   The atom commit process is the following:
74719 +
74720 +   1. The overwrite set is taken from atom's clean list, and its size is
74721 +      counted.
74722 +
74723 +   2. The number of necessary wander records (including tx head) is calculated,
74724 +      and the wander record blocks are allocated.
74725 +
74726 +   3. Allocate wandered blocks and populate wander records by wandered map.
74727 +
74728 +   4. submit write requests for wander records and wandered blocks.
74729 +
74730 +   5. wait until submitted write requests complete.
74731 +
74732 +   6. update journal header: change the pointer to the block number of just
74733 +   written tx head, submit an i/o for modified journal header block and wait
74734 +   for i/o completion.
74735 +
74736 +   NOTE: The special logging for bitmap blocks and some reiser4 super block
74737 +   fields makes processes of atom commit, flush and recovering a bit more
74738 +   complex (see comments in the source code for details).
74739 +
74740 +   The atom playing process is the following:
74741 +
74742 +   1. Write atom's overwrite set in-place.
74743 +
74744 +   2. Wait on i/o.
74745 +
74746 +   3. Update journal footer: change the pointer to block number of tx head
74747 +   block of the atom we currently flushing, submit an i/o, wait on i/o
74748 +   completion.
74749 +
74750 +   4. Free disk space which was used for wandered blocks and wander records.
74751 +
74752 +   After the freeing of wandered blocks and wander records we have that journal
74753 +   footer points to the on-disk structure which might be overwritten soon.
74754 +   Neither the log writer nor the journal recovery procedure use that pointer
74755 +   for accessing the data.  When the journal recovery procedure finds the oldest
74756 +   transaction it compares the journal footer pointer value with the "prev_tx"
74757 +   pointer value in tx head, if values are equal the oldest not flushed
74758 +   transaction is found.
74759 +
74760 +   NOTE on disk space leakage: the information about of what blocks and how many
74761 +   blocks are allocated for wandered blocks, wandered records is not written to
74762 +   the disk because of special logging for bitmaps and some super blocks
74763 +   counters.  After a system crash we the reiser4 does not remember those
74764 +   objects allocation, thus we have no such a kind of disk space leakage.
74765 +*/
74766 +
74767 +/* Special logging of reiser4 super block fields. */
74768 +
74769 +/* There are some reiser4 super block fields (free block count and OID allocator
74770 +   state (number of files and next free OID) which are logged separately from
74771 +   super block to avoid unnecessary atom fusion.
74772 +
74773 +   So, the reiser4 super block can be not captured by a transaction with
74774 +   allocates/deallocates disk blocks or create/delete file objects.  Moreover,
74775 +   the reiser4 on-disk super block is not touched when such a transaction is
74776 +   committed and flushed.  Those "counters logged specially" are logged in "tx
74777 +   head" blocks and in the journal footer block.
74778 +
74779 +   A step-by-step description of special logging:
74780 +
74781 +   0. The per-atom information about deleted or created files and allocated or
74782 +   freed blocks is collected during the transaction.  The atom's
74783 +   ->nr_objects_created and ->nr_objects_deleted are for object
74784 +   deletion/creation tracking, the numbers of allocated and freed blocks are
74785 +   calculated using atom's delete set and atom's capture list -- all new and
74786 +   relocated nodes should be on atom's clean list and should have JNODE_RELOC
74787 +   bit set.
74788 +
74789 +   1. The "logged specially" reiser4 super block fields have their "committed"
74790 +   versions in the reiser4 in-memory super block.  They get modified only at
74791 +   atom commit time.  The atom's commit thread has an exclusive access to those
74792 +   "committed" fields because the log writer implementation supports only one
74793 +   atom commit a time (there is a per-fs "commit" mutex).  At
74794 +   that time "committed" counters are modified using per-atom information
74795 +   collected during the transaction. These counters are stored on disk as a
74796 +   part of tx head block when atom is committed.
74797 +
74798 +   2. When the atom is flushed the value of the free block counter and the OID
74799 +   allocator state get written to the journal footer block.  A special journal
74800 +   procedure (journal_recover_sb_data()) takes those values from the journal
74801 +   footer and updates the reiser4 in-memory super block.
74802 +
74803 +   NOTE: That means free block count and OID allocator state are logged
74804 +   separately from the reiser4 super block regardless of the fact that the
74805 +   reiser4 super block has fields to store both the free block counter and the
74806 +   OID allocator.
74807 +
74808 +   Writing the whole super block at commit time requires knowing true values of
74809 +   all its fields without changes made by not yet committed transactions. It is
74810 +   possible by having their "committed" version of the super block like the
74811 +   reiser4 bitmap blocks have "committed" and "working" versions.  However,
74812 +   another scheme was implemented which stores special logged values in the
74813 +   unused free space inside transaction head block.  In my opinion it has an
74814 +   advantage of not writing whole super block when only part of it was
74815 +   modified. */
74816 +
74817 +#include "debug.h"
74818 +#include "dformat.h"
74819 +#include "txnmgr.h"
74820 +#include "jnode.h"
74821 +#include "znode.h"
74822 +#include "block_alloc.h"
74823 +#include "page_cache.h"
74824 +#include "wander.h"
74825 +#include "reiser4.h"
74826 +#include "super.h"
74827 +#include "vfs_ops.h"
74828 +#include "writeout.h"
74829 +#include "inode.h"
74830 +#include "entd.h"
74831 +
74832 +#include <linux/types.h>
74833 +#include <linux/fs.h>          /* for struct super_block  */
74834 +#include <linux/mm.h>          /* for struct page */
74835 +#include <linux/pagemap.h>
74836 +#include <linux/bio.h>         /* for struct bio */
74837 +#include <linux/blkdev.h>
74838 +
74839 +static int write_jnodes_to_disk_extent(
74840 +       jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
74841 +
74842 +/* The commit_handle is a container for objects needed at atom commit time  */
74843 +struct commit_handle {
74844 +       /* A pointer to atom's list of OVRWR nodes */
74845 +       struct list_head *overwrite_set;
74846 +       /* atom's overwrite set size */
74847 +       int overwrite_set_size;
74848 +       /* jnodes for wander record blocks */
74849 +       struct list_head tx_list;
74850 +       /* number of wander records */
74851 +       __u32 tx_size;
74852 +       /* 'committed' sb counters are saved here until atom is completely
74853 +          flushed  */
74854 +       __u64 free_blocks;
74855 +       __u64 nr_files;
74856 +       __u64 next_oid;
74857 +       /* A pointer to the atom which is being committed */
74858 +       txn_atom *atom;
74859 +       /* A pointer to current super block */
74860 +       struct super_block *super;
74861 +       /* The counter of modified bitmaps */
74862 +       reiser4_block_nr nr_bitmap;
74863 +};
74864 +
74865 +static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
74866 +{
74867 +       memset(ch, 0, sizeof(struct commit_handle));
74868 +       INIT_LIST_HEAD(&ch->tx_list);
74869 +
74870 +       ch->atom = atom;
74871 +       ch->super = reiser4_get_current_sb();
74872 +}
74873 +
74874 +static void done_commit_handle(struct commit_handle *ch)
74875 +{
74876 +       assert("zam-690", list_empty(&ch->tx_list));
74877 +}
74878 +
74879 +static inline int reiser4_use_write_barrier(struct super_block * s)
74880 +{
74881 +       return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
74882 +}
74883 +
74884 +static void disable_write_barrier(struct super_block * s)
74885 +{
74886 +       notice("zam-1055", "%s does not support write barriers,"
74887 +              " using synchronous write instead.", s->s_id);
74888 +       set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
74889 +}
74890 +
74891 +/* fill journal header block data  */
74892 +static void format_journal_header(struct commit_handle *ch)
74893 +{
74894 +       struct reiser4_super_info_data *sbinfo;
74895 +       struct journal_header *header;
74896 +       jnode *txhead;
74897 +
74898 +       sbinfo = get_super_private(ch->super);
74899 +       assert("zam-479", sbinfo != NULL);
74900 +       assert("zam-480", sbinfo->journal_header != NULL);
74901 +
74902 +       txhead = list_entry(ch->tx_list.next, jnode, capture_link);
74903 +
74904 +       jload(sbinfo->journal_header);
74905 +
74906 +       header = (struct journal_header *)jdata(sbinfo->journal_header);
74907 +       assert("zam-484", header != NULL);
74908 +
74909 +       put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
74910 +                     &header->last_committed_tx);
74911 +
74912 +       jrelse(sbinfo->journal_header);
74913 +}
74914 +
74915 +/* fill journal footer block data */
74916 +static void format_journal_footer(struct commit_handle *ch)
74917 +{
74918 +       struct reiser4_super_info_data *sbinfo;
74919 +       struct journal_footer *footer;
74920 +       jnode *tx_head;
74921 +
74922 +       sbinfo = get_super_private(ch->super);
74923 +
74924 +       tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
74925 +
74926 +       assert("zam-493", sbinfo != NULL);
74927 +       assert("zam-494", sbinfo->journal_header != NULL);
74928 +
74929 +       check_me("zam-691", jload(sbinfo->journal_footer) == 0);
74930 +
74931 +       footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
74932 +       assert("zam-495", footer != NULL);
74933 +
74934 +       put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
74935 +                     &footer->last_flushed_tx);
74936 +       put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
74937 +
74938 +       put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
74939 +       put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
74940 +
74941 +       jrelse(sbinfo->journal_footer);
74942 +}
74943 +
74944 +/* wander record capacity depends on current block size */
74945 +static int wander_record_capacity(const struct super_block *super)
74946 +{
74947 +       return (super->s_blocksize -
74948 +               sizeof(struct wander_record_header)) /
74949 +           sizeof(struct wander_entry);
74950 +}
74951 +
74952 +/* Fill first wander record (tx head) in accordance with supplied given data */
74953 +static void format_tx_head(struct commit_handle *ch)
74954 +{
74955 +       jnode *tx_head;
74956 +       jnode *next;
74957 +       struct tx_header *header;
74958 +
74959 +       tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
74960 +       assert("zam-692", &ch->tx_list != &tx_head->capture_link);
74961 +
74962 +       next = list_entry(tx_head->capture_link.next, jnode, capture_link);
74963 +       if (&ch->tx_list == &next->capture_link)
74964 +               next = tx_head;
74965 +
74966 +       header = (struct tx_header *)jdata(tx_head);
74967 +
74968 +       assert("zam-460", header != NULL);
74969 +       assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
74970 +
74971 +       memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
74972 +       memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
74973 +
74974 +       put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
74975 +       put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
74976 +                     &header->prev_tx);
74977 +       put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
74978 +       put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
74979 +       put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
74980 +       put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
74981 +}
74982 +
74983 +/* prepare ordinary wander record block (fill all service fields) */
74984 +static void
74985 +format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
74986 +{
74987 +       struct wander_record_header *LRH;
74988 +       jnode *next;
74989 +
74990 +       assert("zam-464", node != NULL);
74991 +
74992 +       LRH = (struct wander_record_header *)jdata(node);
74993 +       next = list_entry(node->capture_link.next, jnode, capture_link);
74994 +
74995 +       if (&ch->tx_list == &next->capture_link)
74996 +               next = list_entry(ch->tx_list.next, jnode, capture_link);
74997 +
74998 +       assert("zam-465", LRH != NULL);
74999 +       assert("zam-463",
75000 +              ch->super->s_blocksize > sizeof(struct wander_record_header));
75001 +
75002 +       memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
75003 +       memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
75004 +
75005 +       put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
75006 +       put_unaligned(cpu_to_le32(serial), &LRH->serial);
75007 +       put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
75008 +}
75009 +
75010 +/* add one wandered map entry to formatted wander record */
75011 +static void
75012 +store_entry(jnode * node, int index, const reiser4_block_nr * a,
75013 +           const reiser4_block_nr * b)
75014 +{
75015 +       char *data;
75016 +       struct wander_entry *pairs;
75017 +
75018 +       data = jdata(node);
75019 +       assert("zam-451", data != NULL);
75020 +
75021 +       pairs =
75022 +           (struct wander_entry *)(data + sizeof(struct wander_record_header));
75023 +
75024 +       put_unaligned(cpu_to_le64(*a), &pairs[index].original);
75025 +       put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
75026 +}
75027 +
75028 +/* currently, wander records contains contain only wandered map, which depend on
75029 +   overwrite set size */
75030 +static void get_tx_size(struct commit_handle *ch)
75031 +{
75032 +       assert("zam-440", ch->overwrite_set_size != 0);
75033 +       assert("zam-695", ch->tx_size == 0);
75034 +
75035 +       /* count all ordinary wander records
75036 +          (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
75037 +          for tx head block */
75038 +       ch->tx_size =
75039 +           (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
75040 +           2;
75041 +}
75042 +
75043 +/* A special structure for using in store_wmap_actor() for saving its state
75044 +   between calls */
75045 +struct store_wmap_params {
75046 +       jnode *cur;             /* jnode of current wander record to fill */
75047 +       int idx;                /* free element index in wander record  */
75048 +       int capacity;           /* capacity  */
75049 +
75050 +#if REISER4_DEBUG
75051 +       struct list_head *tx_list;
75052 +#endif
75053 +};
75054 +
75055 +/* an actor for use in blocknr_set_iterator routine which populates the list
75056 +   of pre-formatted wander records by wandered map info */
75057 +static int
75058 +store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
75059 +                const reiser4_block_nr * b, void *data)
75060 +{
75061 +       struct store_wmap_params *params = data;
75062 +
75063 +       if (params->idx >= params->capacity) {
75064 +               /* a new wander record should be taken from the tx_list */
75065 +               params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
75066 +               assert("zam-454",
75067 +                      params->tx_list != &params->cur->capture_link);
75068 +
75069 +               params->idx = 0;
75070 +       }
75071 +
75072 +       store_entry(params->cur, params->idx, a, b);
75073 +       params->idx++;
75074 +
75075 +       return 0;
75076 +}
75077 +
75078 +/* This function is called after Relocate set gets written to disk, Overwrite
75079 +   set is written to wandered locations and all wander records are written
75080 +   also. Updated journal header blocks contains a pointer (block number) to
75081 +   first wander record of the just written transaction */
75082 +static int update_journal_header(struct commit_handle *ch, int use_barrier)
75083 +{
75084 +       struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
75085 +       jnode *jh = sbinfo->journal_header;
75086 +       jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
75087 +       int ret;
75088 +
75089 +       format_journal_header(ch);
75090 +
75091 +       ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
75092 +                                         use_barrier ? WRITEOUT_BARRIER : 0);
75093 +       if (ret)
75094 +               return ret;
75095 +
75096 +       // blk_run_address_space(sbinfo->fake->i_mapping);
75097 +       /*blk_run_queues(); */
75098 +
75099 +       ret = jwait_io(jh, WRITE);
75100 +
75101 +       if (ret)
75102 +               return ret;
75103 +
75104 +       sbinfo->last_committed_tx = *jnode_get_block(head);
75105 +
75106 +       return 0;
75107 +}
75108 +
75109 +/* This function is called after write-back is finished. We update journal
75110 +   footer block and free blocks which were occupied by wandered blocks and
75111 +   transaction wander records */
75112 +static int update_journal_footer(struct commit_handle *ch, int use_barrier)
75113 +{
75114 +       reiser4_super_info_data *sbinfo = get_super_private(ch->super);
75115 +
75116 +       jnode *jf = sbinfo->journal_footer;
75117 +
75118 +       int ret;
75119 +
75120 +       format_journal_footer(ch);
75121 +
75122 +       ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
75123 +                                         use_barrier ? WRITEOUT_BARRIER : 0);
75124 +       if (ret)
75125 +               return ret;
75126 +
75127 +       // blk_run_address_space(sbinfo->fake->i_mapping);
75128 +       /*blk_run_queue(); */
75129 +
75130 +       ret = jwait_io(jf, WRITE);
75131 +       if (ret)
75132 +               return ret;
75133 +
75134 +       return 0;
75135 +}
75136 +
75137 +/* free block numbers of wander records of already written in place transaction */
75138 +static void dealloc_tx_list(struct commit_handle *ch)
75139 +{
75140 +       while (!list_empty(&ch->tx_list)) {
75141 +               jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
75142 +               list_del(&cur->capture_link);
75143 +               ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
75144 +               reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
75145 +                                     BA_FORMATTED);
75146 +
75147 +               unpin_jnode_data(cur);
75148 +               reiser4_drop_io_head(cur);
75149 +       }
75150 +}
75151 +
75152 +/* An actor for use in block_nr_iterator() routine which frees wandered blocks
75153 +   from atom's overwrite set. */
75154 +static int
75155 +dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
75156 +                  const reiser4_block_nr * a UNUSED_ARG,
75157 +                  const reiser4_block_nr * b, void *data UNUSED_ARG)
75158 +{
75159 +
75160 +       assert("zam-499", b != NULL);
75161 +       assert("zam-500", *b != 0);
75162 +       assert("zam-501", !reiser4_blocknr_is_fake(b));
75163 +
75164 +       reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
75165 +       return 0;
75166 +}
75167 +
75168 +/* free wandered block locations of already written in place transaction */
75169 +static void dealloc_wmap(struct commit_handle *ch)
75170 +{
75171 +       assert("zam-696", ch->atom != NULL);
75172 +
75173 +       blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
75174 +                            dealloc_wmap_actor, NULL, 1);
75175 +}
75176 +
75177 +/* helper function for alloc wandered blocks, which refill set of block
75178 +   numbers needed for wandered blocks  */
75179 +static int
75180 +get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
75181 +{
75182 +       reiser4_blocknr_hint hint;
75183 +       int ret;
75184 +
75185 +       reiser4_block_nr wide_len = count;
75186 +
75187 +       /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
75188 +          ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
75189 +          reserved allocation area so as to get the best qualities of fixed
75190 +          journals? */
75191 +       reiser4_blocknr_hint_init(&hint);
75192 +       hint.block_stage = BLOCK_GRABBED;
75193 +
75194 +       ret = reiser4_alloc_blocks(&hint, start, &wide_len,
75195 +                                  BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
75196 +       *len = (int)wide_len;
75197 +
75198 +       return ret;
75199 +}
75200 +
75201 +/*
75202 + * roll back changes made before issuing BIO in the case of IO error.
75203 + */
75204 +static void undo_bio(struct bio *bio)
75205 +{
75206 +       int i;
75207 +
75208 +       for (i = 0; i < bio->bi_vcnt; ++i) {
75209 +               struct page *pg;
75210 +               jnode *node;
75211 +
75212 +               pg = bio->bi_io_vec[i].bv_page;
75213 +               end_page_writeback(pg);
75214 +               node = jprivate(pg);
75215 +               spin_lock_jnode(node);
75216 +               JF_CLR(node, JNODE_WRITEBACK);
75217 +               JF_SET(node, JNODE_DIRTY);
75218 +               spin_unlock_jnode(node);
75219 +       }
75220 +       bio_put(bio);
75221 +}
75222 +
75223 +/* put overwrite set back to atom's clean list */
75224 +static void put_overwrite_set(struct commit_handle *ch)
75225 +{
75226 +       jnode *cur;
75227 +
75228 +       list_for_each_entry(cur, ch->overwrite_set, capture_link)
75229 +               jrelse_tail(cur);
75230 +}
75231 +
75232 +/* Count overwrite set size, grab disk space for wandered blocks allocation.
75233 +   Since we have a separate list for atom's overwrite set we just scan the list,
75234 +   count bitmap and other not leaf nodes which wandered blocks allocation we
75235 +   have to grab space for. */
75236 +static int get_overwrite_set(struct commit_handle *ch)
75237 +{
75238 +       int ret;
75239 +       jnode *cur;
75240 +       __u64 nr_not_leaves = 0;
75241 +#if REISER4_DEBUG
75242 +       __u64 nr_formatted_leaves = 0;
75243 +       __u64 nr_unformatted_leaves = 0;
75244 +#endif
75245 +
75246 +       assert("zam-697", ch->overwrite_set_size == 0);
75247 +
75248 +       ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
75249 +       cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
75250 +
75251 +       while (ch->overwrite_set != &cur->capture_link) {
75252 +               jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
75253 +
75254 +               /* Count bitmap locks for getting correct statistics what number
75255 +                * of blocks were cleared by the transaction commit. */
75256 +               if (jnode_get_type(cur) == JNODE_BITMAP)
75257 +                       ch->nr_bitmap++;
75258 +
75259 +               assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
75260 +                      || jnode_get_type(cur) == JNODE_BITMAP);
75261 +
75262 +               if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
75263 +                       /* we replace fake znode by another (real)
75264 +                          znode which is suggested by disk_layout
75265 +                          plugin */
75266 +
75267 +                       /* FIXME: it looks like fake znode should be
75268 +                          replaced by jnode supplied by
75269 +                          disk_layout. */
75270 +
75271 +                       struct super_block *s = reiser4_get_current_sb();
75272 +                       reiser4_super_info_data *sbinfo =
75273 +                           get_current_super_private();
75274 +
75275 +                       if (sbinfo->df_plug->log_super) {
75276 +                               jnode *sj = sbinfo->df_plug->log_super(s);
75277 +
75278 +                               assert("zam-593", sj != NULL);
75279 +
75280 +                               if (IS_ERR(sj))
75281 +                                       return PTR_ERR(sj);
75282 +
75283 +                               spin_lock_jnode(sj);
75284 +                               JF_SET(sj, JNODE_OVRWR);
75285 +                               insert_into_atom_ovrwr_list(ch->atom, sj);
75286 +                               spin_unlock_jnode(sj);
75287 +
75288 +                               /* jload it as the rest of overwrite set */
75289 +                               jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0);
75290 +
75291 +                               ch->overwrite_set_size++;
75292 +                       }
75293 +                       spin_lock_jnode(cur);
75294 +                       reiser4_uncapture_block(cur);
75295 +                       jput(cur);
75296 +
75297 +               } else {
75298 +                       int ret;
75299 +                       ch->overwrite_set_size++;
75300 +                       ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0);
75301 +                       if (ret)
75302 +                               reiser4_panic("zam-783",
75303 +                                             "cannot load e-flushed jnode back (ret = %d)\n",
75304 +                                             ret);
75305 +               }
75306 +
75307 +               /* Count not leaves here because we have to grab disk space
75308 +                * for wandered blocks. They were not counted as "flush
75309 +                * reserved". Counting should be done _after_ nodes are pinned
75310 +                * into memory by jload(). */
75311 +               if (!jnode_is_leaf(cur))
75312 +                       nr_not_leaves++;
75313 +               else {
75314 +#if REISER4_DEBUG
75315 +                       /* at this point @cur either has JNODE_FLUSH_RESERVED
75316 +                        * or is eflushed. Locking is not strong enough to
75317 +                        * write an assertion checking for this. */
75318 +                       if (jnode_is_znode(cur))
75319 +                               nr_formatted_leaves++;
75320 +                       else
75321 +                               nr_unformatted_leaves++;
75322 +#endif
75323 +                       JF_CLR(cur, JNODE_FLUSH_RESERVED);
75324 +               }
75325 +
75326 +               cur = next;
75327 +       }
75328 +
75329 +       /* Grab space for writing (wandered blocks) of not leaves found in
75330 +        * overwrite set. */
75331 +       ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
75332 +       if (ret)
75333 +               return ret;
75334 +
75335 +       /* Disk space for allocation of wandered blocks of leaf nodes already
75336 +        * reserved as "flush reserved", move it to grabbed space counter. */
75337 +       spin_lock_atom(ch->atom);
75338 +       assert("zam-940",
75339 +              nr_formatted_leaves + nr_unformatted_leaves <=
75340 +              ch->atom->flush_reserved);
75341 +       flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
75342 +       spin_unlock_atom(ch->atom);
75343 +
75344 +       return ch->overwrite_set_size;
75345 +}
75346 +
75347 +/**
75348 + * write_jnodes_to_disk_extent - submit write request
75349 + * @head:
75350 + * @first: first jnode of the list
75351 + * @nr: number of jnodes on the list
75352 + * @block_p:
75353 + * @fq:
75354 + * @flags: used to decide whether page is to get PG_reclaim flag
75355 + *
75356 + * Submits a write request for @nr jnodes beginning from the @first, other
75357 + * jnodes are after the @first on the double-linked "capture" list.  All jnodes
75358 + * will be written to the disk region of @nr blocks starting with @block_p block
75359 + * number.  If @fq is not NULL it means that waiting for i/o completion will be
75360 + * done more efficiently by using flush_queue_t objects.
75361 + * This function is the one which writes list of jnodes in batch mode. It does
75362 + * all low-level things as bio construction and page states manipulation.
75363 + *
75364 + * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
75365 + * aggregated in this function instead of being left to the layers below
75366 + *
75367 + * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
75368 + * Why that layer needed? Why BIOs cannot be constructed here?
75369 + */
75370 +static int write_jnodes_to_disk_extent(
75371 +       jnode *first, int nr, const reiser4_block_nr *block_p,
75372 +       flush_queue_t *fq, int flags)
75373 +{
75374 +       struct super_block *super = reiser4_get_current_sb();
75375 +       int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
75376 +       int max_blocks;
75377 +       jnode *cur = first;
75378 +       reiser4_block_nr block;
75379 +
75380 +       assert("zam-571", first != NULL);
75381 +       assert("zam-572", block_p != NULL);
75382 +       assert("zam-570", nr > 0);
75383 +
75384 +       block = *block_p;
75385 +       max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
75386 +
75387 +       while (nr > 0) {
75388 +               struct bio *bio;
75389 +               int nr_blocks = min(nr, max_blocks);
75390 +               int i;
75391 +               int nr_used;
75392 +
75393 +               bio = bio_alloc(GFP_NOIO, nr_blocks);
75394 +               if (!bio)
75395 +                       return RETERR(-ENOMEM);
75396 +
75397 +               bio->bi_bdev = super->s_bdev;
75398 +               bio->bi_sector = block * (super->s_blocksize >> 9);
75399 +               for (nr_used = 0, i = 0; i < nr_blocks; i++) {
75400 +                       struct page *pg;
75401 +
75402 +                       pg = jnode_page(cur);
75403 +                       assert("zam-573", pg != NULL);
75404 +
75405 +                       page_cache_get(pg);
75406 +
75407 +                       lock_and_wait_page_writeback(pg);
75408 +
75409 +                       if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
75410 +                               /*
75411 +                                * underlying device is satiated. Stop adding
75412 +                                * pages to the bio.
75413 +                                */
75414 +                               unlock_page(pg);
75415 +                               page_cache_release(pg);
75416 +                               break;
75417 +                       }
75418 +
75419 +                       spin_lock_jnode(cur);
75420 +                       assert("nikita-3166",
75421 +                              pg->mapping == jnode_get_mapping(cur));
75422 +                       assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
75423 +#if REISER4_DEBUG
75424 +                       spin_lock(&cur->load);
75425 +                       assert("nikita-3165", !jnode_is_releasable(cur));
75426 +                       spin_unlock(&cur->load);
75427 +#endif
75428 +                       JF_SET(cur, JNODE_WRITEBACK);
75429 +                       JF_CLR(cur, JNODE_DIRTY);
75430 +                       ON_DEBUG(cur->written++);
75431 +                       spin_unlock_jnode(cur);
75432 +
75433 +                       ClearPageError(pg);
75434 +                       set_page_writeback(pg);
75435 +
75436 +                       if (get_current_context()->entd) {
75437 +                               /* this is ent thread */
75438 +                               entd_context *ent = get_entd_context(super);
75439 +                               struct wbq *rq, *next;
75440 +
75441 +                               spin_lock(&ent->guard);
75442 +
75443 +                               if (pg == ent->cur_request->page) {
75444 +                                       /*
75445 +                                        * entd is called for this page. This
75446 +                                        * request is not in th etodo list
75447 +                                        */
75448 +                                       ent->cur_request->written = 1;
75449 +                               } else {
75450 +                                       /*
75451 +                                        * if we have written a page for which writepage
75452 +                                        * is called for - move request to another list.
75453 +                                        */
75454 +                                       list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
75455 +                                               assert("", rq->magic == WBQ_MAGIC);
75456 +                                               if (pg == rq->page) {
75457 +                                                       /*
75458 +                                                        * remove request from
75459 +                                                        * entd's queue, but do
75460 +                                                        * not wake up a thread
75461 +                                                        * which put this
75462 +                                                        * request
75463 +                                                        */
75464 +                                                       list_del_init(&rq->link);
75465 +                                                       ent->nr_todo_reqs --;
75466 +                                                       list_add_tail(&rq->link, &ent->done_list);
75467 +                                                       ent->nr_done_reqs ++;
75468 +                                                       rq->written = 1;
75469 +                                                       break;
75470 +                                               }
75471 +                                       }
75472 +                               }
75473 +                               spin_unlock(&ent->guard);
75474 +                       }
75475 +
75476 +                       clear_page_dirty_for_io(pg);
75477 +
75478 +                       unlock_page(pg);
75479 +
75480 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
75481 +                       nr_used++;
75482 +               }
75483 +               if (nr_used > 0) {
75484 +                       assert("nikita-3453",
75485 +                              bio->bi_size == super->s_blocksize * nr_used);
75486 +                       assert("nikita-3454", bio->bi_vcnt == nr_used);
75487 +
75488 +                       /* Check if we are allowed to write at all */
75489 +                       if (super->s_flags & MS_RDONLY)
75490 +                               undo_bio(bio);
75491 +                       else {
75492 +                               int not_supported;
75493 +
75494 +                               add_fq_to_bio(fq, bio);
75495 +                               bio_get(bio);
75496 +                               reiser4_submit_bio(write_op, bio);
75497 +                               not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
75498 +                               bio_put(bio);
75499 +                               if (not_supported)
75500 +                                       return -EOPNOTSUPP;
75501 +                       }
75502 +
75503 +                       block += nr_used - 1;
75504 +                       update_blocknr_hint_default(super, &block);
75505 +                       block += 1;
75506 +               } else {
75507 +                       bio_put(bio);
75508 +               }
75509 +               nr -= nr_used;
75510 +       }
75511 +
75512 +       return 0;
75513 +}
75514 +
75515 +/* This is a procedure which recovers a contiguous sequences of disk block
75516 +   numbers in the given list of j-nodes and submits write requests on this
75517 +   per-sequence basis */
75518 +int
75519 +write_jnode_list(struct list_head *head, flush_queue_t *fq,
75520 +                long *nr_submitted, int flags)
75521 +{
75522 +       int ret;
75523 +       jnode *beg = list_entry(head->next, jnode, capture_link);
75524 +
75525 +       while (head != &beg->capture_link) {
75526 +               int nr = 1;
75527 +               jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
75528 +
75529 +               while (head != &cur->capture_link) {
75530 +                       if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
75531 +                               break;
75532 +                       ++nr;
75533 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
75534 +               }
75535 +
75536 +               ret = write_jnodes_to_disk_extent(
75537 +                       beg, nr, jnode_get_block(beg), fq, flags);
75538 +               if (ret)
75539 +                       return ret;
75540 +
75541 +               if (nr_submitted)
75542 +                       *nr_submitted += nr;
75543 +
75544 +               beg = cur;
75545 +       }
75546 +
75547 +       return 0;
75548 +}
75549 +
75550 +/* add given wandered mapping to atom's wandered map */
75551 +static int
75552 +add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
75553 +{
75554 +       int ret;
75555 +       blocknr_set_entry *new_bsep = NULL;
75556 +       reiser4_block_nr block;
75557 +
75558 +       txn_atom *atom;
75559 +
75560 +       assert("zam-568", block_p != NULL);
75561 +       block = *block_p;
75562 +       assert("zam-569", len > 0);
75563 +
75564 +       while ((len--) > 0) {
75565 +               do {
75566 +                       atom = get_current_atom_locked();
75567 +                       assert("zam-536",
75568 +                              !reiser4_blocknr_is_fake(jnode_get_block(cur)));
75569 +                       ret =
75570 +                           blocknr_set_add_pair(atom, &atom->wandered_map,
75571 +                                                &new_bsep,
75572 +                                                jnode_get_block(cur), &block);
75573 +               } while (ret == -E_REPEAT);
75574 +
75575 +               if (ret) {
75576 +                       /* deallocate blocks which were not added to wandered
75577 +                          map */
75578 +                       reiser4_block_nr wide_len = len;
75579 +
75580 +                       reiser4_dealloc_blocks(&block, &wide_len,
75581 +                                              BLOCK_NOT_COUNTED,
75582 +                                              BA_FORMATTED
75583 +                                              /* formatted, without defer */ );
75584 +
75585 +                       return ret;
75586 +               }
75587 +
75588 +               spin_unlock_atom(atom);
75589 +
75590 +               cur = list_entry(cur->capture_link.next, jnode, capture_link);
75591 +               ++block;
75592 +       }
75593 +
75594 +       return 0;
75595 +}
75596 +
75597 +/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
75598 +   submit IO for allocated blocks.  We assume that current atom is in a stage
75599 +   when any atom fusion is impossible and atom is unlocked and it is safe. */
75600 +static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
75601 +{
75602 +       reiser4_block_nr block;
75603 +
75604 +       int rest;
75605 +       int len;
75606 +       int ret;
75607 +
75608 +       jnode *cur;
75609 +
75610 +       assert("zam-534", ch->overwrite_set_size > 0);
75611 +
75612 +       rest = ch->overwrite_set_size;
75613 +
75614 +       cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
75615 +       while (ch->overwrite_set != &cur->capture_link) {
75616 +               assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
75617 +
75618 +               ret = get_more_wandered_blocks(rest, &block, &len);
75619 +               if (ret)
75620 +                       return ret;
75621 +
75622 +               rest -= len;
75623 +
75624 +               ret = add_region_to_wmap(cur, len, &block);
75625 +               if (ret)
75626 +                       return ret;
75627 +
75628 +               ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
75629 +               if (ret)
75630 +                       return ret;
75631 +
75632 +               while ((len--) > 0) {
75633 +                       assert("zam-604",
75634 +                              ch->overwrite_set != &cur->capture_link);
75635 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
75636 +               }
75637 +       }
75638 +
75639 +       return 0;
75640 +}
75641 +
75642 +/* allocate given number of nodes over the journal area and link them into a
75643 +   list, return pointer to the first jnode in the list */
75644 +static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
75645 +{
75646 +       reiser4_blocknr_hint hint;
75647 +       reiser4_block_nr allocated = 0;
75648 +       reiser4_block_nr first, len;
75649 +       jnode *cur;
75650 +       jnode *txhead;
75651 +       int ret;
75652 +       reiser4_context *ctx;
75653 +       reiser4_super_info_data *sbinfo;
75654 +
75655 +       assert("zam-698", ch->tx_size > 0);
75656 +       assert("zam-699", list_empty_careful(&ch->tx_list));
75657 +
75658 +       ctx = get_current_context();
75659 +       sbinfo = get_super_private(ctx->super);
75660 +
75661 +       while (allocated < (unsigned)ch->tx_size) {
75662 +               len = (ch->tx_size - allocated);
75663 +
75664 +               reiser4_blocknr_hint_init(&hint);
75665 +
75666 +               hint.block_stage = BLOCK_GRABBED;
75667 +
75668 +               /* FIXME: there should be some block allocation policy for
75669 +                  nodes which contain wander records */
75670 +
75671 +               /* We assume that disk space for wandered record blocks can be
75672 +                * taken from reserved area. */
75673 +               ret = reiser4_alloc_blocks(&hint, &first, &len,
75674 +                                          BA_FORMATTED | BA_RESERVED |
75675 +                                          BA_USE_DEFAULT_SEARCH_START);
75676 +               reiser4_blocknr_hint_done(&hint);
75677 +
75678 +               if (ret)
75679 +                       return ret;
75680 +
75681 +               allocated += len;
75682 +
75683 +               /* create jnodes for all wander records */
75684 +               while (len--) {
75685 +                       cur = reiser4_alloc_io_head(&first);
75686 +
75687 +                       if (cur == NULL) {
75688 +                               ret = RETERR(-ENOMEM);
75689 +                               goto free_not_assigned;
75690 +                       }
75691 +
75692 +                       ret = jinit_new(cur, reiser4_ctx_gfp_mask_get());
75693 +
75694 +                       if (ret != 0) {
75695 +                               jfree(cur);
75696 +                               goto free_not_assigned;
75697 +                       }
75698 +
75699 +                       pin_jnode_data(cur);
75700 +
75701 +                       list_add_tail(&cur->capture_link, &ch->tx_list);
75702 +
75703 +                       first++;
75704 +               }
75705 +       }
75706 +
75707 +       { /* format a on-disk linked list of wander records */
75708 +               int serial = 1;
75709 +
75710 +               txhead = list_entry(ch->tx_list.next, jnode, capture_link);
75711 +               format_tx_head(ch);
75712 +
75713 +               cur = list_entry(txhead->capture_link.next, jnode, capture_link);
75714 +               while (&ch->tx_list != &cur->capture_link) {
75715 +                       format_wander_record(ch, cur, serial++);
75716 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
75717 +               }
75718 +       }
75719 +
75720 +       { /* Fill wander records with Wandered Set */
75721 +               struct store_wmap_params params;
75722 +               txn_atom *atom;
75723 +
75724 +               params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
75725 +
75726 +               params.idx = 0;
75727 +               params.capacity =
75728 +                   wander_record_capacity(reiser4_get_current_sb());
75729 +
75730 +               atom = get_current_atom_locked();
75731 +               blocknr_set_iterator(atom, &atom->wandered_map,
75732 +                                    &store_wmap_actor, &params, 0);
75733 +               spin_unlock_atom(atom);
75734 +       }
75735 +
75736 +       { /* relse all jnodes from tx_list */
75737 +               cur = list_entry(ch->tx_list.next, jnode, capture_link);
75738 +               while (&ch->tx_list != &cur->capture_link) {
75739 +                       jrelse(cur);
75740 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
75741 +               }
75742 +       }
75743 +
75744 +       ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
75745 +
75746 +       return ret;
75747 +
75748 +      free_not_assigned:
75749 +       /* We deallocate blocks not yet assigned to jnodes on tx_list. The
75750 +          caller takes care about invalidating of tx list  */
75751 +       reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
75752 +
75753 +       return ret;
75754 +}
75755 +
75756 +static int commit_tx(struct commit_handle *ch)
75757 +{
75758 +       flush_queue_t *fq;
75759 +       int barrier;
75760 +       int ret;
75761 +
75762 +       /* Grab more space for wandered records. */
75763 +       ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
75764 +       if (ret)
75765 +               return ret;
75766 +
75767 +       fq = get_fq_for_current_atom();
75768 +       if (IS_ERR(fq))
75769 +               return PTR_ERR(fq);
75770 +
75771 +       spin_unlock_atom(fq->atom);
75772 +       do {
75773 +               ret = alloc_wandered_blocks(ch, fq);
75774 +               if (ret)
75775 +                       break;
75776 +               ret = alloc_tx(ch, fq);
75777 +               if (ret)
75778 +                       break;
75779 +       } while (0);
75780 +
75781 +       reiser4_fq_put(fq);
75782 +       if (ret)
75783 +               return ret;
75784 + repeat_wo_barrier:
75785 +       barrier = reiser4_use_write_barrier(ch->super);
75786 +       if (!barrier) {
75787 +               ret = current_atom_finish_all_fq();
75788 +               if (ret)
75789 +                       return ret;
75790 +       }
75791 +       ret = update_journal_header(ch, barrier);
75792 +       if (barrier) {
75793 +               if (ret) {
75794 +                       if (ret == -EOPNOTSUPP) {
75795 +                               disable_write_barrier(ch->super);
75796 +                               goto repeat_wo_barrier;
75797 +                       }
75798 +                       return ret;
75799 +               }
75800 +               ret = current_atom_finish_all_fq();
75801 +       }
75802 +       return ret;
75803 +}
75804 +
75805 +static int write_tx_back(struct commit_handle * ch)
75806 +{
75807 +       flush_queue_t *fq;
75808 +       int ret;
75809 +       int barrier;
75810 +
75811 +       reiser4_post_commit_hook();
75812 +       fq = get_fq_for_current_atom();
75813 +       if (IS_ERR(fq))
75814 +               return  PTR_ERR(fq);
75815 +       spin_unlock_atom(fq->atom);
75816 +       ret = write_jnode_list(
75817 +               ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
75818 +       reiser4_fq_put(fq);
75819 +       if (ret)
75820 +               return ret;
75821 + repeat_wo_barrier:
75822 +       barrier = reiser4_use_write_barrier(ch->super);
75823 +       if (!barrier) {
75824 +               ret = current_atom_finish_all_fq();
75825 +               if (ret)
75826 +                       return ret;
75827 +       }
75828 +       ret = update_journal_footer(ch, barrier);
75829 +       if (barrier) {
75830 +               if (ret) {
75831 +                       if (ret == -EOPNOTSUPP) {
75832 +                               disable_write_barrier(ch->super);
75833 +                               goto repeat_wo_barrier;
75834 +                       }
75835 +                       return ret;
75836 +               }
75837 +               ret = current_atom_finish_all_fq();
75838 +       }
75839 +       if (ret)
75840 +               return ret;
75841 +       reiser4_post_write_back_hook();
75842 +       return 0;
75843 +}
75844 +
75845 +/* We assume that at this moment all captured blocks are marked as RELOC or
75846 +   WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
75847 +   are submitted to write.
75848 +*/
75849 +
75850 +int reiser4_write_logs(long *nr_submitted)
75851 +{
75852 +       txn_atom *atom;
75853 +       struct super_block *super = reiser4_get_current_sb();
75854 +       reiser4_super_info_data *sbinfo = get_super_private(super);
75855 +       struct commit_handle ch;
75856 +       int ret;
75857 +
75858 +       writeout_mode_enable();
75859 +
75860 +       /* block allocator may add j-nodes to the clean_list */
75861 +       ret = reiser4_pre_commit_hook();
75862 +       if (ret)
75863 +               return ret;
75864 +
75865 +       /* No locks are required if we take atom which stage >=
75866 +        * ASTAGE_PRE_COMMIT */
75867 +       atom = get_current_context()->trans->atom;
75868 +       assert("zam-965", atom != NULL);
75869 +
75870 +       /* relocate set is on the atom->clean_nodes list after
75871 +        * current_atom_complete_writes() finishes. It can be safely
75872 +        * uncaptured after commit_mutex is locked, because any atom that
75873 +        * captures these nodes is guaranteed to commit after current one.
75874 +        *
75875 +        * This can only be done after reiser4_pre_commit_hook(), because it is where
75876 +        * early flushed jnodes with CREATED bit are transferred to the
75877 +        * overwrite list. */
75878 +       reiser4_invalidate_list(ATOM_CLEAN_LIST(atom));
75879 +       spin_lock_atom(atom);
75880 +       /* There might be waiters for the relocate nodes which we have
75881 +        * released, wake them up. */
75882 +       reiser4_atom_send_event(atom);
75883 +       spin_unlock_atom(atom);
75884 +
75885 +       if (REISER4_DEBUG) {
75886 +               int level;
75887 +
75888 +               for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
75889 +                       assert("nikita-3352",
75890 +                              list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
75891 +       }
75892 +
75893 +       sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
75894 +       sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
75895 +
75896 +       init_commit_handle(&ch, atom);
75897 +
75898 +       ch.free_blocks = sbinfo->blocks_free_committed;
75899 +       ch.nr_files = sbinfo->nr_files_committed;
75900 +       /* ZAM-FIXME-HANS: email me what the contention level is for the super
75901 +        * lock. */
75902 +       ch.next_oid = oid_next(super);
75903 +
75904 +       /* count overwrite set and place it in a separate list */
75905 +       ret = get_overwrite_set(&ch);
75906 +
75907 +       if (ret <= 0) {
75908 +               /* It is possible that overwrite set is empty here, it means
75909 +                  all captured nodes are clean */
75910 +               goto up_and_ret;
75911 +       }
75912 +
75913 +       /* Inform the caller about what number of dirty pages will be
75914 +        * submitted to disk. */
75915 +       *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
75916 +
75917 +       /* count all records needed for storing of the wandered set */
75918 +       get_tx_size(&ch);
75919 +
75920 +       ret = commit_tx(&ch);
75921 +       if (ret)
75922 +               goto up_and_ret;
75923 +
75924 +       spin_lock_atom(atom);
75925 +       reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT);
75926 +       spin_unlock_atom(atom);
75927 +
75928 +       ret = write_tx_back(&ch);
75929 +       reiser4_post_write_back_hook();
75930 +
75931 +      up_and_ret:
75932 +       if (ret) {
75933 +               /* there could be fq attached to current atom; the only way to
75934 +                  remove them is: */
75935 +               current_atom_finish_all_fq();
75936 +       }
75937 +
75938 +       /* free blocks of flushed transaction */
75939 +       dealloc_tx_list(&ch);
75940 +       dealloc_wmap(&ch);
75941 +
75942 +       put_overwrite_set(&ch);
75943 +
75944 +       done_commit_handle(&ch);
75945 +
75946 +       writeout_mode_disable();
75947 +
75948 +       return ret;
75949 +}
75950 +
75951 +/* consistency checks for journal data/control blocks: header, footer, log
75952 +   records, transactions head blocks. All functions return zero on success. */
75953 +
75954 +static int check_journal_header(const jnode * node UNUSED_ARG)
75955 +{
75956 +       /* FIXME: journal header has no magic field yet. */
75957 +       return 0;
75958 +}
75959 +
75960 +/* wait for write completion for all jnodes from given list */
75961 +static int wait_on_jnode_list(struct list_head *head)
75962 +{
75963 +       jnode *scan;
75964 +       int ret = 0;
75965 +
75966 +       list_for_each_entry(scan, head, capture_link) {
75967 +               struct page *pg = jnode_page(scan);
75968 +
75969 +               if (pg) {
75970 +                       if (PageWriteback(pg))
75971 +                               wait_on_page_writeback(pg);
75972 +
75973 +                       if (PageError(pg))
75974 +                               ret++;
75975 +               }
75976 +       }
75977 +
75978 +       return ret;
75979 +}
75980 +
75981 +static int check_journal_footer(const jnode * node UNUSED_ARG)
75982 +{
75983 +       /* FIXME: journal footer has no magic field yet. */
75984 +       return 0;
75985 +}
75986 +
75987 +static int check_tx_head(const jnode * node)
75988 +{
75989 +       struct tx_header *header = (struct tx_header *)jdata(node);
75990 +
75991 +       if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
75992 +               warning("zam-627", "tx head at block %s corrupted\n",
75993 +                       sprint_address(jnode_get_block(node)));
75994 +               return RETERR(-EIO);
75995 +       }
75996 +
75997 +       return 0;
75998 +}
75999 +
76000 +static int check_wander_record(const jnode * node)
76001 +{
76002 +       struct wander_record_header *RH =
76003 +           (struct wander_record_header *)jdata(node);
76004 +
76005 +       if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
76006 +           0) {
76007 +               warning("zam-628", "wander record at block %s corrupted\n",
76008 +                       sprint_address(jnode_get_block(node)));
76009 +               return RETERR(-EIO);
76010 +       }
76011 +
76012 +       return 0;
76013 +}
76014 +
76015 +/* fill commit_handler structure by everything what is needed for update_journal_footer */
76016 +static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
76017 +{
76018 +       struct tx_header *TXH;
76019 +       int ret;
76020 +
76021 +       ret = jload(tx_head);
76022 +       if (ret)
76023 +               return ret;
76024 +
76025 +       TXH = (struct tx_header *)jdata(tx_head);
76026 +
76027 +       ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
76028 +       ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
76029 +       ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
76030 +
76031 +       jrelse(tx_head);
76032 +
76033 +       list_add(&tx_head->capture_link, &ch->tx_list);
76034 +
76035 +       return 0;
76036 +}
76037 +
76038 +/* replay one transaction: restore and write overwrite set in place */
76039 +static int replay_transaction(const struct super_block *s,
76040 +                             jnode * tx_head,
76041 +                             const reiser4_block_nr * log_rec_block_p,
76042 +                             const reiser4_block_nr * end_block,
76043 +                             unsigned int nr_wander_records)
76044 +{
76045 +       reiser4_block_nr log_rec_block = *log_rec_block_p;
76046 +       struct commit_handle ch;
76047 +       LIST_HEAD(overwrite_set);
76048 +       jnode *log;
76049 +       int ret;
76050 +
76051 +       init_commit_handle(&ch, NULL);
76052 +       ch.overwrite_set = &overwrite_set;
76053 +
76054 +       restore_commit_handle(&ch, tx_head);
76055 +
76056 +       while (log_rec_block != *end_block) {
76057 +               struct wander_record_header *header;
76058 +               struct wander_entry *entry;
76059 +
76060 +               int i;
76061 +
76062 +               if (nr_wander_records == 0) {
76063 +                       warning("zam-631",
76064 +                               "number of wander records in the linked list"
76065 +                               " greater than number stored in tx head.\n");
76066 +                       ret = RETERR(-EIO);
76067 +                       goto free_ow_set;
76068 +               }
76069 +
76070 +               log = reiser4_alloc_io_head(&log_rec_block);
76071 +               if (log == NULL)
76072 +                       return RETERR(-ENOMEM);
76073 +
76074 +               ret = jload(log);
76075 +               if (ret < 0) {
76076 +                       reiser4_drop_io_head(log);
76077 +                       return ret;
76078 +               }
76079 +
76080 +               ret = check_wander_record(log);
76081 +               if (ret) {
76082 +                       jrelse(log);
76083 +                       reiser4_drop_io_head(log);
76084 +                       return ret;
76085 +               }
76086 +
76087 +               header = (struct wander_record_header *)jdata(log);
76088 +               log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
76089 +
76090 +               entry = (struct wander_entry *)(header + 1);
76091 +
76092 +               /* restore overwrite set from wander record content */
76093 +               for (i = 0; i < wander_record_capacity(s); i++) {
76094 +                       reiser4_block_nr block;
76095 +                       jnode *node;
76096 +
76097 +                       block = le64_to_cpu(get_unaligned(&entry->wandered));
76098 +                       if (block == 0)
76099 +                               break;
76100 +
76101 +                       node = reiser4_alloc_io_head(&block);
76102 +                       if (node == NULL) {
76103 +                               ret = RETERR(-ENOMEM);
76104 +                               /*
76105 +                                * FIXME-VS:???
76106 +                                */
76107 +                               jrelse(log);
76108 +                               reiser4_drop_io_head(log);
76109 +                               goto free_ow_set;
76110 +                       }
76111 +
76112 +                       ret = jload(node);
76113 +
76114 +                       if (ret < 0) {
76115 +                               reiser4_drop_io_head(node);
76116 +                               /*
76117 +                                * FIXME-VS:???
76118 +                                */
76119 +                               jrelse(log);
76120 +                               reiser4_drop_io_head(log);
76121 +                               goto free_ow_set;
76122 +                       }
76123 +
76124 +                       block = le64_to_cpu(get_unaligned(&entry->original));
76125 +
76126 +                       assert("zam-603", block != 0);
76127 +
76128 +                       jnode_set_block(node, &block);
76129 +
76130 +                       list_add_tail(&node->capture_link, ch.overwrite_set);
76131 +
76132 +                       ++entry;
76133 +               }
76134 +
76135 +               jrelse(log);
76136 +               reiser4_drop_io_head(log);
76137 +
76138 +               --nr_wander_records;
76139 +       }
76140 +
76141 +       if (nr_wander_records != 0) {
76142 +               warning("zam-632", "number of wander records in the linked list"
76143 +                       " less than number stored in tx head.\n");
76144 +               ret = RETERR(-EIO);
76145 +               goto free_ow_set;
76146 +       }
76147 +
76148 +       {                       /* write wandered set in place */
76149 +               write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
76150 +               ret = wait_on_jnode_list(ch.overwrite_set);
76151 +
76152 +               if (ret) {
76153 +                       ret = RETERR(-EIO);
76154 +                       goto free_ow_set;
76155 +               }
76156 +       }
76157 +
76158 +       ret = update_journal_footer(&ch, 0);
76159 +
76160 +      free_ow_set:
76161 +
76162 +       while (!list_empty(ch.overwrite_set)) {
76163 +               jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
76164 +               list_del_init(&cur->capture_link);
76165 +               jrelse(cur);
76166 +               reiser4_drop_io_head(cur);
76167 +       }
76168 +
76169 +       list_del_init(&tx_head->capture_link);
76170 +
76171 +       done_commit_handle(&ch);
76172 +
76173 +       return ret;
76174 +}
76175 +
76176 +/* find oldest committed and not played transaction and play it. The transaction
76177 + * was committed and journal header block was updated but the blocks from the
76178 + * process of writing the atom's overwrite set in-place and updating of journal
76179 + * footer block were not completed. This function completes the process by
76180 + * recovering the atom's overwrite set from their wandered locations and writes
76181 + * them in-place and updating the journal footer. */
76182 +static int replay_oldest_transaction(struct super_block *s)
76183 +{
76184 +       reiser4_super_info_data *sbinfo = get_super_private(s);
76185 +       jnode *jf = sbinfo->journal_footer;
76186 +       unsigned int total;
76187 +       struct journal_footer *F;
76188 +       struct tx_header *T;
76189 +
76190 +       reiser4_block_nr prev_tx;
76191 +       reiser4_block_nr last_flushed_tx;
76192 +       reiser4_block_nr log_rec_block = 0;
76193 +
76194 +       jnode *tx_head;
76195 +
76196 +       int ret;
76197 +
76198 +       if ((ret = jload(jf)) < 0)
76199 +               return ret;
76200 +
76201 +       F = (struct journal_footer *)jdata(jf);
76202 +
76203 +       last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
76204 +
76205 +       jrelse(jf);
76206 +
76207 +       if (sbinfo->last_committed_tx == last_flushed_tx) {
76208 +               /* all transactions are replayed */
76209 +               return 0;
76210 +       }
76211 +
76212 +       prev_tx = sbinfo->last_committed_tx;
76213 +
76214 +       /* searching for oldest not flushed transaction */
76215 +       while (1) {
76216 +               tx_head = reiser4_alloc_io_head(&prev_tx);
76217 +               if (!tx_head)
76218 +                       return RETERR(-ENOMEM);
76219 +
76220 +               ret = jload(tx_head);
76221 +               if (ret < 0) {
76222 +                       reiser4_drop_io_head(tx_head);
76223 +                       return ret;
76224 +               }
76225 +
76226 +               ret = check_tx_head(tx_head);
76227 +               if (ret) {
76228 +                       jrelse(tx_head);
76229 +                       reiser4_drop_io_head(tx_head);
76230 +                       return ret;
76231 +               }
76232 +
76233 +               T = (struct tx_header *)jdata(tx_head);
76234 +
76235 +               prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
76236 +
76237 +               if (prev_tx == last_flushed_tx)
76238 +                       break;
76239 +
76240 +               jrelse(tx_head);
76241 +               reiser4_drop_io_head(tx_head);
76242 +       }
76243 +
76244 +       total = le32_to_cpu(get_unaligned(&T->total));
76245 +       log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
76246 +
76247 +       pin_jnode_data(tx_head);
76248 +       jrelse(tx_head);
76249 +
76250 +       ret =
76251 +           replay_transaction(s, tx_head, &log_rec_block,
76252 +                              jnode_get_block(tx_head), total - 1);
76253 +
76254 +       unpin_jnode_data(tx_head);
76255 +       reiser4_drop_io_head(tx_head);
76256 +
76257 +       if (ret)
76258 +               return ret;
76259 +       return -E_REPEAT;
76260 +}
76261 +
76262 +/* The reiser4 journal current implementation was optimized to not to capture
76263 +   super block if certain super blocks fields are modified. Currently, the set
76264 +   is (<free block count>, <OID allocator>). These fields are logged by
76265 +   special way which includes storing them in each transaction head block at
76266 +   atom commit time and writing that information to journal footer block at
76267 +   atom flush time.  For getting info from journal footer block to the
76268 +   in-memory super block there is a special function
76269 +   reiser4_journal_recover_sb_data() which should be called after disk format
76270 +   plugin re-reads super block after journal replaying.
76271 +*/
76272 +
76273 +/* get the information from journal footer in-memory super block */
76274 +int reiser4_journal_recover_sb_data(struct super_block *s)
76275 +{
76276 +       reiser4_super_info_data *sbinfo = get_super_private(s);
76277 +       struct journal_footer *jf;
76278 +       int ret;
76279 +
76280 +       assert("zam-673", sbinfo->journal_footer != NULL);
76281 +
76282 +       ret = jload(sbinfo->journal_footer);
76283 +       if (ret != 0)
76284 +               return ret;
76285 +
76286 +       ret = check_journal_footer(sbinfo->journal_footer);
76287 +       if (ret != 0)
76288 +               goto out;
76289 +
76290 +       jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
76291 +
76292 +       /* was there at least one flushed transaction?  */
76293 +       if (jf->last_flushed_tx) {
76294 +
76295 +               /* restore free block counter logged in this transaction */
76296 +               reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
76297 +
76298 +               /* restore oid allocator state */
76299 +               oid_init_allocator(s,
76300 +                                  le64_to_cpu(get_unaligned(&jf->nr_files)),
76301 +                                  le64_to_cpu(get_unaligned(&jf->next_oid)));
76302 +       }
76303 +      out:
76304 +       jrelse(sbinfo->journal_footer);
76305 +       return ret;
76306 +}
76307 +
76308 +/* reiser4 replay journal procedure */
76309 +int reiser4_journal_replay(struct super_block *s)
76310 +{
76311 +       reiser4_super_info_data *sbinfo = get_super_private(s);
76312 +       jnode *jh, *jf;
76313 +       struct journal_header *header;
76314 +       int nr_tx_replayed = 0;
76315 +       int ret;
76316 +
76317 +       assert("zam-582", sbinfo != NULL);
76318 +
76319 +       jh = sbinfo->journal_header;
76320 +       jf = sbinfo->journal_footer;
76321 +
76322 +       if (!jh || !jf) {
76323 +               /* it is possible that disk layout does not support journal
76324 +                  structures, we just warn about this */
76325 +               warning("zam-583",
76326 +                       "journal control blocks were not loaded by disk layout plugin.  "
76327 +                       "journal replaying is not possible.\n");
76328 +               return 0;
76329 +       }
76330 +
76331 +       /* Take free block count from journal footer block. The free block
76332 +          counter value corresponds the last flushed transaction state */
76333 +       ret = jload(jf);
76334 +       if (ret < 0)
76335 +               return ret;
76336 +
76337 +       ret = check_journal_footer(jf);
76338 +       if (ret) {
76339 +               jrelse(jf);
76340 +               return ret;
76341 +       }
76342 +
76343 +       jrelse(jf);
76344 +
76345 +       /* store last committed transaction info in reiser4 in-memory super
76346 +          block */
76347 +       ret = jload(jh);
76348 +       if (ret < 0)
76349 +               return ret;
76350 +
76351 +       ret = check_journal_header(jh);
76352 +       if (ret) {
76353 +               jrelse(jh);
76354 +               return ret;
76355 +       }
76356 +
76357 +       header = (struct journal_header *)jdata(jh);
76358 +       sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
76359 +
76360 +       jrelse(jh);
76361 +
76362 +       /* replay committed transactions */
76363 +       while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
76364 +               nr_tx_replayed++;
76365 +
76366 +       return ret;
76367 +}
76368 +
76369 +/* load journal control block (either journal header or journal footer block) */
76370 +static int
76371 +load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
76372 +{
76373 +       int ret;
76374 +
76375 +       *node = reiser4_alloc_io_head(block);
76376 +       if (!(*node))
76377 +               return RETERR(-ENOMEM);
76378 +
76379 +       ret = jload(*node);
76380 +
76381 +       if (ret) {
76382 +               reiser4_drop_io_head(*node);
76383 +               *node = NULL;
76384 +               return ret;
76385 +       }
76386 +
76387 +       pin_jnode_data(*node);
76388 +       jrelse(*node);
76389 +
76390 +       return 0;
76391 +}
76392 +
76393 +/* unload journal header or footer and free jnode */
76394 +static void unload_journal_control_block(jnode ** node)
76395 +{
76396 +       if (*node) {
76397 +               unpin_jnode_data(*node);
76398 +               reiser4_drop_io_head(*node);
76399 +               *node = NULL;
76400 +       }
76401 +}
76402 +
76403 +/* release journal control blocks */
76404 +void reiser4_done_journal_info(struct super_block *s)
76405 +{
76406 +       reiser4_super_info_data *sbinfo = get_super_private(s);
76407 +
76408 +       assert("zam-476", sbinfo != NULL);
76409 +
76410 +       unload_journal_control_block(&sbinfo->journal_header);
76411 +       unload_journal_control_block(&sbinfo->journal_footer);
76412 +       rcu_barrier();
76413 +}
76414 +
76415 +/* load journal control blocks */
76416 +int reiser4_init_journal_info(struct super_block *s)
76417 +{
76418 +       reiser4_super_info_data *sbinfo = get_super_private(s);
76419 +       journal_location *loc;
76420 +       int ret;
76421 +
76422 +       loc = &sbinfo->jloc;
76423 +
76424 +       assert("zam-651", loc != NULL);
76425 +       assert("zam-652", loc->header != 0);
76426 +       assert("zam-653", loc->footer != 0);
76427 +
76428 +       ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
76429 +
76430 +       if (ret)
76431 +               return ret;
76432 +
76433 +       ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
76434 +
76435 +       if (ret) {
76436 +               unload_journal_control_block(&sbinfo->journal_header);
76437 +       }
76438 +
76439 +       return ret;
76440 +}
76441 +
76442 +/* Make Linus happy.
76443 +   Local variables:
76444 +   c-indentation-style: "K&R"
76445 +   mode-name: "LC"
76446 +   c-basic-offset: 8
76447 +   tab-width: 8
76448 +   fill-column: 80
76449 +   End:
76450 +*/
76451 diff -urN linux-2.6.27.orig/fs/reiser4/wander.h linux-2.6.27/fs/reiser4/wander.h
76452 --- linux-2.6.27.orig/fs/reiser4/wander.h       1970-01-01 03:00:00.000000000 +0300
76453 +++ linux-2.6.27/fs/reiser4/wander.h    2008-10-12 18:20:01.000000000 +0400
76454 @@ -0,0 +1,135 @@
76455 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
76456 +
76457 +#if !defined (__FS_REISER4_WANDER_H__)
76458 +#define __FS_REISER4_WANDER_H__
76459 +
76460 +#include "dformat.h"
76461 +
76462 +#include <linux/fs.h>          /* for struct super_block  */
76463 +
76464 +/* REISER4 JOURNAL ON-DISK DATA STRUCTURES   */
76465 +
76466 +#define TX_HEADER_MAGIC  "TxMagic4"
76467 +#define WANDER_RECORD_MAGIC "LogMagc4"
76468 +
76469 +#define TX_HEADER_MAGIC_SIZE  (8)
76470 +#define WANDER_RECORD_MAGIC_SIZE (8)
76471 +
76472 +/* journal header block format */
76473 +struct journal_header {
76474 +       /* last written transaction head location */
76475 +       d64 last_committed_tx;
76476 +};
76477 +
76478 +typedef struct journal_location {
76479 +       reiser4_block_nr footer;
76480 +       reiser4_block_nr header;
76481 +} journal_location;
76482 +
76483 +/* The wander.c head comment describes usage and semantic of all these structures */
76484 +/* journal footer block format */
76485 +struct journal_footer {
76486 +       /* last flushed transaction location. */
76487 +       /* This block number is no more valid after the transaction it points
76488 +          to gets flushed, this number is used only at journal replaying time
76489 +          for detection of the end of on-disk list of committed transactions
76490 +          which were not flushed completely */
76491 +       d64 last_flushed_tx;
76492 +
76493 +       /* free block counter is written in journal footer at transaction
76494 +          flushing , not in super block because free blocks counter is logged
76495 +          by another way than super block fields (root pointer, for
76496 +          example). */
76497 +       d64 free_blocks;
76498 +
76499 +       /* number of used OIDs and maximal used OID are logged separately from
76500 +          super block */
76501 +       d64 nr_files;
76502 +       d64 next_oid;
76503 +};
76504 +
76505 +/* Each wander record (except the first one) has unified format with wander
76506 +   record header followed by an array of log entries */
76507 +struct wander_record_header {
76508 +       /* when there is no predefined location for wander records, this magic
76509 +          string should help reiser4fsck. */
76510 +       char magic[WANDER_RECORD_MAGIC_SIZE];
76511 +
76512 +       /* transaction id */
76513 +       d64 id;
76514 +
76515 +       /* total number of wander records in current transaction  */
76516 +       d32 total;
76517 +
76518 +       /* this block number in transaction */
76519 +       d32 serial;
76520 +
76521 +       /* number of previous block in commit */
76522 +       d64 next_block;
76523 +};
76524 +
76525 +/* The first wander record (transaction head) of written transaction has the
76526 +   special format */
76527 +struct tx_header {
76528 +       /* magic string makes first block in transaction different from other
76529 +          logged blocks, it should help fsck. */
76530 +       char magic[TX_HEADER_MAGIC_SIZE];
76531 +
76532 +       /* transaction id */
76533 +       d64 id;
76534 +
76535 +       /* total number of records (including this first tx head) in the
76536 +          transaction */
76537 +       d32 total;
76538 +
76539 +       /* align next field to 8-byte boundary; this field always is zero */
76540 +       d32 padding;
76541 +
76542 +       /* block number of previous transaction head */
76543 +       d64 prev_tx;
76544 +
76545 +       /* next wander record location */
76546 +       d64 next_block;
76547 +
76548 +       /* committed versions of free blocks counter */
76549 +       d64 free_blocks;
76550 +
76551 +       /* number of used OIDs (nr_files) and maximal used OID are logged
76552 +          separately from super block */
76553 +       d64 nr_files;
76554 +       d64 next_oid;
76555 +};
76556 +
76557 +/* A transaction gets written to disk as a set of wander records (each wander
76558 +   record size is fs block) */
76559 +
76560 +/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled
76561 +   by zeroes */
76562 +struct wander_entry {
76563 +       d64 original;           /* block original location */
76564 +       d64 wandered;           /* block wandered location */
76565 +};
76566 +
76567 +/* REISER4 JOURNAL WRITER FUNCTIONS   */
76568 +
76569 +extern int reiser4_write_logs(long *);
76570 +extern int reiser4_journal_replay(struct super_block *);
76571 +extern int reiser4_journal_recover_sb_data(struct super_block *);
76572 +
76573 +extern int reiser4_init_journal_info(struct super_block *);
76574 +extern void reiser4_done_journal_info(struct super_block *);
76575 +
76576 +extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int);
76577 +
76578 +#endif                         /* __FS_REISER4_WANDER_H__ */
76579 +
76580 +/* Make Linus happy.
76581 +   Local variables:
76582 +   c-indentation-style: "K&R"
76583 +   mode-name: "LC"
76584 +   c-basic-offset: 8
76585 +   tab-width: 8
76586 +   fill-column: 80
76587 +   scroll-step: 1
76588 +   End:
76589 +*/
76590 diff -urN linux-2.6.27.orig/fs/reiser4/writeout.h linux-2.6.27/fs/reiser4/writeout.h
76591 --- linux-2.6.27.orig/fs/reiser4/writeout.h     1970-01-01 03:00:00.000000000 +0300
76592 +++ linux-2.6.27/fs/reiser4/writeout.h  2008-10-12 18:20:01.000000000 +0400
76593 @@ -0,0 +1,21 @@
76594 +/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README  */
76595 +
76596 +#if !defined (__FS_REISER4_WRITEOUT_H__)
76597 +
76598 +#define WRITEOUT_SINGLE_STREAM (0x1)
76599 +#define WRITEOUT_FOR_PAGE_RECLAIM  (0x2)
76600 +#define WRITEOUT_BARRIER (0x4)
76601 +
76602 +extern int reiser4_get_writeout_flags(void);
76603 +
76604 +#endif                         /* __FS_REISER4_WRITEOUT_H__ */
76605 +
76606 +/* Make Linus happy.
76607 +   Local variables:
76608 +   c-indentation-style: "K&R"
76609 +   mode-name: "LC"
76610 +   c-basic-offset: 8
76611 +   tab-width: 8
76612 +   fill-column: 80
76613 +   End:
76614 +*/
76615 diff -urN linux-2.6.27.orig/fs/reiser4/znode.c linux-2.6.27/fs/reiser4/znode.c
76616 --- linux-2.6.27.orig/fs/reiser4/znode.c        1970-01-01 03:00:00.000000000 +0300
76617 +++ linux-2.6.27/fs/reiser4/znode.c     2008-10-12 18:20:01.000000000 +0400
76618 @@ -0,0 +1,1029 @@
76619 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76620 + * reiser4/README */
76621 +/* Znode manipulation functions. */
76622 +/* Znode is the in-memory header for a tree node. It is stored
76623 +   separately from the node itself so that it does not get written to
76624 +   disk.  In this respect znode is like buffer head or page head. We
76625 +   also use znodes for additional reiser4 specific purposes:
76626 +
76627 +    . they are organized into tree structure which is a part of whole
76628 +      reiser4 tree.
76629 +    . they are used to implement node grained locking
76630 +    . they are used to keep additional state associated with a
76631 +      node
76632 +    . they contain links to lists used by the transaction manager
76633 +
76634 +   Znode is attached to some variable "block number" which is instance of
76635 +   fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without
76636 +   appropriate node being actually loaded in memory. Existence of znode itself
76637 +   is regulated by reference count (->x_count) in it. Each time thread
76638 +   acquires reference to znode through call to zget(), ->x_count is
76639 +   incremented and decremented on call to zput().  Data (content of node) are
76640 +   brought in memory through call to zload(), which also increments ->d_count
76641 +   reference counter.  zload can block waiting on IO.  Call to zrelse()
76642 +   decreases this counter. Also, ->c_count keeps track of number of child
76643 +   znodes and prevents parent znode from being recycled until all of its
76644 +   children are. ->c_count is decremented whenever child goes out of existence
76645 +   (being actually recycled in zdestroy()) which can be some time after last
76646 +   reference to this child dies if we support some form of LRU cache for
76647 +   znodes.
76648 +
76649 +*/
76650 +/* EVERY ZNODE'S STORY
76651 +
76652 +   1. His infancy.
76653 +
76654 +   Once upon a time, the znode was born deep inside of zget() by call to
76655 +   zalloc(). At the return from zget() znode had:
76656 +
76657 +    . reference counter (x_count) of 1
76658 +    . assigned block number, marked as used in bitmap
76659 +    . pointer to parent znode. Root znode parent pointer points
76660 +      to its father: "fake" znode. This, in turn, has NULL parent pointer.
76661 +    . hash table linkage
76662 +    . no data loaded from disk
76663 +    . no node plugin
76664 +    . no sibling linkage
76665 +
76666 +   2. His childhood
76667 +
76668 +   Each node is either brought into memory as a result of tree traversal, or
76669 +   created afresh, creation of the root being a special case of the latter. In
76670 +   either case it's inserted into sibling list. This will typically require
76671 +   some ancillary tree traversing, but ultimately both sibling pointers will
76672 +   exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in
76673 +   zjnode.state.
76674 +
76675 +   3. His youth.
76676 +
76677 +   If znode is bound to already existing node in a tree, its content is read
76678 +   from the disk by call to zload(). At that moment, JNODE_LOADED bit is set
76679 +   in zjnode.state and zdata() function starts to return non null for this
76680 +   znode. zload() further calls zparse() that determines which node layout
76681 +   this node is rendered in, and sets ->nplug on success.
76682 +
76683 +   If znode is for new node just created, memory for it is allocated and
76684 +   zinit_new() function is called to initialise data, according to selected
76685 +   node layout.
76686 +
76687 +   4. His maturity.
76688 +
76689 +   After this point, znode lingers in memory for some time. Threads can
76690 +   acquire references to znode either by blocknr through call to zget(), or by
76691 +   following a pointer to unallocated znode from internal item. Each time
76692 +   reference to znode is obtained, x_count is increased. Thread can read/write
76693 +   lock znode. Znode data can be loaded through calls to zload(), d_count will
76694 +   be increased appropriately. If all references to znode are released
76695 +   (x_count drops to 0), znode is not recycled immediately. Rather, it is
76696 +   still cached in the hash table in the hope that it will be accessed
76697 +   shortly.
76698 +
76699 +   There are two ways in which znode existence can be terminated:
76700 +
76701 +    . sudden death: node bound to this znode is removed from the tree
76702 +    . overpopulation: znode is purged out of memory due to memory pressure
76703 +
76704 +   5. His death.
76705 +
76706 +   Death is complex process.
76707 +
76708 +   When we irrevocably commit ourselves to decision to remove node from the
76709 +   tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding
76710 +   znode. This is done either in ->kill_hook() of internal item or in
76711 +   reiser4_kill_root() function when tree root is removed.
76712 +
76713 +   At this moment znode still has:
76714 +
76715 +    . locks held on it, necessary write ones
76716 +    . references to it
76717 +    . disk block assigned to it
76718 +    . data loaded from the disk
76719 +    . pending requests for lock
76720 +
76721 +   But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node
76722 +   deletion. Node deletion includes two phases. First all ways to get
76723 +   references to that znode (sibling and parent links and hash lookup using
76724 +   block number stored in parent node) should be deleted -- it is done through
76725 +   sibling_list_remove(), also we assume that nobody uses down link from
76726 +   parent node due to its nonexistence or proper parent node locking and
76727 +   nobody uses parent pointers from children due to absence of them. Second we
76728 +   invalidate all pending lock requests which still are on znode's lock
76729 +   request queue, this is done by reiser4_invalidate_lock(). Another
76730 +   JNODE_IS_DYING znode status bit is used to invalidate pending lock requests.
76731 +   Once it set all requesters are forced to return -EINVAL from
76732 +   longterm_lock_znode(). Future locking attempts are not possible because all
76733 +   ways to get references to that znode are removed already. Last, node is
76734 +   uncaptured from transaction.
76735 +
76736 +   When last reference to the dying znode is just about to be released,
76737 +   block number for this lock is released and znode is removed from the
76738 +   hash table.
76739 +
76740 +   Now znode can be recycled.
76741 +
76742 +   [it's possible to free bitmap block and remove znode from the hash
76743 +   table when last lock is released. This will result in having
76744 +   referenced but completely orphaned znode]
76745 +
76746 +   6. Limbo
76747 +
76748 +   As have been mentioned above znodes with reference counter 0 are
76749 +   still cached in a hash table. Once memory pressure increases they are
76750 +   purged out of there [this requires something like LRU list for
76751 +   efficient implementation. LRU list would also greatly simplify
76752 +   implementation of coord cache that would in this case morph to just
76753 +   scanning some initial segment of LRU list]. Data loaded into
76754 +   unreferenced znode are flushed back to the durable storage if
76755 +   necessary and memory is freed. Znodes themselves can be recycled at
76756 +   this point too.
76757 +
76758 +*/
76759 +
76760 +#include "debug.h"
76761 +#include "dformat.h"
76762 +#include "key.h"
76763 +#include "coord.h"
76764 +#include "plugin/plugin_header.h"
76765 +#include "plugin/node/node.h"
76766 +#include "plugin/plugin.h"
76767 +#include "txnmgr.h"
76768 +#include "jnode.h"
76769 +#include "znode.h"
76770 +#include "block_alloc.h"
76771 +#include "tree.h"
76772 +#include "tree_walk.h"
76773 +#include "super.h"
76774 +#include "reiser4.h"
76775 +
76776 +#include <linux/pagemap.h>
76777 +#include <linux/spinlock.h>
76778 +#include <linux/slab.h>
76779 +#include <linux/err.h>
76780 +
76781 +static z_hash_table *get_htable(reiser4_tree *,
76782 +                               const reiser4_block_nr * const blocknr);
76783 +static z_hash_table *znode_get_htable(const znode *);
76784 +static void zdrop(znode *);
76785 +
76786 +/* hash table support */
76787 +
76788 +/* compare two block numbers for equality. Used by hash-table macros */
76789 +static inline int
76790 +blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2)
76791 +{
76792 +       assert("nikita-534", b1 != NULL);
76793 +       assert("nikita-535", b2 != NULL);
76794 +
76795 +       return *b1 == *b2;
76796 +}
76797 +
76798 +/* Hash znode by block number. Used by hash-table macros */
76799 +/* Audited by: umka (2002.06.11) */
76800 +static inline __u32
76801 +blknrhashfn(z_hash_table * table, const reiser4_block_nr * b)
76802 +{
76803 +       assert("nikita-536", b != NULL);
76804 +
76805 +       return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1);
76806 +}
76807 +
76808 +/* The hash table definition */
76809 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
76810 +#define KFREE(ptr, size) kfree(ptr)
76811 +TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z,
76812 +                     blknrhashfn, blknreq);
76813 +#undef KFREE
76814 +#undef KMALLOC
76815 +
76816 +/* slab for znodes */
76817 +static struct kmem_cache *znode_cache;
76818 +
76819 +int znode_shift_order;
76820 +
76821 +/**
76822 + * init_znodes - create znode cache
76823 + *
76824 + * Initializes slab cache of znodes. It is part of reiser4 module initialization.
76825 + */
76826 +int init_znodes(void)
76827 +{
76828 +       znode_cache = kmem_cache_create("znode", sizeof(znode), 0,
76829 +                                       SLAB_HWCACHE_ALIGN |
76830 +                                       SLAB_RECLAIM_ACCOUNT, NULL);
76831 +       if (znode_cache == NULL)
76832 +               return RETERR(-ENOMEM);
76833 +
76834 +       for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode);
76835 +            ++znode_shift_order);
76836 +       --znode_shift_order;
76837 +       return 0;
76838 +}
76839 +
76840 +/**
76841 + * done_znodes - delete znode cache
76842 + *
76843 + * This is called on reiser4 module unloading or system shutdown.
76844 + */
76845 +void done_znodes(void)
76846 +{
76847 +       destroy_reiser4_cache(&znode_cache);
76848 +}
76849 +
76850 +/* call this to initialise tree of znodes */
76851 +int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ )
76852 +{
76853 +       int result;
76854 +       assert("umka-050", tree != NULL);
76855 +
76856 +       rwlock_init(&tree->dk_lock);
76857 +
76858 +       result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE);
76859 +       if (result != 0)
76860 +               return result;
76861 +       result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE);
76862 +       return result;
76863 +}
76864 +
76865 +/* free this znode */
76866 +void zfree(znode * node /* znode to free */ )
76867 +{
76868 +       assert("nikita-465", node != NULL);
76869 +       assert("nikita-2120", znode_page(node) == NULL);
76870 +       assert("nikita-2301", list_empty_careful(&node->lock.owners));
76871 +       assert("nikita-2302", list_empty_careful(&node->lock.requestors));
76872 +       assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) &&
76873 +                              NODE_LIST(ZJNODE(node)) == NOT_CAPTURED));
76874 +       assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes));
76875 +       assert("nikita-3293", !znode_is_right_connected(node));
76876 +       assert("nikita-3294", !znode_is_left_connected(node));
76877 +       assert("nikita-3295", node->left == NULL);
76878 +       assert("nikita-3296", node->right == NULL);
76879 +
76880 +       /* not yet phash_jnode_destroy(ZJNODE(node)); */
76881 +
76882 +       kmem_cache_free(znode_cache, node);
76883 +}
76884 +
76885 +/* call this to free tree of znodes */
76886 +void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ )
76887 +{
76888 +       znode *node;
76889 +       znode *next;
76890 +       z_hash_table *ztable;
76891 +
76892 +       /* scan znode hash-tables and kill all znodes, then free hash tables
76893 +        * themselves. */
76894 +
76895 +       assert("nikita-795", tree != NULL);
76896 +
76897 +       ztable = &tree->zhash_table;
76898 +
76899 +       if (ztable->_table != NULL) {
76900 +               for_all_in_htable(ztable, z, node, next) {
76901 +                       node->c_count = 0;
76902 +                       node->in_parent.node = NULL;
76903 +                       assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
76904 +                       zdrop(node);
76905 +               }
76906 +
76907 +               z_hash_done(&tree->zhash_table);
76908 +       }
76909 +
76910 +       ztable = &tree->zfake_table;
76911 +
76912 +       if (ztable->_table != NULL) {
76913 +               for_all_in_htable(ztable, z, node, next) {
76914 +                       node->c_count = 0;
76915 +                       node->in_parent.node = NULL;
76916 +                       assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
76917 +                       zdrop(node);
76918 +               }
76919 +
76920 +               z_hash_done(&tree->zfake_table);
76921 +       }
76922 +}
76923 +
76924 +/* ZNODE STRUCTURES */
76925 +
76926 +/* allocate fresh znode */
76927 +znode *zalloc(gfp_t gfp_flag /* allocation flag */ )
76928 +{
76929 +       znode *node;
76930 +
76931 +       node = kmem_cache_alloc(znode_cache, gfp_flag);
76932 +       return node;
76933 +}
76934 +
76935 +/* Initialize fields of znode
76936 +   @node:    znode to initialize;
76937 +   @parent:  parent znode;
76938 +   @tree:    tree we are in. */
76939 +void zinit(znode * node, const znode * parent, reiser4_tree * tree)
76940 +{
76941 +       assert("nikita-466", node != NULL);
76942 +       assert("umka-268", current_tree != NULL);
76943 +
76944 +       memset(node, 0, sizeof *node);
76945 +
76946 +       assert("umka-051", tree != NULL);
76947 +
76948 +       jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK);
76949 +       reiser4_init_lock(&node->lock);
76950 +       init_parent_coord(&node->in_parent, parent);
76951 +}
76952 +
76953 +/*
76954 + * remove znode from indices. This is called jput() when last reference on
76955 + * znode is released.
76956 + */
76957 +void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree)
76958 +{
76959 +       assert("nikita-2108", node != NULL);
76960 +       assert("nikita-470", node->c_count == 0);
76961 +       assert_rw_write_locked(&(tree->tree_lock));
76962 +
76963 +       /* remove reference to this znode from cbk cache */
76964 +       cbk_cache_invalidate(node, tree);
76965 +
76966 +       /* update c_count of parent */
76967 +       if (znode_parent(node) != NULL) {
76968 +               assert("nikita-472", znode_parent(node)->c_count > 0);
76969 +               /* father, onto your hands I forward my spirit... */
76970 +               znode_parent(node)->c_count--;
76971 +               node->in_parent.node = NULL;
76972 +       } else {
76973 +               /* orphaned znode?! Root? */
76974 +       }
76975 +
76976 +       /* remove znode from hash-table */
76977 +       z_hash_remove_rcu(znode_get_htable(node), node);
76978 +}
76979 +
76980 +/* zdrop() -- Remove znode from the tree.
76981 +
76982 +   This is called when znode is removed from the memory. */
76983 +static void zdrop(znode * node /* znode to finish with */ )
76984 +{
76985 +       jdrop(ZJNODE(node));
76986 +}
76987 +
76988 +/*
76989 + * put znode into right place in the hash table. This is called by relocate
76990 + * code.
76991 + */
76992 +int znode_rehash(znode * node /* node to rehash */ ,
76993 +                const reiser4_block_nr * new_block_nr /* new block number */ )
76994 +{
76995 +       z_hash_table *oldtable;
76996 +       z_hash_table *newtable;
76997 +       reiser4_tree *tree;
76998 +
76999 +       assert("nikita-2018", node != NULL);
77000 +
77001 +       tree = znode_get_tree(node);
77002 +       oldtable = znode_get_htable(node);
77003 +       newtable = get_htable(tree, new_block_nr);
77004 +
77005 +       write_lock_tree(tree);
77006 +       /* remove znode from hash-table */
77007 +       z_hash_remove_rcu(oldtable, node);
77008 +
77009 +       /* assertion no longer valid due to RCU */
77010 +       /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */
77011 +
77012 +       /* update blocknr */
77013 +       znode_set_block(node, new_block_nr);
77014 +       node->zjnode.key.z = *new_block_nr;
77015 +
77016 +       /* insert it into hash */
77017 +       z_hash_insert_rcu(newtable, node);
77018 +       write_unlock_tree(tree);
77019 +       return 0;
77020 +}
77021 +
77022 +/* ZNODE LOOKUP, GET, PUT */
77023 +
77024 +/* zlook() - get znode with given block_nr in a hash table or return NULL
77025 +
77026 +   If result is non-NULL then the znode's x_count is incremented.  Internal version
77027 +   accepts pre-computed hash index.  The hash table is accessed under caller's
77028 +   tree->hash_lock.
77029 +*/
77030 +znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr)
77031 +{
77032 +       znode *result;
77033 +       __u32 hash;
77034 +       z_hash_table *htable;
77035 +
77036 +       assert("jmacd-506", tree != NULL);
77037 +       assert("jmacd-507", blocknr != NULL);
77038 +
77039 +       htable = get_htable(tree, blocknr);
77040 +       hash = blknrhashfn(htable, blocknr);
77041 +
77042 +       rcu_read_lock();
77043 +       result = z_hash_find_index(htable, hash, blocknr);
77044 +
77045 +       if (result != NULL) {
77046 +               add_x_ref(ZJNODE(result));
77047 +               result = znode_rip_check(tree, result);
77048 +       }
77049 +       rcu_read_unlock();
77050 +
77051 +       return result;
77052 +}
77053 +
77054 +/* return hash table where znode with block @blocknr is (or should be)
77055 + * stored */
77056 +static z_hash_table *get_htable(reiser4_tree * tree,
77057 +                               const reiser4_block_nr * const blocknr)
77058 +{
77059 +       z_hash_table *table;
77060 +       if (is_disk_addr_unallocated(blocknr))
77061 +               table = &tree->zfake_table;
77062 +       else
77063 +               table = &tree->zhash_table;
77064 +       return table;
77065 +}
77066 +
77067 +/* return hash table where znode @node is (or should be) stored */
77068 +static z_hash_table *znode_get_htable(const znode * node)
77069 +{
77070 +       return get_htable(znode_get_tree(node), znode_get_block(node));
77071 +}
77072 +
77073 +/* zget() - get znode from hash table, allocating it if necessary.
77074 +
77075 +   First a call to zlook, locating a x-referenced znode if one
77076 +   exists.  If znode is not found, allocate new one and return.  Result
77077 +   is returned with x_count reference increased.
77078 +
77079 +   LOCKS TAKEN:   TREE_LOCK, ZNODE_LOCK
77080 +   LOCK ORDERING: NONE
77081 +*/
77082 +znode *zget(reiser4_tree * tree,
77083 +           const reiser4_block_nr * const blocknr,
77084 +           znode * parent, tree_level level, gfp_t gfp_flag)
77085 +{
77086 +       znode *result;
77087 +       __u32 hashi;
77088 +
77089 +       z_hash_table *zth;
77090 +
77091 +       assert("jmacd-512", tree != NULL);
77092 +       assert("jmacd-513", blocknr != NULL);
77093 +       assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT);
77094 +
77095 +       zth = get_htable(tree, blocknr);
77096 +       hashi = blknrhashfn(zth, blocknr);
77097 +
77098 +       /* NOTE-NIKITA address-as-unallocated-blocknr still is not
77099 +          implemented. */
77100 +
77101 +       z_hash_prefetch_bucket(zth, hashi);
77102 +
77103 +       rcu_read_lock();
77104 +       /* Find a matching BLOCKNR in the hash table.  If the znode is found,
77105 +          we obtain an reference (x_count) but the znode remains unlocked.
77106 +          Have to worry about race conditions later. */
77107 +       result = z_hash_find_index(zth, hashi, blocknr);
77108 +       /* According to the current design, the hash table lock protects new
77109 +          znode references. */
77110 +       if (result != NULL) {
77111 +               add_x_ref(ZJNODE(result));
77112 +               /* NOTE-NIKITA it should be so, but special case during
77113 +                  creation of new root makes such assertion highly
77114 +                  complicated.  */
77115 +               assert("nikita-2131", 1 || znode_parent(result) == parent ||
77116 +                      (ZF_ISSET(result, JNODE_ORPHAN)
77117 +                       && (znode_parent(result) == NULL)));
77118 +               result = znode_rip_check(tree, result);
77119 +       }
77120 +
77121 +       rcu_read_unlock();
77122 +
77123 +       if (!result) {
77124 +               znode *shadow;
77125 +
77126 +               result = zalloc(gfp_flag);
77127 +               if (!result) {
77128 +                       return ERR_PTR(RETERR(-ENOMEM));
77129 +               }
77130 +
77131 +               zinit(result, parent, tree);
77132 +               ZJNODE(result)->blocknr = *blocknr;
77133 +               ZJNODE(result)->key.z = *blocknr;
77134 +               result->level = level;
77135 +
77136 +               write_lock_tree(tree);
77137 +
77138 +               shadow = z_hash_find_index(zth, hashi, blocknr);
77139 +               if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) {
77140 +                       jnode_list_remove(ZJNODE(result));
77141 +                       zfree(result);
77142 +                       result = shadow;
77143 +               } else {
77144 +                       result->version = znode_build_version(tree);
77145 +                       z_hash_insert_index_rcu(zth, hashi, result);
77146 +
77147 +                       if (parent != NULL)
77148 +                               ++parent->c_count;
77149 +               }
77150 +
77151 +               add_x_ref(ZJNODE(result));
77152 +
77153 +               write_unlock_tree(tree);
77154 +       }
77155 +#if REISER4_DEBUG
77156 +       if (!reiser4_blocknr_is_fake(blocknr) && *blocknr != 0)
77157 +               reiser4_check_block(blocknr, 1);
77158 +#endif
77159 +       /* Check for invalid tree level, return -EIO */
77160 +       if (unlikely(znode_get_level(result) != level)) {
77161 +               warning("jmacd-504",
77162 +                       "Wrong level for cached block %llu: %i expecting %i",
77163 +                       (unsigned long long)(*blocknr), znode_get_level(result),
77164 +                       level);
77165 +               zput(result);
77166 +               return ERR_PTR(RETERR(-EIO));
77167 +       }
77168 +
77169 +       assert("nikita-1227", znode_invariant(result));
77170 +
77171 +       return result;
77172 +}
77173 +
77174 +/* ZNODE PLUGINS/DATA */
77175 +
77176 +/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is
77177 +   stored at the fixed offset from the beginning of the node. */
77178 +static node_plugin *znode_guess_plugin(const znode * node      /* znode to guess
77179 +                                                                * plugin of */ )
77180 +{
77181 +       reiser4_tree *tree;
77182 +
77183 +       assert("nikita-1053", node != NULL);
77184 +       assert("nikita-1055", zdata(node) != NULL);
77185 +
77186 +       tree = znode_get_tree(node);
77187 +       assert("umka-053", tree != NULL);
77188 +
77189 +       if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) {
77190 +               return tree->nplug;
77191 +       } else {
77192 +               return node_plugin_by_disk_id
77193 +                   (tree, &((common_node_header *) zdata(node))->plugin_id);
77194 +#ifdef GUESS_EXISTS
77195 +               reiser4_plugin *plugin;
77196 +
77197 +               /* NOTE-NIKITA add locking here when dynamic plugins will be
77198 +                * implemented */
77199 +               for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) {
77200 +                       if ((plugin->u.node.guess != NULL)
77201 +                           && plugin->u.node.guess(node))
77202 +                               return plugin;
77203 +               }
77204 +               warning("nikita-1057", "Cannot guess node plugin");
77205 +               print_znode("node", node);
77206 +               return NULL;
77207 +#endif
77208 +       }
77209 +}
77210 +
77211 +/* parse node header and install ->node_plugin */
77212 +int zparse(znode * node /* znode to parse */ )
77213 +{
77214 +       int result;
77215 +
77216 +       assert("nikita-1233", node != NULL);
77217 +       assert("nikita-2370", zdata(node) != NULL);
77218 +
77219 +       if (node->nplug == NULL) {
77220 +               node_plugin *nplug;
77221 +
77222 +               nplug = znode_guess_plugin(node);
77223 +               if (likely(nplug != NULL)) {
77224 +                       result = nplug->parse(node);
77225 +                       if (likely(result == 0))
77226 +                               node->nplug = nplug;
77227 +               } else {
77228 +                       result = RETERR(-EIO);
77229 +               }
77230 +       } else
77231 +               result = 0;
77232 +       return result;
77233 +}
77234 +
77235 +/* zload with readahead */
77236 +int zload_ra(znode * node /* znode to load */ , ra_info_t * info)
77237 +{
77238 +       int result;
77239 +
77240 +       assert("nikita-484", node != NULL);
77241 +       assert("nikita-1377", znode_invariant(node));
77242 +       assert("jmacd-7771", !znode_above_root(node));
77243 +       assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0);
77244 +       assert("nikita-3016", reiser4_schedulable());
77245 +
77246 +       if (info)
77247 +               formatted_readahead(node, info);
77248 +
77249 +       result = jload(ZJNODE(node));
77250 +       assert("nikita-1378", znode_invariant(node));
77251 +       return result;
77252 +}
77253 +
77254 +/* load content of node into memory */
77255 +int zload(znode * node)
77256 +{
77257 +       return zload_ra(node, NULL);
77258 +}
77259 +
77260 +/* call node plugin to initialise newly allocated node. */
77261 +int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags)
77262 +{
77263 +       return jinit_new(ZJNODE(node), gfp_flags);
77264 +}
77265 +
77266 +/* drop reference to node data. When last reference is dropped, data are
77267 +   unloaded. */
77268 +void zrelse(znode * node /* znode to release references to */ )
77269 +{
77270 +       assert("nikita-1381", znode_invariant(node));
77271 +
77272 +       jrelse(ZJNODE(node));
77273 +}
77274 +
77275 +/* returns free space in node */
77276 +unsigned znode_free_space(znode * node /* znode to query */ )
77277 +{
77278 +       assert("nikita-852", node != NULL);
77279 +       return node_plugin_by_node(node)->free_space(node);
77280 +}
77281 +
77282 +/* left delimiting key of znode */
77283 +reiser4_key *znode_get_rd_key(znode * node /* znode to query */ )
77284 +{
77285 +       assert("nikita-958", node != NULL);
77286 +       assert_rw_locked(&(znode_get_tree(node)->dk_lock));
77287 +       assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk));
77288 +       assert("nikita-30671", node->rd_key_version != 0);
77289 +       return &node->rd_key;
77290 +}
77291 +
77292 +/* right delimiting key of znode */
77293 +reiser4_key *znode_get_ld_key(znode * node /* znode to query */ )
77294 +{
77295 +       assert("nikita-974", node != NULL);
77296 +       assert_rw_locked(&(znode_get_tree(node)->dk_lock));
77297 +       assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk));
77298 +       assert("nikita-30681", node->ld_key_version != 0);
77299 +       return &node->ld_key;
77300 +}
77301 +
77302 +ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0);
77303 +    )
77304 +
77305 +/* update right-delimiting key of @node */
77306 +reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key)
77307 +{
77308 +       assert("nikita-2937", node != NULL);
77309 +       assert("nikita-2939", key != NULL);
77310 +       assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
77311 +       assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk));
77312 +       assert("nikita-2944",
77313 +              znode_is_any_locked(node) ||
77314 +              znode_get_level(node) != LEAF_LEVEL ||
77315 +              keyge(key, &node->rd_key) ||
77316 +              keyeq(&node->rd_key, reiser4_min_key()) ||
77317 +              ZF_ISSET(node, JNODE_HEARD_BANSHEE));
77318 +
77319 +       node->rd_key = *key;
77320 +       ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version));
77321 +       return &node->rd_key;
77322 +}
77323 +
77324 +/* update left-delimiting key of @node */
77325 +reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key)
77326 +{
77327 +       assert("nikita-2940", node != NULL);
77328 +       assert("nikita-2941", key != NULL);
77329 +       assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
77330 +       assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk));
77331 +       assert("nikita-2943",
77332 +              znode_is_any_locked(node) || keyeq(&node->ld_key,
77333 +                                                 reiser4_min_key()));
77334 +
77335 +       node->ld_key = *key;
77336 +       ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version));
77337 +       return &node->ld_key;
77338 +}
77339 +
77340 +/* true if @key is inside key range for @node */
77341 +int znode_contains_key(znode * node /* znode to look in */ ,
77342 +                      const reiser4_key * key /* key to look for */ )
77343 +{
77344 +       assert("nikita-1237", node != NULL);
77345 +       assert("nikita-1238", key != NULL);
77346 +
77347 +       /* left_delimiting_key <= key <= right_delimiting_key */
77348 +       return keyle(znode_get_ld_key(node), key)
77349 +           && keyle(key, znode_get_rd_key(node));
77350 +}
77351 +
77352 +/* same as znode_contains_key(), but lock dk lock */
77353 +int znode_contains_key_lock(znode * node /* znode to look in */ ,
77354 +                           const reiser4_key * key /* key to look for */ )
77355 +{
77356 +       int result;
77357 +
77358 +       assert("umka-056", node != NULL);
77359 +       assert("umka-057", key != NULL);
77360 +
77361 +       read_lock_dk(znode_get_tree(node));
77362 +       result = znode_contains_key(node, key);
77363 +       read_unlock_dk(znode_get_tree(node));
77364 +       return result;
77365 +}
77366 +
77367 +/* get parent pointer, assuming tree is not locked */
77368 +znode *znode_parent_nolock(const znode * node /* child znode */ )
77369 +{
77370 +       assert("nikita-1444", node != NULL);
77371 +       return node->in_parent.node;
77372 +}
77373 +
77374 +/* get parent pointer of znode */
77375 +znode *znode_parent(const znode * node /* child znode */ )
77376 +{
77377 +       assert("nikita-1226", node != NULL);
77378 +       assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree));
77379 +       return znode_parent_nolock(node);
77380 +}
77381 +
77382 +/* detect uber znode used to protect in-superblock tree root pointer */
77383 +int znode_above_root(const znode * node /* znode to query */ )
77384 +{
77385 +       assert("umka-059", node != NULL);
77386 +
77387 +       return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR);
77388 +}
77389 +
77390 +/* check that @node is root---that its block number is recorder in the tree as
77391 +   that of root node */
77392 +#if REISER4_DEBUG
77393 +static int znode_is_true_root(const znode * node /* znode to query */ )
77394 +{
77395 +       assert("umka-060", node != NULL);
77396 +       assert("umka-061", current_tree != NULL);
77397 +
77398 +       return disk_addr_eq(znode_get_block(node),
77399 +                           &znode_get_tree(node)->root_block);
77400 +}
77401 +#endif
77402 +
77403 +/* check that @node is root */
77404 +int znode_is_root(const znode * node /* znode to query */ )
77405 +{
77406 +       assert("nikita-1206", node != NULL);
77407 +
77408 +       return znode_get_level(node) == znode_get_tree(node)->height;
77409 +}
77410 +
77411 +/* Returns true is @node was just created by zget() and wasn't ever loaded
77412 +   into memory. */
77413 +/* NIKITA-HANS: yes */
77414 +int znode_just_created(const znode * node)
77415 +{
77416 +       assert("nikita-2188", node != NULL);
77417 +       return (znode_page(node) == NULL);
77418 +}
77419 +
77420 +/* obtain updated ->znode_epoch. See seal.c for description. */
77421 +__u64 znode_build_version(reiser4_tree * tree)
77422 +{
77423 +       __u64 result;
77424 +
77425 +       spin_lock(&tree->epoch_lock);
77426 +       result = ++tree->znode_epoch;
77427 +       spin_unlock(&tree->epoch_lock);
77428 +       return result;
77429 +}
77430 +
77431 +void init_load_count(load_count * dh)
77432 +{
77433 +       assert("nikita-2105", dh != NULL);
77434 +       memset(dh, 0, sizeof *dh);
77435 +}
77436 +
77437 +void done_load_count(load_count * dh)
77438 +{
77439 +       assert("nikita-2106", dh != NULL);
77440 +       if (dh->node != NULL) {
77441 +               for (; dh->d_ref > 0; --dh->d_ref)
77442 +                       zrelse(dh->node);
77443 +               dh->node = NULL;
77444 +       }
77445 +}
77446 +
77447 +static int incr_load_count(load_count * dh)
77448 +{
77449 +       int result;
77450 +
77451 +       assert("nikita-2110", dh != NULL);
77452 +       assert("nikita-2111", dh->node != NULL);
77453 +
77454 +       result = zload(dh->node);
77455 +       if (result == 0)
77456 +               ++dh->d_ref;
77457 +       return result;
77458 +}
77459 +
77460 +int incr_load_count_znode(load_count * dh, znode * node)
77461 +{
77462 +       assert("nikita-2107", dh != NULL);
77463 +       assert("nikita-2158", node != NULL);
77464 +       assert("nikita-2109",
77465 +              ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0)));
77466 +
77467 +       dh->node = node;
77468 +       return incr_load_count(dh);
77469 +}
77470 +
77471 +int incr_load_count_jnode(load_count * dh, jnode * node)
77472 +{
77473 +       if (jnode_is_znode(node)) {
77474 +               return incr_load_count_znode(dh, JZNODE(node));
77475 +       }
77476 +       return 0;
77477 +}
77478 +
77479 +void copy_load_count(load_count * new, load_count * old)
77480 +{
77481 +       int ret = 0;
77482 +       done_load_count(new);
77483 +       new->node = old->node;
77484 +       new->d_ref = 0;
77485 +
77486 +       while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) {
77487 +       }
77488 +
77489 +       assert("jmacd-87589", ret == 0);
77490 +}
77491 +
77492 +void move_load_count(load_count * new, load_count * old)
77493 +{
77494 +       done_load_count(new);
77495 +       new->node = old->node;
77496 +       new->d_ref = old->d_ref;
77497 +       old->node = NULL;
77498 +       old->d_ref = 0;
77499 +}
77500 +
77501 +/* convert parent pointer into coord */
77502 +void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord)
77503 +{
77504 +       assert("nikita-3204", pcoord != NULL);
77505 +       assert("nikita-3205", coord != NULL);
77506 +
77507 +       coord_init_first_unit_nocheck(coord, pcoord->node);
77508 +       coord_set_item_pos(coord, pcoord->item_pos);
77509 +       coord->between = AT_UNIT;
77510 +}
77511 +
77512 +/* pack coord into parent_coord_t */
77513 +void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord)
77514 +{
77515 +       assert("nikita-3206", pcoord != NULL);
77516 +       assert("nikita-3207", coord != NULL);
77517 +
77518 +       pcoord->node = coord->node;
77519 +       pcoord->item_pos = coord->item_pos;
77520 +}
77521 +
77522 +/* Initialize a parent hint pointer. (parent hint pointer is a field in znode,
77523 +   look for comments there) */
77524 +void init_parent_coord(parent_coord_t * pcoord, const znode * node)
77525 +{
77526 +       pcoord->node = (znode *) node;
77527 +       pcoord->item_pos = (unsigned short)~0;
77528 +}
77529 +
77530 +#if REISER4_DEBUG
77531 +
77532 +/* debugging aid: znode invariant */
77533 +static int znode_invariant_f(const znode * node /* znode to check */ ,
77534 +                            char const **msg   /* where to store error
77535 +                                                * message, if any */ )
77536 +{
77537 +#define _ergo(ant, con)                                                \
77538 +       ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
77539 +
77540 +#define _equi(e1, e2)                                          \
77541 +       ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2)))
77542 +
77543 +#define _check(exp) ((*msg) = #exp, (exp))
77544 +
77545 +       return jnode_invariant_f(ZJNODE(node), msg) &&
77546 +           /* [znode-fake] invariant */
77547 +           /* fake znode doesn't have a parent, and */
77548 +           _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) &&
77549 +           /* there is another way to express this very check, and */
77550 +           _ergo(znode_above_root(node), znode_parent(node) == NULL) &&
77551 +           /* it has special block number, and */
77552 +           _ergo(znode_get_level(node) == 0,
77553 +                 disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
77554 +           /* it is the only znode with such block number, and */
77555 +           _ergo(!znode_above_root(node) && znode_is_loaded(node),
77556 +                 !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
77557 +           /* it is parent of the tree root node */
77558 +           _ergo(znode_is_true_root(node),
77559 +                 znode_above_root(znode_parent(node))) &&
77560 +           /* [znode-level] invariant */
77561 +           /* level of parent znode is one larger than that of child,
77562 +              except for the fake znode, and */
77563 +           _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)),
77564 +                 znode_get_level(znode_parent(node)) ==
77565 +                 znode_get_level(node) + 1) &&
77566 +           /* left neighbor is at the same level, and */
77567 +           _ergo(znode_is_left_connected(node) && node->left != NULL,
77568 +                 znode_get_level(node) == znode_get_level(node->left)) &&
77569 +           /* right neighbor is at the same level */
77570 +           _ergo(znode_is_right_connected(node) && node->right != NULL,
77571 +                 znode_get_level(node) == znode_get_level(node->right)) &&
77572 +           /* [znode-connected] invariant */
77573 +           _ergo(node->left != NULL, znode_is_left_connected(node)) &&
77574 +           _ergo(node->right != NULL, znode_is_right_connected(node)) &&
77575 +           _ergo(!znode_is_root(node) && node->left != NULL,
77576 +                 znode_is_right_connected(node->left) &&
77577 +                 node->left->right == node) &&
77578 +           _ergo(!znode_is_root(node) && node->right != NULL,
77579 +                 znode_is_left_connected(node->right) &&
77580 +                 node->right->left == node) &&
77581 +           /* [znode-c_count] invariant */
77582 +           /* for any znode, c_count of its parent is greater than 0 */
77583 +           _ergo(znode_parent(node) != NULL &&
77584 +                 !znode_above_root(znode_parent(node)),
77585 +                 znode_parent(node)->c_count > 0) &&
77586 +           /* leaves don't have children */
77587 +           _ergo(znode_get_level(node) == LEAF_LEVEL,
77588 +                 node->c_count == 0) &&
77589 +           _check(node->zjnode.jnodes.prev != NULL) &&
77590 +           _check(node->zjnode.jnodes.next != NULL) &&
77591 +           /* orphan doesn't have a parent */
77592 +           _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) &&
77593 +           /* [znode-modify] invariant */
77594 +           /* if znode is not write-locked, its checksum remains
77595 +            * invariant */
77596 +           /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we
77597 +            * cannot check this. */
77598 +           /* [znode-refs] invariant */
77599 +           /* only referenced znode can be long-term locked */
77600 +           _ergo(znode_is_locked(node),
77601 +                 atomic_read(&ZJNODE(node)->x_count) != 0);
77602 +}
77603 +
77604 +/* debugging aid: check znode invariant and panic if it doesn't hold */
77605 +int znode_invariant(znode * node /* znode to check */ )
77606 +{
77607 +       char const *failed_msg;
77608 +       int result;
77609 +
77610 +       assert("umka-063", node != NULL);
77611 +       assert("umka-064", current_tree != NULL);
77612 +
77613 +       spin_lock_znode(node);
77614 +       read_lock_tree(znode_get_tree(node));
77615 +       result = znode_invariant_f(node, &failed_msg);
77616 +       if (!result) {
77617 +               /* print_znode("corrupted node", node); */
77618 +               warning("jmacd-555", "Condition %s failed", failed_msg);
77619 +       }
77620 +       read_unlock_tree(znode_get_tree(node));
77621 +       spin_unlock_znode(node);
77622 +       return result;
77623 +}
77624 +
77625 +/* return non-0 iff data are loaded into znode */
77626 +int znode_is_loaded(const znode * node /* znode to query */ )
77627 +{
77628 +       assert("nikita-497", node != NULL);
77629 +       return jnode_is_loaded(ZJNODE(node));
77630 +}
77631 +
77632 +unsigned long znode_times_locked(const znode * z)
77633 +{
77634 +       return z->times_locked;
77635 +}
77636 +
77637 +#endif                         /* REISER4_DEBUG */
77638 +
77639 +/* Make Linus happy.
77640 +   Local variables:
77641 +   c-indentation-style: "K&R"
77642 +   mode-name: "LC"
77643 +   c-basic-offset: 8
77644 +   tab-width: 8
77645 +   fill-column: 120
77646 +   End:
77647 +*/
77648 diff -urN linux-2.6.27.orig/fs/reiser4/znode.h linux-2.6.27/fs/reiser4/znode.h
77649 --- linux-2.6.27.orig/fs/reiser4/znode.h        1970-01-01 03:00:00.000000000 +0300
77650 +++ linux-2.6.27/fs/reiser4/znode.h     2008-10-12 18:20:01.000000000 +0400
77651 @@ -0,0 +1,433 @@
77652 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
77653 + * reiser4/README */
77654 +
77655 +/* Declaration of znode (Zam's node). See znode.c for more details. */
77656 +
77657 +#ifndef __ZNODE_H__
77658 +#define __ZNODE_H__
77659 +
77660 +#include "forward.h"
77661 +#include "debug.h"
77662 +#include "dformat.h"
77663 +#include "key.h"
77664 +#include "coord.h"
77665 +#include "plugin/node/node.h"
77666 +#include "jnode.h"
77667 +#include "lock.h"
77668 +#include "readahead.h"
77669 +
77670 +#include <linux/types.h>
77671 +#include <linux/spinlock.h>
77672 +#include <linux/pagemap.h>     /* for PAGE_CACHE_SIZE */
77673 +#include <asm/atomic.h>
77674 +
77675 +/* znode tracks its position within parent (internal item in a parent node,
77676 + * that contains znode's block number). */
77677 +typedef struct parent_coord {
77678 +       znode *node;
77679 +       pos_in_node_t item_pos;
77680 +} parent_coord_t;
77681 +
77682 +/* &znode - node in a reiser4 tree.
77683 +
77684 +   NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce
77685 +   cacheline pressure.
77686 +
77687 +   Locking:
77688 +
77689 +   Long term: data in a disk node attached to this znode are protected
77690 +   by long term, deadlock aware lock ->lock;
77691 +
77692 +   Spin lock: the following fields are protected by the spin lock:
77693 +
77694 +    ->lock
77695 +
77696 +   Following fields are protected by the global tree lock:
77697 +
77698 +    ->left
77699 +    ->right
77700 +    ->in_parent
77701 +    ->c_count
77702 +
77703 +   Following fields are protected by the global delimiting key lock (dk_lock):
77704 +
77705 +    ->ld_key (to update ->ld_key long-term lock on the node is also required)
77706 +    ->rd_key
77707 +
77708 +   Following fields are protected by the long term lock:
77709 +
77710 +    ->nr_items
77711 +
77712 +   ->node_plugin is never changed once set. This means that after code made
77713 +   itself sure that field is valid it can be accessed without any additional
77714 +   locking.
77715 +
77716 +   ->level is immutable.
77717 +
77718 +   Invariants involving this data-type:
77719 +
77720 +      [znode-fake]
77721 +      [znode-level]
77722 +      [znode-connected]
77723 +      [znode-c_count]
77724 +      [znode-refs]
77725 +      [jnode-refs]
77726 +      [jnode-queued]
77727 +      [znode-modify]
77728 +
77729 +    For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks.
77730 +    Suggestions for how to do that are desired.*/
77731 +struct znode {
77732 +       /* Embedded jnode. */
77733 +       jnode zjnode;
77734 +
77735 +       /* contains three subfields, node, pos_in_node, and pos_in_unit.
77736 +
77737 +          pos_in_node and pos_in_unit are only hints that are cached to
77738 +          speed up lookups during balancing. They are not required to be up to
77739 +          date. Synched in find_child_ptr().
77740 +
77741 +          This value allows us to avoid expensive binary searches.
77742 +
77743 +          in_parent->node points to the parent of this node, and is NOT a
77744 +          hint.
77745 +        */
77746 +       parent_coord_t in_parent;
77747 +
77748 +       /*
77749 +        * sibling list pointers
77750 +        */
77751 +
77752 +       /* left-neighbor */
77753 +       znode *left;
77754 +       /* right-neighbor */
77755 +       znode *right;
77756 +
77757 +       /* long term lock on node content. This lock supports deadlock
77758 +          detection. See lock.c
77759 +        */
77760 +       zlock lock;
77761 +
77762 +       /* You cannot remove from memory a node that has children in
77763 +          memory. This is because we rely on the fact that parent of given
77764 +          node can always be reached without blocking for io. When reading a
77765 +          node into memory you must increase the c_count of its parent, when
77766 +          removing it from memory you must decrease the c_count.  This makes
77767 +          the code simpler, and the cases where it is suboptimal are truly
77768 +          obscure.
77769 +        */
77770 +       int c_count;
77771 +
77772 +       /* plugin of node attached to this znode. NULL if znode is not
77773 +          loaded. */
77774 +       node_plugin *nplug;
77775 +
77776 +       /* version of znode data. This is increased on each modification. This
77777 +        * is necessary to implement seals (see seal.[ch]) efficiently. */
77778 +       __u64 version;
77779 +
77780 +       /* left delimiting key. Necessary to efficiently perform
77781 +          balancing with node-level locking. Kept in memory only. */
77782 +       reiser4_key ld_key;
77783 +       /* right delimiting key. */
77784 +       reiser4_key rd_key;
77785 +
77786 +       /* znode's tree level */
77787 +       __u16 level;
77788 +       /* number of items in this node. This field is modified by node
77789 +        * plugin. */
77790 +       __u16 nr_items;
77791 +
77792 +#if REISER4_DEBUG
77793 +       void *creator;
77794 +       reiser4_key first_key;
77795 +       unsigned long times_locked;
77796 +       int left_version;       /* when node->left was updated */
77797 +       int right_version;      /* when node->right was updated */
77798 +       int ld_key_version;     /* when node->ld_key was updated */
77799 +       int rd_key_version;     /* when node->rd_key was updated */
77800 +#endif
77801 +
77802 +} __attribute__ ((aligned(16)));
77803 +
77804 +ON_DEBUG(extern atomic_t delim_key_version;
77805 +    )
77806 +
77807 +/* In general I think these macros should not be exposed. */
77808 +#define znode_is_locked(node)          (lock_is_locked(&node->lock))
77809 +#define znode_is_rlocked(node)         (lock_is_rlocked(&node->lock))
77810 +#define znode_is_wlocked(node)         (lock_is_wlocked(&node->lock))
77811 +#define znode_is_wlocked_once(node)    (lock_is_wlocked_once(&node->lock))
77812 +#define znode_can_be_rlocked(node)     (lock_can_be_rlocked(&node->lock))
77813 +#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode))
77814 +/* Macros for accessing the znode state. */
77815 +#define        ZF_CLR(p,f)             JF_CLR  (ZJNODE(p), (f))
77816 +#define        ZF_ISSET(p,f)           JF_ISSET(ZJNODE(p), (f))
77817 +#define        ZF_SET(p,f)             JF_SET  (ZJNODE(p), (f))
77818 +extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block,
77819 +                  znode * parent, tree_level level, gfp_t gfp_flag);
77820 +extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block);
77821 +extern int zload(znode * node);
77822 +extern int zload_ra(znode * node, ra_info_t * info);
77823 +extern int zinit_new(znode * node, gfp_t gfp_flags);
77824 +extern void zrelse(znode * node);
77825 +extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block);
77826 +
77827 +/* size of data in znode */
77828 +static inline unsigned
77829 +znode_size(const znode * node UNUSED_ARG /* znode to query */ )
77830 +{
77831 +       assert("nikita-1416", node != NULL);
77832 +       return PAGE_CACHE_SIZE;
77833 +}
77834 +
77835 +extern void parent_coord_to_coord(const parent_coord_t * pcoord,
77836 +                                 coord_t * coord);
77837 +extern void coord_to_parent_coord(const coord_t * coord,
77838 +                                 parent_coord_t * pcoord);
77839 +extern void init_parent_coord(parent_coord_t * pcoord, const znode * node);
77840 +
77841 +extern unsigned znode_free_space(znode * node);
77842 +
77843 +extern reiser4_key *znode_get_rd_key(znode * node);
77844 +extern reiser4_key *znode_get_ld_key(znode * node);
77845 +
77846 +extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key);
77847 +extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key);
77848 +
77849 +/* `connected' state checks */
77850 +static inline int znode_is_right_connected(const znode * node)
77851 +{
77852 +       return ZF_ISSET(node, JNODE_RIGHT_CONNECTED);
77853 +}
77854 +
77855 +static inline int znode_is_left_connected(const znode * node)
77856 +{
77857 +       return ZF_ISSET(node, JNODE_LEFT_CONNECTED);
77858 +}
77859 +
77860 +static inline int znode_is_connected(const znode * node)
77861 +{
77862 +       return znode_is_right_connected(node) && znode_is_left_connected(node);
77863 +}
77864 +
77865 +extern int znode_shift_order;
77866 +extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr);
77867 +extern void znode_remove(znode *, reiser4_tree *);
77868 +extern znode *znode_parent(const znode * node);
77869 +extern znode *znode_parent_nolock(const znode * node);
77870 +extern int znode_above_root(const znode * node);
77871 +extern int init_znodes(void);
77872 +extern void done_znodes(void);
77873 +extern int znodes_tree_init(reiser4_tree * ztree);
77874 +extern void znodes_tree_done(reiser4_tree * ztree);
77875 +extern int znode_contains_key(znode * node, const reiser4_key * key);
77876 +extern int znode_contains_key_lock(znode * node, const reiser4_key * key);
77877 +extern unsigned znode_save_free_space(znode * node);
77878 +extern unsigned znode_recover_free_space(znode * node);
77879 +extern znode *zalloc(gfp_t gfp_flag);
77880 +extern void zinit(znode *, const znode * parent, reiser4_tree *);
77881 +extern int zparse(znode * node);
77882 +
77883 +extern int znode_just_created(const znode * node);
77884 +
77885 +extern void zfree(znode * node);
77886 +
77887 +#if REISER4_DEBUG
77888 +extern void print_znode(const char *prefix, const znode * node);
77889 +#else
77890 +#define print_znode( p, n ) noop
77891 +#endif
77892 +
77893 +/* Make it look like various znode functions exist instead of treating znodes as
77894 +   jnodes in znode-specific code. */
77895 +#define znode_page(x)               jnode_page ( ZJNODE(x) )
77896 +#define zdata(x)                    jdata ( ZJNODE(x) )
77897 +#define znode_get_block(x)          jnode_get_block ( ZJNODE(x) )
77898 +#define znode_created(x)            jnode_created ( ZJNODE(x) )
77899 +#define znode_set_created(x)        jnode_set_created ( ZJNODE(x) )
77900 +#define znode_convertible(x)        jnode_convertible (ZJNODE(x))
77901 +#define znode_set_convertible(x)    jnode_set_convertible (ZJNODE(x))
77902 +
77903 +#define znode_is_dirty(x)           jnode_is_dirty    ( ZJNODE(x) )
77904 +#define znode_check_dirty(x)        jnode_check_dirty ( ZJNODE(x) )
77905 +#define znode_make_clean(x)         jnode_make_clean   ( ZJNODE(x) )
77906 +#define znode_set_block(x, b)       jnode_set_block ( ZJNODE(x), (b) )
77907 +
77908 +#define spin_lock_znode(x)          spin_lock_jnode ( ZJNODE(x) )
77909 +#define spin_unlock_znode(x)        spin_unlock_jnode ( ZJNODE(x) )
77910 +#define spin_trylock_znode(x)       spin_trylock_jnode ( ZJNODE(x) )
77911 +#define spin_znode_is_locked(x)     spin_jnode_is_locked ( ZJNODE(x) )
77912 +#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) )
77913 +
77914 +#if REISER4_DEBUG
77915 +extern int znode_x_count_is_protected(const znode * node);
77916 +extern int znode_invariant(znode * node);
77917 +#endif
77918 +
77919 +/* acquire reference to @node */
77920 +static inline znode *zref(znode * node)
77921 +{
77922 +       /* change of x_count from 0 to 1 is protected by tree spin-lock */
77923 +       return JZNODE(jref(ZJNODE(node)));
77924 +}
77925 +
77926 +/* release reference to @node */
77927 +static inline void zput(znode * node)
77928 +{
77929 +       assert("nikita-3564", znode_invariant(node));
77930 +       jput(ZJNODE(node));
77931 +}
77932 +
77933 +/* get the level field for a znode */
77934 +static inline tree_level znode_get_level(const znode * node)
77935 +{
77936 +       return node->level;
77937 +}
77938 +
77939 +/* get the level field for a jnode */
77940 +static inline tree_level jnode_get_level(const jnode * node)
77941 +{
77942 +       if (jnode_is_znode(node))
77943 +               return znode_get_level(JZNODE(node));
77944 +       else
77945 +               /* unformatted nodes are all at the LEAF_LEVEL and for
77946 +                  "semi-formatted" nodes like bitmaps, level doesn't matter. */
77947 +               return LEAF_LEVEL;
77948 +}
77949 +
77950 +/* true if jnode is on leaf level */
77951 +static inline int jnode_is_leaf(const jnode * node)
77952 +{
77953 +       if (jnode_is_znode(node))
77954 +               return (znode_get_level(JZNODE(node)) == LEAF_LEVEL);
77955 +       if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK)
77956 +               return 1;
77957 +       return 0;
77958 +}
77959 +
77960 +/* return znode's tree */
77961 +static inline reiser4_tree *znode_get_tree(const znode * node)
77962 +{
77963 +       assert("nikita-2692", node != NULL);
77964 +       return jnode_get_tree(ZJNODE(node));
77965 +}
77966 +
77967 +/* resolve race with zput */
77968 +static inline znode *znode_rip_check(reiser4_tree * tree, znode * node)
77969 +{
77970 +       jnode *j;
77971 +
77972 +       j = jnode_rip_sync(tree, ZJNODE(node));
77973 +       if (likely(j != NULL))
77974 +               node = JZNODE(j);
77975 +       else
77976 +               node = NULL;
77977 +       return node;
77978 +}
77979 +
77980 +#if defined(REISER4_DEBUG)
77981 +int znode_is_loaded(const znode * node /* znode to query */ );
77982 +#endif
77983 +
77984 +extern __u64 znode_build_version(reiser4_tree * tree);
77985 +
77986 +/* Data-handles.  A data handle object manages pairing calls to zload() and zrelse().  We
77987 +   must load the data for a node in many places.  We could do this by simply calling
77988 +   zload() everywhere, the difficulty arises when we must release the loaded data by
77989 +   calling zrelse.  In a function with many possible error/return paths, it requires extra
77990 +   work to figure out which exit paths must call zrelse and those which do not.  The data
77991 +   handle automatically calls zrelse for every zload that it is responsible for.  In that
77992 +   sense, it acts much like a lock_handle.
77993 +*/
77994 +typedef struct load_count {
77995 +       znode *node;
77996 +       int d_ref;
77997 +} load_count;
77998 +
77999 +extern void init_load_count(load_count * lc);  /* Initialize a load_count set the current node to NULL. */
78000 +extern void done_load_count(load_count * dh);  /* Finalize a load_count: call zrelse() if necessary */
78001 +extern int incr_load_count_znode(load_count * dh, znode * node);       /* Set the argument znode to the current node, call zload(). */
78002 +extern int incr_load_count_jnode(load_count * dh, jnode * node);       /* If the argument jnode is formatted, do the same as
78003 +                                                                        * incr_load_count_znode, otherwise do nothing (unformatted nodes
78004 +                                                                        * don't require zload/zrelse treatment). */
78005 +extern void move_load_count(load_count * new, load_count * old);       /* Move the contents of a load_count.  Old handle is released. */
78006 +extern void copy_load_count(load_count * new, load_count * old);       /* Copy the contents of a load_count.  Old handle remains held. */
78007 +
78008 +/* Variable initializers for load_count. */
78009 +#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 }
78010 +#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 }
78011 +/* A convenience macro for use in assertions or debug-only code, where loaded
78012 +   data is only required to perform the debugging check.  This macro
78013 +   encapsulates an expression inside a pair of calls to zload()/zrelse(). */
78014 +#define WITH_DATA( node, exp )                         \
78015 +({                                                     \
78016 +       long __with_dh_result;                          \
78017 +       znode *__with_dh_node;                          \
78018 +                                                       \
78019 +       __with_dh_node = ( node );                      \
78020 +       __with_dh_result = zload( __with_dh_node );     \
78021 +       if( __with_dh_result == 0 ) {                   \
78022 +               __with_dh_result = ( long )( exp );     \
78023 +               zrelse( __with_dh_node );               \
78024 +       }                                               \
78025 +       __with_dh_result;                               \
78026 +})
78027 +
78028 +/* Same as above, but accepts a return value in case zload fails. */
78029 +#define WITH_DATA_RET( node, ret, exp )                        \
78030 +({                                                     \
78031 +       int __with_dh_result;                           \
78032 +       znode *__with_dh_node;                          \
78033 +                                                       \
78034 +       __with_dh_node = ( node );                      \
78035 +       __with_dh_result = zload( __with_dh_node );     \
78036 +       if( __with_dh_result == 0 ) {                   \
78037 +               __with_dh_result = ( int )( exp );      \
78038 +               zrelse( __with_dh_node );               \
78039 +       } else                                          \
78040 +               __with_dh_result = ( ret );             \
78041 +       __with_dh_result;                               \
78042 +})
78043 +
78044 +#define WITH_COORD(coord, exp)                 \
78045 +({                                             \
78046 +       coord_t *__coord;                       \
78047 +                                               \
78048 +       __coord = (coord);                      \
78049 +       coord_clear_iplug(__coord);             \
78050 +       WITH_DATA(__coord->node, exp);          \
78051 +})
78052 +
78053 +#if REISER4_DEBUG
78054 +#define STORE_COUNTERS                                         \
78055 +       reiser4_lock_cnt_info __entry_counters =                \
78056 +               *reiser4_lock_counters()
78057 +#define CHECK_COUNTERS                                                 \
78058 +ON_DEBUG_CONTEXT(                                                      \
78059 +({                                                                     \
78060 +       __entry_counters.x_refs = reiser4_lock_counters() -> x_refs;    \
78061 +       __entry_counters.t_refs = reiser4_lock_counters() -> t_refs;    \
78062 +       __entry_counters.d_refs = reiser4_lock_counters() -> d_refs;    \
78063 +       assert("nikita-2159",                                           \
78064 +              !memcmp(&__entry_counters, reiser4_lock_counters(),      \
78065 +                      sizeof __entry_counters));                       \
78066 +}) )
78067 +
78068 +#else
78069 +#define STORE_COUNTERS
78070 +#define CHECK_COUNTERS noop
78071 +#endif
78072 +
78073 +/* __ZNODE_H__ */
78074 +#endif
78075 +
78076 +/* Make Linus happy.
78077 +   Local variables:
78078 +   c-indentation-style: "K&R"
78079 +   mode-name: "LC"
78080 +   c-basic-offset: 8
78081 +   tab-width: 8
78082 +   fill-column: 120
78083 +   End:
78084 +*/
78085 diff -urN linux-2.6.27.orig/include/linux/fs.h linux-2.6.27/include/linux/fs.h
78086 --- linux-2.6.27.orig/include/linux/fs.h        2008-10-13 01:35:44.000000000 +0400
78087 +++ linux-2.6.27/include/linux/fs.h     2008-10-12 18:20:01.000000000 +0400
78088 @@ -1321,6 +1321,8 @@
78089         void (*clear_inode) (struct inode *);
78090         void (*umount_begin) (struct super_block *);
78091
78092 +       void (*sync_inodes) (struct super_block *sb,
78093 +                               struct writeback_control *wbc);
78094         int (*show_options)(struct seq_file *, struct vfsmount *);
78095         int (*show_stats)(struct seq_file *, struct vfsmount *);
78096  #ifdef CONFIG_QUOTA
78097 @@ -1747,6 +1749,7 @@
78098  extern void generic_sync_sb_inodes(struct super_block *sb,
78099                                 struct writeback_control *wbc);
78100  extern int write_inode_now(struct inode *, int);
78101 +extern void generic_sync_sb_inodes(struct super_block *, struct writeback_control *);
78102  extern int filemap_fdatawrite(struct address_space *);
78103  extern int filemap_flush(struct address_space *);
78104  extern int filemap_fdatawait(struct address_space *);
78105 diff -urN linux-2.6.27.orig/mm/filemap.c linux-2.6.27/mm/filemap.c
78106 --- linux-2.6.27.orig/mm/filemap.c      2008-10-13 01:35:44.000000000 +0400
78107 +++ linux-2.6.27/mm/filemap.c   2008-10-12 18:20:01.000000000 +0400
78108 @@ -134,6 +134,7 @@
78109                 dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
78110         }
78111  }
78112 +EXPORT_SYMBOL(__remove_from_page_cache);
78113
78114  void remove_from_page_cache(struct page *page)
78115  {
78116 @@ -145,6 +146,7 @@
78117         __remove_from_page_cache(page);
78118         spin_unlock_irq(&mapping->tree_lock);
78119  }
78120 +EXPORT_SYMBOL(remove_from_page_cache);
78121
78122  static int sync_page(void *word)
78123  {
78124 @@ -796,6 +798,7 @@
78125         rcu_read_unlock();
78126         return ret;
78127  }
78128 +EXPORT_SYMBOL(add_to_page_cache_lru);
78129
78130  /**
78131   * find_get_pages_contig - gang contiguous pagecache lookup
78132 @@ -968,6 +971,7 @@
78133
78134         ra->ra_pages /= 4;
78135  }
78136 +EXPORT_SYMBOL(find_get_pages);
78137
78138  /**
78139   * do_generic_file_read - generic file read routine